lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr; { no overloading; $addr = 0+$self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted this change.)
  17
  18 require 5.010_001;
  19 use strict;
  20 use warnings;
  21 use Carp;
  22 use File::Find;
  23 use File::Path;
  24 use File::Spec;
  25 use Text::Tabs;
  26
  27 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  28
  29 ##########################################################################
  30 #
  31 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  32 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  33 # a pod file and a .t file
  34 #
  35 # The structure of this file is:
  36 #   First these introductory comments; then
  37 #   code needed for everywhere, such as debugging stuff; then
  38 #   code to handle input parameters; then
  39 #   data structures likely to be of external interest (some of which depend on
  40 #       the input parameters, so follows them; then
  41 #   more data structures and subroutine and package (class) definitions; then
  42 #   the small actual loop to process the input files and finish up; then
  43 #   a __DATA__ section, for the .t tests
  44 #
  45 # This program works on all releases of Unicode through at least 5.2.  The
  46 # outputs have been scrutinized most intently for release 5.1.  The others
  47 # have been checked for somewhat more than just sanity.  It can handle all
  48 # existing Unicode character properties in those releases.
  49 #
  50 # This program is mostly about Unicode character (or code point) properties.
  51 # A property describes some attribute or quality of a code point, like if it
  52 # is lowercase or not, its name, what version of Unicode it was first defined
  53 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  54 # possibilities by making all properties into mappings from each code point
  55 # into some corresponding value.  In the case of it being lowercase or not,
  56 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  57 # property maps each Unicode code point to a single value, called a "property
  58 # value".  (Hence each Unicode property is a true mathematical function with
  59 # exactly one value per code point.)
  60 #
  61 # When using a property in a regular expression, what is desired isn't the
  62 # mapping of the code point to its property's value, but the reverse (or the
  63 # mathematical "inverse relation"): starting with the property value, "Does a
  64 # code point map to it?"  These are written in a "compound" form:
  65 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  66 # files containing the lists of code points that map to each such regular
  67 # expression property value, one file per list
  68 #
  69 # There is also a single form shortcut that Perl adds for many of the commonly
  70 # used properties.  This happens for all binary properties, plus script,
  71 # general_category, and block properties.
  72 #
  73 # Thus the outputs of this program are files.  There are map files, mostly in
  74 # the 'To' directory; and there are list files for use in regular expression
  75 # matching, all in subdirectories of the 'lib' directory, with each
  76 # subdirectory being named for the property that the lists in it are for.
  77 # Bookkeeping, test, and documentation files are also generated.
  78
  79 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  80 my $map_directory = 'To';        # Where map files go.
  81
  82 # DATA STRUCTURES
  83 #
  84 # The major data structures of this program are Property, of course, but also
  85 # Table.  There are two kinds of tables, very similar to each other.
  86 # "Match_Table" is the data structure giving the list of code points that have
  87 # a particular property value, mentioned above.  There is also a "Map_Table"
  88 # data structure which gives the property's mapping from code point to value.
  89 # There are two structures because the match tables need to be combined in
  90 # various ways, such as constructing unions, intersections, complements, etc.,
  91 # and the map ones don't.  And there would be problems, perhaps subtle, if
  92 # a map table were inadvertently operated on in some of those ways.
  93 # The use of separate classes with operations defined on one but not the other
  94 # prevents accidentally confusing the two.
  95 #
  96 # At the heart of each table's data structure is a "Range_List", which is just
  97 # an ordered list of "Ranges", plus ancillary information, and methods to
  98 # operate on them.  A Range is a compact way to store property information.
  99 # Each range has a starting code point, an ending code point, and a value that
 100 # is meant to apply to all the code points between the two end points,
 101 # inclusive.  For a map table, this value is the property value for those
 102 # code points.  Two such ranges could be written like this:
 103 #   0x41 .. 0x5A, 'Upper',
 104 #   0x61 .. 0x7A, 'Lower'
 105 #
 106 # Each range also has a type used as a convenience to classify the values.
 107 # Most ranges in this program will be Type 0, or normal, but there are some
 108 # ranges that have a non-zero type.  These are used only in map tables, and
 109 # are for mappings that don't fit into the normal scheme of things.  Mappings
 110 # that require a hash entry to communicate with utf8.c are one example;
 111 # another example is mappings for charnames.pm to use which indicate a name
 112 # that is algorithmically determinable from its code point (and vice-versa).
 113 # These are used to significantly compact these tables, instead of listing
 114 # each one of the tens of thousands individually.
 115 #
 116 # In a match table, the value of a range is irrelevant (and hence the type as
 117 # well, which will always be 0), and arbitrarily set to the null string.
 118 # Using the example above, there would be two match tables for those two
 119 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 120 # named Lower would contain 0x61..0x7A.
 121 #
 122 # Actually, there are two types of range lists, "Range_Map" is the one
 123 # associated with map tables, and "Range_List" with match tables.
 124 # Again, this is so that methods can be defined on one and not the other so as
 125 # to prevent operating on them in incorrect ways.
 126 #
 127 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 128 # in the perl core.  All tables could in theory be written, but some are
 129 # suppressed because there is no current practical use for them.  It is easy
 130 # to change which get written by changing various lists that are near the top
 131 # of the actual code in this file.  The table data structures contain enough
 132 # ancillary information to allow them to be treated as separate entities for
 133 # writing, such as the path to each one's file.  There is a heading in each
 134 # map table that gives the format of its entries, and what the map is for all
 135 # the code points missing from it.  (This allows tables to be more compact.)
 136 #
 137 # The Property data structure contains one or more tables.  All properties
 138 # contain a map table (except the $perl property which is a
 139 # pseudo-property containing only match tables), and any properties that
 140 # are usable in regular expression matches also contain various matching
 141 # tables, one for each value the property can have.  A binary property can
 142 # have two values, True and False (or Y and N, which are preferred by Unicode
 143 # terminology).  Thus each of these properties will have a map table that
 144 # takes every code point and maps it to Y or N (but having ranges cuts the
 145 # number of entries in that table way down), and two match tables, one
 146 # which has a list of all the code points that map to Y, and one for all the
 147 # code points that map to N.  (For each of these, a third table is also
 148 # generated for the pseudo Perl property.  It contains the identical code
 149 # points as the Y table, but can be written, not in the compound form, but in
 150 # a "single" form like \p{IsUppercase}.)  Many properties are binary, but some
 151 # properties have several possible values, some have many, and properties like
 152 # Name have a different value for every named code point.  Those will not,
 153 # unless the controlling lists are changed, have their match tables written
 154 # out.  But all the ones which can be used in regular expression \p{} and \P{}
 155 # constructs will.  Generally a property will have either its map table or its
 156 # match tables written but not both.  Again, what gets written is controlled
 157 # by lists which can easily be changed.
 158 #
 159 # For information about the Unicode properties, see Unicode's UAX44 document:
 160
 161 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 162
 163 # As stated earlier, this program will work on any release of Unicode so far.
 164 # Most obvious problems in earlier data have NOT been corrected except when
 165 # necessary to make Perl or this program work reasonably.  For example, no
 166 # folding information was given in early releases, so this program uses the
 167 # substitute of lower case, just so that a regular expression with the /i
 168 # option will do something that actually gives the right results in many
 169 # cases.  There are also a couple other corrections for version 1.1.5,
 170 # commented at the point they are made.  As an example of corrections that
 171 # weren't made (but could be) is this statement from DerivedAge.txt: "The
 172 # supplementary private use code points and the non-character code points were
 173 # assigned in version 2.0, but not specifically listed in the UCD until
 174 # versions 3.0 and 3.1 respectively."  (To be precise it was 3.0.1 not 3.0.0)
 175 # More information on Unicode version glitches is further down in these
 176 # introductory comments.
 177 #
 178 # This program works on all properties as of 5.2, though the files for some
 179 # are suppressed from apparent lack of demand for them.  You can change which
 180 # are output by changing lists in this program.
 181 #
 182 # The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
 183 # loose matchings rules (from Unicode TR18):
 184 #
 185 #    The recommended names for UCD properties and property values are in
 186 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 187 #    [PropValue]. There are both abbreviated names and longer, more
 188 #    descriptive names. It is strongly recommended that both names be
 189 #    recognized, and that loose matching of property names be used,
 190 #    whereby the case distinctions, whitespace, hyphens, and underbar
 191 #    are ignored.
 192 # The program still allows Fuzzy to override its determination of if loose
 193 # matching should be used, but it isn't currently used, as it is no longer
 194 # needed; the calculations it makes are good enough.
 195 #
 196 # SUMMARY OF HOW IT WORKS:
 197 #
 198 #   Process arguments
 199 #
 200 #   A list is constructed containing each input file that is to be processed
 201 #
 202 #   Each file on the list is processed in a loop, using the associated handler
 203 #   code for each:
 204 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 205 #            first.  These files name the properties and property values.
 206 #            Objects are created of all the property and property value names
 207 #            that the rest of the input should expect, including all synonyms.
 208 #        The other input files give mappings from properties to property
 209 #           values.  That is, they list code points and say what the mapping
 210 #           is under the given property.  Some files give the mappings for
 211 #           just one property; and some for many.  This program goes through
 212 #           each file and populates the properties from them.  Some properties
 213 #           are listed in more than one file, and Unicode has set up a
 214 #           precedence as to which has priority if there is a conflict.  Thus
 215 #           the order of processing matters, and this program handles the
 216 #           conflict possibility by processing the overriding input files
 217 #           last, so that if necessary they replace earlier values.
 218 #        After this is all done, the program creates the property mappings not
 219 #            furnished by Unicode, but derivable from what it does give.
 220 #        The tables of code points that match each property value in each
 221 #            property that is accessible by regular expressions are created.
 222 #        The Perl-defined properties are created and populated.  Many of these
 223 #            require data determined from the earlier steps
 224 #        Any Perl-defined synonyms are created, and name clashes between Perl
 225 #            and Unicode are reconciled and warned about.
 226 #        All the properties are written to files
 227 #        Any other files are written, and final warnings issued.
 228 #
 229 # For clarity, a number of operators have been overloaded to work on tables:
 230 #   ~ means invert (take all characters not in the set).  The more
 231 #       conventional '!' is not used because of the possibility of confusing
 232 #       it with the actual boolean operation.
 233 #   + means union
 234 #   - means subtraction
 235 #   & means intersection
 236 # The precedence of these is the order listed.  Parentheses should be
 237 # copiously used.  These are not a general scheme.  The operations aren't
 238 # defined for a number of things, deliberately, to avoid getting into trouble.
 239 # Operations are done on references and affect the underlying structures, so
 240 # that the copy constructors for them have been overloaded to not return a new
 241 # clone, but the input object itself.
 242 #
 243 # The bool operator is deliberately not overloaded to avoid confusion with
 244 # "should it mean if the object merely exists, or also is non-empty?".
 245 #
 246 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 247 #
 248 # This program needs to be able to run under miniperl.  Therefore, it uses a
 249 # minimum of other modules, and hence implements some things itself that could
 250 # be gotten from CPAN
 251 #
 252 # This program uses inputs published by the Unicode Consortium.  These can
 253 # change incompatibly between releases without the Perl maintainers realizing
 254 # it.  Therefore this program is now designed to try to flag these.  It looks
 255 # at the directories where the inputs are, and flags any unrecognized files.
 256 # It keeps track of all the properties in the files it handles, and flags any
 257 # that it doesn't know how to handle.  It also flags any input lines that
 258 # don't match the expected syntax, among other checks.
 259 #
 260 # It is also designed so if a new input file matches one of the known
 261 # templates, one hopefully just needs to add it to a list to have it
 262 # processed.
 263 #
 264 # As mentioned earlier, some properties are given in more than one file.  In
 265 # particular, the files in the extracted directory are supposedly just
 266 # reformattings of the others.  But they contain information not easily
 267 # derivable from the other files, including results for Unihan, which this
 268 # program doesn't ordinarily look at, and for unassigned code points.  They
 269 # also have historically had errors or been incomplete.  In an attempt to
 270 # create the best possible data, this program thus processes them first to
 271 # glean information missing from the other files; then processes those other
 272 # files to override any errors in the extracted ones.  Much of the design was
 273 # driven by this need to store things and then possibly override them.
 274 #
 275 # It tries to keep fatal errors to a minimum, to generate something usable for
 276 # testing purposes.  It always looks for files that could be inputs, and will
 277 # warn about any that it doesn't know how to handle (the -q option suppresses
 278 # the warning).
 279 #
 280 # Why have files written out for binary 'N' matches?
 281 #   For binary properties, if you know the mapping for either Y or N; the
 282 #   other is trivial to construct, so could be done at Perl run-time by just
 283 #   complementing the result, instead of having a file for it.  That is, if
 284 #   someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and
 285 #   not need a file.   The problem is communicating to Perl that a given
 286 #   property is binary.  Perl can't figure it out from looking at the N (or
 287 #   No), as some non-binary properties have these as property values.  So
 288 #   rather than inventing a way to communicate this info back to the core,
 289 #   which would have required changes there as well, it was simpler just to
 290 #   add the extra tables.
 291 #
 292 # Why is there more than one type of range?
 293 #   This simplified things.  There are some very specialized code points that
 294 #   have to be handled specially for output, such as Hangul syllable names.
 295 #   By creating a range type (done late in the development process), it
 296 #   allowed this to be stored with the range, and overridden by other input.
 297 #   Originally these were stored in another data structure, and it became a
 298 #   mess trying to decide if a second file that was for the same property was
 299 #   overriding the earlier one or not.
 300 #
 301 # Why are there two kinds of tables, match and map?
 302 #   (And there is a base class shared by the two as well.)  As stated above,
 303 #   they actually are for different things.  Development proceeded much more
 304 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 305 #   give the property value for every code point (actually every code point
 306 #   that doesn't map to a default value).  Match tables are used for regular
 307 #   expression matches, and are essentially the inverse mapping.  Separating
 308 #   the two allows more specialized methods, and error checks so that one
 309 #   can't just take the intersection of two map tables, for example, as that
 310 #   is nonsensical.
 311 #
 312 # There are no match tables generated for matches of the null string.  These
 313 # would like like qr/\p{JSN=}/ currently without modifying the regex code.
 314 # Perhaps something like them could be added if necessary.  The JSN does have
 315 # a real code point U+110B that maps to the null string, but it is a
 316 # contributory property, and therefore not output by default.  And it's easily
 317 # handled so far by making the null string the default where it is a
 318 # possibility.
 319 #
 320 # DEBUGGING
 321 #
 322 # This program is written so it will run under miniperl.  Occasionally changes
 323 # will cause an error where the backtrace doesn't work well under miniperl.
 324 # To diagnose the problem, you can instead run it under regular perl, if you
 325 # have one compiled.
 326 #
 327 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 328 # to return true.  Then a line like
 329 #
 330 # local $to_trace = 1 if main::DEBUG;
 331 #
 332 # can be added to enable tracing in its lexical scope or until you insert
 333 # another line:
 334 #
 335 # local $to_trace = 0 if main::DEBUG;
 336 #
 337 # then use a line like "trace $a, @b, %c, ...;
 338 #
 339 # Some of the more complex subroutines already have trace statements in them.
 340 # Permanent trace statements should be like:
 341 #
 342 # trace ... if main::DEBUG && $to_trace;
 343 #
 344 # If there is just one or a few files that you're debugging, you can easily
 345 # cause most everything else to be skipped.  Change the line
 346 #
 347 # my $debug_skip = 0;
 348 #
 349 # to 1, and every file whose object is in @input_file_objects and doesn't have
 350 # a, 'non_skip => 1,' in its constructor will be skipped.
 351 #
 352 # FUTURE ISSUES
 353 #
 354 # The program would break if Unicode were to change its names so that
 355 # interior white space, underscores, or dashes differences were significant
 356 # within property and property value names.
 357 #
 358 # It might be easier to use the xml versions of the UCD if this program ever
 359 # would need heavy revision, and the ability to handle old versions was not
 360 # required.
 361 #
 362 # There is the potential for name collisions, in that Perl has chosen names
 363 # that Unicode could decide it also likes.  There have been such collisions in
 364 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 365 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 366 # collisions, which were withdrawn before the final release, because of Perl's
 367 # and other's protests.  These all involved new properties which began with
 368 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 369 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 370 # Unicode document, so they are unlikely to be used by Unicode for another
 371 # purpose.  However, they might try something beginning with 'In', or use any
 372 # of the other Perl-defined properties.  This program will warn you of name
 373 # collisions, and refuse to generate tables with them, but manual intervention
 374 # will be required in this event.  One scheme that could be implemented, if
 375 # necessary, would be to have this program generate another file, or add a
 376 # field to mktables.lst that gives the date of first definition of a property.
 377 # Each new release of Unicode would use that file as a basis for the next
 378 # iteration.  And the Perl synonym addition code could sort based on the age
 379 # of the property, so older properties get priority, and newer ones that clash
 380 # would be refused; hence existing code would not be impacted, and some other
 381 # synonym would have to be used for the new property.  This is ugly, and
 382 # manual intervention would certainly be easier to do in the short run; lets
 383 # hope it never comes to this.
 384 #
 385 # A NOTE ON UNIHAN
 386 #
 387 # This program can generate tables from the Unihan database.  But it doesn't
 388 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 389 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 390 # database was split into 8 different files, all beginning with the letters
 391 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 392 # know which of the many properties in the file(s) should have tables created
 393 # for them.  It will create tables for any properties listed in
 394 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 395 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 396 # property you want is not in those files of the release you are building
 397 # against, you must add it to those two arrays.  Starting in 4.0, the
 398 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 399 # is present in the directory, a table will be generated for that property.
 400 # In 5.2, several more properties were added.  For your convenience, the two
 401 # arrays are initialized with all the 5.2 listed properties that are also in
 402 # earlier releases.  But these are commented out.  You can just uncomment the
 403 # ones you want, or use them as a template for adding entries for other
 404 # properties.
 405 #
 406 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 407 # and filter_unihan_line() are the functions where this is done.  This program
 408 # already does some adjusting to make the lines look more like the rest of the
 409 # Unicode DB;  You can see what that is in filter_unihan_line()
 410 #
 411 # There is a bug in the 3.2 data file in which some values for the
 412 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 413 # could be added for these; or for a particular installation, the Unihan.txt
 414 # file could be edited to fix them.
 415 # have to be
 416 #
 417 # HOW TO ADD A FILE TO BE PROCESSED
 418 #
 419 # A new file from Unicode needs to have an object constructed for it in
 420 # @input_file_objects, probably at the end or at the end of the extracted
 421 # ones.  The program should warn you if its name will clash with others on
 422 # restrictive file systems, like DOS.  If so, figure out a better name, and
 423 # add lines to the README.perl file giving that.  If the file is a character
 424 # property, it should be in the format that Unicode has by default
 425 # standardized for such files for the more recently introduced ones.
 426 # If so, the Input_file constructor for @input_file_objects can just be the
 427 # file name and release it first appeared in.  If not, then it should be
 428 # possible to construct an each_line_handler() to massage the line into the
 429 # standardized form.
 430 #
 431 # For non-character properties, more code will be needed.  You can look at
 432 # the existing entries for clues.
 433 #
 434 # UNICODE VERSIONS NOTES
 435 #
 436 # The Unicode UCD has had a number of errors in it over the versions.  And
 437 # these remain, by policy, in the standard for that version.  Therefore it is
 438 # risky to correct them, because code may be expecting the error.  So this
 439 # program doesn't generally make changes, unless the error breaks the Perl
 440 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 441 # for U+1105, which causes real problems for the algorithms for Jamo
 442 # calculations, so it is changed here.
 443 #
 444 # But it isn't so clear cut as to what to do about concepts that are
 445 # introduced in a later release; should they extend back to earlier releases
 446 # where the concept just didn't exist?  It was easier to do this than to not,
 447 # so that's what was done.  For example, the default value for code points not
 448 # in the files for various properties was probably undefined until changed by
 449 # some version.  No_Block for blocks is such an example.  This program will
 450 # assign No_Block even in Unicode versions that didn't have it.  This has the
 451 # benefit that code being written doesn't have to special case earlier
 452 # versions; and the detriment that it doesn't match the Standard precisely for
 453 # the affected versions.
 454 #
 455 # Here are some observations about some of the issues in early versions:
 456 #
 457 # The number of code points in \p{alpha} halve in 2.1.9.  It turns out that
 458 # the reason is that the CJK block starting at 4E00 was removed from PropList,
 459 # and was not put back in until 3.1.0
 460 #
 461 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 462 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 463 # reason is that 3.2 introduced U+205F=medium math space, which was not
 464 # classed as white space, but Perl figured out that it should have been. 4.0
 465 # reclassified it correctly.
 466 #
 467 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 468 # this was erroneously a synonym for 202.  In 4.0, ATB became 202, and ATBL
 469 # was left with no code points, as all the ones that mapped to 202 stayed
 470 # mapped to 202.  Thus if your program used the numeric name for the class,
 471 # it would not have been affected, but if it used the mnemonic, it would have
 472 # been.
 473 #
 474 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 475 # points which eventually came to have this script property value, instead
 476 # mapped to "Unknown".  But in the next release all these code points were
 477 # moved to \p{sc=common} instead.
 478 #
 479 # The default for missing code points for BidiClass is complicated.  Starting
 480 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 481 # tries to do the best it can for earlier releases.  It is done in
 482 # process_PropertyAliases()
 483 #
 484 ##############################################################################
 485
 486 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 487                         # and errors
 488 my $MAX_LINE_WIDTH = 78;
 489
 490 # Debugging aid to skip most files so as to not be distracted by them when
 491 # concentrating on the ones being debugged.  Add
 492 # non_skip => 1,
 493 # to the constructor for those files you want processed when you set this.
 494 # Files with a first version number of 0 are special: they are always
 495 # processed regardless of the state of this flag.
 496 my $debug_skip = 0;
 497
 498 # Set to 1 to enable tracing.
 499 our $to_trace = 0;
 500
 501 { # Closure for trace: debugging aid
 502     my $print_caller = 1;        # ? Include calling subroutine name
 503     my $main_with_colon = 'main::';
 504     my $main_colon_length = length($main_with_colon);
 505
 506     sub trace {
 507         return unless $to_trace;        # Do nothing if global flag not set
 508
 509         my @input = @_;
 510
 511         local $DB::trace = 0;
 512         $DB::trace = 0;          # Quiet 'used only once' message
 513
 514         my $line_number;
 515
 516         # Loop looking up the stack to get the first non-trace caller
 517         my $caller_line;
 518         my $caller_name;
 519         my $i = 0;
 520         do {
 521             $line_number = $caller_line;
 522             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 523             $caller = $main_with_colon unless defined $caller;
 524
 525             $caller_name = $caller;
 526
 527             # get rid of pkg
 528             $caller_name =~ s/.*:://;
 529             if (substr($caller_name, 0, $main_colon_length)
 530                 eq $main_with_colon)
 531             {
 532                 $caller_name = substr($caller_name, $main_colon_length);
 533             }
 534
 535         } until ($caller_name ne 'trace');
 536
 537         # If the stack was empty, we were called from the top level
 538         $caller_name = 'main' if ($caller_name eq ""
 539                                     || $caller_name eq 'trace');
 540
 541         my $output = "";
 542         foreach my $string (@input) {
 543             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 544             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 545                 $output .= simple_dumper($string);
 546             }
 547             else {
 548                 $string = "$string" if ref $string;
 549                 $string = $UNDEF unless defined $string;
 550                 chomp $string;
 551                 $string = '""' if $string eq "";
 552                 $output .= " " if $output ne ""
 553                                 && $string ne ""
 554                                 && substr($output, -1, 1) ne " "
 555                                 && substr($string, 0, 1) ne " ";
 556                 $output .= $string;
 557             }
 558         }
 559
 560         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 561         print STDERR "$caller_name: " if $print_caller;
 562         print STDERR $output, "\n";
 563         return;
 564     }
 565 }
 566
 567 # This is for a rarely used development feature that allows you to compare two
 568 # versions of the Unicode standard without having to deal with changes caused
 569 # by the code points introduced in the later verson.  Change the 0 to a SINGLE
 570 # dotted Unicode release number (e.g. 2.1).  Only code points introduced in
 571 # that release and earlier will be used; later ones are thrown away.  You use
 572 # the version number of the earliest one you want to compare; then run this
 573 # program on directory structures containing each release, and compare the
 574 # outputs.  These outputs will therefore include only the code points common
 575 # to both releases, and you can see the changes caused just by the underlying
 576 # release semantic changes.  For versions earlier than 3.2, you must copy a
 577 # version of DAge.txt into the directory.
 578 my $string_compare_versions = DEBUG && 0; #  e.g., v2.1;
 579 my $compare_versions = DEBUG
 580                        && $string_compare_versions
 581                        && pack "C*", split /\./, $string_compare_versions;
 582
 583 sub uniques {
 584     # Returns non-duplicated input values.  From "Perl Best Practices:
 585     # Encapsulated Cleverness".  p. 455 in first edition.
 586
 587     my %seen;
 588     return grep { ! $seen{$_}++ } @_;
 589 }
 590
 591 $0 = File::Spec->canonpath($0);
 592
 593 my $make_test_script = 0;      # ? Should we output a test script
 594 my $write_unchanged_files = 0; # ? Should we update the output files even if
 595                                #    we don't think they have changed
 596 my $use_directory = "";        # ? Should we chdir somewhere.
 597 my $pod_directory;             # input directory to store the pod file.
 598 my $pod_file = 'perluniprops';
 599 my $t_path;                     # Path to the .t test file
 600 my $file_list = 'mktables.lst'; # File to store input and output file names.
 601                                # This is used to speed up the build, by not
 602                                # executing the main body of the program if
 603                                # nothing on the list has changed since the
 604                                # previous build
 605 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 606                                # make a list so that when the pumpking is
 607                                # preparing a release, s/he won't have to do
 608                                # special things
 609 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 610                                # in the input.
 611 my $output_range_counts = 1;   # ? Should we include the number of code points
 612                                # in ranges in the output
 613 # Verbosity levels; 0 is quiet
 614 my $NORMAL_VERBOSITY = 1;
 615 my $PROGRESS = 2;
 616 my $VERBOSE = 3;
 617
 618 my $verbosity = $NORMAL_VERBOSITY;
 619
 620 # Process arguments
 621 while (@ARGV) {
 622     my $arg = shift @ARGV;
 623     if ($arg eq '-v') {
 624         $verbosity = $VERBOSE;
 625     }
 626     elsif ($arg eq '-p') {
 627         $verbosity = $PROGRESS;
 628         $| = 1;     # Flush buffers as we go.
 629     }
 630     elsif ($arg eq '-q') {
 631         $verbosity = 0;
 632     }
 633     elsif ($arg eq '-w') {
 634         $write_unchanged_files = 1; # update the files even if havent changed
 635     }
 636     elsif ($arg eq '-check') {
 637         my $this = shift @ARGV;
 638         my $ok = shift @ARGV;
 639         if ($this ne $ok) {
 640             print "Skipping as check params are not the same.\n";
 641             exit(0);
 642         }
 643     }
 644     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 645         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 646     }
 647     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 648     {
 649         $make_test_script = 1;
 650     }
 651     elsif ($arg eq '-makelist') {
 652         $make_list = 1;
 653     }
 654     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 655         -d $use_directory or croak "Unknown directory '$use_directory'";
 656     }
 657     elsif ($arg eq '-L') {
 658
 659         # Existence not tested until have chdir'd
 660         $file_list = shift;
 661     }
 662     elsif ($arg eq '-globlist') {
 663         $glob_list = 1;
 664     }
 665     elsif ($arg eq '-c') {
 666         $output_range_counts = ! $output_range_counts
 667     }
 668     else {
 669         my $with_c = 'with';
 670         $with_c .= 'out' if $output_range_counts;   # Complements the state
 671         croak <<END;
 672 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 673           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 674           [-check A B ]
 675   -c          : Output comments $with_c number of code points in ranges
 676   -q          : Quiet Mode: Only output serious warnings.
 677   -p          : Set verbosity level to normal plus show progress.
 678   -v          : Set Verbosity level high:  Show progress and non-serious
 679                 warnings
 680   -w          : Write files regardless
 681   -C dir      : Change to this directory before proceeding. All relative paths
 682                 except those specified by the -P and -T options will be done
 683                 with respect to this directory.
 684   -P dir      : Output $pod_file file to directory 'dir'.
 685   -T path     : Create a test script as 'path'; overrides -maketest
 686   -L filelist : Use alternate 'filelist' instead of standard one
 687   -globlist   : Take as input all non-Test *.txt files in current and sub
 688                 directories
 689   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 690                 overrides -T
 691   -makelist   : Rewrite the file list $file_list based on current setup
 692   -check A B  : Executes $0 only if A and B are the same
 693 END
 694     }
 695 }
 696
 697 # Stores the most-recently changed file.  If none have changed, can skip the
 698 # build
 699 my $youngest = -M $0;   # Do this before the chdir!
 700
 701 # Change directories now, because need to read 'version' early.
 702 if ($use_directory) {
 703     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 704         $pod_directory = File::Spec->rel2abs($pod_directory);
 705     }
 706     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 707         $t_path = File::Spec->rel2abs($t_path);
 708     }
 709     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 710     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 711         $pod_directory = File::Spec->abs2rel($pod_directory);
 712     }
 713     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 714         $t_path = File::Spec->abs2rel($t_path);
 715     }
 716 }
 717
 718 # Get Unicode version into regular and v-string.  This is done now because
 719 # various tables below get populated based on it.  These tables are populated
 720 # here to be near the top of the file, and so easily seeable by those needing
 721 # to modify things.
 722 open my $VERSION, "<", "version"
 723                     or croak "$0: can't open required file 'version': $!\n";
 724 my $string_version = <$VERSION>;
 725 close $VERSION;
 726 chomp $string_version;
 727 my $v_version = pack "C*", split /\./, $string_version;        # v string
 728
 729 # The following are the complete names of properties with property values that
 730 # are known to not match any code points in some versions of Unicode, but that
 731 # may change in the future so they should be matchable, hence an empty file is
 732 # generated for them.
 733 my @tables_that_may_be_empty = (
 734                                 'Joining_Type=Left_Joining',
 735                                 );
 736 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 737 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 738 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 739                                                     if $v_version ge v4.1.0;
 740
 741 # The lists below are hashes, so the key is the item in the list, and the
 742 # value is the reason why it is in the list.  This makes generation of
 743 # documentation easier.
 744
 745 my %why_suppressed;  # No file generated for these.
 746
 747 # Files aren't generated for empty extraneous properties.  This is arguable.
 748 # Extraneous properties generally come about because a property is no longer
 749 # used in a newer version of Unicode.  If we generated a file without code
 750 # points, programs that used to work on that property will still execute
 751 # without errors.  It just won't ever match (or will always match, with \P{}).
 752 # This means that the logic is now likely wrong.  I (khw) think its better to
 753 # find this out by getting an error message.  Just move them to the table
 754 # above to change this behavior
 755 my %why_suppress_if_empty_warn_if_not = (
 756
 757    # It is the only property that has ever officially been removed from the
 758    # Standard.  The database never contained any code points for it.
 759    'Special_Case_Condition' => 'Obsolete',
 760
 761    # Apparently never official, but there were code points in some versions of
 762    # old-style PropList.txt
 763    'Non_Break' => 'Obsolete',
 764 );
 765
 766 # These would normally go in the warn table just above, but they were changed
 767 # a long time before this program was written, so warnings about them are
 768 # moot.
 769 if ($v_version gt v3.2.0) {
 770     push @tables_that_may_be_empty,
 771                                 'Canonical_Combining_Class=Attached_Below_Left'
 772 }
 773
 774 # These are listed in the Property aliases file in 5.2, but Unihan is ignored
 775 # unless explicitly added.
 776 if ($v_version ge v5.2.0) {
 777     my $unihan = 'Unihan; remove from list if using Unihan';
 778     foreach my $table qw (
 779                            kAccountingNumeric
 780                            kOtherNumeric
 781                            kPrimaryNumeric
 782                            kCompatibilityVariant
 783                            kIICore
 784                            kIRG_GSource
 785                            kIRG_HSource
 786                            kIRG_JSource
 787                            kIRG_KPSource
 788                            kIRG_MSource
 789                            kIRG_KSource
 790                            kIRG_TSource
 791                            kIRG_USource
 792                            kIRG_VSource
 793                            kRSUnicode
 794                         )
 795     {
 796         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 797     }
 798 }
 799
 800 # Properties that this program ignores.
 801 my @unimplemented_properties = (
 802 'Unicode_Radical_Stroke'    # Remove if changing to handle this one.
 803 );
 804
 805 # There are several types of obsolete properties defined by Unicode.  These
 806 # must be hand-edited for every new Unicode release.
 807 my %why_deprecated;  # Generates a deprecated warning message if used.
 808 my %why_stabilized;  # Documentation only
 809 my %why_obsolete;    # Documentation only
 810
 811 {   # Closure
 812     my $simple = 'Perl uses the more complete version of this property';
 813     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 814
 815     my $other_properties = 'other properties';
 816     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 817     my $why_no_expand  = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
 818
 819     %why_deprecated = (
 820         'Grapheme_Link' => 'Deprecated by Unicode.  Use ccc=vr (Canonical_Combining_Class=Virama) instead',
 821         'Jamo_Short_Name' => $contributory,
 822         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 823         'Other_Alphabetic' => $contributory,
 824         'Other_Default_Ignorable_Code_Point' => $contributory,
 825         'Other_Grapheme_Extend' => $contributory,
 826         'Other_ID_Continue' => $contributory,
 827         'Other_ID_Start' => $contributory,
 828         'Other_Lowercase' => $contributory,
 829         'Other_Math' => $contributory,
 830         'Other_Uppercase' => $contributory,
 831     );
 832
 833     %why_suppressed = (
 834         # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
 835         # contains the same information, but without the algorithmically
 836         # determinable Hangul syllables'.  This file is not published, so it's
 837         # existence is not noted in the comment.
 838         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
 839
 840         'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo.  Obsoleted, and code points for it removed in Unicode 5.2',
 841         'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo.  If there is no later name for a code point, then this one is used instead in charnames",
 842
 843         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold",
 844         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 845         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 846         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 847
 848         'Name' => "Accessible via 'use charnames;'",
 849         'Name_Alias' => "Accessible via 'use charnames;'",
 850
 851         # These are sort of jumping the gun; deprecation is proposed for
 852         # Unicode version 6.0, but they have never been exposed by Perl, and
 853         # likely are soon to be deprecated, so best not to expose them.
 854         FC_NFKC_Closure => 'Use NFKC_Casefold instead',
 855         Expands_On_NFC => $why_no_expand,
 856         Expands_On_NFD => $why_no_expand,
 857         Expands_On_NFKC => $why_no_expand,
 858         Expands_On_NFKD => $why_no_expand,
 859     );
 860
 861     # The following are suppressed because they were made contributory or
 862     # deprecated by Unicode before Perl ever thought about supporting them.
 863     foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
 864         $why_suppressed{$property} = $why_deprecated{$property};
 865     }
 866
 867     # Customize the message for all the 'Other_' properties
 868     foreach my $property (keys %why_deprecated) {
 869         next if (my $main_property = $property) !~ s/^Other_//;
 870         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 871     }
 872 }
 873
 874 if ($v_version ge 4.0.0) {
 875     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
 876 }
 877 if ($v_version ge 5.2.0) {
 878     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
 879 }
 880
 881 # Probably obsolete forever
 882 if ($v_version ge v4.1.0) {
 883     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common"';
 884 }
 885
 886 # This program can create files for enumerated-like properties, such as
 887 # 'Numeric_Type'.  This file would be the same format as for a string
 888 # property, with a mapping from code point to its value, so you could look up,
 889 # for example, the script a code point is in.  But no one so far wants this
 890 # mapping, or they have found another way to get it since this is a new
 891 # feature.  So no file is generated except if it is in this list.
 892 my @output_mapped_properties = split "\n", <<END;
 893 END
 894
 895 # If you are using the Unihan database, you need to add the properties that
 896 # you want to extract from it to this table.  For your convenience, the
 897 # properties in the 5.2 PropertyAliases.txt file are listed, commented out
 898 my @cjk_properties = split "\n", <<'END';
 899 #cjkAccountingNumeric; kAccountingNumeric
 900 #cjkOtherNumeric; kOtherNumeric
 901 #cjkPrimaryNumeric; kPrimaryNumeric
 902 #cjkCompatibilityVariant; kCompatibilityVariant
 903 #cjkIICore ; kIICore
 904 #cjkIRG_GSource; kIRG_GSource
 905 #cjkIRG_HSource; kIRG_HSource
 906 #cjkIRG_JSource; kIRG_JSource
 907 #cjkIRG_KPSource; kIRG_KPSource
 908 #cjkIRG_KSource; kIRG_KSource
 909 #cjkIRG_TSource; kIRG_TSource
 910 #cjkIRG_USource; kIRG_USource
 911 #cjkIRG_VSource; kIRG_VSource
 912 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
 913 END
 914
 915 # Similarly for the property values.  For your convenience, the lines in the
 916 # 5.2 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
 917 # '#' marks
 918 my @cjk_property_values = split "\n", <<'END';
 919 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
 920 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
 921 ## @missing: 0000..10FFFF; cjkIICore; <none>
 922 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
 923 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
 924 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
 925 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
 926 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
 927 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
 928 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
 929 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
 930 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
 931 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
 932 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
 933 END
 934
 935 # The input files don't list every code point.  Those not listed are to be
 936 # defaulted to some value.  Below are hard-coded what those values are for
 937 # non-binary properties as of 5.1.  Starting in 5.0, there are
 938 # machine-parsable comment lines in the files the give the defaults; so this
 939 # list shouldn't have to be extended.  The claim is that all missing entries
 940 # for binary properties will default to 'N'.  Unicode tried to change that in
 941 # 5.2, but the beta period produced enough protest that they backed off.
 942 #
 943 # The defaults for the fields that appear in UnicodeData.txt in this hash must
 944 # be in the form that it expects.  The others may be synonyms.
 945 my $CODE_POINT = '<code point>';
 946 my %default_mapping = (
 947     Age => "Unassigned",
 948     # Bidi_Class => Complicated; set in code
 949     Bidi_Mirroring_Glyph => "",
 950     Block => 'No_Block',
 951     Canonical_Combining_Class => 0,
 952     Case_Folding => $CODE_POINT,
 953     Decomposition_Mapping => $CODE_POINT,
 954     Decomposition_Type => 'None',
 955     East_Asian_Width => "Neutral",
 956     FC_NFKC_Closure => $CODE_POINT,
 957     General_Category => 'Cn',
 958     Grapheme_Cluster_Break => 'Other',
 959     Hangul_Syllable_Type => 'NA',
 960     ISO_Comment => "",
 961     Jamo_Short_Name => "",
 962     Joining_Group => "No_Joining_Group",
 963     # Joining_Type => Complicated; set in code
 964     kIICore => 'N',   #                       Is converted to binary
 965     #Line_Break => Complicated; set in code
 966     Lowercase_Mapping => $CODE_POINT,
 967     Name => "",
 968     Name_Alias => "",
 969     NFC_QC => 'Yes',
 970     NFD_QC => 'Yes',
 971     NFKC_QC => 'Yes',
 972     NFKD_QC => 'Yes',
 973     Numeric_Type => 'None',
 974     Numeric_Value => 'NaN',
 975     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
 976     Sentence_Break => 'Other',
 977     Simple_Case_Folding => $CODE_POINT,
 978     Simple_Lowercase_Mapping => $CODE_POINT,
 979     Simple_Titlecase_Mapping => $CODE_POINT,
 980     Simple_Uppercase_Mapping => $CODE_POINT,
 981     Titlecase_Mapping => $CODE_POINT,
 982     Unicode_1_Name => "",
 983     Unicode_Radical_Stroke => "",
 984     Uppercase_Mapping => $CODE_POINT,
 985     Word_Break => 'Other',
 986 );
 987
 988 # Below are files that Unicode furnishes, but this program ignores, and why
 989 my %ignored_files = (
 990     'CJKRadicals.txt' => 'Unihan data',
 991     'Index.txt' => 'An index, not actual data',
 992     'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
 993     'NamesList.txt' => 'Just adds commentary',
 994     'NormalizationCorrections.txt' => 'Data is already in other files.',
 995     'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
 996     'ReadMe.txt' => 'Just comments',
 997     'README.TXT' => 'Just comments',
 998     'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property.  Does not fit into current scheme where one code point is mapped',
 999 );
1000
1001 ### End of externally interesting definitions, except for @input_file_objects
1002
1003 my $HEADER=<<"EOF";
1004 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1005 # This file is machine-generated by $0 from the Unicode
1006 # database, Version $string_version.  Any changes made here will be lost!
1007 EOF
1008
1009 my $INTERNAL_ONLY=<<"EOF";
1010
1011 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1012 # This file is for internal use by the Perl program only.  The format and even
1013 # the name or existence of this file are subject to change without notice.
1014 # Don't use it directly.
1015 EOF
1016
1017 my $DEVELOPMENT_ONLY=<<"EOF";
1018 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1019 # This file contains information artificially constrained to code points
1020 # present in Unicode release $string_compare_versions.
1021 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1022 # not be used for production.
1023
1024 EOF
1025
1026 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1027 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1028 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1029
1030 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1031 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1032 # to decrease backtracking
1033 my $code_point_re =
1034         qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1035
1036 # This matches the beginning of the line in the Unicode db files that give the
1037 # defaults for code points not listed (i.e., missing) in the file.  The code
1038 # depends on this ending with a semi-colon, so it can assume it is a valid
1039 # field when the line is split() by semi-colons
1040 my $missing_defaults_prefix =
1041             qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1042
1043 # Property types.  Unicode has more types, but these are sufficient for our
1044 # purposes.
1045 my $UNKNOWN = -1;   # initialized to illegal value
1046 my $NON_STRING = 1; # Either binary or enum
1047 my $BINARY = 2;
1048 my $ENUM = 3;       # Include catalog
1049 my $STRING = 4;     # Anything else: string or misc
1050
1051 # Some input files have lines that give default values for code points not
1052 # contained in the file.  Sometimes these should be ignored.
1053 my $NO_DEFAULTS = 0;        # Must evaluate to false
1054 my $NOT_IGNORED = 1;
1055 my $IGNORED = 2;
1056
1057 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1058 # and will appear in the main body of the tables in the output files, but
1059 # there are other types of ranges as well, listed below, that are specially
1060 # handled.   There are pseudo-types as well that will never be stored as a
1061 # type, but will affect the calculation of the type.
1062
1063 # 0 is for normal, non-specials
1064 my $MULTI_CP = 1;           # Sequence of more than code point
1065 my $HANGUL_SYLLABLE = 2;
1066 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1067 my $NULL = 4;               # The map is to the null string; utf8.c can't
1068                             # handle these, nor is there an accepted syntax
1069                             # for them in \p{} constructs
1070 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1071                              # otherwise be $MULTI_CP type are instead type 0
1072
1073 # process_generic_property_file() can accept certain overrides in its input.
1074 # Each of these must begin AND end with $CMD_DELIM.
1075 my $CMD_DELIM = "\a";
1076 my $REPLACE_CMD = 'replace';    # Override the Replace
1077 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1078
1079 my $NO = 0;
1080 my $YES = 1;
1081
1082 # Values for the Replace argument to add_range.
1083 # $NO                      # Don't replace; add only the code points not
1084                            # already present.
1085 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1086                            # the comments at the subroutine definition.
1087 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1088 my $MULTIPLE = 4;          # Don't replace, but add a duplicate record if
1089                            # already there
1090
1091 # Flags to give property statuses.  The phrases are to remind maintainers that
1092 # if the flag is changed, the indefinite article referring to it in the
1093 # documentation may need to be as well.
1094 my $NORMAL = "";
1095 my $SUPPRESSED = 'z';   # The character should never actually be seen, since
1096                         # it is suppressed
1097 my $PLACEHOLDER = 'P';  # Implies no pod entry generated
1098 my $DEPRECATED = 'D';
1099 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1100 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1101 my $DISCOURAGED = 'X';
1102 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1103 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1104 my $STRICTER = 'T';
1105 my $a_bold_stricter = "a 'B<$STRICTER>'";
1106 my $A_bold_stricter = "A 'B<$STRICTER>'";
1107 my $STABILIZED = 'S';
1108 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1109 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1110 my $OBSOLETE = 'O';
1111 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1112 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1113
1114 my %status_past_participles = (
1115     $DISCOURAGED => 'discouraged',
1116     $SUPPRESSED => 'should never be generated',
1117     $STABILIZED => 'stabilized',
1118     $OBSOLETE => 'obsolete',
1119     $DEPRECATED => 'deprecated',
1120 );
1121
1122 # The format of the values of the map tables:
1123 my $BINARY_FORMAT = 'b';
1124 my $DECIMAL_FORMAT = 'd';
1125 my $FLOAT_FORMAT = 'f';
1126 my $INTEGER_FORMAT = 'i';
1127 my $HEX_FORMAT = 'x';
1128 my $RATIONAL_FORMAT = 'r';
1129 my $STRING_FORMAT = 's';
1130
1131 my %map_table_formats = (
1132     $BINARY_FORMAT => 'binary',
1133     $DECIMAL_FORMAT => 'single decimal digit',
1134     $FLOAT_FORMAT => 'floating point number',
1135     $INTEGER_FORMAT => 'integer',
1136     $HEX_FORMAT => 'positive hex whole number; a code point',
1137     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1138     $STRING_FORMAT => 'arbitrary string',
1139 );
1140
1141 # Unicode didn't put such derived files in a separate directory at first.
1142 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1143 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1144 my $AUXILIARY = 'auxiliary';
1145
1146 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1147 my %loose_to_file_of;       # loosely maps table names to their respective
1148                             # files
1149 my %stricter_to_file_of;    # same; but for stricter mapping.
1150 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1151                              # their rational equivalent
1152 my %loose_property_name_of; # Loosely maps property names to standard form
1153
1154 # These constants names and values were taken from the Unicode standard,
1155 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1156 # syllables
1157 my $SBase = 0xAC00;
1158 my $LBase = 0x1100;
1159 my $VBase = 0x1161;
1160 my $TBase = 0x11A7;
1161 my $SCount = 11172;
1162 my $LCount = 19;
1163 my $VCount = 21;
1164 my $TCount = 28;
1165 my $NCount = $VCount * $TCount;
1166
1167 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1168 # with the above published constants.
1169 my %Jamo;
1170 my %Jamo_L;     # Leading consonants
1171 my %Jamo_V;     # Vowels
1172 my %Jamo_T;     # Trailing consonants
1173
1174 my @backslash_X_tests;     # List of tests read in for testing \X
1175 my @unhandled_properties;  # Will contain a list of properties found in
1176                            # the input that we didn't process.
1177 my @match_properties;      # Properties that have match tables, to be
1178                            # listed in the pod
1179 my @map_properties;        # Properties that get map files written
1180 my @named_sequences;       # NamedSequences.txt contents.
1181 my %potential_files;       # Generated list of all .txt files in the directory
1182                            # structure so we can warn if something is being
1183                            # ignored.
1184 my @files_actually_output; # List of files we generated.
1185 my @more_Names;            # Some code point names are compound; this is used
1186                            # to store the extra components of them.
1187 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1188                            # the minimum before we consider it equivalent to a
1189                            # candidate rational
1190 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1191
1192 # These store references to certain commonly used property objects
1193 my $gc;
1194 my $perl;
1195 my $block;
1196
1197 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1198 my $has_In_conflicts = 0;
1199 my $has_Is_conflicts = 0;
1200
1201 sub internal_file_to_platform ($) {
1202     # Convert our file paths which have '/' separators to those of the
1203     # platform.
1204
1205     my $file = shift;
1206     return undef unless defined $file;
1207
1208     return File::Spec->join(split '/', $file);
1209 }
1210
1211 sub file_exists ($) {   # platform independent '-e'.  This program internally
1212                         # uses slash as a path separator.
1213     my $file = shift;
1214     return 0 if ! defined $file;
1215     return -e internal_file_to_platform($file);
1216 }
1217
1218 sub objaddr($) {
1219     # Returns the address of the blessed input object.
1220     # It doesn't check for blessedness because that would do a string eval
1221     # every call, and the program is structured so that this is never called
1222     # for a non-blessed object.
1223
1224     no overloading; # If overloaded, numifying below won't work.
1225
1226     # Numifying a ref gives its address.
1227     return 0 + $_[0];
1228 }
1229
1230 # Commented code below should work on Perl 5.8.
1231 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1232 ## the native perl version of it (which is what would operate under miniperl)
1233 ## is extremely slow, as it does a string eval every call.
1234 #my $has_fast_scalar_util = $\18 !~ /miniperl/
1235 #                            && defined eval "require Scalar::Util";
1236 #
1237 #sub objaddr($) {
1238 #    # Returns the address of the blessed input object.  Uses the XS version if
1239 #    # available.  It doesn't check for blessedness because that would do a
1240 #    # string eval every call, and the program is structured so that this is
1241 #    # never called for a non-blessed object.
1242 #
1243 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1244 #
1245 #    # Check at least that is a ref.
1246 #    my $pkg = ref($_[0]) or return undef;
1247 #
1248 #    # Change to a fake package to defeat any overloaded stringify
1249 #    bless $_[0], 'main::Fake';
1250 #
1251 #    # Numifying a ref gives its address.
1252 #    my $addr = 0 + $_[0];
1253 #
1254 #    # Return to original class
1255 #    bless $_[0], $pkg;
1256 #    return $addr;
1257 #}
1258
1259 sub max ($$) {
1260     my $a = shift;
1261     my $b = shift;
1262     return $a if $a >= $b;
1263     return $b;
1264 }
1265
1266 sub min ($$) {
1267     my $a = shift;
1268     my $b = shift;
1269     return $a if $a <= $b;
1270     return $b;
1271 }
1272
1273 sub clarify_number ($) {
1274     # This returns the input number with underscores inserted every 3 digits
1275     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1276     # checked.
1277
1278     my $number = shift;
1279     my $pos = length($number) - 3;
1280     return $number if $pos <= 1;
1281     while ($pos > 0) {
1282         substr($number, $pos, 0) = '_';
1283         $pos -= 3;
1284     }
1285     return $number;
1286 }
1287
1288
1289 package Carp;
1290
1291 # These routines give a uniform treatment of messages in this program.  They
1292 # are placed in the Carp package to cause the stack trace to not include them,
1293 # although an alternative would be to use another package and set @CARP_NOT
1294 # for it.
1295
1296 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1297
1298 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1299 # and overload trying to load Scalar:Util under miniperl.  See
1300 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1301 undef $overload::VERSION;
1302
1303 sub my_carp {
1304     my $message = shift || "";
1305     my $nofold = shift || 0;
1306
1307     if ($message) {
1308         $message = main::join_lines($message);
1309         $message =~ s/^$0: *//;     # Remove initial program name
1310         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1311         $message = "\n$0: $message;";
1312
1313         # Fold the message with program name, semi-colon end punctuation
1314         # (which looks good with the message that carp appends to it), and a
1315         # hanging indent for continuation lines.
1316         $message = main::simple_fold($message, "", 4) unless $nofold;
1317         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1318                                     # appends is to the same line
1319     }
1320
1321     return $message if defined wantarray;   # If a caller just wants the msg
1322
1323     carp $message;
1324     return;
1325 }
1326
1327 sub my_carp_bug {
1328     # This is called when it is clear that the problem is caused by a bug in
1329     # this program.
1330
1331     my $message = shift;
1332     $message =~ s/^$0: *//;
1333     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1334     carp $message;
1335     return;
1336 }
1337
1338 sub carp_too_few_args {
1339     if (@_ != 2) {
1340         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1341         return;
1342     }
1343
1344     my $args_ref = shift;
1345     my $count = shift;
1346
1347     my_carp_bug("Need at least $count arguments to "
1348         . (caller 1)[3]
1349         . ".  Instead got: '"
1350         . join ', ', @$args_ref
1351         . "'.  No action taken.");
1352     return;
1353 }
1354
1355 sub carp_extra_args {
1356     my $args_ref = shift;
1357     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1358
1359     unless (ref $args_ref) {
1360         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1361         return;
1362     }
1363     my ($package, $file, $line) = caller;
1364     my $subroutine = (caller 1)[3];
1365
1366     my $list;
1367     if (ref $args_ref eq 'HASH') {
1368         foreach my $key (keys %$args_ref) {
1369             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1370         }
1371         $list = join ', ', each %{$args_ref};
1372     }
1373     elsif (ref $args_ref eq 'ARRAY') {
1374         foreach my $arg (@$args_ref) {
1375             $arg = $UNDEF unless defined $arg;
1376         }
1377         $list = join ', ', @$args_ref;
1378     }
1379     else {
1380         my_carp_bug("Can't cope with ref "
1381                 . ref($args_ref)
1382                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1383         return;
1384     }
1385
1386     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1387     return;
1388 }
1389
1390 package main;
1391
1392 { # Closure
1393
1394     # This program uses the inside-out method for objects, as recommended in
1395     # "Perl Best Practices".  This closure aids in generating those.  There
1396     # are two routines.  setup_package() is called once per package to set
1397     # things up, and then set_access() is called for each hash representing a
1398     # field in the object.  These routines arrange for the object to be
1399     # properly destroyed when no longer used, and for standard accessor
1400     # functions to be generated.  If you need more complex accessors, just
1401     # write your own and leave those accesses out of the call to set_access().
1402     # More details below.
1403
1404     my %constructor_fields; # fields that are to be used in constructors; see
1405                             # below
1406
1407     # The values of this hash will be the package names as keys to other
1408     # hashes containing the name of each field in the package as keys, and
1409     # references to their respective hashes as values.
1410     my %package_fields;
1411
1412     sub setup_package {
1413         # Sets up the package, creating standard DESTROY and dump methods
1414         # (unless already defined).  The dump method is used in debugging by
1415         # simple_dumper().
1416         # The optional parameters are:
1417         #   a)  a reference to a hash, that gets populated by later
1418         #       set_access() calls with one of the accesses being
1419         #       'constructor'.  The caller can then refer to this, but it is
1420         #       not otherwise used by these two routines.
1421         #   b)  a reference to a callback routine to call during destruction
1422         #       of the object, before any fields are actually destroyed
1423
1424         my %args = @_;
1425         my $constructor_ref = delete $args{'Constructor_Fields'};
1426         my $destroy_callback = delete $args{'Destroy_Callback'};
1427         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1428
1429         my %fields;
1430         my $package = (caller)[0];
1431
1432         $package_fields{$package} = \%fields;
1433         $constructor_fields{$package} = $constructor_ref;
1434
1435         unless ($package->can('DESTROY')) {
1436             my $destroy_name = "${package}::DESTROY";
1437             no strict "refs";
1438
1439             # Use typeglob to give the anonymous subroutine the name we want
1440             *$destroy_name = sub {
1441                 my $self = shift;
1442                 my $addr; { no overloading; $addr = 0+$self; }
1443
1444                 $self->$destroy_callback if $destroy_callback;
1445                 foreach my $field (keys %{$package_fields{$package}}) {
1446                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1447                     delete $package_fields{$package}{$field}{$addr};
1448                 }
1449                 return;
1450             }
1451         }
1452
1453         unless ($package->can('dump')) {
1454             my $dump_name = "${package}::dump";
1455             no strict "refs";
1456             *$dump_name = sub {
1457                 my $self = shift;
1458                 return dump_inside_out($self, $package_fields{$package}, @_);
1459             }
1460         }
1461         return;
1462     }
1463
1464     sub set_access {
1465         # Arrange for the input field to be garbage collected when no longer
1466         # needed.  Also, creates standard accessor functions for the field
1467         # based on the optional parameters-- none if none of these parameters:
1468         #   'addable'    creates an 'add_NAME()' accessor function.
1469         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1470         #                function.
1471         #   'settable'   creates a 'set_NAME()' accessor function.
1472         #   'constructor' doesn't create an accessor function, but adds the
1473         #                field to the hash that was previously passed to
1474         #                setup_package();
1475         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1476         # 'add' etc. all mean 'addable'.
1477         # The read accessor function will work on both array and scalar
1478         # values.  If another accessor in the parameter list is 'a', the read
1479         # access assumes an array.  You can also force it to be array access
1480         # by specifying 'readable_array' instead of 'readable'
1481         #
1482         # A sort-of 'protected' access can be set-up by preceding the addable,
1483         # readable or settable with some initial portion of 'protected_' (but,
1484         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1485         # "protection" is only by convention.  All that happens is that the
1486         # accessor functions' names begin with an underscore.  So instead of
1487         # calling set_foo, the call is _set_foo.  (Real protection could be
1488         # accomplished by having a new subroutine, end_package called at the
1489         # end of each package, and then storing the __LINE__ ranges and
1490         # checking them on every accessor.  But that is way overkill.)
1491
1492         # We create anonymous subroutines as the accessors and then use
1493         # typeglobs to assign them to the proper package and name
1494
1495         my $name = shift;   # Name of the field
1496         my $field = shift;  # Reference to the inside-out hash containing the
1497                             # field
1498
1499         my $package = (caller)[0];
1500
1501         if (! exists $package_fields{$package}) {
1502             croak "$0: Must call 'setup_package' before 'set_access'";
1503         }
1504
1505         # Stash the field so DESTROY can get it.
1506         $package_fields{$package}{$name} = $field;
1507
1508         # Remaining arguments are the accessors.  For each...
1509         foreach my $access (@_) {
1510             my $access = lc $access;
1511
1512             my $protected = "";
1513
1514             # Match the input as far as it goes.
1515             if ($access =~ /^(p[^_]*)_/) {
1516                 $protected = $1;
1517                 if (substr('protected_', 0, length $protected)
1518                     eq $protected)
1519                 {
1520
1521                     # Add 1 for the underscore not included in $protected
1522                     $access = substr($access, length($protected) + 1);
1523                     $protected = '_';
1524                 }
1525                 else {
1526                     $protected = "";
1527                 }
1528             }
1529
1530             if (substr('addable', 0, length $access) eq $access) {
1531                 my $subname = "${package}::${protected}add_$name";
1532                 no strict "refs";
1533
1534                 # add_ accessor.  Don't add if already there, which we
1535                 # determine using 'eq' for scalars and '==' otherwise.
1536                 *$subname = sub {
1537                     use strict "refs";
1538                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1539                     my $self = shift;
1540                     my $value = shift;
1541                     my $addr; { no overloading; $addr = 0+$self; }
1542                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1543                     if (ref $value) {
1544                         return if grep { $value == $_ } @{$field->{$addr}};
1545                     }
1546                     else {
1547                         return if grep { $value eq $_ } @{$field->{$addr}};
1548                     }
1549                     push @{$field->{$addr}}, $value;
1550                     return;
1551                 }
1552             }
1553             elsif (substr('constructor', 0, length $access) eq $access) {
1554                 if ($protected) {
1555                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1556                 }
1557                 else {
1558                     $constructor_fields{$package}{$name} = $field;
1559                 }
1560             }
1561             elsif (substr('readable_array', 0, length $access) eq $access) {
1562
1563                 # Here has read access.  If one of the other parameters for
1564                 # access is array, or this one specifies array (by being more
1565                 # than just 'readable_'), then create a subroutine that
1566                 # assumes the data is an array.  Otherwise just a scalar
1567                 my $subname = "${package}::${protected}$name";
1568                 if (grep { /^a/i } @_
1569                     or length($access) > length('readable_'))
1570                 {
1571                     no strict "refs";
1572                     *$subname = sub {
1573                         use strict "refs";
1574                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1575                         my $addr; { no overloading; $addr = 0+$_[0]; }
1576                         if (ref $field->{$addr} ne 'ARRAY') {
1577                             my $type = ref $field->{$addr};
1578                             $type = 'scalar' unless $type;
1579                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1580                             return;
1581                         }
1582                         return scalar @{$field->{$addr}} unless wantarray;
1583
1584                         # Make a copy; had problems with caller modifying the
1585                         # original otherwise
1586                         my @return = @{$field->{$addr}};
1587                         return @return;
1588                     }
1589                 }
1590                 else {
1591
1592                     # Here not an array value, a simpler function.
1593                     no strict "refs";
1594                     *$subname = sub {
1595                         use strict "refs";
1596                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1597                         no overloading;
1598                         return $field->{0+$_[0]};
1599                     }
1600                 }
1601             }
1602             elsif (substr('settable', 0, length $access) eq $access) {
1603                 my $subname = "${package}::${protected}set_$name";
1604                 no strict "refs";
1605                 *$subname = sub {
1606                     use strict "refs";
1607                     if (main::DEBUG) {
1608                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1609                         Carp::carp_extra_args(\@_) if @_ > 2;
1610                     }
1611                     # $self is $_[0]; $value is $_[1]
1612                     no overloading;
1613                     $field->{0+$_[0]} = $_[1];
1614                     return;
1615                 }
1616             }
1617             else {
1618                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
1619             }
1620         }
1621         return;
1622     }
1623 }
1624
1625 package Input_file;
1626
1627 # All input files use this object, which stores various attributes about them,
1628 # and provides for convenient, uniform handling.  The run method wraps the
1629 # processing.  It handles all the bookkeeping of opening, reading, and closing
1630 # the file, returning only significant input lines.
1631 #
1632 # Each object gets a handler which processes the body of the file, and is
1633 # called by run().  Most should use the generic, default handler, which has
1634 # code scrubbed to handle things you might not expect.  A handler should
1635 # basically be a while(next_line()) {...} loop.
1636 #
1637 # You can also set up handlers to
1638 #   1) call before the first line is read for pre processing
1639 #   2) call to adjust each line of the input before the main handler gets them
1640 #   3) call upon EOF before the main handler exits its loop
1641 #   4) call at the end for post processing
1642 #
1643 # $_ is used to store the input line, and is to be filtered by the
1644 # each_line_handler()s.  So, if the format of the line is not in the desired
1645 # format for the main handler, these are used to do that adjusting.  They can
1646 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1647 # so the $_ output of one is used as the input to the next.  None of the other
1648 # handlers are stackable, but could easily be changed to be so.
1649 #
1650 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1651 # which insert the parameters as lines to be processed before the next input
1652 # file line is read.  This allows the EOF handler to flush buffers, for
1653 # example.  The difference between the two routines is that the lines inserted
1654 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
1655 # called it from such a handler, you would get infinite recursion.)  Lines
1656 # inserted by insert_adjusted_lines() go directly to the main handler without
1657 # any adjustments.  If the  post-processing handler calls any of these, there
1658 # will be no effect.  Some error checking for these conditions could be added,
1659 # but it hasn't been done.
1660 #
1661 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1662 # to prevent further processing of the line.  This routine will output the
1663 # message as a warning once, and then keep a count of the lines that have the
1664 # same message, and output that count at the end of the file's processing.
1665 # This keeps the number of messages down to a manageable amount.
1666 #
1667 # get_missings() should be called to retrieve any @missing input lines.
1668 # Messages will be raised if this isn't done if the options aren't to ignore
1669 # missings.
1670
1671 sub trace { return main::trace(@_); }
1672
1673 { # Closure
1674     # Keep track of fields that are to be put into the constructor.
1675     my %constructor_fields;
1676
1677     main::setup_package(Constructor_Fields => \%constructor_fields);
1678
1679     my %file; # Input file name, required
1680     main::set_access('file', \%file, qw{ c r });
1681
1682     my %first_released; # Unicode version file was first released in, required
1683     main::set_access('first_released', \%first_released, qw{ c r });
1684
1685     my %handler;    # Subroutine to process the input file, defaults to
1686                     # 'process_generic_property_file'
1687     main::set_access('handler', \%handler, qw{ c });
1688
1689     my %property;
1690     # name of property this file is for.  defaults to none, meaning not
1691     # applicable, or is otherwise determinable, for example, from each line.
1692     main::set_access('property', \%property, qw{ c });
1693
1694     my %optional;
1695     # If this is true, the file is optional.  If not present, no warning is
1696     # output.  If it is present, the string given by this parameter is
1697     # evaluated, and if false the file is not processed.
1698     main::set_access('optional', \%optional, 'c', 'r');
1699
1700     my %non_skip;
1701     # This is used for debugging, to skip processing of all but a few input
1702     # files.  Add 'non_skip => 1' to the constructor for those files you want
1703     # processed when you set the $debug_skip global.
1704     main::set_access('non_skip', \%non_skip, 'c');
1705
1706     my %skip;
1707     # This is used to skip processing of this input file semi-permanently.
1708     # It is used for files that we aren't planning to process anytime soon,
1709     # but want to allow to be in the directory and not raise a message that we
1710     # are not handling.  Mostly for test files.  This is in contrast to the
1711     # non_skip element, which is supposed to be used very temporarily for
1712     # debugging.  Sets 'optional' to 1
1713     main::set_access('skip', \%skip, 'c');
1714
1715     my %each_line_handler;
1716     # list of subroutines to look at and filter each non-comment line in the
1717     # file.  defaults to none.  The subroutines are called in order, each is
1718     # to adjust $_ for the next one, and the final one adjusts it for
1719     # 'handler'
1720     main::set_access('each_line_handler', \%each_line_handler, 'c');
1721
1722     my %has_missings_defaults;
1723     # ? Are there lines in the file giving default values for code points
1724     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
1725     # the norm, but IGNORED means it has such lines, but the handler doesn't
1726     # use them.  Having these three states allows us to catch changes to the
1727     # UCD that this program should track
1728     main::set_access('has_missings_defaults',
1729                                         \%has_missings_defaults, qw{ c r });
1730
1731     my %pre_handler;
1732     # Subroutine to call before doing anything else in the file.  If undef, no
1733     # such handler is called.
1734     main::set_access('pre_handler', \%pre_handler, qw{ c });
1735
1736     my %eof_handler;
1737     # Subroutine to call upon getting an EOF on the input file, but before
1738     # that is returned to the main handler.  This is to allow buffers to be
1739     # flushed.  The handler is expected to call insert_lines() or
1740     # insert_adjusted() with the buffered material
1741     main::set_access('eof_handler', \%eof_handler, qw{ c r });
1742
1743     my %post_handler;
1744     # Subroutine to call after all the lines of the file are read in and
1745     # processed.  If undef, no such handler is called.
1746     main::set_access('post_handler', \%post_handler, qw{ c });
1747
1748     my %progress_message;
1749     # Message to print to display progress in lieu of the standard one
1750     main::set_access('progress_message', \%progress_message, qw{ c });
1751
1752     my %handle;
1753     # cache open file handle, internal.  Is undef if file hasn't been
1754     # processed at all, empty if has;
1755     main::set_access('handle', \%handle);
1756
1757     my %added_lines;
1758     # cache of lines added virtually to the file, internal
1759     main::set_access('added_lines', \%added_lines);
1760
1761     my %errors;
1762     # cache of errors found, internal
1763     main::set_access('errors', \%errors);
1764
1765     my %missings;
1766     # storage of '@missing' defaults lines
1767     main::set_access('missings', \%missings);
1768
1769     sub new {
1770         my $class = shift;
1771
1772         my $self = bless \do{ my $anonymous_scalar }, $class;
1773         my $addr; { no overloading; $addr = 0+$self; }
1774
1775         # Set defaults
1776         $handler{$addr} = \&main::process_generic_property_file;
1777         $non_skip{$addr} = 0;
1778         $skip{$addr} = 0;
1779         $has_missings_defaults{$addr} = $NO_DEFAULTS;
1780         $handle{$addr} = undef;
1781         $added_lines{$addr} = [ ];
1782         $each_line_handler{$addr} = [ ];
1783         $errors{$addr} = { };
1784         $missings{$addr} = [ ];
1785
1786         # Two positional parameters.
1787         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1788         $file{$addr} = main::internal_file_to_platform(shift);
1789         $first_released{$addr} = shift;
1790
1791         # The rest of the arguments are key => value pairs
1792         # %constructor_fields has been set up earlier to list all possible
1793         # ones.  Either set or push, depending on how the default has been set
1794         # up just above.
1795         my %args = @_;
1796         foreach my $key (keys %args) {
1797             my $argument = $args{$key};
1798
1799             # Note that the fields are the lower case of the constructor keys
1800             my $hash = $constructor_fields{lc $key};
1801             if (! defined $hash) {
1802                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
1803                 next;
1804             }
1805             if (ref $hash->{$addr} eq 'ARRAY') {
1806                 if (ref $argument eq 'ARRAY') {
1807                     foreach my $argument (@{$argument}) {
1808                         next if ! defined $argument;
1809                         push @{$hash->{$addr}}, $argument;
1810                     }
1811                 }
1812                 else {
1813                     push @{$hash->{$addr}}, $argument if defined $argument;
1814                 }
1815             }
1816             else {
1817                 $hash->{$addr} = $argument;
1818             }
1819             delete $args{$key};
1820         };
1821
1822         # If the file has a property for it, it means that the property is not
1823         # listed in the file's entries.  So add a handler to the list of line
1824         # handlers to insert the property name into the lines, to provide a
1825         # uniform interface to the final processing subroutine.
1826         # the final code doesn't have to worry about that.
1827         if ($property{$addr}) {
1828             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
1829         }
1830
1831         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
1832             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
1833         }
1834
1835         $optional{$addr} = 1 if $skip{$addr};
1836
1837         return $self;
1838     }
1839
1840
1841     use overload
1842         fallback => 0,
1843         qw("") => "_operator_stringify",
1844         "." => \&main::_operator_dot,
1845     ;
1846
1847     sub _operator_stringify {
1848         my $self = shift;
1849
1850         return __PACKAGE__ . " object for " . $self->file;
1851     }
1852
1853     # flag to make sure extracted files are processed early
1854     my $seen_non_extracted_non_age = 0;
1855
1856     sub run {
1857         # Process the input object $self.  This opens and closes the file and
1858         # calls all the handlers for it.  Currently,  this can only be called
1859         # once per file, as it destroy's the EOF handler
1860
1861         my $self = shift;
1862         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1863
1864         my $addr; { no overloading; $addr = 0+$self; }
1865
1866         my $file = $file{$addr};
1867
1868         # Don't process if not expecting this file (because released later
1869         # than this Unicode version), and isn't there.  This means if someone
1870         # copies it into an earlier version's directory, we will go ahead and
1871         # process it.
1872         return if $first_released{$addr} gt $v_version && ! -e $file;
1873
1874         # If in debugging mode and this file doesn't have the non-skip
1875         # flag set, and isn't one of the critical files, skip it.
1876         if ($debug_skip
1877             && $first_released{$addr} ne v0
1878             && ! $non_skip{$addr})
1879         {
1880             print "Skipping $file in debugging\n" if $verbosity;
1881             return;
1882         }
1883
1884         # File could be optional
1885         if ($optional{$addr}) {
1886             return unless -e $file;
1887             my $result = eval $optional{$addr};
1888             if (! defined $result) {
1889                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
1890                 return;
1891             }
1892             if (! $result) {
1893                 if ($verbosity) {
1894                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
1895                 }
1896                 return;
1897             }
1898         }
1899
1900         if (! defined $file || ! -e $file) {
1901
1902             # If the file doesn't exist, see if have internal data for it
1903             # (based on first_released being 0).
1904             if ($first_released{$addr} eq v0) {
1905                 $handle{$addr} = 'pretend_is_open';
1906             }
1907             else {
1908                 if (! $optional{$addr}  # File could be optional
1909                     && $v_version ge $first_released{$addr})
1910                 {
1911                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
1912                 }
1913                 return;
1914             }
1915         }
1916         else {
1917
1918             # Here, the file exists.  Some platforms may change the case of
1919             # its name
1920             if ($seen_non_extracted_non_age) {
1921                 if ($file =~ /$EXTRACTED/i) {
1922                     Carp::my_carp_bug(join_lines(<<END
1923 $file should be processed just after the 'Prop...Alias' files, and before
1924 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
1925 have subtle problems
1926 END
1927                     ));
1928                 }
1929             }
1930             elsif ($EXTRACTED_DIR
1931                     && $first_released{$addr} ne v0
1932                     && $file !~ /$EXTRACTED/i
1933                     && lc($file) ne 'dage.txt')
1934             {
1935                 # We don't set this (by the 'if' above) if we have no
1936                 # extracted directory, so if running on an early version,
1937                 # this test won't work.  Not worth worrying about.
1938                 $seen_non_extracted_non_age = 1;
1939             }
1940
1941             # And mark the file as having being processed, and warn if it
1942             # isn't a file we are expecting.  As we process the files,
1943             # they are deleted from the hash, so any that remain at the
1944             # end of the program are files that we didn't process.
1945             my $fkey = File::Spec->rel2abs($file);
1946             my $expecting = delete $potential_files{$fkey};
1947             $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
1948             Carp::my_carp("Was not expecting '$file'.") if
1949                     ! $expecting
1950                     && ! defined $handle{$addr};
1951
1952             # Having deleted from expected files, we can quit if not to do
1953             # anything.  Don't print progress unless really want verbosity
1954             if ($skip{$addr}) {
1955                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
1956                 return;
1957             }
1958
1959             # Open the file, converting the slashes used in this program
1960             # into the proper form for the OS
1961             my $file_handle;
1962             if (not open $file_handle, "<", $file) {
1963                 Carp::my_carp("Can't open $file.  Skipping: $!");
1964                 return 0;
1965             }
1966             $handle{$addr} = $file_handle; # Cache the open file handle
1967         }
1968
1969         if ($verbosity >= $PROGRESS) {
1970             if ($progress_message{$addr}) {
1971                 print "$progress_message{$addr}\n";
1972             }
1973             else {
1974                 # If using a virtual file, say so.
1975                 print "Processing ", (-e $file)
1976                                        ? $file
1977                                        : "substitute $file",
1978                                      "\n";
1979             }
1980         }
1981
1982
1983         # Call any special handler for before the file.
1984         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
1985
1986         # Then the main handler
1987         &{$handler{$addr}}($self);
1988
1989         # Then any special post-file handler.
1990         &{$post_handler{$addr}}($self) if $post_handler{$addr};
1991
1992         # If any errors have been accumulated, output the counts (as the first
1993         # error message in each class was output when it was encountered).
1994         if ($errors{$addr}) {
1995             my $total = 0;
1996             my $types = 0;
1997             foreach my $error (keys %{$errors{$addr}}) {
1998                 $total += $errors{$addr}->{$error};
1999                 delete $errors{$addr}->{$error};
2000                 $types++;
2001             }
2002             if ($total > 1) {
2003                 my $message
2004                         = "A total of $total lines had errors in $file.  ";
2005
2006                 $message .= ($types == 1)
2007                             ? '(Only the first one was displayed.)'
2008                             : '(Only the first of each type was displayed.)';
2009                 Carp::my_carp($message);
2010             }
2011         }
2012
2013         if (@{$missings{$addr}}) {
2014             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2015         }
2016
2017         # If a real file handle, close it.
2018         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2019                                                         ref $handle{$addr};
2020         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2021                                # the file, as opposed to undef
2022         return;
2023     }
2024
2025     sub next_line {
2026         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2027         # if such a line exists.  'logical' means that any lines that have
2028         # been added via insert_lines() will be returned in $_ before the file
2029         # is read again.
2030
2031         my $self = shift;
2032         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2033
2034         my $addr; { no overloading; $addr = 0+$self; }
2035
2036         # Here the file is open (or if the handle is not a ref, is an open
2037         # 'virtual' file).  Get the next line; any inserted lines get priority
2038         # over the file itself.
2039         my $adjusted;
2040
2041         LINE:
2042         while (1) { # Loop until find non-comment, non-empty line
2043             #local $to_trace = 1 if main::DEBUG;
2044             my $inserted_ref = shift @{$added_lines{$addr}};
2045             if (defined $inserted_ref) {
2046                 ($adjusted, $_) = @{$inserted_ref};
2047                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2048                 return 1 if $adjusted;
2049             }
2050             else {
2051                 last if ! ref $handle{$addr}; # Don't read unless is real file
2052                 last if ! defined ($_ = readline $handle{$addr});
2053             }
2054             chomp;
2055             trace $_ if main::DEBUG && $to_trace;
2056
2057             # See if this line is the comment line that defines what property
2058             # value that code points that are not listed in the file should
2059             # have.  The format or existence of these lines is not guaranteed
2060             # by Unicode since they are comments, but the documentation says
2061             # that this was added for machine-readability, so probably won't
2062             # change.  This works starting in Unicode Version 5.0.  They look
2063             # like:
2064             #
2065             # @missing: 0000..10FFFF; Not_Reordered
2066             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2067             # @missing: 0000..10FFFF; ; NaN
2068             #
2069             # Save the line for a later get_missings() call.
2070             if (/$missing_defaults_prefix/) {
2071                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2072                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2073                 }
2074                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2075                     my @defaults = split /\s* ; \s*/x, $_;
2076
2077                     # The first field is the @missing, which ends in a
2078                     # semi-colon, so can safely shift.
2079                     shift @defaults;
2080
2081                     # Some of these lines may have empty field placeholders
2082                     # which get in the way.  An example is:
2083                     # @missing: 0000..10FFFF; ; NaN
2084                     # Remove them.  Process starting from the top so the
2085                     # splice doesn't affect things still to be looked at.
2086                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2087                         next if $defaults[$i] ne "";
2088                         splice @defaults, $i, 1;
2089                     }
2090
2091                     # What's left should be just the property (maybe) and the
2092                     # default.  Having only one element means it doesn't have
2093                     # the property.
2094                     my $default;
2095                     my $property;
2096                     if (@defaults >= 1) {
2097                         if (@defaults == 1) {
2098                             $default = $defaults[0];
2099                         }
2100                         else {
2101                             $property = $defaults[0];
2102                             $default = $defaults[1];
2103                         }
2104                     }
2105
2106                     if (@defaults < 1
2107                         || @defaults > 2
2108                         || ($default =~ /^</
2109                             && $default !~ /^<code *point>$/i
2110                             && $default !~ /^<none>$/i))
2111                     {
2112                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2113                     }
2114                     else {
2115
2116                         # If the property is missing from the line, it should
2117                         # be the one for the whole file
2118                         $property = $property{$addr} if ! defined $property;
2119
2120                         # Change <none> to the null string, which is what it
2121                         # really means.  If the default is the code point
2122                         # itself, set it to <code point>, which is what
2123                         # Unicode uses (but sometimes they've forgotten the
2124                         # space)
2125                         if ($default =~ /^<none>$/i) {
2126                             $default = "";
2127                         }
2128                         elsif ($default =~ /^<code *point>$/i) {
2129                             $default = $CODE_POINT;
2130                         }
2131
2132                         # Store them as a sub-arrays with both components.
2133                         push @{$missings{$addr}}, [ $default, $property ];
2134                     }
2135                 }
2136
2137                 # There is nothing for the caller to process on this comment
2138                 # line.
2139                 next;
2140             }
2141
2142             # Remove comments and trailing space, and skip this line if the
2143             # result is empty
2144             s/#.*//;
2145             s/\s+$//;
2146             next if /^$/;
2147
2148             # Call any handlers for this line, and skip further processing of
2149             # the line if the handler sets the line to null.
2150             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2151                 &{$sub_ref}($self);
2152                 next LINE if /^$/;
2153             }
2154
2155             # Here the line is ok.  return success.
2156             return 1;
2157         } # End of looping through lines.
2158
2159         # If there is an EOF handler, call it (only once) and if it generates
2160         # more lines to process go back in the loop to handle them.
2161         if ($eof_handler{$addr}) {
2162             &{$eof_handler{$addr}}($self);
2163             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2164             goto LINE if $added_lines{$addr};
2165         }
2166
2167         # Return failure -- no more lines.
2168         return 0;
2169
2170     }
2171
2172 #   Not currently used, not fully tested.
2173 #    sub peek {
2174 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2175 #        # record.  Not callable from an each_line_handler(), nor does it call
2176 #        # an each_line_handler() on the line.
2177 #
2178 #        my $self = shift;
2179 #        my $addr; { no overloading; $addr = 0+$self; }
2180 #
2181 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2182 #            my ($adjusted, $line) = @{$inserted_ref};
2183 #            next if $adjusted;
2184 #
2185 #            # Remove comments and trailing space, and return a non-empty
2186 #            # resulting line
2187 #            $line =~ s/#.*//;
2188 #            $line =~ s/\s+$//;
2189 #            return $line if $line ne "";
2190 #        }
2191 #
2192 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2193 #        while (1) { # Loop until find non-comment, non-empty line
2194 #            local $to_trace = 1 if main::DEBUG;
2195 #            trace $_ if main::DEBUG && $to_trace;
2196 #            return if ! defined (my $line = readline $handle{$addr});
2197 #            chomp $line;
2198 #            push @{$added_lines{$addr}}, [ 0, $line ];
2199 #
2200 #            $line =~ s/#.*//;
2201 #            $line =~ s/\s+$//;
2202 #            return $line if $line ne "";
2203 #        }
2204 #
2205 #        return;
2206 #    }
2207
2208
2209     sub insert_lines {
2210         # Lines can be inserted so that it looks like they were in the input
2211         # file at the place it was when this routine is called.  See also
2212         # insert_adjusted_lines().  Lines inserted via this routine go through
2213         # any each_line_handler()
2214
2215         my $self = shift;
2216
2217         # Each inserted line is an array, with the first element being 0 to
2218         # indicate that this line hasn't been adjusted, and needs to be
2219         # processed.
2220         no overloading;
2221         push @{$added_lines{0+$self}}, map { [ 0, $_ ] } @_;
2222         return;
2223     }
2224
2225     sub insert_adjusted_lines {
2226         # Lines can be inserted so that it looks like they were in the input
2227         # file at the place it was when this routine is called.  See also
2228         # insert_lines().  Lines inserted via this routine are already fully
2229         # adjusted, ready to be processed; each_line_handler()s handlers will
2230         # not be called.  This means this is not a completely general
2231         # facility, as only the last each_line_handler on the stack should
2232         # call this.  It could be made more general, by passing to each of the
2233         # line_handlers their position on the stack, which they would pass on
2234         # to this routine, and that would replace the boolean first element in
2235         # the anonymous array pushed here, so that the next_line routine could
2236         # use that to call only those handlers whose index is after it on the
2237         # stack.  But this is overkill for what is needed now.
2238
2239         my $self = shift;
2240         trace $_[0] if main::DEBUG && $to_trace;
2241
2242         # Each inserted line is an array, with the first element being 1 to
2243         # indicate that this line has been adjusted
2244         no overloading;
2245         push @{$added_lines{0+$self}}, map { [ 1, $_ ] } @_;
2246         return;
2247     }
2248
2249     sub get_missings {
2250         # Returns the stored up @missings lines' values, and clears the list.
2251         # The values are in an array, consisting of the default in the first
2252         # element, and the property in the 2nd.  However, since these lines
2253         # can be stacked up, the return is an array of all these arrays.
2254
2255         my $self = shift;
2256         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2257
2258         my $addr; { no overloading; $addr = 0+$self; }
2259
2260         # If not accepting a list return, just return the first one.
2261         return shift @{$missings{$addr}} unless wantarray;
2262
2263         my @return = @{$missings{$addr}};
2264         undef @{$missings{$addr}};
2265         return @return;
2266     }
2267
2268     sub _insert_property_into_line {
2269         # Add a property field to $_, if this file requires it.
2270
2271         my $self = shift;
2272         my $addr; { no overloading; $addr = 0+$self; }
2273         my $property = $property{$addr};
2274         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2275
2276         $_ =~ s/(;|$)/; $property$1/;
2277         return;
2278     }
2279
2280     sub carp_bad_line {
2281         # Output consistent error messages, using either a generic one, or the
2282         # one given by the optional parameter.  To avoid gazillions of the
2283         # same message in case the syntax of a  file is way off, this routine
2284         # only outputs the first instance of each message, incrementing a
2285         # count so the totals can be output at the end of the file.
2286
2287         my $self = shift;
2288         my $message = shift;
2289         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2290
2291         my $addr; { no overloading; $addr = 0+$self; }
2292
2293         $message = 'Unexpected line' unless $message;
2294
2295         # No trailing punctuation so as to fit with our addenda.
2296         $message =~ s/[.:;,]$//;
2297
2298         # If haven't seen this exact message before, output it now.  Otherwise
2299         # increment the count of how many times it has occurred
2300         unless ($errors{$addr}->{$message}) {
2301             Carp::my_carp("$message in '$_' in "
2302                             . $file{$addr}
2303                             . " at line $..  Skipping this line;");
2304             $errors{$addr}->{$message} = 1;
2305         }
2306         else {
2307             $errors{$addr}->{$message}++;
2308         }
2309
2310         # Clear the line to prevent any further (meaningful) processing of it.
2311         $_ = "";
2312
2313         return;
2314     }
2315 } # End closure
2316
2317 package Multi_Default;
2318
2319 # Certain properties in early versions of Unicode had more than one possible
2320 # default for code points missing from the files.  In these cases, one
2321 # default applies to everything left over after all the others are applied,
2322 # and for each of the others, there is a description of which class of code
2323 # points applies to it.  This object helps implement this by storing the
2324 # defaults, and for all but that final default, an eval string that generates
2325 # the class that it applies to.
2326
2327
2328 {   # Closure
2329
2330     main::setup_package();
2331
2332     my %class_defaults;
2333     # The defaults structure for the classes
2334     main::set_access('class_defaults', \%class_defaults);
2335
2336     my %other_default;
2337     # The default that applies to everything left over.
2338     main::set_access('other_default', \%other_default, 'r');
2339
2340
2341     sub new {
2342         # The constructor is called with default => eval pairs, terminated by
2343         # the left-over default. e.g.
2344         # Multi_Default->new(
2345         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2346         #               -  0x200D',
2347         #        'R' => 'some other expression that evaluates to code points',
2348         #        .
2349         #        .
2350         #        .
2351         #        'U'));
2352
2353         my $class = shift;
2354
2355         my $self = bless \do{my $anonymous_scalar}, $class;
2356         my $addr; { no overloading; $addr = 0+$self; }
2357
2358         while (@_ > 1) {
2359             my $default = shift;
2360             my $eval = shift;
2361             $class_defaults{$addr}->{$default} = $eval;
2362         }
2363
2364         $other_default{$addr} = shift;
2365
2366         return $self;
2367     }
2368
2369     sub get_next_defaults {
2370         # Iterates and returns the next class of defaults.
2371         my $self = shift;
2372         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2373
2374         my $addr; { no overloading; $addr = 0+$self; }
2375
2376         return each %{$class_defaults{$addr}};
2377     }
2378 }
2379
2380 package Alias;
2381
2382 # An alias is one of the names that a table goes by.  This class defines them
2383 # including some attributes.  Everything is currently setup in the
2384 # constructor.
2385
2386
2387 {   # Closure
2388
2389     main::setup_package();
2390
2391     my %name;
2392     main::set_access('name', \%name, 'r');
2393
2394     my %loose_match;
2395     # Determined by the constructor code if this name should match loosely or
2396     # not.  The constructor parameters can override this, but it isn't fully
2397     # implemented, as should have ability to override Unicode one's via
2398     # something like a set_loose_match()
2399     main::set_access('loose_match', \%loose_match, 'r');
2400
2401     my %make_pod_entry;
2402     # Some aliases should not get their own entries because they are covered
2403     # by a wild-card, and some we want to discourage use of.  Binary
2404     main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2405
2406     my %status;
2407     # Aliases have a status, like deprecated, or even suppressed (which means
2408     # they don't appear in documentation).  Enum
2409     main::set_access('status', \%status, 'r');
2410
2411     my %externally_ok;
2412     # Similarly, some aliases should not be considered as usable ones for
2413     # external use, such as file names, or we don't want documentation to
2414     # recommend them.  Boolean
2415     main::set_access('externally_ok', \%externally_ok, 'r');
2416
2417     sub new {
2418         my $class = shift;
2419
2420         my $self = bless \do { my $anonymous_scalar }, $class;
2421         my $addr; { no overloading; $addr = 0+$self; }
2422
2423         $name{$addr} = shift;
2424         $loose_match{$addr} = shift;
2425         $make_pod_entry{$addr} = shift;
2426         $externally_ok{$addr} = shift;
2427         $status{$addr} = shift;
2428
2429         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2430
2431         # Null names are never ok externally
2432         $externally_ok{$addr} = 0 if $name{$addr} eq "";
2433
2434         return $self;
2435     }
2436 }
2437
2438 package Range;
2439
2440 # A range is the basic unit for storing code points, and is described in the
2441 # comments at the beginning of the program.  Each range has a starting code
2442 # point; an ending code point (not less than the starting one); a value
2443 # that applies to every code point in between the two end-points, inclusive;
2444 # and an enum type that applies to the value.  The type is for the user's
2445 # convenience, and has no meaning here, except that a non-zero type is
2446 # considered to not obey the normal Unicode rules for having standard forms.
2447 #
2448 # The same structure is used for both map and match tables, even though in the
2449 # latter, the value (and hence type) is irrelevant and could be used as a
2450 # comment.  In map tables, the value is what all the code points in the range
2451 # map to.  Type 0 values have the standardized version of the value stored as
2452 # well, so as to not have to recalculate it a lot.
2453
2454 sub trace { return main::trace(@_); }
2455
2456 {   # Closure
2457
2458     main::setup_package();
2459
2460     my %start;
2461     main::set_access('start', \%start, 'r', 's');
2462
2463     my %end;
2464     main::set_access('end', \%end, 'r', 's');
2465
2466     my %value;
2467     main::set_access('value', \%value, 'r');
2468
2469     my %type;
2470     main::set_access('type', \%type, 'r');
2471
2472     my %standard_form;
2473     # The value in internal standard form.  Defined only if the type is 0.
2474     main::set_access('standard_form', \%standard_form);
2475
2476     # Note that if these fields change, the dump() method should as well
2477
2478     sub new {
2479         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2480         my $class = shift;
2481
2482         my $self = bless \do { my $anonymous_scalar }, $class;
2483         my $addr; { no overloading; $addr = 0+$self; }
2484
2485         $start{$addr} = shift;
2486         $end{$addr} = shift;
2487
2488         my %args = @_;
2489
2490         my $value = delete $args{'Value'};  # Can be 0
2491         $value = "" unless defined $value;
2492         $value{$addr} = $value;
2493
2494         $type{$addr} = delete $args{'Type'} || 0;
2495
2496         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2497
2498         if (! $type{$addr}) {
2499             $standard_form{$addr} = main::standardize($value);
2500         }
2501
2502         return $self;
2503     }
2504
2505     use overload
2506         fallback => 0,
2507         qw("") => "_operator_stringify",
2508         "." => \&main::_operator_dot,
2509     ;
2510
2511     sub _operator_stringify {
2512         my $self = shift;
2513         my $addr; { no overloading; $addr = 0+$self; }
2514
2515         # Output it like '0041..0065 (value)'
2516         my $return = sprintf("%04X", $start{$addr})
2517                         .  '..'
2518                         . sprintf("%04X", $end{$addr});
2519         my $value = $value{$addr};
2520         my $type = $type{$addr};
2521         $return .= ' (';
2522         $return .= "$value";
2523         $return .= ", Type=$type" if $type != 0;
2524         $return .= ')';
2525
2526         return $return;
2527     }
2528
2529     sub standard_form {
2530         # The standard form is the value itself if the standard form is
2531         # undefined (that is if the value is special)
2532
2533         my $self = shift;
2534         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2535
2536         my $addr; { no overloading; $addr = 0+$self; }
2537
2538         return $standard_form{$addr} if defined $standard_form{$addr};
2539         return $value{$addr};
2540     }
2541
2542     sub dump {
2543         # Human, not machine readable.  For machine readable, comment out this
2544         # entire routine and let the standard one take effect.
2545         my $self = shift;
2546         my $indent = shift;
2547         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2548
2549         my $addr; { no overloading; $addr = 0+$self; }
2550
2551         my $return = $indent
2552                     . sprintf("%04X", $start{$addr})
2553                     . '..'
2554                     . sprintf("%04X", $end{$addr})
2555                     . " '$value{$addr}';";
2556         if (! defined $standard_form{$addr}) {
2557             $return .= "(type=$type{$addr})";
2558         }
2559         elsif ($standard_form{$addr} ne $value{$addr}) {
2560             $return .= "(standard '$standard_form{$addr}')";
2561         }
2562         return $return;
2563     }
2564 } # End closure
2565
2566 package _Range_List_Base;
2567
2568 # Base class for range lists.  A range list is simply an ordered list of
2569 # ranges, so that the ranges with the lowest starting numbers are first in it.
2570 #
2571 # When a new range is added that is adjacent to an existing range that has the
2572 # same value and type, it merges with it to form a larger range.
2573 #
2574 # Ranges generally do not overlap, except that there can be multiple entries
2575 # of single code point ranges.  This is because of NameAliases.txt.
2576 #
2577 # In this program, there is a standard value such that if two different
2578 # values, have the same standard value, they are considered equivalent.  This
2579 # value was chosen so that it gives correct results on Unicode data
2580
2581 # There are a number of methods to manipulate range lists, and some operators
2582 # are overloaded to handle them.
2583
2584 sub trace { return main::trace(@_); }
2585
2586 { # Closure
2587
2588     our $addr;
2589
2590     main::setup_package();
2591
2592     my %ranges;
2593     # The list of ranges
2594     main::set_access('ranges', \%ranges, 'readable_array');
2595
2596     my %max;
2597     # The highest code point in the list.  This was originally a method, but
2598     # actual measurements said it was used a lot.
2599     main::set_access('max', \%max, 'r');
2600
2601     my %each_range_iterator;
2602     # Iterator position for each_range()
2603     main::set_access('each_range_iterator', \%each_range_iterator);
2604
2605     my %owner_name_of;
2606     # Name of parent this is attached to, if any.  Solely for better error
2607     # messages.
2608     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2609
2610     my %_search_ranges_cache;
2611     # A cache of the previous result from _search_ranges(), for better
2612     # performance
2613     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2614
2615     sub new {
2616         my $class = shift;
2617         my %args = @_;
2618
2619         # Optional initialization data for the range list.
2620         my $initialize = delete $args{'Initialize'};
2621
2622         my $self;
2623
2624         # Use _union() to initialize.  _union() returns an object of this
2625         # class, which means that it will call this constructor recursively.
2626         # But it won't have this $initialize parameter so that it won't
2627         # infinitely loop on this.
2628         return _union($class, $initialize, %args) if defined $initialize;
2629
2630         $self = bless \do { my $anonymous_scalar }, $class;
2631         my $addr; { no overloading; $addr = 0+$self; }
2632
2633         # Optional parent object, only for debug info.
2634         $owner_name_of{$addr} = delete $args{'Owner'};
2635         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2636
2637         # Stringify, in case it is an object.
2638         $owner_name_of{$addr} = "$owner_name_of{$addr}";
2639
2640         # This is used only for error messages, and so a colon is added
2641         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2642
2643         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2644
2645         # Max is initialized to a negative value that isn't adjacent to 0,
2646         # for simpler tests
2647         $max{$addr} = -2;
2648
2649         $_search_ranges_cache{$addr} = 0;
2650         $ranges{$addr} = [];
2651
2652         return $self;
2653     }
2654
2655     use overload
2656         fallback => 0,
2657         qw("") => "_operator_stringify",
2658         "." => \&main::_operator_dot,
2659     ;
2660
2661     sub _operator_stringify {
2662         my $self = shift;
2663         my $addr; { no overloading; $addr = 0+$self; }
2664
2665         return "Range_List attached to '$owner_name_of{$addr}'"
2666                                                 if $owner_name_of{$addr};
2667         return "anonymous Range_List " . \$self;
2668     }
2669
2670     sub _union {
2671         # Returns the union of the input code points.  It can be called as
2672         # either a constructor or a method.  If called as a method, the result
2673         # will be a new() instance of the calling object, containing the union
2674         # of that object with the other parameter's code points;  if called as
2675         # a constructor, the first parameter gives the class the new object
2676         # should be, and the second parameter gives the code points to go into
2677         # it.
2678         # In either case, there are two parameters looked at by this routine;
2679         # any additional parameters are passed to the new() constructor.
2680         #
2681         # The code points can come in the form of some object that contains
2682         # ranges, and has a conventionally named method to access them; or
2683         # they can be an array of individual code points (as integers); or
2684         # just a single code point.
2685         #
2686         # If they are ranges, this routine doesn't make any effort to preserve
2687         # the range values of one input over the other.  Therefore this base
2688         # class should not allow _union to be called from other than
2689         # initialization code, so as to prevent two tables from being added
2690         # together where the range values matter.  The general form of this
2691         # routine therefore belongs in a derived class, but it was moved here
2692         # to avoid duplication of code.  The failure to overload this in this
2693         # class keeps it safe.
2694         #
2695
2696         my $self;
2697         my @args;   # Arguments to pass to the constructor
2698
2699         my $class = shift;
2700
2701         # If a method call, will start the union with the object itself, and
2702         # the class of the new object will be the same as self.
2703         if (ref $class) {
2704             $self = $class;
2705             $class = ref $self;
2706             push @args, $self;
2707         }
2708
2709         # Add the other required parameter.
2710         push @args, shift;
2711         # Rest of parameters are passed on to the constructor
2712
2713         # Accumulate all records from both lists.
2714         my @records;
2715         for my $arg (@args) {
2716             #local $to_trace = 0 if main::DEBUG;
2717             trace "argument = $arg" if main::DEBUG && $to_trace;
2718             if (! defined $arg) {
2719                 my $message = "";
2720                 if (defined $self) {
2721                     no overloading;
2722                     $message .= $owner_name_of{0+$self};
2723                 }
2724                 Carp::my_carp_bug($message .= "Undefined argument to _union.  No union done.");
2725                 return;
2726             }
2727             $arg = [ $arg ] if ! ref $arg;
2728             my $type = ref $arg;
2729             if ($type eq 'ARRAY') {
2730                 foreach my $element (@$arg) {
2731                     push @records, Range->new($element, $element);
2732                 }
2733             }
2734             elsif ($arg->isa('Range')) {
2735                 push @records, $arg;
2736             }
2737             elsif ($arg->can('ranges')) {
2738                 push @records, $arg->ranges;
2739             }
2740             else {
2741                 my $message = "";
2742                 if (defined $self) {
2743                     no overloading;
2744                     $message .= $owner_name_of{0+$self};
2745                 }
2746                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
2747                 return;
2748             }
2749         }
2750
2751         # Sort with the range containing the lowest ordinal first, but if
2752         # two ranges start at the same code point, sort with the bigger range
2753         # of the two first, because it takes fewer cycles.
2754         @records = sort { ($a->start <=> $b->start)
2755                                       or
2756                                     # if b is shorter than a, b->end will be
2757                                     # less than a->end, and we want to select
2758                                     # a, so want to return -1
2759                                     ($b->end <=> $a->end)
2760                                    } @records;
2761
2762         my $new = $class->new(@_);
2763
2764         # Fold in records so long as they add new information.
2765         for my $set (@records) {
2766             my $start = $set->start;
2767             my $end   = $set->end;
2768             my $value   = $set->value;
2769             if ($start > $new->max) {
2770                 $new->_add_delete('+', $start, $end, $value);
2771             }
2772             elsif ($end > $new->max) {
2773                 $new->_add_delete('+', $new->max +1, $end, $value);
2774             }
2775         }
2776
2777         return $new;
2778     }
2779
2780     sub range_count {        # Return the number of ranges in the range list
2781         my $self = shift;
2782         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2783
2784         no overloading;
2785         return scalar @{$ranges{0+$self}};
2786     }
2787
2788     sub min {
2789         # Returns the minimum code point currently in the range list, or if
2790         # the range list is empty, 2 beyond the max possible.  This is a
2791         # method because used so rarely, that not worth saving between calls,
2792         # and having to worry about changing it as ranges are added and
2793         # deleted.
2794
2795         my $self = shift;
2796         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2797
2798         my $addr; { no overloading; $addr = 0+$self; }
2799
2800         # If the range list is empty, return a large value that isn't adjacent
2801         # to any that could be in the range list, for simpler tests
2802         return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2803         return $ranges{$addr}->[0]->start;
2804     }
2805
2806     sub contains {
2807         # Boolean: Is argument in the range list?  If so returns $i such that:
2808         #   range[$i]->end < $codepoint <= range[$i+1]->end
2809         # which is one beyond what you want; this is so that the 0th range
2810         # doesn't return false
2811         my $self = shift;
2812         my $codepoint = shift;
2813         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2814
2815         my $i = $self->_search_ranges($codepoint);
2816         return 0 unless defined $i;
2817
2818         # The search returns $i, such that
2819         #   range[$i-1]->end < $codepoint <= range[$i]->end
2820         # So is in the table if and only iff it is at least the start position
2821         # of range $i.
2822         no overloading;
2823         return 0 if $ranges{0+$self}->[$i]->start > $codepoint;
2824         return $i + 1;
2825     }
2826
2827     sub value_of {
2828         # Returns the value associated with the code point, undef if none
2829
2830         my $self = shift;
2831         my $codepoint = shift;
2832         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2833
2834         my $i = $self->contains($codepoint);
2835         return unless $i;
2836
2837         # contains() returns 1 beyond where we should look
2838         no overloading;
2839         return $ranges{0+$self}->[$i-1]->value;
2840     }
2841
2842     sub _search_ranges {
2843         # Find the range in the list which contains a code point, or where it
2844         # should go if were to add it.  That is, it returns $i, such that:
2845         #   range[$i-1]->end < $codepoint <= range[$i]->end
2846         # Returns undef if no such $i is possible (e.g. at end of table), or
2847         # if there is an error.
2848
2849         my $self = shift;
2850         my $code_point = shift;
2851         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2852
2853         my $addr; { no overloading; $addr = 0+$self; }
2854
2855         return if $code_point > $max{$addr};
2856         my $r = $ranges{$addr};                # The current list of ranges
2857         my $range_list_size = scalar @$r;
2858         my $i;
2859
2860         use integer;        # want integer division
2861
2862         # Use the cached result as the starting guess for this one, because,
2863         # an experiment on 5.1 showed that 90% of the time the cache was the
2864         # same as the result on the next call (and 7% it was one less).
2865         $i = $_search_ranges_cache{$addr};
2866         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
2867                                             # from an intervening deletion
2868         #local $to_trace = 1 if main::DEBUG;
2869         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
2870         return $i if $code_point <= $r->[$i]->end
2871                      && ($i == 0 || $r->[$i-1]->end < $code_point);
2872
2873         # Here the cache doesn't yield the correct $i.  Try adding 1.
2874         if ($i < $range_list_size - 1
2875             && $r->[$i]->end < $code_point &&
2876             $code_point <= $r->[$i+1]->end)
2877         {
2878             $i++;
2879             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
2880             $_search_ranges_cache{$addr} = $i;
2881             return $i;
2882         }
2883
2884         # Here, adding 1 also didn't work.  We do a binary search to
2885         # find the correct position, starting with current $i
2886         my $lower = 0;
2887         my $upper = $range_list_size - 1;
2888         while (1) {
2889             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
2890
2891             if ($code_point <= $r->[$i]->end) {
2892
2893                 # Here we have met the upper constraint.  We can quit if we
2894                 # also meet the lower one.
2895                 last if $i == 0 || $r->[$i-1]->end < $code_point;
2896
2897                 $upper = $i;        # Still too high.
2898
2899             }
2900             else {
2901
2902                 # Here, $r[$i]->end < $code_point, so look higher up.
2903                 $lower = $i;
2904             }
2905
2906             # Split search domain in half to try again.
2907             my $temp = ($upper + $lower) / 2;
2908
2909             # No point in continuing unless $i changes for next time
2910             # in the loop.
2911             if ($temp == $i) {
2912
2913                 # We can't reach the highest element because of the averaging.
2914                 # So if one below the upper edge, force it there and try one
2915                 # more time.
2916                 if ($i == $range_list_size - 2) {
2917
2918                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
2919                     $i = $range_list_size - 1;
2920
2921                     # Change $lower as well so if fails next time through,
2922                     # taking the average will yield the same $i, and we will
2923                     # quit with the error message just below.
2924                     $lower = $i;
2925                     next;
2926                 }
2927                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
2928                 return;
2929             }
2930             $i = $temp;
2931         } # End of while loop
2932
2933         if (main::DEBUG && $to_trace) {
2934             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
2935             trace "i=  [ $i ]", $r->[$i];
2936             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
2937         }
2938
2939         # Here we have found the offset.  Cache it as a starting point for the
2940         # next call.
2941         $_search_ranges_cache{$addr} = $i;
2942         return $i;
2943     }
2944
2945     sub _add_delete {
2946         # Add, replace or delete ranges to or from a list.  The $type
2947         # parameter gives which:
2948         #   '+' => insert or replace a range, returning a list of any changed
2949         #          ranges.
2950         #   '-' => delete a range, returning a list of any deleted ranges.
2951         #
2952         # The next three parameters give respectively the start, end, and
2953         # value associated with the range.  'value' should be null unless the
2954         # operation is '+';
2955         #
2956         # The range list is kept sorted so that the range with the lowest
2957         # starting position is first in the list, and generally, adjacent
2958         # ranges with the same values are merged into single larger one (see
2959         # exceptions below).
2960         #
2961         # There are more parameters, all are key => value pairs:
2962         #   Type    gives the type of the value.  It is only valid for '+'.
2963         #           All ranges have types; if this parameter is omitted, 0 is
2964         #           assumed.  Ranges with type 0 are assumed to obey the
2965         #           Unicode rules for casing, etc; ranges with other types are
2966         #           not.  Otherwise, the type is arbitrary, for the caller's
2967         #           convenience, and looked at only by this routine to keep
2968         #           adjacent ranges of different types from being merged into
2969         #           a single larger range, and when Replace =>
2970         #           $IF_NOT_EQUIVALENT is specified (see just below).
2971         #   Replace  determines what to do if the range list already contains
2972         #            ranges which coincide with all or portions of the input
2973         #            range.  It is only valid for '+':
2974         #       => $NO            means that the new value is not to replace
2975         #                         any existing ones, but any empty gaps of the
2976         #                         range list coinciding with the input range
2977         #                         will be filled in with the new value.
2978         #       => $UNCONDITIONALLY  means to replace the existing values with
2979         #                         this one unconditionally.  However, if the
2980         #                         new and old values are identical, the
2981         #                         replacement is skipped to save cycles
2982         #       => $IF_NOT_EQUIVALENT means to replace the existing values
2983         #                         with this one if they are not equivalent.
2984         #                         Ranges are equivalent if their types are the
2985         #                         same, and they are the same string, or if
2986         #                         both are type 0 ranges, if their Unicode
2987         #                         standard forms are identical.  In this last
2988         #                         case, the routine chooses the more "modern"
2989         #                         one to use.  This is because some of the
2990         #                         older files are formatted with values that
2991         #                         are, for example, ALL CAPs, whereas the
2992         #                         derived files have a more modern style,
2993         #                         which looks better.  By looking for this
2994         #                         style when the pre-existing and replacement
2995         #                         standard forms are the same, we can move to
2996         #                         the modern style
2997         #       => $MULTIPLE      means that if this range duplicates an
2998         #                         existing one, but has a different value,
2999         #                         don't replace the existing one, but insert
3000         #                         this, one so that the same range can occur
3001         #                         multiple times.
3002         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3003         #
3004         # "same value" means identical for type-0 ranges, and it means having
3005         # the same standard forms for non-type-0 ranges.
3006
3007         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3008
3009         my $self = shift;
3010         my $operation = shift;   # '+' for add/replace; '-' for delete;
3011         my $start = shift;
3012         my $end   = shift;
3013         my $value = shift;
3014
3015         my %args = @_;
3016
3017         $value = "" if not defined $value;        # warning: $value can be "0"
3018
3019         my $replace = delete $args{'Replace'};
3020         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3021
3022         my $type = delete $args{'Type'};
3023         $type = 0 unless defined $type;
3024
3025         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3026
3027         my $addr; { no overloading; $addr = 0+$self; }
3028
3029         if ($operation ne '+' && $operation ne '-') {
3030             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3031             return;
3032         }
3033         unless (defined $start && defined $end) {
3034             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3035             return;
3036         }
3037         unless ($end >= $start) {
3038             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3039             return;
3040         }
3041         #local $to_trace = 1 if main::DEBUG;
3042
3043         if ($operation eq '-') {
3044             if ($replace != $IF_NOT_EQUIVALENT) {
3045                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3046                 $replace = $IF_NOT_EQUIVALENT;
3047             }
3048             if ($type) {
3049                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3050                 $type = 0;
3051             }
3052             if ($value ne "") {
3053                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3054                 $value = "";
3055             }
3056         }
3057
3058         my $r = $ranges{$addr};               # The current list of ranges
3059         my $range_list_size = scalar @$r;     # And its size
3060         my $max = $max{$addr};                # The current high code point in
3061                                               # the list of ranges
3062
3063         # Do a special case requiring fewer machine cycles when the new range
3064         # starts after the current highest point.  The Unicode input data is
3065         # structured so this is common.
3066         if ($start > $max) {
3067
3068             trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3069             return if $operation eq '-'; # Deleting a non-existing range is a
3070                                          # no-op
3071
3072             # If the new range doesn't logically extend the current final one
3073             # in the range list, create a new range at the end of the range
3074             # list.  (max cleverly is initialized to a negative number not
3075             # adjacent to 0 if the range list is empty, so even adding a range
3076             # to an empty range list starting at 0 will have this 'if'
3077             # succeed.)
3078             if ($start > $max + 1        # non-adjacent means can't extend.
3079                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3080                 || @{$r}[-1]->type != $type # types differ, can't extend.
3081             ) {
3082                 push @$r, Range->new($start, $end,
3083                                      Value => $value,
3084                                      Type => $type);
3085             }
3086             else {
3087
3088                 # Here, the new range starts just after the current highest in
3089                 # the range list, and they have the same type and value.
3090                 # Extend the current range to incorporate the new one.
3091                 @{$r}[-1]->set_end($end);
3092             }
3093
3094             # This becomes the new maximum.
3095             $max{$addr} = $end;
3096
3097             return;
3098         }
3099         #local $to_trace = 0 if main::DEBUG;
3100
3101         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3102
3103         # Here, the input range isn't after the whole rest of the range list.
3104         # Most likely 'splice' will be needed.  The rest of the routine finds
3105         # the needed splice parameters, and if necessary, does the splice.
3106         # First, find the offset parameter needed by the splice function for
3107         # the input range.  Note that the input range may span multiple
3108         # existing ones, but we'll worry about that later.  For now, just find
3109         # the beginning.  If the input range is to be inserted starting in a
3110         # position not currently in the range list, it must (obviously) come
3111         # just after the range below it, and just before the range above it.
3112         # Slightly less obviously, it will occupy the position currently
3113         # occupied by the range that is to come after it.  More formally, we
3114         # are looking for the position, $i, in the array of ranges, such that:
3115         #
3116         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3117         #
3118         # (The ordered relationships within existing ranges are also shown in
3119         # the equation above).  However, if the start of the input range is
3120         # within an existing range, the splice offset should point to that
3121         # existing range's position in the list; that is $i satisfies a
3122         # somewhat different equation, namely:
3123         #
3124         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3125         #
3126         # More briefly, $start can come before or after r[$i]->start, and at
3127         # this point, we don't know which it will be.  However, these
3128         # two equations share these constraints:
3129         #
3130         #   r[$i-1]->end < $start <= r[$i]->end
3131         #
3132         # And that is good enough to find $i.
3133
3134         my $i = $self->_search_ranges($start);
3135         if (! defined $i) {
3136             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3137             return;
3138         }
3139
3140         # The search function returns $i such that:
3141         #
3142         # r[$i-1]->end < $start <= r[$i]->end
3143         #
3144         # That means that $i points to the first range in the range list
3145         # that could possibly be affected by this operation.  We still don't
3146         # know if the start of the input range is within r[$i], or if it
3147         # points to empty space between r[$i-1] and r[$i].
3148         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3149
3150         # Special case the insertion of data that is not to replace any
3151         # existing data.
3152         if ($replace == $NO) {  # If $NO, has to be operation '+'
3153             #local $to_trace = 1 if main::DEBUG;
3154             trace "Doesn't replace" if main::DEBUG && $to_trace;
3155
3156             # Here, the new range is to take effect only on those code points
3157             # that aren't already in an existing range.  This can be done by
3158             # looking through the existing range list and finding the gaps in
3159             # the ranges that this new range affects, and then calling this
3160             # function recursively on each of those gaps, leaving untouched
3161             # anything already in the list.  Gather up a list of the changed
3162             # gaps first so that changes to the internal state as new ranges
3163             # are added won't be a problem.
3164             my @gap_list;
3165
3166             # First, if the starting point of the input range is outside an
3167             # existing one, there is a gap from there to the beginning of the
3168             # existing range -- add a span to fill the part that this new
3169             # range occupies
3170             if ($start < $r->[$i]->start) {
3171                 push @gap_list, Range->new($start,
3172                                            main::min($end,
3173                                                      $r->[$i]->start - 1),
3174                                            Type => $type);
3175                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3176             }
3177
3178             # Then look through the range list for other gaps until we reach
3179             # the highest range affected by the input one.
3180             my $j;
3181             for ($j = $i+1; $j < $range_list_size; $j++) {
3182                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3183                 last if $end < $r->[$j]->start;
3184
3185                 # If there is a gap between when this range starts and the
3186                 # previous one ends, add a span to fill it.  Note that just
3187                 # because there are two ranges doesn't mean there is a
3188                 # non-zero gap between them.  It could be that they have
3189                 # different values or types
3190                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3191                     push @gap_list,
3192                         Range->new($r->[$j-1]->end + 1,
3193                                    $r->[$j]->start - 1,
3194                                    Type => $type);
3195                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3196                 }
3197             }
3198
3199             # Here, we have either found an existing range in the range list,
3200             # beyond the area affected by the input one, or we fell off the
3201             # end of the loop because the input range affects the whole rest
3202             # of the range list.  In either case, $j is 1 higher than the
3203             # highest affected range.  If $j == $i, it means that there are no
3204             # affected ranges, that the entire insertion is in the gap between
3205             # r[$i-1], and r[$i], which we already have taken care of before
3206             # the loop.
3207             # On the other hand, if there are affected ranges, it might be
3208             # that there is a gap that needs filling after the final such
3209             # range to the end of the input range
3210             if ($r->[$j-1]->end < $end) {
3211                     push @gap_list, Range->new(main::max($start,
3212                                                          $r->[$j-1]->end + 1),
3213                                                $end,
3214                                                Type => $type);
3215                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3216             }
3217
3218             # Call recursively to fill in all the gaps.
3219             foreach my $gap (@gap_list) {
3220                 $self->_add_delete($operation,
3221                                    $gap->start,
3222                                    $gap->end,
3223                                    $value,
3224                                    Type => $type);
3225             }
3226
3227             return;
3228         }
3229
3230         # Here, we have taken care of the case where $replace is $NO, which
3231         # means that whatever action we now take is done unconditionally.  It
3232         # still could be that this call will result in a no-op, if duplicates
3233         # aren't allowed, and we are inserting a range that merely duplicates
3234         # data already in the range list; or also if deleting a non-existent
3235         # range.
3236         # $i still points to the first potential affected range.  Now find the
3237         # highest range affected, which will determine the length parameter to
3238         # splice.  (The input range can span multiple existing ones.)  While
3239         # we are looking through the range list, see also if this is an
3240         # insertion that will change the values of at least one of the
3241         # affected ranges.  We don't need to do this check unless this is an
3242         # insertion of non-multiples, and also since this is a boolean, we
3243         # don't need to do it if have already determined that it will make a
3244         # change; just unconditionally change them.  $cdm is created to be 1
3245         # if either of these is true. (The 'c' in the name comes from below)
3246         my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3247         my $j;        # This will point to the highest affected range
3248
3249         # For non-zero types, the standard form is the value itself;
3250         my $standard_form = ($type) ? $value : main::standardize($value);
3251
3252         for ($j = $i; $j < $range_list_size; $j++) {
3253             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3254
3255             # If find a range that it doesn't overlap into, we can stop
3256             # searching
3257             last if $end < $r->[$j]->start;
3258
3259             # Here, overlaps the range at $j.  If the value's don't match,
3260             # and this is supposedly an insertion, it becomes a change
3261             # instead.  This is what the 'c' stands for in $cdm.
3262             if (! $cdm) {
3263                 if ($r->[$j]->standard_form ne $standard_form) {
3264                     $cdm = 1;
3265                 }
3266                 else {
3267
3268                     # Here, the two values are essentially the same.  If the
3269                     # two are actually identical, replacing wouldn't change
3270                     # anything so skip it.
3271                     my $pre_existing = $r->[$j]->value;
3272                     if ($pre_existing ne $value) {
3273
3274                         # Here the new and old standardized values are the
3275                         # same, but the non-standardized values aren't.  If
3276                         # replacing unconditionally, then replace
3277                         if( $replace == $UNCONDITIONALLY) {
3278                             $cdm = 1;
3279                         }
3280                         else {
3281
3282                             # Here, are replacing conditionally.  Decide to
3283                             # replace or not based on which appears to look
3284                             # the "nicest".  If one is mixed case and the
3285                             # other isn't, choose the mixed case one.
3286                             my $new_mixed = $value =~ /[A-Z]/
3287                                             && $value =~ /[a-z]/;
3288                             my $old_mixed = $pre_existing =~ /[A-Z]/
3289                                             && $pre_existing =~ /[a-z]/;
3290
3291                             if ($old_mixed != $new_mixed) {
3292                                 $cdm = 1 if $new_mixed;
3293                                 if (main::DEBUG && $to_trace) {
3294                                     if ($cdm) {
3295                                         trace "Replacing $pre_existing with $value";
3296                                     }
3297                                     else {
3298                                         trace "Retaining $pre_existing over $value";
3299                                     }
3300                                 }
3301                             }
3302                             else {
3303
3304                                 # Here casing wasn't different between the two.
3305                                 # If one has hyphens or underscores and the
3306                                 # other doesn't, choose the one with the
3307                                 # punctuation.
3308                                 my $new_punct = $value =~ /[-_]/;
3309                                 my $old_punct = $pre_existing =~ /[-_]/;
3310
3311                                 if ($old_punct != $new_punct) {
3312                                     $cdm = 1 if $new_punct;
3313                                     if (main::DEBUG && $to_trace) {
3314                                         if ($cdm) {
3315                                             trace "Replacing $pre_existing with $value";
3316                                         }
3317                                         else {
3318                                             trace "Retaining $pre_existing over $value";
3319                                         }
3320                                     }
3321                                 }   # else existing one is just as "good";
3322                                     # retain it to save cycles.
3323                             }
3324                         }
3325                     }
3326                 }
3327             }
3328         } # End of loop looking for highest affected range.
3329
3330         # Here, $j points to one beyond the highest range that this insertion
3331         # affects (hence to beyond the range list if that range is the final
3332         # one in the range list).
3333
3334         # The splice length is all the affected ranges.  Get it before
3335         # subtracting, for efficiency, so we don't have to later add 1.
3336         my $length = $j - $i;
3337
3338         $j--;        # $j now points to the highest affected range.
3339         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3340
3341         # If inserting a multiple record, this is where it goes, after all the
3342         # existing ones for this range.  This implies an insertion, and no
3343         # change to any existing ranges.  Note that $j can be -1 if this new
3344         # range doesn't actually duplicate any existing, and comes at the
3345         # beginning of the list, in which case we can handle it like any other
3346         # insertion, and is easier to do so.
3347         if ($replace == $MULTIPLE && $j >= 0) {
3348
3349             # This restriction could be remedied with a little extra work, but
3350             # it won't hopefully ever be necessary
3351             if ($r->[$j]->start != $r->[$j]->end) {
3352                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point.  No action taken.");
3353                 return;
3354             }
3355
3356             # Don't add an exact duplicate, as it isn't really a multiple
3357             return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3358
3359             trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3360             my @return = splice @$r,
3361                                 $j+1,
3362                                 0,
3363                                 Range->new($start,
3364                                            $end,
3365                                            Value => $value,
3366                                            Type => $type);
3367             if (main::DEBUG && $to_trace) {
3368                 trace "After splice:";
3369                 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3370                 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3371                 trace "j  =[", $j, "]", $r->[$j] if $j >= 0;
3372                 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3373                 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3374                 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3375             }
3376             return @return;
3377         }
3378
3379         # Here, have taken care of $NO and $MULTIPLE replaces.
3380         # $j points to the highest affected range.  But it can be < $i or even
3381         # -1.  These happen only if the insertion is entirely in the gap
3382         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3383         # above exited first time through with $end < $r->[$i]->start.  (And
3384         # then we subtracted one from j)  This implies also that $start <
3385         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3386         # $start, so the entire input range is in the gap.
3387         if ($j < $i) {
3388
3389             # Here the entire input range is in the gap before $i.
3390
3391             if (main::DEBUG && $to_trace) {
3392                 if ($i) {
3393                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3394                 }
3395                 else {
3396                     trace "Entire range is before $r->[$i]";
3397                 }
3398             }
3399             return if $operation ne '+'; # Deletion of a non-existent range is
3400                                          # a no-op
3401         }
3402         else {
3403
3404             # Here the entire input range is not in the gap before $i.  There
3405             # is an affected one, and $j points to the highest such one.
3406
3407             # At this point, here is the situation:
3408             # This is not an insertion of a multiple, nor of tentative ($NO)
3409             # data.
3410             #   $i  points to the first element in the current range list that
3411             #            may be affected by this operation.  In fact, we know
3412             #            that the range at $i is affected because we are in
3413             #            the else branch of this 'if'
3414             #   $j  points to the highest affected range.
3415             # In other words,
3416             #   r[$i-1]->end < $start <= r[$i]->end
3417             # And:
3418             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3419             #
3420             # Also:
3421             #   $cdm is a boolean which is set true if and only if this is a
3422             #        change or deletion (multiple was handled above).  In
3423             #        other words, it could be renamed to be just $cd.
3424
3425             # We now have enough information to decide if this call is a no-op
3426             # or not.  It is a no-op if it is a deletion of a non-existent
3427             # range, or an insertion of already existing data.
3428
3429             if (main::DEBUG && $to_trace && ! $cdm
3430                                          && $i == $j
3431                                          && $start >= $r->[$i]->start)
3432             {
3433                     trace "no-op";
3434             }
3435             return if ! $cdm      # change or delete => not no-op
3436                       && $i == $j # more than one affected range => not no-op
3437
3438                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3439                       # Further, $start and/or $end is >= r[$i]->start
3440                       # The test below hence guarantees that
3441                       #     r[$i]->start < $start <= $end <= r[$i]->end
3442                       # This means the input range is contained entirely in
3443                       # the one at $i, so is a no-op
3444                       && $start >= $r->[$i]->start;
3445         }
3446
3447         # Here, we know that some action will have to be taken.  We have
3448         # calculated the offset and length (though adjustments may be needed)
3449         # for the splice.  Now start constructing the replacement list.
3450         my @replacement;
3451         my $splice_start = $i;
3452
3453         my $extends_below;
3454         my $extends_above;
3455
3456         # See if should extend any adjacent ranges.
3457         if ($operation eq '-') { # Don't extend deletions
3458             $extends_below = $extends_above = 0;
3459         }
3460         else {  # Here, should extend any adjacent ranges.  See if there are
3461                 # any.
3462             $extends_below = ($i > 0
3463                             # can't extend unless adjacent
3464                             && $r->[$i-1]->end == $start -1
3465                             # can't extend unless are same standard value
3466                             && $r->[$i-1]->standard_form eq $standard_form
3467                             # can't extend unless share type
3468                             && $r->[$i-1]->type == $type);
3469             $extends_above = ($j+1 < $range_list_size
3470                             && $r->[$j+1]->start == $end +1
3471                             && $r->[$j+1]->standard_form eq $standard_form
3472                             && $r->[$j-1]->type == $type);
3473         }
3474         if ($extends_below && $extends_above) { # Adds to both
3475             $splice_start--;     # start replace at element below
3476             $length += 2;        # will replace on both sides
3477             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3478
3479             # The result will fill in any gap, replacing both sides, and
3480             # create one large range.
3481             @replacement = Range->new($r->[$i-1]->start,
3482                                       $r->[$j+1]->end,
3483                                       Value => $value,
3484                                       Type => $type);
3485         }
3486         else {
3487
3488             # Here we know that the result won't just be the conglomeration of
3489             # a new range with both its adjacent neighbors.  But it could
3490             # extend one of them.
3491
3492             if ($extends_below) {
3493
3494                 # Here the new element adds to the one below, but not to the
3495                 # one above.  If inserting, and only to that one range,  can
3496                 # just change its ending to include the new one.
3497                 if ($length == 0 && ! $cdm) {
3498                     $r->[$i-1]->set_end($end);
3499                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3500                     return;
3501                 }
3502                 else {
3503                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3504                     $splice_start--;        # start replace at element below
3505                     $length++;              # will replace the element below
3506                     $start = $r->[$i-1]->start;
3507                 }
3508             }
3509             elsif ($extends_above) {
3510
3511                 # Here the new element adds to the one above, but not below.
3512                 # Mirror the code above
3513                 if ($length == 0 && ! $cdm) {
3514                     $r->[$j+1]->set_start($start);
3515                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3516                     return;
3517                 }
3518                 else {
3519                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3520                     $length++;        # will replace the element above
3521                     $end = $r->[$j+1]->end;
3522                 }
3523             }
3524
3525             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3526
3527             # Finally, here we know there will have to be a splice.
3528             # If the change or delete affects only the highest portion of the
3529             # first affected range, the range will have to be split.  The
3530             # splice will remove the whole range, but will replace it by a new
3531             # range containing just the unaffected part.  So, in this case,
3532             # add to the replacement list just this unaffected portion.
3533             if (! $extends_below
3534                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3535             {
3536                 push @replacement,
3537                     Range->new($r->[$i]->start,
3538                                $start - 1,
3539                                Value => $r->[$i]->value,
3540                                Type => $r->[$i]->type);
3541             }
3542
3543             # In the case of an insert or change, but not a delete, we have to
3544             # put in the new stuff;  this comes next.
3545             if ($operation eq '+') {
3546                 push @replacement, Range->new($start,
3547                                               $end,
3548                                               Value => $value,
3549                                               Type => $type);
3550             }
3551
3552             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3553             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3554
3555             # And finally, if we're changing or deleting only a portion of the
3556             # highest affected range, it must be split, as the lowest one was.
3557             if (! $extends_above
3558                 && $j >= 0  # Remember that j can be -1 if before first
3559                             # current element
3560                 && $end >= $r->[$j]->start
3561                 && $end < $r->[$j]->end)
3562             {
3563                 push @replacement,
3564                     Range->new($end + 1,
3565                                $r->[$j]->end,
3566                                Value => $r->[$j]->value,
3567                                Type => $r->[$j]->type);
3568             }
3569         }
3570
3571         # And do the splice, as calculated above
3572         if (main::DEBUG && $to_trace) {
3573             trace "replacing $length element(s) at $i with ";
3574             foreach my $replacement (@replacement) {
3575                 trace "    $replacement";
3576             }
3577             trace "Before splice:";
3578             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3579             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3580             trace "i  =[", $i, "]", $r->[$i];
3581             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3582             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3583         }
3584
3585         my @return = splice @$r, $splice_start, $length, @replacement;
3586
3587         if (main::DEBUG && $to_trace) {
3588             trace "After splice:";
3589             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3590             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3591             trace "i  =[", $i, "]", $r->[$i];
3592             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3593             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3594             trace "removed @return";
3595         }
3596
3597         # An actual deletion could have changed the maximum in the list.
3598         # There was no deletion if the splice didn't return something, but
3599         # otherwise recalculate it.  This is done too rarely to worry about
3600         # performance.
3601         if ($operation eq '-' && @return) {
3602             $max{$addr} = $r->[-1]->end;
3603         }
3604         return @return;
3605     }
3606
3607     sub reset_each_range {  # reset the iterator for each_range();
3608         my $self = shift;
3609         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3610
3611         no overloading;
3612         undef $each_range_iterator{0+$self};
3613         return;
3614     }
3615
3616     sub each_range {
3617         # Iterate over each range in a range list.  Results are undefined if
3618         # the range list is changed during the iteration.
3619
3620         my $self = shift;
3621         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3622
3623         my $addr; { no overloading; $addr = 0+$self; }
3624
3625         return if $self->is_empty;
3626
3627         $each_range_iterator{$addr} = -1
3628                                 if ! defined $each_range_iterator{$addr};
3629         $each_range_iterator{$addr}++;
3630         return $ranges{$addr}->[$each_range_iterator{$addr}]
3631                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
3632         undef $each_range_iterator{$addr};
3633         return;
3634     }
3635
3636     sub count {        # Returns count of code points in range list
3637         my $self = shift;
3638         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3639
3640         my $addr; { no overloading; $addr = 0+$self; }
3641
3642         my $count = 0;
3643         foreach my $range (@{$ranges{$addr}}) {
3644             $count += $range->end - $range->start + 1;
3645         }
3646         return $count;
3647     }
3648
3649     sub delete_range {    # Delete a range
3650         my $self = shift;
3651         my $start = shift;
3652         my $end = shift;
3653
3654         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3655
3656         return $self->_add_delete('-', $start, $end, "");
3657     }
3658
3659     sub is_empty { # Returns boolean as to if a range list is empty
3660         my $self = shift;
3661         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3662
3663         no overloading;
3664         return scalar @{$ranges{0+$self}} == 0;
3665     }
3666
3667     sub hash {
3668         # Quickly returns a scalar suitable for separating tables into
3669         # buckets, i.e. it is a hash function of the contents of a table, so
3670         # there are relatively few conflicts.
3671
3672         my $self = shift;
3673         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3674
3675         my $addr; { no overloading; $addr = 0+$self; }
3676
3677         # These are quickly computable.  Return looks like 'min..max;count'
3678         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3679     }
3680 } # End closure for _Range_List_Base
3681
3682 package Range_List;
3683 use base '_Range_List_Base';
3684
3685 # A Range_List is a range list for match tables; i.e. the range values are
3686 # not significant.  Thus a number of operations can be safely added to it,
3687 # such as inversion, intersection.  Note that union is also an unsafe
3688 # operation when range values are cared about, and that method is in the base
3689 # class, not here.  But things are set up so that that method is callable only
3690 # during initialization.  Only in this derived class, is there an operation
3691 # that combines two tables.  A Range_Map can thus be used to initialize a
3692 # Range_List, and its mappings will be in the list, but are not significant to
3693 # this class.
3694
3695 sub trace { return main::trace(@_); }
3696
3697 { # Closure
3698
3699     use overload
3700         fallback => 0,
3701         '+' => sub { my $self = shift;
3702                     my $other = shift;
3703
3704                     return $self->_union($other)
3705                 },
3706         '&' => sub { my $self = shift;
3707                     my $other = shift;
3708
3709                     return $self->_intersect($other, 0);
3710                 },
3711         '~' => "_invert",
3712         '-' => "_subtract",
3713     ;
3714
3715     sub _invert {
3716         # Returns a new Range_List that gives all code points not in $self.
3717
3718         my $self = shift;
3719
3720         my $new = Range_List->new;
3721
3722         # Go through each range in the table, finding the gaps between them
3723         my $max = -1;   # Set so no gap before range beginning at 0
3724         for my $range ($self->ranges) {
3725             my $start = $range->start;
3726             my $end   = $range->end;
3727
3728             # If there is a gap before this range, the inverse will contain
3729             # that gap.
3730             if ($start > $max + 1) {
3731                 $new->add_range($max + 1, $start - 1);
3732             }
3733             $max = $end;
3734         }
3735
3736         # And finally, add the gap from the end of the table to the max
3737         # possible code point
3738         if ($max < $LAST_UNICODE_CODEPOINT) {
3739             $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3740         }
3741         return $new;
3742     }
3743
3744     sub _subtract {
3745         # Returns a new Range_List with the argument deleted from it.  The
3746         # argument can be a single code point, a range, or something that has
3747         # a range, with the _range_list() method on it returning them
3748
3749         my $self = shift;
3750         my $other = shift;
3751         my $reversed = shift;
3752         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3753
3754         if ($reversed) {
3755             Carp::my_carp_bug("Can't cope with a "
3756              .  __PACKAGE__
3757              . " being the second parameter in a '-'.  Subtraction ignored.");
3758             return $self;
3759         }
3760
3761         my $new = Range_List->new(Initialize => $self);
3762
3763         if (! ref $other) { # Single code point
3764             $new->delete_range($other, $other);
3765         }
3766         elsif ($other->isa('Range')) {
3767             $new->delete_range($other->start, $other->end);
3768         }
3769         elsif ($other->can('_range_list')) {
3770             foreach my $range ($other->_range_list->ranges) {
3771                 $new->delete_range($range->start, $range->end);
3772             }
3773         }
3774         else {
3775             Carp::my_carp_bug("Can't cope with a "
3776                         . ref($other)
3777                         . " argument to '-'.  Subtraction ignored."
3778                         );
3779             return $self;
3780         }
3781
3782         return $new;
3783     }
3784
3785     sub _intersect {
3786         # Returns either a boolean giving whether the two inputs' range lists
3787         # intersect (overlap), or a new Range_List containing the intersection
3788         # of the two lists.  The optional final parameter being true indicates
3789         # to do the check instead of the intersection.
3790
3791         my $a_object = shift;
3792         my $b_object = shift;
3793         my $check_if_overlapping = shift;
3794         $check_if_overlapping = 0 unless defined $check_if_overlapping;
3795         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3796
3797         if (! defined $b_object) {
3798             my $message = "";
3799             $message .= $a_object->_owner_name_of if defined $a_object;
3800             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
3801             return;
3802         }
3803
3804         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
3805         # Thus the intersection could be much more simply be written:
3806         #   return ~(~$a_object + ~$b_object);
3807         # But, this is slower, and when taking the inverse of a large
3808         # range_size_1 table, back when such tables were always stored that
3809         # way, it became prohibitively slow, hence the code was changed to the
3810         # below
3811
3812         if ($b_object->isa('Range')) {
3813             $b_object = Range_List->new(Initialize => $b_object,
3814                                         Owner => $a_object->_owner_name_of);
3815         }
3816         $b_object = $b_object->_range_list if $b_object->can('_range_list');
3817
3818         my @a_ranges = $a_object->ranges;
3819         my @b_ranges = $b_object->ranges;
3820
3821         #local $to_trace = 1 if main::DEBUG;
3822         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
3823
3824         # Start with the first range in each list
3825         my $a_i = 0;
3826         my $range_a = $a_ranges[$a_i];
3827         my $b_i = 0;
3828         my $range_b = $b_ranges[$b_i];
3829
3830         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
3831                                                 if ! $check_if_overlapping;
3832
3833         # If either list is empty, there is no intersection and no overlap
3834         if (! defined $range_a || ! defined $range_b) {
3835             return $check_if_overlapping ? 0 : $new;
3836         }
3837         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3838
3839         # Otherwise, must calculate the intersection/overlap.  Start with the
3840         # very first code point in each list
3841         my $a = $range_a->start;
3842         my $b = $range_b->start;
3843
3844         # Loop through all the ranges of each list; in each iteration, $a and
3845         # $b are the current code points in their respective lists
3846         while (1) {
3847
3848             # If $a and $b are the same code point, ...
3849             if ($a == $b) {
3850
3851                 # it means the lists overlap.  If just checking for overlap
3852                 # know the answer now,
3853                 return 1 if $check_if_overlapping;
3854
3855                 # The intersection includes this code point plus anything else
3856                 # common to both current ranges.
3857                 my $start = $a;
3858                 my $end = main::min($range_a->end, $range_b->end);
3859                 if (! $check_if_overlapping) {
3860                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
3861                     $new->add_range($start, $end);
3862                 }
3863
3864                 # Skip ahead to the end of the current intersect
3865                 $a = $b = $end;
3866
3867                 # If the current intersect ends at the end of either range (as
3868                 # it must for at least one of them), the next possible one
3869                 # will be the beginning code point in it's list's next range.
3870                 if ($a == $range_a->end) {
3871                     $range_a = $a_ranges[++$a_i];
3872                     last unless defined $range_a;
3873                     $a = $range_a->start;
3874                 }
3875                 if ($b == $range_b->end) {
3876                     $range_b = $b_ranges[++$b_i];
3877                     last unless defined $range_b;
3878                     $b = $range_b->start;
3879                 }
3880
3881                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3882             }
3883             elsif ($a < $b) {
3884
3885                 # Not equal, but if the range containing $a encompasses $b,
3886                 # change $a to be the middle of the range where it does equal
3887                 # $b, so the next iteration will get the intersection
3888                 if ($range_a->end >= $b) {
3889                     $a = $b;
3890                 }
3891                 else {
3892
3893                     # Here, the current range containing $a is entirely below
3894                     # $b.  Go try to find a range that could contain $b.
3895                     $a_i = $a_object->_search_ranges($b);
3896
3897                     # If no range found, quit.
3898                     last unless defined $a_i;
3899
3900                     # The search returns $a_i, such that
3901                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
3902                     # Set $a to the beginning of this new range, and repeat.
3903                     $range_a = $a_ranges[$a_i];
3904                     $a = $range_a->start;
3905                 }
3906             }
3907             else { # Here, $b < $a.
3908
3909                 # Mirror image code to the leg just above
3910                 if ($range_b->end >= $a) {
3911                     $b = $a;
3912                 }
3913                 else {
3914                     $b_i = $b_object->_search_ranges($a);
3915                     last unless defined $b_i;
3916                     $range_b = $b_ranges[$b_i];
3917                     $b = $range_b->start;
3918                 }
3919             }
3920         } # End of looping through ranges.
3921
3922         # Intersection fully computed, or now know that there is no overlap
3923         return $check_if_overlapping ? 0 : $new;
3924     }
3925
3926     sub overlaps {
3927         # Returns boolean giving whether the two arguments overlap somewhere
3928
3929         my $self = shift;
3930         my $other = shift;
3931         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3932
3933         return $self->_intersect($other, 1);
3934     }
3935
3936     sub add_range {
3937         # Add a range to the list.
3938
3939         my $self = shift;
3940         my $start = shift;
3941         my $end = shift;
3942         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3943
3944         return $self->_add_delete('+', $start, $end, "");
3945     }
3946
3947     my $non_ASCII = (ord('A') != 65);   # Assumes test on same platform
3948
3949     sub is_code_point_usable {
3950         # This used only for making the test script.  See if the input
3951         # proposed trial code point is one that Perl will handle.  If second
3952         # parameter is 0, it won't select some code points for various
3953         # reasons, noted below.
3954
3955         my $code = shift;
3956         my $try_hard = shift;
3957         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3958
3959         return 0 if $code < 0;                # Never use a negative
3960
3961         # For non-ASCII, we shun the characters that don't have Perl encoding-
3962         # independent symbols for them.  'A' is such a symbol, so is "\n".
3963         return $try_hard if $non_ASCII
3964                             && $code <= 0xFF
3965                             && ($code >= 0x7F
3966                                 || ($code >= 0x0E && $code <= 0x1F)
3967                                 || ($code >= 0x01 && $code <= 0x06)
3968                                 || $code == 0x0B);
3969
3970         # shun null.  I'm (khw) not sure why this was done, but NULL would be
3971         # the character very frequently used.
3972         return $try_hard if $code == 0x0000;
3973
3974         return 0 if $try_hard;  # XXX Temporary until fix utf8.c
3975
3976         # shun non-character code points.
3977         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
3978         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
3979
3980         return $try_hard if $code > $LAST_UNICODE_CODEPOINT;   # keep in range
3981         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
3982
3983         return 1;
3984     }
3985
3986     sub get_valid_code_point {
3987         # Return a code point that's part of the range list.  Returns nothing
3988         # if the table is empty or we can't find a suitable code point.  This
3989         # used only for making the test script.
3990
3991         my $self = shift;
3992         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3993
3994         my $addr; { no overloading; $addr = 0+$self; }
3995
3996         # On first pass, don't choose less desirable code points; if no good
3997         # one is found, repeat, allowing a less desirable one to be selected.
3998         for my $try_hard (0, 1) {
3999
4000             # Look through all the ranges for a usable code point.
4001             for my $set ($self->ranges) {
4002
4003                 # Try the edge cases first, starting with the end point of the
4004                 # range.
4005                 my $end = $set->end;
4006                 return $end if is_code_point_usable($end, $try_hard);
4007
4008                 # End point didn't, work.  Start at the beginning and try
4009                 # every one until find one that does work.
4010                 for my $trial ($set->start .. $end - 1) {
4011                     return $trial if is_code_point_usable($trial, $try_hard);
4012                 }
4013             }
4014         }
4015         return ();  # If none found, give up.
4016     }
4017
4018     sub get_invalid_code_point {
4019         # Return a code point that's not part of the table.  Returns nothing
4020         # if the table covers all code points or a suitable code point can't
4021         # be found.  This used only for making the test script.
4022
4023         my $self = shift;
4024         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4025
4026         # Just find a valid code point of the inverse, if any.
4027         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4028     }
4029 } # end closure for Range_List
4030
4031 package Range_Map;
4032 use base '_Range_List_Base';
4033
4034 # A Range_Map is a range list in which the range values (called maps) are
4035 # significant, and hence shouldn't be manipulated by our other code, which
4036 # could be ambiguous or lose things.  For example, in taking the union of two
4037 # lists, which share code points, but which have differing values, which one
4038 # has precedence in the union?
4039 # It turns out that these operations aren't really necessary for map tables,
4040 # and so this class was created to make sure they aren't accidentally
4041 # applied to them.
4042
4043 { # Closure
4044
4045     sub add_map {
4046         # Add a range containing a mapping value to the list
4047
4048         my $self = shift;
4049         # Rest of parameters passed on
4050
4051         return $self->_add_delete('+', @_);
4052     }
4053
4054     sub add_duplicate {
4055         # Adds entry to a range list which can duplicate an existing entry
4056
4057         my $self = shift;
4058         my $code_point = shift;
4059         my $value = shift;
4060         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4061
4062         return $self->add_map($code_point, $code_point,
4063                                 $value, Replace => $MULTIPLE);
4064     }
4065 } # End of closure for package Range_Map
4066
4067 package _Base_Table;
4068
4069 # A table is the basic data structure that gets written out into a file for
4070 # use by the Perl core.  This is the abstract base class implementing the
4071 # common elements from the derived ones.  A list of the methods to be
4072 # furnished by an implementing class is just after the constructor.
4073
4074 sub standardize { return main::standardize($_[0]); }
4075 sub trace { return main::trace(@_); }
4076
4077 { # Closure
4078
4079     main::setup_package();
4080
4081     my %range_list;
4082     # Object containing the ranges of the table.
4083     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4084
4085     my %full_name;
4086     # The full table name.
4087     main::set_access('full_name', \%full_name, 'r');
4088
4089     my %name;
4090     # The table name, almost always shorter
4091     main::set_access('name', \%name, 'r');
4092
4093     my %short_name;
4094     # The shortest of all the aliases for this table, with underscores removed
4095     main::set_access('short_name', \%short_name);
4096
4097     my %nominal_short_name_length;
4098     # The length of short_name before removing underscores
4099     main::set_access('nominal_short_name_length',
4100                     \%nominal_short_name_length);
4101
4102     my %complete_name;
4103     # The complete name, including property.
4104     main::set_access('complete_name', \%complete_name, 'r');
4105
4106     my %property;
4107     # Parent property this table is attached to.
4108     main::set_access('property', \%property, 'r');
4109
4110     my %aliases;
4111     # Ordered list of aliases of the table's name.  The first ones in the list
4112     # are output first in comments
4113     main::set_access('aliases', \%aliases, 'readable_array');
4114
4115     my %comment;
4116     # A comment associated with the table for human readers of the files
4117     main::set_access('comment', \%comment, 's');
4118
4119     my %description;
4120     # A comment giving a short description of the table's meaning for human
4121     # readers of the files.
4122     main::set_access('description', \%description, 'readable_array');
4123
4124     my %note;
4125     # A comment giving a short note about the table for human readers of the
4126     # files.
4127     main::set_access('note', \%note, 'readable_array');
4128
4129     my %internal_only;
4130     # Boolean; if set means any file that contains this table is marked as for
4131     # internal-only use.
4132     main::set_access('internal_only', \%internal_only);
4133
4134     my %find_table_from_alias;
4135     # The parent property passes this pointer to a hash which this class adds
4136     # all its aliases to, so that the parent can quickly take an alias and
4137     # find this table.
4138     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4139
4140     my %locked;
4141     # After this table is made equivalent to another one; we shouldn't go
4142     # changing the contents because that could mean it's no longer equivalent
4143     main::set_access('locked', \%locked, 'r');
4144
4145     my %file_path;
4146     # This gives the final path to the file containing the table.  Each
4147     # directory in the path is an element in the array
4148     main::set_access('file_path', \%file_path, 'readable_array');
4149
4150     my %status;
4151     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4152     main::set_access('status', \%status, 'r');
4153
4154     my %status_info;
4155     # A comment about its being obsolete, or whatever non normal status it has
4156     main::set_access('status_info', \%status_info, 'r');
4157
4158     my %range_size_1;
4159     # Is the table to be output with each range only a single code point?
4160     # This is done to avoid breaking existing code that may have come to rely
4161     # on this behavior in previous versions of this program.)
4162     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4163
4164     my %perl_extension;
4165     # A boolean set iff this table is a Perl extension to the Unicode
4166     # standard.
4167     main::set_access('perl_extension', \%perl_extension, 'r');
4168
4169     my %output_range_counts;
4170     # A boolean set iff this table is to have comments written in the
4171     # output file that contain the number of code points in the range.
4172     # The constructor can override the global flag of the same name.
4173     main::set_access('output_range_counts', \%output_range_counts, 'r');
4174
4175     sub new {
4176         # All arguments are key => value pairs, which you can see below, most
4177         # of which match fields documented above.  Otherwise: Pod_Entry,
4178         # Externally_Ok, and Fuzzy apply to the names of the table, and are
4179         # documented in the Alias package
4180
4181         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4182
4183         my $class = shift;
4184
4185         my $self = bless \do { my $anonymous_scalar }, $class;
4186         my $addr; { no overloading; $addr = 0+$self; }
4187
4188         my %args = @_;
4189
4190         $name{$addr} = delete $args{'Name'};
4191         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4192         $full_name{$addr} = delete $args{'Full_Name'};
4193         my $complete_name = $complete_name{$addr}
4194                           = delete $args{'Complete_Name'};
4195         $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4196         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4197         $property{$addr} = delete $args{'_Property'};
4198         $range_list{$addr} = delete $args{'_Range_List'};
4199         $status{$addr} = delete $args{'Status'} || $NORMAL;
4200         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4201         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4202
4203         my $description = delete $args{'Description'};
4204         my $externally_ok = delete $args{'Externally_Ok'};
4205         my $loose_match = delete $args{'Fuzzy'};
4206         my $note = delete $args{'Note'};
4207         my $make_pod_entry = delete $args{'Pod_Entry'};
4208         my $perl_extension = delete $args{'Perl_Extension'};
4209
4210         # Shouldn't have any left over
4211         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4212
4213         # Can't use || above because conceivably the name could be 0, and
4214         # can't use // operator in case this program gets used in Perl 5.8
4215         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4216         $output_range_counts{$addr} = $output_range_counts if
4217                                         ! defined $output_range_counts{$addr};
4218
4219         $aliases{$addr} = [ ];
4220         $comment{$addr} = [ ];
4221         $description{$addr} = [ ];
4222         $note{$addr} = [ ];
4223         $file_path{$addr} = [ ];
4224         $locked{$addr} = "";
4225
4226         push @{$description{$addr}}, $description if $description;
4227         push @{$note{$addr}}, $note if $note;
4228
4229         if ($status{$addr} eq $PLACEHOLDER) {
4230
4231             # A placeholder table doesn't get documented, is a perl extension,
4232             # and quite likely will be empty
4233             $make_pod_entry = 0 if ! defined $make_pod_entry;
4234             $perl_extension = 1 if ! defined $perl_extension;
4235             push @tables_that_may_be_empty, $complete_name{$addr};
4236         }
4237         elsif (! $status{$addr}) {
4238
4239             # If hasn't set its status already, see if it is on one of the
4240             # lists of properties or tables that have particular statuses; if
4241             # not, is normal.  The lists are prioritized so the most serious
4242             # ones are checked first
4243             if (exists $why_suppressed{$complete_name}) {
4244                 $status{$addr} = $SUPPRESSED;
4245             }
4246             elsif (exists $why_deprecated{$complete_name}) {
4247                 $status{$addr} = $DEPRECATED;
4248             }
4249             elsif (exists $why_stabilized{$complete_name}) {
4250                 $status{$addr} = $STABILIZED;
4251             }
4252             elsif (exists $why_obsolete{$complete_name}) {
4253                 $status{$addr} = $OBSOLETE;
4254             }
4255
4256             # Existence above doesn't necessarily mean there is a message
4257             # associated with it.  Use the most serious message.
4258             if ($status{$addr}) {
4259                 if ($why_suppressed{$complete_name}) {
4260                     $status_info{$addr}
4261                                 = $why_suppressed{$complete_name};
4262                 }
4263                 elsif ($why_deprecated{$complete_name}) {
4264                     $status_info{$addr}
4265                                 = $why_deprecated{$complete_name};
4266                 }
4267                 elsif ($why_stabilized{$complete_name}) {
4268                     $status_info{$addr}
4269                                 = $why_stabilized{$complete_name};
4270                 }
4271                 elsif ($why_obsolete{$complete_name}) {
4272                     $status_info{$addr}
4273                                 = $why_obsolete{$complete_name};
4274                 }
4275             }
4276         }
4277
4278         $perl_extension{$addr} = $perl_extension || 0;
4279
4280         # By convention what typically gets printed only or first is what's
4281         # first in the list, so put the full name there for good output
4282         # clarity.  Other routines rely on the full name being first on the
4283         # list
4284         $self->add_alias($full_name{$addr},
4285                             Externally_Ok => $externally_ok,
4286                             Fuzzy => $loose_match,
4287                             Pod_Entry => $make_pod_entry,
4288                             Status => $status{$addr},
4289                             );
4290
4291         # Then comes the other name, if meaningfully different.
4292         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4293             $self->add_alias($name{$addr},
4294                             Externally_Ok => $externally_ok,
4295                             Fuzzy => $loose_match,
4296                             Pod_Entry => $make_pod_entry,
4297                             Status => $status{$addr},
4298                             );
4299         }
4300
4301         return $self;
4302     }
4303
4304     # Here are the methods that are required to be defined by any derived
4305     # class
4306     for my $sub qw(
4307                     append_to_body
4308                     pre_body
4309                 )
4310                 # append_to_body and pre_body are called in the write() method
4311                 # to add stuff after the main body of the table, but before
4312                 # its close; and to prepend stuff before the beginning of the
4313                 # table.
4314     {
4315         no strict "refs";
4316         *$sub = sub {
4317             Carp::my_carp_bug( __LINE__
4318                               . ": Must create method '$sub()' for "
4319                               . ref shift);
4320             return;
4321         }
4322     }
4323
4324     use overload
4325         fallback => 0,
4326         "." => \&main::_operator_dot,
4327         '!=' => \&main::_operator_not_equal,
4328         '==' => \&main::_operator_equal,
4329     ;
4330
4331     sub ranges {
4332         # Returns the array of ranges associated with this table.
4333
4334         no overloading;
4335         return $range_list{0+shift}->ranges;
4336     }
4337
4338     sub add_alias {
4339         # Add a synonym for this table.
4340
4341         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4342
4343         my $self = shift;
4344         my $name = shift;       # The name to add.
4345         my $pointer = shift;    # What the alias hash should point to.  For
4346                                 # map tables, this is the parent property;
4347                                 # for match tables, it is the table itself.
4348
4349         my %args = @_;
4350         my $loose_match = delete $args{'Fuzzy'};
4351
4352         my $make_pod_entry = delete $args{'Pod_Entry'};
4353         $make_pod_entry = $YES unless defined $make_pod_entry;
4354
4355         my $externally_ok = delete $args{'Externally_Ok'};
4356         $externally_ok = 1 unless defined $externally_ok;
4357
4358         my $status = delete $args{'Status'};
4359         $status = $NORMAL unless defined $status;
4360
4361         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4362
4363         # Capitalize the first letter of the alias unless it is one of the CJK
4364         # ones which specifically begins with a lower 'k'.  Do this because
4365         # Unicode has varied whether they capitalize first letters or not, and
4366         # have later changed their minds and capitalized them, but not the
4367         # other way around.  So do it always and avoid changes from release to
4368         # release
4369         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4370
4371         my $addr; { no overloading; $addr = 0+$self; }
4372
4373         # Figure out if should be loosely matched if not already specified.
4374         if (! defined $loose_match) {
4375
4376             # Is a loose_match if isn't null, and doesn't begin with an
4377             # underscore and isn't just a number
4378             if ($name ne ""
4379                 && substr($name, 0, 1) ne '_'
4380                 && $name !~ qr{^[0-9_.+-/]+$})
4381             {
4382                 $loose_match = 1;
4383             }
4384             else {
4385                 $loose_match = 0;
4386             }
4387         }
4388
4389         # If this alias has already been defined, do nothing.
4390         return if defined $find_table_from_alias{$addr}->{$name};
4391
4392         # That includes if it is standardly equivalent to an existing alias,
4393         # in which case, add this name to the list, so won't have to search
4394         # for it again.
4395         my $standard_name = main::standardize($name);
4396         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4397             $find_table_from_alias{$addr}->{$name}
4398                         = $find_table_from_alias{$addr}->{$standard_name};
4399             return;
4400         }
4401
4402         # Set the index hash for this alias for future quick reference.
4403         $find_table_from_alias{$addr}->{$name} = $pointer;
4404         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4405         local $to_trace = 0 if main::DEBUG;
4406         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4407         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4408
4409
4410         # Put the new alias at the end of the list of aliases unless the final
4411         # element begins with an underscore (meaning it is for internal perl
4412         # use) or is all numeric, in which case, put the new one before that
4413         # one.  This floats any all-numeric or underscore-beginning aliases to
4414         # the end.  This is done so that they are listed last in output lists,
4415         # to encourage the user to use a better name (either more descriptive
4416         # or not an internal-only one) instead.  This ordering is relied on
4417         # implicitly elsewhere in this program, like in short_name()
4418         my $list = $aliases{$addr};
4419         my $insert_position = (@$list == 0
4420                                 || (substr($list->[-1]->name, 0, 1) ne '_'
4421                                     && $list->[-1]->name =~ /\D/))
4422                             ? @$list
4423                             : @$list - 1;
4424         splice @$list,
4425                 $insert_position,
4426                 0,
4427                 Alias->new($name, $loose_match, $make_pod_entry,
4428                                                     $externally_ok, $status);
4429
4430         # This name may be shorter than any existing ones, so clear the cache
4431         # of the shortest, so will have to be recalculated.
4432         no overloading;
4433         undef $short_name{0+$self};
4434         return;
4435     }
4436
4437     sub short_name {
4438         # Returns a name suitable for use as the base part of a file name.
4439         # That is, shorter wins.  It can return undef if there is no suitable
4440         # name.  The name has all non-essential underscores removed.
4441
4442         # The optional second parameter is a reference to a scalar in which
4443         # this routine will store the length the returned name had before the
4444         # underscores were removed, or undef if the return is undef.
4445
4446         # The shortest name can change if new aliases are added.  So using
4447         # this should be deferred until after all these are added.  The code
4448         # that does that should clear this one's cache.
4449         # Any name with alphabetics is preferred over an all numeric one, even
4450         # if longer.
4451
4452         my $self = shift;
4453         my $nominal_length_ptr = shift;
4454         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4455
4456         my $addr; { no overloading; $addr = 0+$self; }
4457
4458         # For efficiency, don't recalculate, but this means that adding new
4459         # aliases could change what the shortest is, so the code that does
4460         # that needs to undef this.
4461         if (defined $short_name{$addr}) {
4462             if ($nominal_length_ptr) {
4463                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4464             }
4465             return $short_name{$addr};
4466         }
4467
4468         # Look at each alias
4469         foreach my $alias ($self->aliases()) {
4470
4471             # Don't use an alias that isn't ok to use for an external name.
4472             next if ! $alias->externally_ok;
4473
4474             my $name = main::Standardize($alias->name);
4475             trace $self, $name if main::DEBUG && $to_trace;
4476
4477             # Take the first one, or a shorter one that isn't numeric.  This
4478             # relies on numeric aliases always being last in the array
4479             # returned by aliases().  Any alpha one will have precedence.
4480             if (! defined $short_name{$addr}
4481                 || ($name =~ /\D/
4482                     && length($name) < length($short_name{$addr})))
4483             {
4484                 # Remove interior underscores.
4485                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4486
4487                 $nominal_short_name_length{$addr} = length $name;
4488             }
4489         }
4490
4491         # If no suitable external name return undef
4492         if (! defined $short_name{$addr}) {
4493             $$nominal_length_ptr = undef if $nominal_length_ptr;
4494             return;
4495         }
4496
4497         # Don't allow a null external name.
4498         if ($short_name{$addr} eq "") {
4499             $short_name{$addr} = '_';
4500             $nominal_short_name_length{$addr} = 1;
4501         }
4502
4503         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4504
4505         if ($nominal_length_ptr) {
4506             $$nominal_length_ptr = $nominal_short_name_length{$addr};
4507         }
4508         return $short_name{$addr};
4509     }
4510
4511     sub external_name {
4512         # Returns the external name that this table should be known by.  This
4513         # is usually the short_name, but not if the short_name is undefined.
4514
4515         my $self = shift;
4516         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4517
4518         my $short = $self->short_name;
4519         return $short if defined $short;
4520
4521         return '_';
4522     }
4523
4524     sub add_description { # Adds the parameter as a short description.
4525
4526         my $self = shift;
4527         my $description = shift;
4528         chomp $description;
4529         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4530
4531         no overloading;
4532         push @{$description{0+$self}}, $description;
4533
4534         return;
4535     }
4536
4537     sub add_note { # Adds the parameter as a short note.
4538
4539         my $self = shift;
4540         my $note = shift;
4541         chomp $note;
4542         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4543
4544         no overloading;
4545         push @{$note{0+$self}}, $note;
4546
4547         return;
4548     }
4549
4550     sub add_comment { # Adds the parameter as a comment.
4551
4552         my $self = shift;
4553         my $comment = shift;
4554         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4555
4556         chomp $comment;
4557
4558         no overloading;
4559         push @{$comment{0+$self}}, $comment;
4560
4561         return;
4562     }
4563
4564     sub comment {
4565         # Return the current comment for this table.  If called in list
4566         # context, returns the array of comments.  In scalar, returns a string
4567         # of each element joined together with a period ending each.
4568
4569         my $self = shift;
4570         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4571
4572         my $addr; { no overloading; $addr = 0+$self; }
4573         my @list = @{$comment{$addr}};
4574         return @list if wantarray;
4575         my $return = "";
4576         foreach my $sentence (@list) {
4577             $return .= '.  ' if $return;
4578             $return .= $sentence;
4579             $return =~ s/\.$//;
4580         }
4581         $return .= '.' if $return;
4582         return $return;
4583     }
4584
4585     sub initialize {
4586         # Initialize the table with the argument which is any valid
4587         # initialization for range lists.
4588
4589         my $self = shift;
4590         my $addr; { no overloading; $addr = 0+$self; }
4591         my $initialization = shift;
4592         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4593
4594         # Replace the current range list with a new one of the same exact
4595         # type.
4596         my $class = ref $range_list{$addr};
4597         $range_list{$addr} = $class->new(Owner => $self,
4598                                         Initialize => $initialization);
4599         return;
4600
4601     }
4602
4603     sub header {
4604         # The header that is output for the table in the file it is written
4605         # in.
4606
4607         my $self = shift;
4608         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4609
4610         my $return = "";
4611         $return .= $DEVELOPMENT_ONLY if $compare_versions;
4612         $return .= $HEADER;
4613         no overloading;
4614         $return .= $INTERNAL_ONLY if $internal_only{0+$self};
4615         return $return;
4616     }
4617
4618     sub write {
4619         # Write a representation of the table to its file.
4620
4621         my $self = shift;
4622         my $tab_stops = shift;       # The number of tab stops over to put any
4623                                      # comment.
4624         my $suppress_value = shift;  # Optional, if the value associated with
4625                                      # a range equals this one, don't write
4626                                      # the range
4627         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4628
4629         my $addr; { no overloading; $addr = 0+$self; }
4630
4631         # Start with the header
4632         my @OUT = $self->header;
4633
4634         # Then the comments
4635         push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4636                                                         if $comment{$addr};
4637
4638         # Then any pre-body stuff.
4639         my $pre_body = $self->pre_body;
4640         push @OUT, $pre_body, "\n" if $pre_body;
4641
4642         # The main body looks like a 'here' document
4643         push @OUT, "return <<'END';\n";
4644
4645         if ($range_list{$addr}->is_empty) {
4646
4647             # This is a kludge for empty tables to silence a warning in
4648             # utf8.c, which can't really deal with empty tables, but it can
4649             # deal with a table that matches nothing, as the inverse of 'Any'
4650             # does.
4651             push @OUT, "!utf8::IsAny\n";
4652         }
4653         else {
4654             my $range_size_1 = $range_size_1{$addr};
4655
4656             # Output each range as part of the here document.
4657             for my $set ($range_list{$addr}->ranges) {
4658                 my $start = $set->start;
4659                 my $end   = $set->end;
4660                 my $value  = $set->value;
4661
4662                 # Don't output ranges whose value is the one to suppress
4663                 next if defined $suppress_value && $value eq $suppress_value;
4664
4665                 # If has or wants a single point range output
4666                 if ($start == $end || $range_size_1) {
4667                     for my $i ($start .. $end) {
4668                         push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4669                     }
4670                 }
4671                 else  {
4672                     push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4673
4674                     # Add a comment with the size of the range, if requested.
4675                     # Expand Tabs to make sure they all start in the same
4676                     # column, and then unexpand to use mostly tabs.
4677                     if (! $output_range_counts{$addr}) {
4678                         $OUT[-1] .= "\n";
4679                     }
4680                     else {
4681                         $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4682                         my $count = main::clarify_number($end - $start + 1);
4683                         use integer;
4684
4685                         my $width = $tab_stops * 8 - 1;
4686                         $OUT[-1] = sprintf("%-*s # [%s]\n",
4687                                             $width,
4688                                             $OUT[-1],
4689                                             $count);
4690                         $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4691                     }
4692                 }
4693             } # End of loop through all the table's ranges
4694         }
4695
4696         # Add anything that goes after the main body, but within the here
4697         # document,
4698         my $append_to_body = $self->append_to_body;
4699         push @OUT, $append_to_body if $append_to_body;
4700
4701         # And finish the here document.
4702         push @OUT, "END\n";
4703
4704         # All these files have a .pl suffix
4705         $file_path{$addr}->[-1] .= '.pl';
4706
4707         main::write($file_path{$addr}, \@OUT);
4708         return;
4709     }
4710
4711     sub set_status {    # Set the table's status
4712         my $self = shift;
4713         my $status = shift; # The status enum value
4714         my $info = shift;   # Any message associated with it.
4715         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4716
4717         my $addr; { no overloading; $addr = 0+$self; }
4718
4719         $status{$addr} = $status;
4720         $status_info{$addr} = $info;
4721         return;
4722     }
4723
4724     sub lock {
4725         # Don't allow changes to the table from now on.  This stores a stack
4726         # trace of where it was called, so that later attempts to modify it
4727         # can immediately show where it got locked.
4728
4729         my $self = shift;
4730         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4731
4732         my $addr; { no overloading; $addr = 0+$self; }
4733
4734         $locked{$addr} = "";
4735
4736         my $line = (caller(0))[2];
4737         my $i = 1;
4738
4739         # Accumulate the stack trace
4740         while (1) {
4741             my ($pkg, $file, $caller_line, $caller) = caller $i++;
4742
4743             last unless defined $caller;
4744
4745             $locked{$addr} .= "    called from $caller() at line $line\n";
4746             $line = $caller_line;
4747         }
4748         $locked{$addr} .= "    called from main at line $line\n";
4749
4750         return;
4751     }
4752
4753     sub carp_if_locked {
4754         # Return whether a table is locked or not, and, by the way, complain
4755         # if is locked
4756
4757         my $self = shift;
4758         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4759
4760         my $addr; { no overloading; $addr = 0+$self; }
4761
4762         return 0 if ! $locked{$addr};
4763         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
4764         return 1;
4765     }
4766
4767     sub set_file_path { # Set the final directory path for this table
4768         my $self = shift;
4769         # Rest of parameters passed on
4770
4771         no overloading;
4772         @{$file_path{0+$self}} = @_;
4773         return
4774     }
4775
4776     # Accessors for the range list stored in this table.  First for
4777     # unconditional
4778     for my $sub qw(
4779                     contains
4780                     count
4781                     each_range
4782                     hash
4783                     is_empty
4784                     max
4785                     min
4786                     range_count
4787                     reset_each_range
4788                     value_of
4789                 )
4790     {
4791         no strict "refs";
4792         *$sub = sub {
4793             use strict "refs";
4794             my $self = shift;
4795             no overloading;
4796             return $range_list{0+$self}->$sub(@_);
4797         }
4798     }
4799
4800     # Then for ones that should fail if locked
4801     for my $sub qw(
4802                     delete_range
4803                 )
4804     {
4805         no strict "refs";
4806         *$sub = sub {
4807             use strict "refs";
4808             my $self = shift;
4809
4810             return if $self->carp_if_locked;
4811             no overloading;
4812             return $range_list{0+$self}->$sub(@_);
4813         }
4814     }
4815
4816 } # End closure
4817
4818 package Map_Table;
4819 use base '_Base_Table';
4820
4821 # A Map Table is a table that contains the mappings from code points to
4822 # values.  There are two weird cases:
4823 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
4824 #    are written in the table's file at the end of the table nonetheless.  It
4825 #    requires specially constructed code to handle these; utf8.c can not read
4826 #    these in, so they should not go in $map_directory.  As of this writing,
4827 #    the only case that these happen is for named sequences used in
4828 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
4829 #    something else could come along that uses it.
4830 # 2) Specials are anything that doesn't fit syntactically into the body of the
4831 #    table.  The ranges for these have a map type of non-zero.  The code below
4832 #    knows about and handles each possible type.   In most cases, these are
4833 #    written as part of the header.
4834 #
4835 # A map table deliberately can't be manipulated at will unlike match tables.
4836 # This is because of the ambiguities having to do with what to do with
4837 # overlapping code points.  And there just isn't a need for those things;
4838 # what one wants to do is just query, add, replace, or delete mappings, plus
4839 # write the final result.
4840 # However, there is a method to get the list of possible ranges that aren't in
4841 # this table to use for defaulting missing code point mappings.  And,
4842 # map_add_or_replace_non_nulls() does allow one to add another table to this
4843 # one, but it is clearly very specialized, and defined that the other's
4844 # non-null values replace this one's if there is any overlap.
4845
4846 sub trace { return main::trace(@_); }
4847
4848 { # Closure
4849
4850     main::setup_package();
4851
4852     my %default_map;
4853     # Many input files omit some entries; this gives what the mapping for the
4854     # missing entries should be
4855     main::set_access('default_map', \%default_map, 'r');
4856
4857     my %anomalous_entries;
4858     # Things that go in the body of the table which don't fit the normal
4859     # scheme of things, like having a range.  Not much can be done with these
4860     # once there except to output them.  This was created to handle named
4861     # sequences.
4862     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
4863     main::set_access('anomalous_entries',       # Append singular, read plural
4864                     \%anomalous_entries,
4865                     'readable_array');
4866
4867     my %format;
4868     # The format of the entries of the table.  This is calculated from the
4869     # data in the table (or passed in the constructor).  This is an enum e.g.,
4870     # $STRING_FORMAT
4871     main::set_access('format', \%format);
4872
4873     my %core_access;
4874     # This is a string, solely for documentation, indicating how one can get
4875     # access to this property via the Perl core.
4876     main::set_access('core_access', \%core_access, 'r', 's');
4877
4878     my %has_specials;
4879     # Boolean set when non-zero map-type ranges are added to this table,
4880     # which happens in only a few tables.  This is purely for performance, to
4881     # avoid having to search through every table upon output, so if all the
4882     # non-zero maps got deleted before output, this would remain set, and the
4883     # only penalty would be performance.  Currently, most map tables that get
4884     # output have specials in them, so this doesn't help that much anyway.
4885     main::set_access('has_specials', \%has_specials);
4886
4887     my %to_output_map;
4888     # Boolean as to whether or not to write out this map table
4889     main::set_access('to_output_map', \%to_output_map, 's');
4890
4891
4892     sub new {
4893         my $class = shift;
4894         my $name = shift;
4895
4896         my %args = @_;
4897
4898         # Optional initialization data for the table.
4899         my $initialize = delete $args{'Initialize'};
4900
4901         my $core_access = delete $args{'Core_Access'};
4902         my $default_map = delete $args{'Default_Map'};
4903         my $format = delete $args{'Format'};
4904         my $property = delete $args{'_Property'};
4905         my $full_name = delete $args{'Full_Name'};
4906         # Rest of parameters passed on
4907
4908         my $range_list = Range_Map->new(Owner => $property);
4909
4910         my $self = $class->SUPER::new(
4911                                     Name => $name,
4912                                     Complete_Name =>  $full_name,
4913                                     Full_Name => $full_name,
4914                                     _Property => $property,
4915                                     _Range_List => $range_list,
4916                                     %args);
4917
4918         my $addr; { no overloading; $addr = 0+$self; }
4919
4920         $anomalous_entries{$addr} = [];
4921         $core_access{$addr} = $core_access;
4922         $default_map{$addr} = $default_map;
4923         $format{$addr} = $format;
4924
4925         $self->initialize($initialize) if defined $initialize;
4926
4927         return $self;
4928     }
4929
4930     use overload
4931         fallback => 0,
4932         qw("") => "_operator_stringify",
4933     ;
4934
4935     sub _operator_stringify {
4936         my $self = shift;
4937
4938         my $name = $self->property->full_name;
4939         $name = '""' if $name eq "";
4940         return "Map table for Property '$name'";
4941     }
4942
4943     sub add_alias {
4944         # Add a synonym for this table (which means the property itself)
4945         my $self = shift;
4946         my $name = shift;
4947         # Rest of parameters passed on.
4948
4949         $self->SUPER::add_alias($name, $self->property, @_);
4950         return;
4951     }
4952
4953     sub add_map {
4954         # Add a range of code points to the list of specially-handled code
4955         # points.  $MULTI_CP is assumed if the type of special is not passed
4956         # in.
4957
4958         my $self = shift;
4959         my $lower = shift;
4960         my $upper = shift;
4961         my $string = shift;
4962         my %args = @_;
4963
4964         my $type = delete $args{'Type'} || 0;
4965         # Rest of parameters passed on
4966
4967         # Can't change the table if locked.
4968         return if $self->carp_if_locked;
4969
4970         my $addr; { no overloading; $addr = 0+$self; }
4971
4972         $has_specials{$addr} = 1 if $type;
4973
4974         $self->_range_list->add_map($lower, $upper,
4975                                     $string,
4976                                     @_,
4977                                     Type => $type);
4978         return;
4979     }
4980
4981     sub append_to_body {
4982         # Adds to the written HERE document of the table's body any anomalous
4983         # entries in the table..
4984
4985         my $self = shift;
4986         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4987
4988         my $addr; { no overloading; $addr = 0+$self; }
4989
4990         return "" unless @{$anomalous_entries{$addr}};
4991         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
4992     }
4993
4994     sub map_add_or_replace_non_nulls {
4995         # This adds the mappings in the table $other to $self.  Non-null
4996         # mappings from $other override those in $self.  It essentially merges
4997         # the two tables, with the second having priority except for null
4998         # mappings.
4999
5000         my $self = shift;
5001         my $other = shift;
5002         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5003
5004         return if $self->carp_if_locked;
5005
5006         if (! $other->isa(__PACKAGE__)) {
5007             Carp::my_carp_bug("$other should be a "
5008                         . __PACKAGE__
5009                         . ".  Not a '"
5010                         . ref($other)
5011                         . "'.  Not added;");
5012             return;
5013         }
5014
5015         my $addr; { no overloading; $addr = 0+$self; }
5016         my $other_addr; { no overloading; $other_addr = 0+$other; }
5017
5018         local $to_trace = 0 if main::DEBUG;
5019
5020         my $self_range_list = $self->_range_list;
5021         my $other_range_list = $other->_range_list;
5022         foreach my $range ($other_range_list->ranges) {
5023             my $value = $range->value;
5024             next if $value eq "";
5025             $self_range_list->_add_delete('+',
5026                                           $range->start,
5027                                           $range->end,
5028                                           $value,
5029                                           Type => $range->type,
5030                                           Replace => $UNCONDITIONALLY);
5031         }
5032
5033         # Copy the specials information from the other table to $self
5034         if ($has_specials{$other_addr}) {
5035             $has_specials{$addr} = 1;
5036         }
5037
5038         return;
5039     }
5040
5041     sub set_default_map {
5042         # Define what code points that are missing from the input files should
5043         # map to
5044
5045         my $self = shift;
5046         my $map = shift;
5047         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5048
5049         my $addr; { no overloading; $addr = 0+$self; }
5050
5051         # Convert the input to the standard equivalent, if any (won't have any
5052         # for $STRING properties)
5053         my $standard = $self->_find_table_from_alias->{$map};
5054         $map = $standard->name if defined $standard;
5055
5056         # Warn if there already is a non-equivalent default map for this
5057         # property.  Note that a default map can be a ref, which means that
5058         # what it actually means is delayed until later in the program, and it
5059         # IS permissible to override it here without a message.
5060         my $default_map = $default_map{$addr};
5061         if (defined $default_map
5062             && ! ref($default_map)
5063             && $default_map ne $map
5064             && main::Standardize($map) ne $default_map)
5065         {
5066             my $property = $self->property;
5067             my $map_table = $property->table($map);
5068             my $default_table = $property->table($default_map);
5069             if (defined $map_table
5070                 && defined $default_table
5071                 && $map_table != $default_table)
5072             {
5073                 Carp::my_carp("Changing the default mapping for "
5074                             . $property
5075                             . " from $default_map to $map'");
5076             }
5077         }
5078
5079         $default_map{$addr} = $map;
5080
5081         # Don't also create any missing table for this map at this point,
5082         # because if we did, it could get done before the main table add is
5083         # done for PropValueAliases.txt; instead the caller will have to make
5084         # sure it exists, if desired.
5085         return;
5086     }
5087
5088     sub to_output_map {
5089         # Returns boolean: should we write this map table?
5090
5091         my $self = shift;
5092         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5093
5094         my $addr; { no overloading; $addr = 0+$self; }
5095
5096         # If overridden, use that
5097         return $to_output_map{$addr} if defined $to_output_map{$addr};
5098
5099         my $full_name = $self->full_name;
5100
5101         # If table says to output, do so; if says to suppress it, do do.
5102         return 1 if grep { $_ eq $full_name } @output_mapped_properties;
5103         return 0 if $self->status eq $SUPPRESSED;
5104
5105         my $type = $self->property->type;
5106
5107         # Don't want to output binary map tables even for debugging.
5108         return 0 if $type == $BINARY;
5109
5110         # But do want to output string ones.
5111         return 1 if $type == $STRING;
5112
5113         # Otherwise is an $ENUM, don't output it
5114         return 0;
5115     }
5116
5117     sub inverse_list {
5118         # Returns a Range_List that is gaps of the current table.  That is,
5119         # the inversion
5120
5121         my $self = shift;
5122         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5123
5124         my $current = Range_List->new(Initialize => $self->_range_list,
5125                                 Owner => $self->property);
5126         return ~ $current;
5127     }
5128
5129     sub set_final_comment {
5130         # Just before output, create the comment that heads the file
5131         # containing this table.
5132
5133         my $self = shift;
5134         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5135
5136         # No sense generating a comment if aren't going to write it out.
5137         return if ! $self->to_output_map;
5138
5139         my $addr; { no overloading; $addr = 0+$self; }
5140
5141         my $property = $self->property;
5142
5143         # Get all the possible names for this property.  Don't use any that
5144         # aren't ok for use in a file name, etc.  This is perhaps causing that
5145         # flag to do double duty, and may have to be changed in the future to
5146         # have our own flag for just this purpose; but it works now to exclude
5147         # Perl generated synonyms from the lists for properties, where the
5148         # name is always the proper Unicode one.
5149         my @property_aliases = grep { $_->externally_ok } $self->aliases;
5150
5151         my $count = $self->count;
5152         my $default_map = $default_map{$addr};
5153
5154         # The ranges that map to the default aren't output, so subtract that
5155         # to get those actually output.  A property with matching tables
5156         # already has the information calculated.
5157         if ($property->type != $STRING) {
5158             $count -= $property->table($default_map)->count;
5159         }
5160         elsif (defined $default_map) {
5161
5162             # But for $STRING properties, must calculate now.  Subtract the
5163             # count from each range that maps to the default.
5164             foreach my $range ($self->_range_list->ranges) {
5165                 if ($range->value eq $default_map) {
5166                     $count -= $range->end +1 - $range->start;
5167                 }
5168             }
5169
5170         }
5171
5172         # Get a  string version of $count with underscores in large numbers,
5173         # for clarity.
5174         my $string_count = main::clarify_number($count);
5175
5176         my $code_points = ($count == 1)
5177                         ? 'single code point'
5178                         : "$string_count code points";
5179
5180         my $mapping;
5181         my $these_mappings;
5182         my $are;
5183         if (@property_aliases <= 1) {
5184             $mapping = 'mapping';
5185             $these_mappings = 'this mapping';
5186             $are = 'is'
5187         }
5188         else {
5189             $mapping = 'synonymous mappings';
5190             $these_mappings = 'these mappings';
5191             $are = 'are'
5192         }
5193         my $cp;
5194         if ($count >= $MAX_UNICODE_CODEPOINTS) {
5195             $cp = "any code point in Unicode Version $string_version";
5196         }
5197         else {
5198             my $map_to;
5199             if ($default_map eq "") {
5200                 $map_to = 'the null string';
5201             }
5202             elsif ($default_map eq $CODE_POINT) {
5203                 $map_to = "itself";
5204             }
5205             else {
5206                 $map_to = "'$default_map'";
5207             }
5208             if ($count == 1) {
5209                 $cp = "the single code point";
5210             }
5211             else {
5212                 $cp = "one of the $code_points";
5213             }
5214             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5215         }
5216
5217         my $comment = "";
5218
5219         my $status = $self->status;
5220         if ($status) {
5221             my $warn = uc $status_past_participles{$status};
5222             $comment .= <<END;
5223
5224 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
5225  All property or property=value combinations contained in this file are $warn.
5226  See $unicode_reference_url for what this means.
5227
5228 END
5229         }
5230         $comment .= "This file returns the $mapping:\n";
5231
5232         for my $i (0 .. @property_aliases - 1) {
5233             $comment .= sprintf("%-8s%s\n",
5234                                 " ",
5235                                 $property_aliases[$i]->name . '(cp)'
5236                                 );
5237         }
5238         $comment .=
5239                 "\nwhere 'cp' is $cp.  Note that $these_mappings $are ";
5240
5241         my $access = $core_access{$addr};
5242         if ($access) {
5243             $comment .= "accessible through the Perl core via $access.";
5244         }
5245         else {
5246             $comment .= "not accessible through the Perl core directly.";
5247         }
5248
5249         # And append any commentary already set from the actual property.
5250         $comment .= "\n\n" . $self->comment if $self->comment;
5251         if ($self->description) {
5252             $comment .= "\n\n" . join " ", $self->description;
5253         }
5254         if ($self->note) {
5255             $comment .= "\n\n" . join " ", $self->note;
5256         }
5257         $comment .= "\n";
5258
5259         if (! $self->perl_extension) {
5260             $comment .= <<END;
5261
5262 For information about what this property really means, see:
5263 $unicode_reference_url
5264 END
5265         }
5266
5267         if ($count) {        # Format differs for empty table
5268                 $comment.= "\nThe format of the ";
5269             if ($self->range_size_1) {
5270                 $comment.= <<END;
5271 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5272 is in hex; MAPPING is what CODE_POINT maps to.
5273 END
5274             }
5275             else {
5276
5277                 # There are tables which end up only having one element per
5278                 # range, but it is not worth keeping track of for making just
5279                 # this comment a little better.
5280                 $comment.= <<END;
5281 non-comment portions of the main body of lines of this file is:
5282 START\\tSTOP\\tMAPPING where START is the starting code point of the
5283 range, in hex; STOP is the ending point, or if omitted, the range has just one
5284 code point; MAPPING is what each code point between START and STOP maps to.
5285 END
5286                 if ($self->output_range_counts) {
5287                     $comment .= <<END;
5288 Numbers in comments in [brackets] indicate how many code points are in the
5289 range (omitted when the range is a single code point or if the mapping is to
5290 the null string).
5291 END
5292                 }
5293             }
5294         }
5295         $self->set_comment(main::join_lines($comment));
5296         return;
5297     }
5298
5299     my %swash_keys; # Makes sure don't duplicate swash names.
5300
5301     sub pre_body {
5302         # Returns the string that should be output in the file before the main
5303         # body of this table.  This includes some hash entries identifying the
5304         # format of the body, and what the single value should be for all
5305         # ranges missing from it.  It also includes any code points which have
5306         # map_types that don't go in the main table.
5307
5308         my $self = shift;
5309         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5310
5311         my $addr; { no overloading; $addr = 0+$self; }
5312
5313         my $name = $self->property->swash_name;
5314
5315         if (defined $swash_keys{$name}) {
5316             Carp::my_carp(join_lines(<<END
5317 Already created a swash name '$name' for $swash_keys{$name}.  This means that
5318 the same name desired for $self shouldn't be used.  Bad News.  This must be
5319 fixed before production use, but proceeding anyway
5320 END
5321             ));
5322         }
5323         $swash_keys{$name} = "$self";
5324
5325         my $default_map = $default_map{$addr};
5326
5327         my $pre_body = "";
5328         if ($has_specials{$addr}) {
5329
5330             # Here, some maps with non-zero type have been added to the table.
5331             # Go through the table and handle each of them.  None will appear
5332             # in the body of the table, so delete each one as we go.  The
5333             # code point count has already been calculated, so ok to delete
5334             # now.
5335
5336             my @multi_code_point_maps;
5337             my $has_hangul_syllables = 0;
5338
5339             # The key is the base name of the code point, and the value is an
5340             # array giving all the ranges that use this base name.  Each range
5341             # is actually a hash giving the 'low' and 'high' values of it.
5342             my %names_ending_in_code_point;
5343
5344             # Inverse mapping.  The list of ranges that have these kinds of
5345             # names.  Each element contains the low, high, and base names in a
5346             # hash.
5347             my @code_points_ending_in_code_point;
5348
5349             my $range_map = $self->_range_list;
5350             foreach my $range ($range_map->ranges) {
5351                 next unless $range->type != 0;
5352                 my $low = $range->start;
5353                 my $high = $range->end;
5354                 my $map = $range->value;
5355                 my $type = $range->type;
5356
5357                 # No need to output the range if it maps to the default.  And
5358                 # the write method won't output it either, so no need to
5359                 # delete it to keep it from being output, and is faster to
5360                 # skip than to delete anyway.
5361                 next if $map eq $default_map;
5362
5363                 # Delete the range to keep write() from trying to output it
5364                 $range_map->delete_range($low, $high);
5365
5366                 # Switch based on the map type...
5367                 if ($type == $HANGUL_SYLLABLE) {
5368
5369                     # These are entirely algorithmically determinable based on
5370                     # some constants furnished by Unicode; for now, just set a
5371                     # flag to indicate that have them.  Below we will output
5372                     # the code that does the algorithm.
5373                     $has_hangul_syllables = 1;
5374                 }
5375                 elsif ($type == $CP_IN_NAME) {
5376
5377                     # If the name ends in the code point it represents, are
5378                     # also algorithmically determinable, but need information
5379                     # about the map to do so.  Both the map and its inverse
5380                     # are stored in data structures output in the file.
5381                     push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5382                     push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5383
5384                     push @code_points_ending_in_code_point, { low => $low,
5385                                                               high => $high,
5386                                                               name => $map
5387                                                             };
5388                 }
5389                 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5390
5391                     # Multi-code point maps and null string maps have an entry
5392                     # for each code point in the range.  They use the same
5393                     # output format.
5394                     for my $code_point ($low .. $high) {
5395
5396                         # The pack() below can't cope with surrogates.
5397                         if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5398                             Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self.  No map created");
5399                             next;
5400                         }
5401
5402                         # Generate the hash entries for these in the form that
5403                         # utf8.c understands.
5404                         my $tostr = "";
5405                         foreach my $to (split " ", $map) {
5406                             if ($to !~ /^$code_point_re$/) {
5407                                 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
5408                                 next;
5409                             }
5410                             $tostr .= sprintf "\\x{%s}", $to;
5411                         }
5412
5413                         # I (khw) have never waded through this line to
5414                         # understand it well enough to comment it.
5415                         my $utf8 = sprintf(qq["%s" => "$tostr",],
5416                                 join("", map { sprintf "\\x%02X", $_ }
5417                                     unpack("U0C*", pack("U", $code_point))));
5418
5419                         # Add a comment so that a human reader can more easily
5420                         # see what's going on.
5421                         push @multi_code_point_maps,
5422                                 sprintf("%-45s # U+%04X => %s", $utf8,
5423                                                                 $code_point,
5424                                                                 $map);
5425                     }
5426                 }
5427                 else {
5428                     Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Using type 0 instead");
5429                     $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
5430                 }
5431             } # End of loop through all ranges
5432
5433             # Here have gone through the whole file.  If actually generated
5434             # anything for each map type, add its respective header and
5435             # trailer
5436             if (@multi_code_point_maps) {
5437                 $pre_body .= <<END;
5438
5439 # Some code points require special handling because their mappings are each to
5440 # multiple code points.  These do not appear in the main body, but are defined
5441 # in the hash below.
5442
5443 # The key: UTF-8 _bytes_, the value: UTF-8 (speed hack)
5444 %utf8::ToSpec$name = (
5445 END
5446                 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5447             }
5448
5449             if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5450
5451                 # Convert these structures to output format.
5452                 my $code_points_ending_in_code_point =
5453                     main::simple_dumper(\@code_points_ending_in_code_point,
5454                                         ' ' x 8);
5455                 my $names = main::simple_dumper(\%names_ending_in_code_point,
5456                                                 ' ' x 8);
5457
5458                 # Do the same with the Hangul names,
5459                 my $jamo;
5460                 my $jamo_l;
5461                 my $jamo_v;
5462                 my $jamo_t;
5463                 my $jamo_re;
5464                 if ($has_hangul_syllables) {
5465
5466                     # Construct a regular expression of all the possible
5467                     # combinations of the Hangul syllables.
5468                     my @L_re;   # Leading consonants
5469                     for my $i ($LBase .. $LBase + $LCount - 1) {
5470                         push @L_re, $Jamo{$i}
5471                     }
5472                     my @V_re;   # Middle vowels
5473                     for my $i ($VBase .. $VBase + $VCount - 1) {
5474                         push @V_re, $Jamo{$i}
5475                     }
5476                     my @T_re;   # Trailing consonants
5477                     for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5478                         push @T_re, $Jamo{$i}
5479                     }
5480
5481                     # The whole re is made up of the L V T combination.
5482                     $jamo_re = '('
5483                                . join ('|', sort @L_re)
5484                                . ')('
5485                                . join ('|', sort @V_re)
5486                                . ')('
5487                                . join ('|', sort @T_re)
5488                                . ')?';
5489
5490                     # These hashes needed by the algorithm were generated
5491                     # during reading of the Jamo.txt file
5492                     $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5493                     $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5494                     $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5495                     $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5496                 }
5497
5498                 $pre_body .= <<END;
5499
5500 # To achieve significant memory savings when this file is read in,
5501 # algorithmically derivable code points are omitted from the main body below.
5502 # Instead, the following routines can be used to translate between name and
5503 # code point and vice versa
5504
5505 { # Closure
5506
5507     # Matches legal code point.  4-6 hex numbers, If there are 6, the
5508     # first two must be '10'; if there are 5, the first must not be a '0'.
5509     my \$code_point_re = qr/$code_point_re/;
5510
5511     # In the following hash, the keys are the bases of names which includes
5512     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
5513     # of each key is another hash which is used to get the low and high ends
5514     # for each range of code points that apply to the name
5515     my %names_ending_in_code_point = (
5516 $names
5517     );
5518
5519     # And the following array gives the inverse mapping from code points to
5520     # names.  Lowest code points are first
5521     my \@code_points_ending_in_code_point = (
5522 $code_points_ending_in_code_point
5523     );
5524 END
5525                 # Earlier releases didn't have Jamos.  No sense outputting
5526                 # them unless will be used.
5527                 if ($has_hangul_syllables) {
5528                     $pre_body .= <<END;
5529
5530     # Convert from code point to Jamo short name for use in composing Hangul
5531     # syllable names
5532     my %Jamo = (
5533 $jamo
5534     );
5535
5536     # Leading consonant (can be null)
5537     my %Jamo_L = (
5538 $jamo_l
5539     );
5540
5541     # Vowel
5542     my %Jamo_V = (
5543 $jamo_v
5544     );
5545
5546     # Optional trailing consonant
5547     my %Jamo_T = (
5548 $jamo_t
5549     );
5550
5551     # Computed re that splits up a Hangul name into LVT or LV syllables
5552     my \$syllable_re = qr/$jamo_re/;
5553
5554     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
5555     my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
5556
5557     # These constants names and values were taken from the Unicode standard,
5558     # version 5.1, section 3.12.  They are used in conjunction with Hangul
5559     # syllables
5560     my \$SBase = 0xAC00;
5561     my \$LBase = 0x1100;
5562     my \$VBase = 0x1161;
5563     my \$TBase = 0x11A7;
5564     my \$SCount = 11172;
5565     my \$LCount = 19;
5566     my \$VCount = 21;
5567     my \$TCount = 28;
5568     my \$NCount = \$VCount * \$TCount;
5569 END
5570                 } # End of has Jamos
5571
5572                 $pre_body .= << 'END';
5573
5574     sub name_to_code_point_special {
5575         my $name = shift;
5576
5577         # Returns undef if not one of the specially handled names; otherwise
5578         # returns the code point equivalent to the input name
5579 END
5580                 if ($has_hangul_syllables) {
5581                     $pre_body .= << 'END';
5582
5583         if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
5584             $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
5585             return if $name !~ qr/^$syllable_re$/;
5586             my $L = $Jamo_L{$1};
5587             my $V = $Jamo_V{$2};
5588             my $T = (defined $3) ? $Jamo_T{$3} : 0;
5589             return ($L * $VCount + $V) * $TCount + $T + $SBase;
5590         }
5591 END
5592                 }
5593                 $pre_body .= << 'END';
5594
5595         # Name must end in '-code_point' for this to handle.
5596         if ($name !~ /^ (.*) - ($code_point_re) $/x) {
5597             return;
5598         }
5599
5600         my $base = $1;
5601         my $code_point = CORE::hex $2;
5602
5603         # Name must be one of the ones which has the code point in it.
5604         return if ! $names_ending_in_code_point{$base};
5605
5606         # Look through the list of ranges that apply to this name to see if
5607         # the code point is in one of them.
5608         for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
5609             return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
5610             next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
5611
5612             # Here, the code point is in the range.
5613             return $code_point;
5614         }
5615
5616         # Here, looked like the name had a code point number in it, but
5617         # did not match one of the valid ones.
5618         return;
5619     }
5620
5621     sub code_point_to_name_special {
5622         my $code_point = shift;
5623
5624         # Returns the name of a code point if algorithmically determinable;
5625         # undef if not
5626 END
5627                 if ($has_hangul_syllables) {
5628                     $pre_body .= << 'END';
5629
5630         # If in the Hangul range, calculate the name based on Unicode's
5631         # algorithm
5632         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
5633             use integer;
5634             my $SIndex = $code_point - $SBase;
5635             my $L = $LBase + $SIndex / $NCount;
5636             my $V = $VBase + ($SIndex % $NCount) / $TCount;
5637             my $T = $TBase + $SIndex % $TCount;
5638             $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}";
5639             $name .= $Jamo{$T} if $T != $TBase;
5640             return $name;
5641         }
5642 END
5643                 }
5644                 $pre_body .= << 'END';
5645
5646         # Look through list of these code points for one in range.
5647         foreach my $hash (@code_points_ending_in_code_point) {
5648             return if $code_point < $hash->{'low'};
5649             if ($code_point <= $hash->{'high'}) {
5650                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
5651             }
5652         }
5653         return;            # None found
5654     }
5655 } # End closure
5656
5657 END
5658             } # End of has hangul or code point in name maps.
5659         } # End of has specials
5660
5661         # Calculate the format of the table if not already done.
5662         my $format = $format{$addr};
5663         my $property = $self->property;
5664         my $type = $property->type;
5665         if (! defined $format) {
5666             if ($type == $BINARY) {
5667
5668                 # Don't bother checking the values, because we elsewhere
5669                 # verify that a binary table has only 2 values.
5670                 $format = $BINARY_FORMAT;
5671             }
5672             else {
5673                 my @ranges = $self->_range_list->ranges;
5674
5675                 # default an empty table based on its type and default map
5676                 if (! @ranges) {
5677
5678                     # But it turns out that the only one we can say is a
5679                     # non-string (besides binary, handled above) is when the
5680                     # table is a string and the default map is to a code point
5681                     if ($type == $STRING && $default_map eq $CODE_POINT) {
5682                         $format = $HEX_FORMAT;
5683                     }
5684                     else {
5685                         $format = $STRING_FORMAT;
5686                     }
5687                 }
5688                 else {
5689
5690                     # Start with the most restrictive format, and as we find
5691                     # something that doesn't fit with that, change to the next
5692                     # most restrictive, and so on.
5693                     $format = $DECIMAL_FORMAT;
5694                     foreach my $range (@ranges) {
5695                         my $map = $range->value;
5696                         if ($map ne $default_map) {
5697                             last if $format eq $STRING_FORMAT;  # already at
5698                                                                 # least
5699                                                                 # restrictive
5700                             $format = $INTEGER_FORMAT
5701                                                 if $format eq $DECIMAL_FORMAT
5702                                                     && $map !~ / ^ [0-9] $ /x;
5703                             $format = $FLOAT_FORMAT
5704                                             if $format eq $INTEGER_FORMAT
5705                                                 && $map !~ / ^ -? [0-9]+ $ /x;
5706                             $format = $RATIONAL_FORMAT
5707                                 if $format eq $FLOAT_FORMAT
5708                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
5709                             $format = $HEX_FORMAT
5710                             if $format eq $RATIONAL_FORMAT
5711                                 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
5712                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
5713                                                        && $map =~ /[^0-9A-F]/;
5714                         }
5715                     }
5716                 }
5717             }
5718         } # end of calculating format
5719
5720         my $return = <<END;
5721 # The name this swash is to be known by, with the format of the mappings in
5722 # the main body of the table, and what all code points missing from this file
5723 # map to.
5724 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
5725 END
5726         my $missing = $default_map;
5727         if ($missing eq $CODE_POINT
5728             && $format ne $HEX_FORMAT
5729             && ! defined $format{$addr})    # Is expected if was manually set
5730         {
5731             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
5732         }
5733         $format{$addr} = $format;
5734         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
5735         if ($missing eq $CODE_POINT) {
5736             $return .= ' # code point maps to itself';
5737         }
5738         elsif ($missing eq "") {
5739             $return .= ' # code point maps to the null string';
5740         }
5741         $return .= "\n";
5742
5743         $return .= $pre_body;
5744
5745         return $return;
5746     }
5747
5748     sub write {
5749         # Write the table to the file.
5750
5751         my $self = shift;
5752         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5753
5754         my $addr; { no overloading; $addr = 0+$self; }
5755
5756         return $self->SUPER::write(
5757             ($self->property == $block)
5758                 ? 7     # block file needs more tab stops
5759                 : 3,
5760             $default_map{$addr});   # don't write defaulteds
5761     }
5762
5763     # Accessors for the underlying list that should fail if locked.
5764     for my $sub qw(
5765                     add_duplicate
5766                 )
5767     {
5768         no strict "refs";
5769         *$sub = sub {
5770             use strict "refs";
5771             my $self = shift;
5772
5773             return if $self->carp_if_locked;
5774             return $self->_range_list->$sub(@_);
5775         }
5776     }
5777 } # End closure for Map_Table
5778
5779 package Match_Table;
5780 use base '_Base_Table';
5781
5782 # A Match table is one which is a list of all the code points that have
5783 # the same property and property value, for use in \p{property=value}
5784 # constructs in regular expressions.  It adds very little data to the base
5785 # structure, but many methods, as these lists can be combined in many ways to
5786 # form new ones.
5787 # There are only a few concepts added:
5788 # 1) Equivalents and Relatedness.
5789 #    Two tables can match the identical code points, but have different names.
5790 #    This always happens when there is a perl single form extension
5791 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
5792 #    tables are set to be related, with the Perl extension being a child, and
5793 #    the Unicode property being the parent.
5794 #
5795 #    It may be that two tables match the identical code points and we don't
5796 #    know if they are related or not.  This happens most frequently when the
5797 #    Block and Script properties have the exact range.  But note that a
5798 #    revision to Unicode could add new code points to the script, which would
5799 #    now have to be in a different block (as the block was filled, or there
5800 #    would have been 'Unknown' script code points in it and they wouldn't have
5801 #    been identical).  So we can't rely on any two properties from Unicode
5802 #    always matching the same code points from release to release, and thus
5803 #    these tables are considered coincidentally equivalent--not related.  When
5804 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
5805 #    'leader', and the others are 'equivalents'.  This concept is useful
5806 #    to minimize the number of tables written out.  Only one file is used for
5807 #    any identical set of code points, with entries in Heavy.pl mapping all
5808 #    the involved tables to it.
5809 #
5810 #    Related tables will always be identical; we set them up to be so.  Thus
5811 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
5812 #    unrelated tables.  Relatedness makes generating the documentation easier.
5813 #
5814 # 2) Conflicting.  It may be that there will eventually be name clashes, with
5815 #    the same name meaning different things.  For a while, there actually were
5816 #    conflicts, but they have so far been resolved by changing Perl's or
5817 #    Unicode's definitions to match the other, but when this code was written,
5818 #    it wasn't clear that that was what was going to happen.  (Unicode changed
5819 #    because of protests during their beta period.)  Name clashes are warned
5820 #    about during compilation, and the documentation.  The generated tables
5821 #    are sane, free of name clashes, because the code suppresses the Perl
5822 #    version.  But manual intervention to decide what the actual behavior
5823 #    should be may be required should this happen.  The introductory comments
5824 #    have more to say about this.
5825
5826 sub standardize { return main::standardize($_[0]); }
5827 sub trace { return main::trace(@_); }
5828
5829
5830 { # Closure
5831
5832     main::setup_package();
5833
5834     my %leader;
5835     # The leader table of this one; initially $self.
5836     main::set_access('leader', \%leader, 'r');
5837
5838     my %equivalents;
5839     # An array of any tables that have this one as their leader
5840     main::set_access('equivalents', \%equivalents, 'readable_array');
5841
5842     my %parent;
5843     # The parent table to this one, initially $self.  This allows us to
5844     # distinguish between equivalent tables that are related, and those which
5845     # may not be, but share the same output file because they match the exact
5846     # same set of code points in the current Unicode release.
5847     main::set_access('parent', \%parent, 'r');
5848
5849     my %children;
5850     # An array of any tables that have this one as their parent
5851     main::set_access('children', \%children, 'readable_array');
5852
5853     my %conflicting;
5854     # Array of any tables that would have the same name as this one with
5855     # a different meaning.  This is used for the generated documentation.
5856     main::set_access('conflicting', \%conflicting, 'readable_array');
5857
5858     my %matches_all;
5859     # Set in the constructor for tables that are expected to match all code
5860     # points.
5861     main::set_access('matches_all', \%matches_all, 'r');
5862
5863     sub new {
5864         my $class = shift;
5865
5866         my %args = @_;
5867
5868         # The property for which this table is a listing of property values.
5869         my $property = delete $args{'_Property'};
5870
5871         my $name = delete $args{'Name'};
5872         my $full_name = delete $args{'Full_Name'};
5873         $full_name = $name if ! defined $full_name;
5874
5875         # Optional
5876         my $initialize = delete $args{'Initialize'};
5877         my $matches_all = delete $args{'Matches_All'} || 0;
5878         # Rest of parameters passed on.
5879
5880         my $range_list = Range_List->new(Initialize => $initialize,
5881                                          Owner => $property);
5882
5883         my $complete = $full_name;
5884         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
5885                                               # but this helps debug if it
5886                                               # does
5887         # The complete name for a match table includes it's property in a
5888         # compound form 'property=table', except if the property is the
5889         # pseudo-property, perl, in which case it is just the single form,
5890         # 'table' (If you change the '=' must also change the ':' in lots of
5891         # places in this program that assume an equal sign)
5892         $complete = $property->full_name . "=$complete" if $property != $perl;
5893
5894         my $self = $class->SUPER::new(%args,
5895                                       Name => $name,
5896                                       Complete_Name => $complete,
5897                                       Full_Name => $full_name,
5898                                       _Property => $property,
5899                                       _Range_List => $range_list,
5900                                       );
5901         my $addr; { no overloading; $addr = 0+$self; }
5902
5903         $conflicting{$addr} = [ ];
5904         $equivalents{$addr} = [ ];
5905         $children{$addr} = [ ];
5906         $matches_all{$addr} = $matches_all;
5907         $leader{$addr} = $self;
5908         $parent{$addr} = $self;
5909
5910         return $self;
5911     }
5912
5913     # See this program's beginning comment block about overloading these.
5914     use overload
5915         fallback => 0,
5916         qw("") => "_operator_stringify",
5917         '=' => sub {
5918                     my $self = shift;
5919
5920                     return if $self->carp_if_locked;
5921                     return $self;
5922                 },
5923
5924         '+' => sub {
5925                         my $self = shift;
5926                         my $other = shift;
5927
5928                         return $self->_range_list + $other;
5929                     },
5930         '&' => sub {
5931                         my $self = shift;
5932                         my $other = shift;
5933
5934                         return $self->_range_list & $other;
5935                     },
5936         '+=' => sub {
5937                         my $self = shift;
5938                         my $other = shift;
5939
5940                         return if $self->carp_if_locked;
5941
5942                         my $addr; { no overloading; $addr = 0+$self; }
5943
5944                         if (ref $other) {
5945
5946                             # Change the range list of this table to be the
5947                             # union of the two.
5948                             $self->_set_range_list($self->_range_list
5949                                                     + $other);
5950                         }
5951                         else {    # $other is just a simple value
5952                             $self->add_range($other, $other);
5953                         }
5954                         return $self;
5955                     },
5956         '-' => sub { my $self = shift;
5957                     my $other = shift;
5958                     my $reversed = shift;
5959
5960                     if ($reversed) {
5961                         Carp::my_carp_bug("Can't cope with a "
5962                             .  __PACKAGE__
5963                             . " being the first parameter in a '-'.  Subtraction ignored.");
5964                         return;
5965                     }
5966
5967                     return $self->_range_list - $other;
5968                 },
5969         '~' => sub { my $self = shift;
5970                     return ~ $self->_range_list;
5971                 },
5972     ;
5973
5974     sub _operator_stringify {
5975         my $self = shift;
5976
5977         my $name = $self->complete_name;
5978         return "Table '$name'";
5979     }
5980
5981     sub add_alias {
5982         # Add a synonym for this table.  See the comments in the base class
5983
5984         my $self = shift;
5985         my $name = shift;
5986         # Rest of parameters passed on.
5987
5988         $self->SUPER::add_alias($name, $self, @_);
5989         return;
5990     }
5991
5992     sub add_conflicting {
5993         # Add the name of some other object to the list of ones that name
5994         # clash with this match table.
5995
5996         my $self = shift;
5997         my $conflicting_name = shift;   # The name of the conflicting object
5998         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
5999         my $conflicting_object = shift; # Optional, the conflicting object
6000                                         # itself.  This is used to
6001                                         # disambiguate the text if the input
6002                                         # name is identical to any of the
6003                                         # aliases $self is known by.
6004                                         # Sometimes the conflicting object is
6005                                         # merely hypothetical, so this has to
6006                                         # be an optional parameter.
6007         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6008
6009         my $addr; { no overloading; $addr = 0+$self; }
6010
6011         # Check if the conflicting name is exactly the same as any existing
6012         # alias in this table (as long as there is a real object there to
6013         # disambiguate with).
6014         if (defined $conflicting_object) {
6015             foreach my $alias ($self->aliases) {
6016                 if ($alias->name eq $conflicting_name) {
6017
6018                     # Here, there is an exact match.  This results in
6019                     # ambiguous comments, so disambiguate by changing the
6020                     # conflicting name to its object's complete equivalent.
6021                     $conflicting_name = $conflicting_object->complete_name;
6022                     last;
6023                 }
6024             }
6025         }
6026
6027         # Convert to the \p{...} final name
6028         $conflicting_name = "\\$p" . "{$conflicting_name}";
6029
6030         # Only add once
6031         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6032
6033         push @{$conflicting{$addr}}, $conflicting_name;
6034
6035         return;
6036     }
6037
6038     sub is_equivalent_to {
6039         # Return boolean of whether or not the other object is a table of this
6040         # type and has been marked equivalent to this one.
6041
6042         my $self = shift;
6043         my $other = shift;
6044         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6045
6046         return 0 if ! defined $other; # Can happen for incomplete early
6047                                       # releases
6048         unless ($other->isa(__PACKAGE__)) {
6049             my $ref_other = ref $other;
6050             my $ref_self = ref $self;
6051             Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
6052             return 0;
6053         }
6054
6055         # Two tables are equivalent if they have the same leader.
6056         no overloading;
6057         return $leader{0+$self} == $leader{0+$other};
6058         return;
6059     }
6060
6061     sub matches_identically_to {
6062         # Return a boolean as to whether or not two tables match identical
6063         # sets of code points.
6064
6065         my $self = shift;
6066         my $other = shift;
6067         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6068
6069         unless ($other->isa(__PACKAGE__)) {
6070             my $ref_other = ref $other;
6071             my $ref_self = ref $self;
6072             Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
6073             return 0;
6074         }
6075
6076         # These are ordered in increasing real time to figure out (at least
6077         # until a patch changes that and doesn't change this)
6078         return 0 if $self->max != $other->max;
6079         return 0 if $self->min != $other->min;
6080         return 0 if $self->range_count != $other->range_count;
6081         return 0 if $self->count != $other->count;
6082
6083         # Here they could be identical because all the tests above passed.
6084         # The loop below is somewhat simpler since we know they have the same
6085         # number of elements.  Compare range by range, until reach the end or
6086         # find something that differs.
6087         my @a_ranges = $self->_range_list->ranges;
6088         my @b_ranges = $other->_range_list->ranges;
6089         for my $i (0 .. @a_ranges - 1) {
6090             my $a = $a_ranges[$i];
6091             my $b = $b_ranges[$i];
6092             trace "self $a; other $b" if main::DEBUG && $to_trace;
6093             return 0 if $a->start != $b->start || $a->end != $b->end;
6094         }
6095         return 1;
6096     }
6097
6098     sub set_equivalent_to {
6099         # Set $self equivalent to the parameter table.
6100         # The required Related => 'x' parameter is a boolean indicating
6101         # whether these tables are related or not.  If related, $other becomes
6102         # the 'parent' of $self; if unrelated it becomes the 'leader'
6103         #
6104         # Related tables share all characteristics except names; equivalents
6105         # not quite so many.
6106         # If they are related, one must be a perl extension.  This is because
6107         # we can't guarantee that Unicode won't change one or the other in a
6108         # later release even if they are idential now.
6109
6110         my $self = shift;
6111         my $other = shift;
6112
6113         my %args = @_;
6114         my $related = delete $args{'Related'};
6115
6116         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6117
6118         return if ! defined $other;     # Keep on going; happens in some early
6119                                         # Unicode releases.
6120
6121         if (! defined $related) {
6122             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
6123             $related = 0;
6124         }
6125
6126         # If already are equivalent, no need to re-do it;  if subroutine
6127         # returns null, it found an error, also do nothing
6128         my $are_equivalent = $self->is_equivalent_to($other);
6129         return if ! defined $are_equivalent || $are_equivalent;
6130
6131         my $addr; { no overloading; $addr = 0+$self; }
6132         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
6133
6134         if ($related &&
6135             ! $other->perl_extension
6136             && ! $current_leader->perl_extension)
6137         {
6138             Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
6139             $related = 0;
6140         }
6141
6142         my $leader; { no overloading; $leader = 0+$current_leader; }
6143         my $other_addr; { no overloading; $other_addr = 0+$other; }
6144
6145         # Any tables that are equivalent to or children of this table must now
6146         # instead be equivalent to or (children) to the new leader (parent),
6147         # still equivalent.  The equivalency includes their matches_all info,
6148         # and for related tables, their status
6149         # All related tables are of necessity equivalent, but the converse
6150         # isn't necessarily true
6151         my $status = $other->status;
6152         my $status_info = $other->status_info;
6153         my $matches_all = $matches_all{other_addr};
6154         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6155             next if $table == $other;
6156             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6157
6158             my $table_addr; { no overloading; $table_addr = 0+$table; }
6159             $leader{$table_addr} = $other;
6160             $matches_all{$table_addr} = $matches_all;
6161             $self->_set_range_list($other->_range_list);
6162             push @{$equivalents{$other_addr}}, $table;
6163             if ($related) {
6164                 $parent{$table_addr} = $other;
6165                 push @{$children{$other_addr}}, $table;
6166                 $table->set_status($status, $status_info);
6167             }
6168         }
6169
6170         # Now that we've declared these to be equivalent, any changes to one
6171         # of the tables would invalidate that equivalency.
6172         $self->lock;
6173         $other->lock;
6174         return;
6175     }
6176
6177     sub add_range { # Add a range to the list for this table.
6178         my $self = shift;
6179         # Rest of parameters passed on
6180
6181         return if $self->carp_if_locked;
6182         return $self->_range_list->add_range(@_);
6183     }
6184
6185     sub pre_body {  # Does nothing for match tables.
6186         return
6187     }
6188
6189     sub append_to_body {  # Does nothing for match tables.
6190         return
6191     }
6192
6193     sub write {
6194         my $self = shift;
6195         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6196
6197         return $self->SUPER::write(2); # 2 tab stops
6198     }
6199
6200     sub set_final_comment {
6201         # This creates a comment for the file that is to hold the match table
6202         # $self.  It is somewhat convoluted to make the English read nicely,
6203         # but, heh, it's just a comment.
6204         # This should be called only with the leader match table of all the
6205         # ones that share the same file.  It lists all such tables, ordered so
6206         # that related ones are together.
6207
6208         my $leader = shift;   # Should only be called on the leader table of
6209                               # an equivalent group
6210         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6211
6212         my $addr; { no overloading; $addr = 0+$leader; }
6213
6214         if ($leader{$addr} != $leader) {
6215             Carp::my_carp_bug(<<END
6216 set_final_comment() must be called on a leader table, which $leader is not.
6217 It is equivalent to $leader{$addr}.  No comment created
6218 END
6219             );
6220             return;
6221         }
6222
6223         # Get the number of code points matched by each of the tables in this
6224         # file, and add underscores for clarity.
6225         my $count = $leader->count;
6226         my $string_count = main::clarify_number($count);
6227
6228         my $loose_count = 0;        # how many aliases loosely matched
6229         my $compound_name = "";     # ? Are any names compound?, and if so, an
6230                                     # example
6231         my $properties_with_compound_names = 0;    # count of these
6232
6233
6234         my %flags;              # The status flags used in the file
6235         my $total_entries = 0;  # number of entries written in the comment
6236         my $matches_comment = ""; # The portion of the comment about the
6237                                   # \p{}'s
6238         my @global_comments;    # List of all the tables' comments that are
6239                                 # there before this routine was called.
6240
6241         # Get list of all the parent tables that are equivalent to this one
6242         # (including itself).
6243         my @parents = grep { $parent{main::objaddr $_} == $_ }
6244                             main::uniques($leader, @{$equivalents{$addr}});
6245         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
6246                                               # tables
6247
6248         for my $parent (@parents) {
6249
6250             my $property = $parent->property;
6251
6252             # Special case 'N' tables in properties with two match tables when
6253             # the other is a 'Y' one.  These are likely to be binary tables,
6254             # but not necessarily.  In either case, \P{} will match the
6255             # complement of \p{}, and so if something is a synonym of \p, the
6256             # complement of that something will be the synonym of \P.  This
6257             # would be true of any property with just two match tables, not
6258             # just those whose values are Y and N; but that would require a
6259             # little extra work, and there are none such so far in Unicode.
6260             my $perl_p = 'p';        # which is it?  \p{} or \P{}
6261             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
6262
6263             if (scalar $property->tables == 2
6264                 && $parent == $property->table('N')
6265                 && defined (my $yes = $property->table('Y')))
6266             {
6267                 my $yes_addr; { no overloading; $yes_addr = 0+$yes; }
6268                 @yes_perl_synonyms
6269                     = grep { $_->property == $perl }
6270                                     main::uniques($yes,
6271                                                 $parent{$yes_addr},
6272                                                 $parent{$yes_addr}->children);
6273
6274                 # But these synonyms are \P{} ,not \p{}
6275                 $perl_p = 'P';
6276             }
6277
6278             my @description;        # Will hold the table description
6279             my @note;               # Will hold the table notes.
6280             my @conflicting;        # Will hold the table conflicts.
6281
6282             # Look at the parent, any yes synonyms, and all the children
6283             my $parent_addr; { no overloading; $parent_addr = 0+$parent; }
6284             for my $table ($parent,
6285                            @yes_perl_synonyms,
6286                            @{$children{$parent_addr}})
6287             {
6288                 my $table_addr; { no overloading; $table_addr = 0+$table; }
6289                 my $table_property = $table->property;
6290
6291                 # Tables are separated by a blank line to create a grouping.
6292                 $matches_comment .= "\n" if $matches_comment;
6293
6294                 # The table is named based on the property and value
6295                 # combination it is for, like script=greek.  But there may be
6296                 # a number of synonyms for each side, like 'sc' for 'script',
6297                 # and 'grek' for 'greek'.  Any combination of these is a valid
6298                 # name for this table.  In this case, there are three more,
6299                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
6300                 # listing all possible combinations in the comment, we make
6301                 # sure that each synonym occurs at least once, and add
6302                 # commentary that the other combinations are possible.
6303                 my @property_aliases = $table_property->aliases;
6304                 my @table_aliases = $table->aliases;
6305
6306                 Carp::my_carp_bug("$table doesn't have any names.  Proceeding anyway.") unless @table_aliases;
6307
6308                 # The alias lists above are already ordered in the order we
6309                 # want to output them.  To ensure that each synonym is listed,
6310                 # we must use the max of the two numbers.
6311                 my $listed_combos = main::max(scalar @table_aliases,
6312                                                 scalar @property_aliases);
6313                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6314
6315                 my $property_had_compound_name = 0;
6316
6317                 for my $i (0 .. $listed_combos - 1) {
6318                     $total_entries++;
6319
6320                     # The current alias for the property is the next one on
6321                     # the list, or if beyond the end, start over.  Similarly
6322                     # for the table (\p{prop=table})
6323                     my $property_alias = $property_aliases
6324                                             [$i % @property_aliases]->name;
6325                     my $table_alias_object = $table_aliases
6326                                                         [$i % @table_aliases];
6327                     my $table_alias = $table_alias_object->name;
6328                     my $loose_match = $table_alias_object->loose_match;
6329
6330                     if ($table_alias !~ /\D/) { # Clarify large numbers.
6331                         $table_alias = main::clarify_number($table_alias)
6332                     }
6333
6334                     # Add a comment for this alias combination
6335                     my $current_match_comment;
6336                     if ($table_property == $perl) {
6337                         $current_match_comment = "\\$perl_p"
6338                                                     . "{$table_alias}";
6339                     }
6340                     else {
6341                         $current_match_comment
6342                                         = "\\p{$property_alias=$table_alias}";
6343                         $property_had_compound_name = 1;
6344                     }
6345
6346                     # Flag any abnormal status for this table.
6347                     my $flag = $property->status
6348                                 || $table->status
6349                                 || $table_alias_object->status;
6350                     if ($flag) {
6351                         if ($flag ne $PLACEHOLDER) {
6352                             $flags{$flag} = $status_past_participles{$flag};
6353                         } else {
6354                             $flags{$flag} = <<END;
6355 a placeholder because it is not in Version $string_version of Unicode, but is
6356 needed by the Perl core to work gracefully.  Because it is not in this version
6357 of Unicode, it will not be listed in $pod_file.pod
6358 END
6359                         }
6360                     }
6361
6362                     $loose_count++;
6363
6364                     # Pretty up the comment.  Note the \b; it says don't make
6365                     # this line a continuation.
6366                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
6367                                         $flag,
6368                                         " " x 7,
6369                                         $current_match_comment);
6370                 } # End of generating the entries for this table.
6371
6372                 # Save these for output after this group of related tables.
6373                 push @description, $table->description;
6374                 push @note, $table->note;
6375                 push @conflicting, $table->conflicting;
6376
6377                 # And this for output after all the tables.
6378                 push @global_comments, $table->comment;
6379
6380                 # Compute an alternate compound name using the final property
6381                 # synonym and the first table synonym with a colon instead of
6382                 # the equal sign used elsewhere.
6383                 if ($property_had_compound_name) {
6384                     $properties_with_compound_names ++;
6385                     if (! $compound_name || @property_aliases > 1) {
6386                         $compound_name = $property_aliases[-1]->name
6387                                         . ': '
6388                                         . $table_aliases[0]->name;
6389                     }
6390                 }
6391             } # End of looping through all children of this table
6392
6393             # Here have assembled in $matches_comment all the related tables
6394             # to the current parent (preceded by the same info for all the
6395             # previous parents).  Put out information that applies to all of
6396             # the current family.
6397             if (@conflicting) {
6398
6399                 # But output the conflicting information now, as it applies to
6400                 # just this table.
6401                 my $conflicting = join ", ", @conflicting;
6402                 if ($conflicting) {
6403                     $matches_comment .= <<END;
6404
6405     Note that contrary to what you might expect, the above is NOT the same as
6406 END
6407                     $matches_comment .= "any of: " if @conflicting > 1;
6408                     $matches_comment .= "$conflicting\n";
6409                 }
6410             }
6411             if (@description) {
6412                 $matches_comment .= "\n    Meaning: "
6413                                     . join('; ', @description)
6414                                     . "\n";
6415             }
6416             if (@note) {
6417                 $matches_comment .= "\n    Note: "
6418                                     . join("\n    ", @note)
6419                                     . "\n";
6420             }
6421         } # End of looping through all tables
6422
6423
6424         my $code_points;
6425         my $match;
6426         my $any_of_these;
6427         if ($count == 1) {
6428             $match = 'matches';
6429             $code_points = 'single code point';
6430         }
6431         else {
6432             $match = 'match';
6433             $code_points = "$string_count code points";
6434         }
6435
6436         my $synonyms;
6437         my $entries;
6438         if ($total_entries <= 1) {
6439             $synonyms = "";
6440             $entries = 'entry';
6441             $any_of_these = 'this'
6442         }
6443         else {
6444             $synonyms = " any of the following regular expression constructs";
6445             $entries = 'entries';
6446             $any_of_these = 'any of these'
6447         }
6448
6449         my $comment = "";
6450         if ($has_unrelated) {
6451             $comment .= <<END;
6452 This file is for tables that are not necessarily related:  To conserve
6453 resources, every table that matches the identical set of code points in this
6454 version of Unicode uses this file.  Each one is listed in a separate group
6455 below.  It could be that the tables will match the same set of code points in
6456 other Unicode releases, or it could be purely coincidence that they happen to
6457 be the same in Unicode $string_version, and hence may not in other versions.
6458
6459 END
6460         }
6461
6462         if (%flags) {
6463             foreach my $flag (sort keys %flags) {
6464                 $comment .= <<END;
6465 '$flag' below means that this form is $flags{$flag}.
6466 END
6467                 next if $flag eq $PLACEHOLDER;
6468                 $comment .= "Consult $pod_file.pod\n";
6469             }
6470             $comment .= "\n";
6471         }
6472
6473         $comment .= <<END;
6474 This file returns the $code_points in Unicode Version $string_version that
6475 $match$synonyms:
6476
6477 $matches_comment
6478 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
6479 including if adding or subtracting white space, underscore, and hyphen
6480 characters matters or doesn't matter, and other permissible syntactic
6481 variants.  Upper/lower case distinctions never matter.
6482 END
6483
6484         if ($compound_name) {
6485             $comment .= <<END;
6486
6487 A colon can be substituted for the equals sign, and
6488 END
6489             if ($properties_with_compound_names > 1) {
6490                 $comment .= <<END;
6491 within each group above,
6492 END
6493             }
6494             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6495
6496             # Note the \b below, it says don't make that line a continuation.
6497             $comment .= <<END;
6498 anything to the left of the equals (or colon) can be combined with anything to
6499 the right.  Thus, for example,
6500 $compound_name
6501 \bis also valid.
6502 END
6503         }
6504
6505         # And append any comment(s) from the actual tables.  They are all
6506         # gathered here, so may not read all that well.
6507         if (@global_comments) {
6508             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
6509         }
6510
6511         if ($count) {   # The format differs if no code points, and needs no
6512                         # explanation in that case
6513                 $comment.= <<END;
6514
6515 The format of the lines of this file is:
6516 END
6517             $comment.= <<END;
6518 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6519 STOP is the ending point, or if omitted, the range has just one code point.
6520 END
6521             if ($leader->output_range_counts) {
6522                 $comment .= <<END;
6523 Numbers in comments in [brackets] indicate how many code points are in the
6524 range.
6525 END
6526             }
6527         }
6528
6529         $leader->set_comment(main::join_lines($comment));
6530         return;
6531     }
6532
6533     # Accessors for the underlying list
6534     for my $sub qw(
6535                     get_valid_code_point
6536                     get_invalid_code_point
6537                 )
6538     {
6539         no strict "refs";
6540         *$sub = sub {
6541             use strict "refs";
6542             my $self = shift;
6543
6544             return $self->_range_list->$sub(@_);
6545         }
6546     }
6547 } # End closure for Match_Table
6548
6549 package Property;
6550
6551 # The Property class represents a Unicode property, or the $perl
6552 # pseudo-property.  It contains a map table initialized empty at construction
6553 # time, and for properties accessible through regular expressions, various
6554 # match tables, created through the add_match_table() method, and referenced
6555 # by the table('NAME') or tables() methods, the latter returning a list of all
6556 # of the match tables.  Otherwise table operations implicitly are for the map
6557 # table.
6558 #
6559 # Most of the data in the property is actually about its map table, so it
6560 # mostly just uses that table's accessors for most methods.  The two could
6561 # have been combined into one object, but for clarity because of their
6562 # differing semantics, they have been kept separate.  It could be argued that
6563 # the 'file' and 'directory' fields should be kept with the map table.
6564 #
6565 # Each property has a type.  This can be set in the constructor, or in the
6566 # set_type accessor, but mostly it is figured out by the data.  Every property
6567 # starts with unknown type, overridden by a parameter to the constructor, or
6568 # as match tables are added, or ranges added to the map table, the data is
6569 # inspected, and the type changed.  After the table is mostly or entirely
6570 # filled, compute_type() should be called to finalize they analysis.
6571 #
6572 # There are very few operations defined.  One can safely remove a range from
6573 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
6574 # table to this one, replacing any in the intersection of the two.
6575
6576 sub standardize { return main::standardize($_[0]); }
6577 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
6578
6579 {   # Closure
6580
6581     # This hash will contain as keys, all the aliases of all properties, and
6582     # as values, pointers to their respective property objects.  This allows
6583     # quick look-up of a property from any of its names.
6584     my %alias_to_property_of;
6585
6586     sub dump_alias_to_property_of {
6587         # For debugging
6588
6589         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
6590         return;
6591     }
6592
6593     sub property_ref {
6594         # This is a package subroutine, not called as a method.
6595         # If the single parameter is a literal '*' it returns a list of all
6596         # defined properties.
6597         # Otherwise, the single parameter is a name, and it returns a pointer
6598         # to the corresponding property object, or undef if none.
6599         #
6600         # Properties can have several different names.  The 'standard' form of
6601         # each of them is stored in %alias_to_property_of as they are defined.
6602         # But it's possible that this subroutine will be called with some
6603         # variant, so if the initial lookup fails, it is repeated with the
6604         # standarized form of the input name.  If found, besides returning the
6605         # result, the input name is added to the list so future calls won't
6606         # have to do the conversion again.
6607
6608         my $name = shift;
6609
6610         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6611
6612         if (! defined $name) {
6613             Carp::my_carp_bug("Undefined input property.  No action taken.");
6614             return;
6615         }
6616
6617         return main::uniques(values %alias_to_property_of) if $name eq '*';
6618
6619         # Return cached result if have it.
6620         my $result = $alias_to_property_of{$name};
6621         return $result if defined $result;
6622
6623         # Convert the input to standard form.
6624         my $standard_name = standardize($name);
6625
6626         $result = $alias_to_property_of{$standard_name};
6627         return unless defined $result;        # Don't cache undefs
6628
6629         # Cache the result before returning it.
6630         $alias_to_property_of{$name} = $result;
6631         return $result;
6632     }
6633
6634
6635     main::setup_package();
6636
6637     my %map;
6638     # A pointer to the map table object for this property
6639     main::set_access('map', \%map);
6640
6641     my %full_name;
6642     # The property's full name.  This is a duplicate of the copy kept in the
6643     # map table, but is needed because stringify needs it during
6644     # construction of the map table, and then would have a chicken before egg
6645     # problem.
6646     main::set_access('full_name', \%full_name, 'r');
6647
6648     my %table_ref;
6649     # This hash will contain as keys, all the aliases of any match tables
6650     # attached to this property, and as values, the pointers to their
6651     # respective tables.  This allows quick look-up of a table from any of its
6652     # names.
6653     main::set_access('table_ref', \%table_ref);
6654
6655     my %type;
6656     # The type of the property, $ENUM, $BINARY, etc
6657     main::set_access('type', \%type, 'r');
6658
6659     my %file;
6660     # The filename where the map table will go (if actually written).
6661     # Normally defaulted, but can be overridden.
6662     main::set_access('file', \%file, 'r', 's');
6663
6664     my %directory;
6665     # The directory where the map table will go (if actually written).
6666     # Normally defaulted, but can be overridden.
6667     main::set_access('directory', \%directory, 's');
6668
6669     my %pseudo_map_type;
6670     # This is used to affect the calculation of the map types for all the
6671     # ranges in the table.  It should be set to one of the values that signify
6672     # to alter the calculation.
6673     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
6674
6675     my %has_only_code_point_maps;
6676     # A boolean used to help in computing the type of data in the map table.
6677     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
6678
6679     my %unique_maps;
6680     # A list of the first few distinct mappings this property has.  This is
6681     # used to disambiguate between binary and enum property types, so don't
6682     # have to keep more than three.
6683     main::set_access('unique_maps', \%unique_maps);
6684
6685     sub new {
6686         # The only required parameter is the positionally first, name.  All
6687         # other parameters are key => value pairs.  See the documentation just
6688         # above for the meanings of the ones not passed directly on to the map
6689         # table constructor.
6690
6691         my $class = shift;
6692         my $name = shift || "";
6693
6694         my $self = property_ref($name);
6695         if (defined $self) {
6696             my $options_string = join ", ", @_;
6697             $options_string = ".  Ignoring options $options_string" if $options_string;
6698             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
6699             return $self;
6700         }
6701
6702         my %args = @_;
6703
6704         $self = bless \do { my $anonymous_scalar }, $class;
6705         my $addr; { no overloading; $addr = 0+$self; }
6706
6707         $directory{$addr} = delete $args{'Directory'};
6708         $file{$addr} = delete $args{'File'};
6709         $full_name{$addr} = delete $args{'Full_Name'} || $name;
6710         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
6711         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
6712         # Rest of parameters passed on.
6713
6714         $has_only_code_point_maps{$addr} = 1;
6715         $table_ref{$addr} = { };
6716         $unique_maps{$addr} = { };
6717
6718         $map{$addr} = Map_Table->new($name,
6719                                     Full_Name => $full_name{$addr},
6720                                     _Alias_Hash => \%alias_to_property_of,
6721                                     _Property => $self,
6722                                     %args);
6723         return $self;
6724     }
6725
6726     # See this program's beginning comment block about overloading the copy
6727     # constructor.  Few operations are defined on properties, but a couple are
6728     # useful.  It is safe to take the inverse of a property, and to remove a
6729     # single code point from it.
6730     use overload
6731         fallback => 0,
6732         qw("") => "_operator_stringify",
6733         "." => \&main::_operator_dot,
6734         '==' => \&main::_operator_equal,
6735         '!=' => \&main::_operator_not_equal,
6736         '=' => sub { return shift },
6737         '-=' => "_minus_and_equal",
6738     ;
6739
6740     sub _operator_stringify {
6741         return "Property '" .  shift->full_name . "'";
6742     }
6743
6744     sub _minus_and_equal {
6745         # Remove a single code point from the map table of a property.
6746
6747         my $self = shift;
6748         my $other = shift;
6749         my $reversed = shift;
6750         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6751
6752         if (ref $other) {
6753             Carp::my_carp_bug("Can't cope with a "
6754                         . ref($other)
6755                         . " argument to '-='.  Subtraction ignored.");
6756             return $self;
6757         }
6758         elsif ($reversed) {   # Shouldnt happen in a -=, but just in case
6759             Carp::my_carp_bug("Can't cope with a "
6760             .  __PACKAGE__
6761             . " being the first parameter in a '-='.  Subtraction ignored.");
6762             return $self;
6763         }
6764         else {
6765             no overloading;
6766             $map{0+$self}->delete_range($other, $other);
6767         }
6768         return $self;
6769     }
6770
6771     sub add_match_table {
6772         # Add a new match table for this property, with name given by the
6773         # parameter.  It returns a pointer to the table.
6774
6775         my $self = shift;
6776         my $name = shift;
6777         my %args = @_;
6778
6779         my $addr; { no overloading; $addr = 0+$self; }
6780
6781         my $table = $table_ref{$addr}{$name};
6782         my $standard_name = main::standardize($name);
6783         if (defined $table
6784             || (defined ($table = $table_ref{$addr}{$standard_name})))
6785         {
6786             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
6787             $table_ref{$addr}{$name} = $table;
6788             return $table;
6789         }
6790         else {
6791
6792             # See if this is a perl extension, if not passed in.
6793             my $perl_extension = delete $args{'Perl_Extension'};
6794             $perl_extension
6795                         = $self->perl_extension if ! defined $perl_extension;
6796
6797             $table = Match_Table->new(
6798                                 Name => $name,
6799                                 Perl_Extension => $perl_extension,
6800                                 _Alias_Hash => $table_ref{$addr},
6801                                 _Property => $self,
6802
6803                                 # gets property's status by default
6804                                 Status => $self->status,
6805                                 _Status_Info => $self->status_info,
6806                                 %args,
6807                                 Internal_Only_Warning => 1); # Override any
6808                                                              # input param
6809             return unless defined $table;
6810         }
6811
6812         # Save the names for quick look up
6813         $table_ref{$addr}{$standard_name} = $table;
6814         $table_ref{$addr}{$name} = $table;
6815
6816         # Perhaps we can figure out the type of this property based on the
6817         # fact of adding this match table.  First, string properties don't
6818         # have match tables; second, a binary property can't have 3 match
6819         # tables
6820         if ($type{$addr} == $UNKNOWN) {
6821             $type{$addr} = $NON_STRING;
6822         }
6823         elsif ($type{$addr} == $STRING) {
6824             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
6825             $type{$addr} = $NON_STRING;
6826         }
6827         elsif ($type{$addr} != $ENUM) {
6828             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
6829                 && $type{$addr} == $BINARY)
6830             {
6831                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
6832                 $type{$addr} = $ENUM;
6833             }
6834         }
6835
6836         return $table;
6837     }
6838
6839     sub table {
6840         # Return a pointer to the match table (with name given by the
6841         # parameter) associated with this property; undef if none.
6842
6843         my $self = shift;
6844         my $name = shift;
6845         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6846
6847         my $addr; { no overloading; $addr = 0+$self; }
6848
6849         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
6850
6851         # If quick look-up failed, try again using the standard form of the
6852         # input name.  If that succeeds, cache the result before returning so
6853         # won't have to standardize this input name again.
6854         my $standard_name = main::standardize($name);
6855         return unless defined $table_ref{$addr}{$standard_name};
6856
6857         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
6858         return $table_ref{$addr}{$name};
6859     }
6860
6861     sub tables {
6862         # Return a list of pointers to all the match tables attached to this
6863         # property
6864
6865         no overloading;
6866         return main::uniques(values %{$table_ref{0+shift}});
6867     }
6868
6869     sub directory {
6870         # Returns the directory the map table for this property should be
6871         # output in.  If a specific directory has been specified, that has
6872         # priority;  'undef' is returned if the type isn't defined;
6873         # or $map_directory for everything else.
6874
6875         my $addr; { no overloading; $addr = 0+shift; }
6876
6877         return $directory{$addr} if defined $directory{$addr};
6878         return undef if $type{$addr} == $UNKNOWN;
6879         return $map_directory;
6880     }
6881
6882     sub swash_name {
6883         # Return the name that is used to both:
6884         #   1)  Name the file that the map table is written to.
6885         #   2)  The name of swash related stuff inside that file.
6886         # The reason for this is that the Perl core historically has used
6887         # certain names that aren't the same as the Unicode property names.
6888         # To continue using these, $file is hard-coded in this file for those,
6889         # but otherwise the standard name is used.  This is different from the
6890         # external_name, so that the rest of the files, like in lib can use
6891         # the standard name always, without regard to historical precedent.
6892
6893         my $self = shift;
6894         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6895
6896         my $addr; { no overloading; $addr = 0+$self; }
6897
6898         return $file{$addr} if defined $file{$addr};
6899         return $map{$addr}->external_name;
6900     }
6901
6902     sub to_create_match_tables {
6903         # Returns a boolean as to whether or not match tables should be
6904         # created for this property.
6905
6906         my $self = shift;
6907         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6908
6909         # The whole point of this pseudo property is match tables.
6910         return 1 if $self == $perl;
6911
6912         my $addr; { no overloading; $addr = 0+$self; }
6913
6914         # Don't generate tables of code points that match the property values
6915         # of a string property.  Such a list would most likely have many
6916         # property values, each with just one or very few code points mapping
6917         # to it.
6918         return 0 if $type{$addr} == $STRING;
6919
6920         # Don't generate anything for unimplemented properties.
6921         return 0 if grep { $self->complete_name eq $_ }
6922                                                     @unimplemented_properties;
6923         # Otherwise, do.
6924         return 1;
6925     }
6926
6927     sub property_add_or_replace_non_nulls {
6928         # This adds the mappings in the property $other to $self.  Non-null
6929         # mappings from $other override those in $self.  It essentially merges
6930         # the two properties, with the second having priority except for null
6931         # mappings.
6932
6933         my $self = shift;
6934         my $other = shift;
6935         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6936
6937         if (! $other->isa(__PACKAGE__)) {
6938             Carp::my_carp_bug("$other should be a "
6939                             . __PACKAGE__
6940                             . ".  Not a '"
6941                             . ref($other)
6942                             . "'.  Not added;");
6943             return;
6944         }
6945
6946         no overloading;
6947         return $map{0+$self}->map_add_or_replace_non_nulls($map{0+$other});
6948     }
6949
6950     sub set_type {
6951         # Set the type of the property.  Mostly this is figured out by the
6952         # data in the table.  But this is used to set it explicitly.  The
6953         # reason it is not a standard accessor is that when setting a binary
6954         # property, we need to make sure that all the true/false aliases are
6955         # present, as they were omitted in early Unicode releases.
6956
6957         my $self = shift;
6958         my $type = shift;
6959         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6960
6961         if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
6962             Carp::my_carp("Unrecognized type '$type'.  Type not set");
6963             return;
6964         }
6965
6966         { no overloading; $type{0+$self} = $type; }
6967         return if $type != $BINARY;
6968
6969         my $yes = $self->table('Y');
6970         $yes = $self->table('Yes') if ! defined $yes;
6971         $yes = $self->add_match_table('Y') if ! defined $yes;
6972         $yes->add_alias('Yes');
6973         $yes->add_alias('T');
6974         $yes->add_alias('True');
6975
6976         my $no = $self->table('N');
6977         $no = $self->table('No') if ! defined $no;
6978         $no = $self->add_match_table('N') if ! defined $no;
6979         $no->add_alias('No');
6980         $no->add_alias('F');
6981         $no->add_alias('False');
6982         return;
6983     }
6984
6985     sub add_map {
6986         # Add a map to the property's map table.  This also keeps
6987         # track of the maps so that the property type can be determined from
6988         # its data.
6989
6990         my $self = shift;
6991         my $start = shift;  # First code point in range
6992         my $end = shift;    # Final code point in range
6993         my $map = shift;    # What the range maps to.
6994         # Rest of parameters passed on.
6995
6996         my $addr; { no overloading; $addr = 0+$self; }
6997
6998         # If haven't the type of the property, gather information to figure it
6999         # out.
7000         if ($type{$addr} == $UNKNOWN) {
7001
7002             # If the map contains an interior blank or dash, or most other
7003             # nonword characters, it will be a string property.  This
7004             # heuristic may actually miss some string properties.  If so, they
7005             # may need to have explicit set_types called for them.  This
7006             # happens in the Unihan properties.
7007             if ($map =~ / (?<= . ) [ -] (?= . ) /x
7008                 || $map =~ / [^\w.\/\ -]  /x)
7009             {
7010                 $self->set_type($STRING);
7011
7012                 # $unique_maps is used for disambiguating between ENUM and
7013                 # BINARY later; since we know the property is not going to be
7014                 # one of those, no point in keeping the data around
7015                 undef $unique_maps{$addr};
7016             }
7017             else {
7018
7019                 # Not necessarily a string.  The final decision has to be
7020                 # deferred until all the data are in.  We keep track of if all
7021                 # the values are code points for that eventual decision.
7022                 $has_only_code_point_maps{$addr} &=
7023                                             $map =~ / ^ $code_point_re $/x;
7024
7025                 # For the purposes of disambiguating between binary and other
7026                 # enumerations at the end, we keep track of the first three
7027                 # distinct property values.  Once we get to three, we know
7028                 # it's not going to be binary, so no need to track more.
7029                 if (scalar keys %{$unique_maps{$addr}} < 3) {
7030                     $unique_maps{$addr}{main::standardize($map)} = 1;
7031                 }
7032             }
7033         }
7034
7035         # Add the mapping by calling our map table's method
7036         return $map{$addr}->add_map($start, $end, $map, @_);
7037     }
7038
7039     sub compute_type {
7040         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
7041         # should be called after the property is mostly filled with its maps.
7042         # We have been keeping track of what the property values have been,
7043         # and now have the necessary information to figure out the type.
7044
7045         my $self = shift;
7046         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7047
7048         my $addr; { no overloading; $addr = 0+$self; }
7049
7050         my $type = $type{$addr};
7051
7052         # If already have figured these out, no need to do so again, but we do
7053         # a double check on ENUMS to make sure that a string property hasn't
7054         # improperly been classified as an ENUM, so continue on with those.
7055         return if $type == $STRING || $type == $BINARY;
7056
7057         # If every map is to a code point, is a string property.
7058         if ($type == $UNKNOWN
7059             && ($has_only_code_point_maps{$addr}
7060                 || (defined $map{$addr}->default_map
7061                     && $map{$addr}->default_map eq "")))
7062         {
7063             $self->set_type($STRING);
7064         }
7065         else {
7066
7067             # Otherwise, it is to some sort of enumeration.  (The case where
7068             # it is a Unicode miscellaneous property, and treated like a
7069             # string in this program is handled in add_map()).  Distinguish
7070             # between binary and some other enumeration type.  Of course, if
7071             # there are more than two values, it's not binary.  But more
7072             # subtle is the test that the default mapping is defined means it
7073             # isn't binary.  This in fact may change in the future if Unicode
7074             # changes the way its data is structured.  But so far, no binary
7075             # properties ever have @missing lines for them, so the default map
7076             # isn't defined for them.  The few properties that are two-valued
7077             # and aren't considered binary have the default map defined
7078             # starting in Unicode 5.0, when the @missing lines appeared; and
7079             # this program has special code to put in a default map for them
7080             # for earlier than 5.0 releases.
7081             if ($type == $ENUM
7082                 || scalar keys %{$unique_maps{$addr}} > 2
7083                 || defined $self->default_map)
7084             {
7085                 my $tables = $self->tables;
7086                 my $count = $self->count;
7087                 if ($verbosity && $count > 500 && $tables/$count > .1) {
7088                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7089                 }
7090                 $self->set_type($ENUM);
7091             }
7092             else {
7093                 $self->set_type($BINARY);
7094             }
7095         }
7096         undef $unique_maps{$addr};  # Garbage collect
7097         return;
7098     }
7099
7100     # Most of the accessors for a property actually apply to its map table.
7101     # Setup up accessor functions for those, referring to %map
7102     for my $sub qw(
7103                     add_alias
7104                     add_anomalous_entry
7105                     add_comment
7106                     add_conflicting
7107                     add_description
7108                     add_duplicate
7109                     add_note
7110                     aliases
7111                     comment
7112                     complete_name
7113                     core_access
7114                     count
7115                     default_map
7116                     delete_range
7117                     description
7118                     each_range
7119                     external_name
7120                     file_path
7121                     format
7122                     initialize
7123                     inverse_list
7124                     is_empty
7125                     name
7126                     note
7127                     perl_extension
7128                     property
7129                     range_count
7130                     ranges
7131                     range_size_1
7132                     reset_each_range
7133                     set_comment
7134                     set_core_access
7135                     set_default_map
7136                     set_file_path
7137                     set_final_comment
7138                     set_range_size_1
7139                     set_status
7140                     set_to_output_map
7141                     short_name
7142                     status
7143                     status_info
7144                     to_output_map
7145                     value_of
7146                     write
7147                 )
7148                     # 'property' above is for symmetry, so that one can take
7149                     # the property of a property and get itself, and so don't
7150                     # have to distinguish between properties and tables in
7151                     # calling code
7152     {
7153         no strict "refs";
7154         *$sub = sub {
7155             use strict "refs";
7156             my $self = shift;
7157             no overloading;
7158             return $map{0+$self}->$sub(@_);
7159         }
7160     }
7161
7162
7163 } # End closure
7164
7165 package main;
7166
7167 sub join_lines($) {
7168     # Returns lines of the input joined together, so that they can be folded
7169     # properly.
7170     # This causes continuation lines to be joined together into one long line
7171     # for folding.  A continuation line is any line that doesn't begin with a
7172     # space or "\b" (the latter is stripped from the output).  This is so
7173     # lines can be be in a HERE document so as to fit nicely in the terminal
7174     # width, but be joined together in one long line, and then folded with
7175     # indents, '#' prefixes, etc, properly handled.
7176     # A blank separates the joined lines except if there is a break; an extra
7177     # blank is inserted after a period ending a line.
7178
7179     # Intialize the return with the first line.
7180     my ($return, @lines) = split "\n", shift;
7181
7182     # If the first line is null, it was an empty line, add the \n back in
7183     $return = "\n" if $return eq "";
7184
7185     # Now join the remainder of the physical lines.
7186     for my $line (@lines) {
7187
7188         # An empty line means wanted a blank line, so add two \n's to get that
7189         # effect, and go to the next line.
7190         if (length $line == 0) {
7191             $return .= "\n\n";
7192             next;
7193         }
7194
7195         # Look at the last character of what we have so far.
7196         my $previous_char = substr($return, -1, 1);
7197
7198         # And at the next char to be output.
7199         my $next_char = substr($line, 0, 1);
7200
7201         if ($previous_char ne "\n") {
7202
7203             # Here didn't end wth a nl.  If the next char a blank or \b, it
7204             # means that here there is a break anyway.  So add a nl to the
7205             # output.
7206             if ($next_char eq " " || $next_char eq "\b") {
7207                 $previous_char = "\n";
7208                 $return .= $previous_char;
7209             }
7210
7211             # Add an extra space after periods.
7212             $return .= " " if $previous_char eq '.';
7213         }
7214
7215         # Here $previous_char is still the latest character to be output.  If
7216         # it isn't a nl, it means that the next line is to be a continuation
7217         # line, with a blank inserted between them.
7218         $return .= " " if $previous_char ne "\n";
7219
7220         # Get rid of any \b
7221         substr($line, 0, 1) = "" if $next_char eq "\b";
7222
7223         # And append this next line.
7224         $return .= $line;
7225     }
7226
7227     return $return;
7228 }
7229
7230 sub simple_fold($;$$$) {
7231     # Returns a string of the input (string or an array of strings) folded
7232     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7233     # a \n
7234     # This is tailored for the kind of text written by this program,
7235     # especially the pod file, which can have very long names with
7236     # underscores in the middle, or words like AbcDefgHij....  We allow
7237     # breaking in the middle of such constructs if the line won't fit
7238     # otherwise.  The break in such cases will come either just after an
7239     # underscore, or just before one of the Capital letters.
7240
7241     local $to_trace = 0 if main::DEBUG;
7242
7243     my $line = shift;
7244     my $prefix = shift;     # Optional string to prepend to each output
7245                             # line
7246     $prefix = "" unless defined $prefix;
7247
7248     my $hanging_indent = shift; # Optional number of spaces to indent
7249                                 # continuation lines
7250     $hanging_indent = 0 unless $hanging_indent;
7251
7252     my $right_margin = shift;   # Optional number of spaces to narrow the
7253                                 # total width by.
7254     $right_margin = 0 unless defined $right_margin;
7255
7256     # Call carp with the 'nofold' option to avoid it from trying to call us
7257     # recursively
7258     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7259
7260     # The space available doesn't include what's automatically prepended
7261     # to each line, or what's reserved on the right.
7262     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7263     # XXX Instead of using the 'nofold' perhaps better to look up the stack
7264
7265     if (DEBUG && $hanging_indent >= $max) {
7266         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
7267         $hanging_indent = 0;
7268     }
7269
7270     # First, split into the current physical lines.
7271     my @line;
7272     if (ref $line) {        # Better be an array, because not bothering to
7273                             # test
7274         foreach my $line (@{$line}) {
7275             push @line, split /\n/, $line;
7276         }
7277     }
7278     else {
7279         @line = split /\n/, $line;
7280     }
7281
7282     #local $to_trace = 1 if main::DEBUG;
7283     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7284
7285     # Look at each current physical line.
7286     for (my $i = 0; $i < @line; $i++) {
7287         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7288         #local $to_trace = 1 if main::DEBUG;
7289         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7290
7291         # Remove prefix, because will be added back anyway, don't want
7292         # doubled prefix
7293         $line[$i] =~ s/^$prefix//;
7294
7295         # Remove trailing space
7296         $line[$i] =~ s/\s+\Z//;
7297
7298         # If the line is too long, fold it.
7299         if (length $line[$i] > $max) {
7300             my $remainder;
7301
7302             # Here needs to fold.  Save the leading space in the line for
7303             # later.
7304             $line[$i] =~ /^ ( \s* )/x;
7305             my $leading_space = $1;
7306             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7307
7308             # If character at final permissible position is white space,
7309             # fold there, which will delete that white space
7310             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7311                 $remainder = substr($line[$i], $max);
7312                 $line[$i] = substr($line[$i], 0, $max - 1);
7313             }
7314             else {
7315
7316                 # Otherwise fold at an acceptable break char closest to
7317                 # the max length.  Look at just the maximal initial
7318                 # segment of the line
7319                 my $segment = substr($line[$i], 0, $max - 1);
7320                 if ($segment =~
7321                     /^ ( .{$hanging_indent}   # Don't look before the
7322                                               #  indent.
7323                         \ *                   # Don't look in leading
7324                                               #  blanks past the indent
7325                             [^ ] .*           # Find the right-most
7326                         (?:                   #  acceptable break:
7327                             [ \s = ]          # space or equal
7328                             | - (?! [.0-9] )  # or non-unary minus.
7329                         )                     # $1 includes the character
7330                     )/x)
7331                 {
7332                     # Split into the initial part that fits, and remaining
7333                     # part of the input
7334                     $remainder = substr($line[$i], length $1);
7335                     $line[$i] = $1;
7336                     trace $line[$i] if DEBUG && $to_trace;
7337                     trace $remainder if DEBUG && $to_trace;
7338                 }
7339
7340                 # If didn't find a good breaking spot, see if there is a
7341                 # not-so-good breaking spot.  These are just after
7342                 # underscores or where the case changes from lower to
7343                 # upper.  Use \a as a soft hyphen, but give up
7344                 # and don't break the line if there is actually a \a
7345                 # already in the input.  We use an ascii character for the
7346                 # soft-hyphen to avoid any attempt by miniperl to try to
7347                 # access the files that this program is creating.
7348                 elsif ($segment !~ /\a/
7349                        && ($segment =~ s/_/_\a/g
7350                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7351                 {
7352                     # Here were able to find at least one place to insert
7353                     # our substitute soft hyphen.  Find the right-most one
7354                     # and replace it by a real hyphen.
7355                     trace $segment if DEBUG && $to_trace;
7356                     substr($segment,
7357                             rindex($segment, "\a"),
7358                             1) = '-';
7359
7360                     # Then remove the soft hyphen substitutes.
7361                     $segment =~ s/\a//g;
7362                     trace $segment if DEBUG && $to_trace;
7363
7364                     # And split into the initial part that fits, and
7365                     # remainder of the line
7366                     my $pos = rindex($segment, '-');
7367                     $remainder = substr($line[$i], $pos);
7368                     trace $remainder if DEBUG && $to_trace;
7369                     $line[$i] = substr($segment, 0, $pos + 1);
7370                 }
7371             }
7372
7373             # Here we know if we can fold or not.  If we can, $remainder
7374             # is what remains to be processed in the next iteration.
7375             if (defined $remainder) {
7376                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7377
7378                 # Insert the folded remainder of the line as a new element
7379                 # of the array.  (It may still be too long, but we will
7380                 # deal with that next time through the loop.)  Omit any
7381                 # leading space in the remainder.
7382                 $remainder =~ s/^\s+//;
7383                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7384
7385                 # But then indent by whichever is larger of:
7386                 # 1) the leading space on the input line;
7387                 # 2) the hanging indent.
7388                 # This preserves indentation in the original line.
7389                 my $lead = ($leading_space)
7390                             ? length $leading_space
7391                             : $hanging_indent;
7392                 $lead = max($lead, $hanging_indent);
7393                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7394             }
7395         }
7396
7397         # Ready to output the line. Get rid of any trailing space
7398         # And prefix by the required $prefix passed in.
7399         $line[$i] =~ s/\s+$//;
7400         $line[$i] = "$prefix$line[$i]\n";
7401     } # End of looping through all the lines.
7402
7403     return join "", @line;
7404 }
7405
7406 sub property_ref {  # Returns a reference to a property object.
7407     return Property::property_ref(@_);
7408 }
7409
7410 sub force_unlink ($) {
7411     my $filename = shift;
7412     return unless file_exists($filename);
7413     return if CORE::unlink($filename);
7414
7415     # We might need write permission
7416     chmod 0777, $filename;
7417     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
7418     return;
7419 }
7420
7421 sub write ($\@) {
7422     # Given a filename and a reference to an array of lines, write the lines
7423     # to the file
7424     # Filename can be given as an arrayref of directory names
7425
7426     my $file  = shift;
7427     my $lines_ref = shift;
7428     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7429
7430     if (! defined $lines_ref) {
7431         Carp::my_carp("Missing lines to write parameter for $file.  Writing skipped;");
7432         return;
7433     }
7434
7435     # Get into a single string if an array, and get rid of, in Unix terms, any
7436     # leading '.'
7437     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7438     $file = File::Spec->canonpath($file);
7439
7440     # If has directories, make sure that they all exist
7441     (undef, my $directories, undef) = File::Spec->splitpath($file);
7442     File::Path::mkpath($directories) if $directories && ! -d $directories;
7443
7444     push @files_actually_output, $file;
7445
7446     my $text;
7447     if (@$lines_ref) {
7448         $text = join "", @$lines_ref;
7449     }
7450     else {
7451         $text = "";
7452         Carp::my_carp("Output file '$file' is empty; writing it anyway;");
7453     }
7454
7455     force_unlink ($file);
7456
7457     my $OUT;
7458     if (not open $OUT, ">", $file) {
7459         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
7460         return;
7461     }
7462     print "$file written.\n" if $verbosity >= $VERBOSE;
7463
7464     print $OUT $text;
7465     close $OUT;
7466     return;
7467 }
7468
7469
7470 sub Standardize($) {
7471     # This converts the input name string into a standardized equivalent to
7472     # use internally.
7473
7474     my $name = shift;
7475     unless (defined $name) {
7476       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
7477       return;
7478     }
7479
7480     # Remove any leading or trailing white space
7481     $name =~ s/^\s+//g;
7482     $name =~ s/\s+$//g;
7483
7484     # Convert interior white space and hypens into underscores.
7485     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7486
7487     # Capitalize the letter following an underscore, and convert a sequence of
7488     # multiple underscores to a single one
7489     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7490
7491     # And capitalize the first letter, but not for the special cjk ones.
7492     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7493     return $name;
7494 }
7495
7496 sub standardize ($) {
7497     # Returns a lower-cased standardized name, without underscores.  This form
7498     # is chosen so that it can distinguish between any real versus superficial
7499     # Unicode name differences.  It relies on the fact that Unicode doesn't
7500     # have interior underscores, white space, nor dashes in any
7501     # stricter-matched name.  It should not be used on Unicode code point
7502     # names (the Name property), as they mostly, but not always follow these
7503     # rules.
7504
7505     my $name = Standardize(shift);
7506     return if !defined $name;
7507
7508     $name =~ s/ (?<= .) _ (?= . ) //xg;
7509     return lc $name;
7510 }
7511
7512 {   # Closure
7513
7514     my $indent_increment = " " x 2;
7515     my %already_output;
7516
7517     $main::simple_dumper_nesting = 0;
7518
7519     sub simple_dumper {
7520         # Like Simple Data::Dumper. Good enough for our needs. We can't use
7521         # the real thing as we have to run under miniperl.
7522
7523         # It is designed so that on input it is at the beginning of a line,
7524         # and the final thing output in any call is a trailing ",\n".
7525
7526         my $item = shift;
7527         my $indent = shift;
7528         $indent = "" if ! defined $indent;
7529
7530         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7531
7532         # nesting level is localized, so that as the call stack pops, it goes
7533         # back to the prior value.
7534         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7535         undef %already_output if $main::simple_dumper_nesting == 0;
7536         $main::simple_dumper_nesting++;
7537         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7538
7539         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7540
7541         # Determine the indent for recursive calls.
7542         my $next_indent = $indent . $indent_increment;
7543
7544         my $output;
7545         if (! ref $item) {
7546
7547             # Dump of scalar: just output it in quotes if not a number.  To do
7548             # so we must escape certain characters, and therefore need to
7549             # operate on a copy to avoid changing the original
7550             my $copy = $item;
7551             $copy = $UNDEF unless defined $copy;
7552
7553             # Quote non-numbers (numbers also have optional leading '-' and
7554             # fractions)
7555             if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7556
7557                 # Escape apostrophe and backslash
7558                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7559                 $copy = "'$copy'";
7560             }
7561             $output = "$indent$copy,\n";
7562         }
7563         else {
7564
7565             # Keep track of cycles in the input, and refuse to infinitely loop
7566             my $addr; { no overloading; $addr = 0+$item; }
7567             if (defined $already_output{$addr}) {
7568                 return "${indent}ALREADY OUTPUT: $item\n";
7569             }
7570             $already_output{$addr} = $item;
7571
7572             if (ref $item eq 'ARRAY') {
7573                 my $using_brackets;
7574                 $output = $indent;
7575                 if ($main::simple_dumper_nesting > 1) {
7576                     $output .= '[';
7577                     $using_brackets = 1;
7578                 }
7579                 else {
7580                     $using_brackets = 0;
7581                 }
7582
7583                 # If the array is empty, put the closing bracket on the same
7584                 # line.  Otherwise, recursively add each array element
7585                 if (@$item == 0) {
7586                     $output .= " ";
7587                 }
7588                 else {
7589                     $output .= "\n";
7590                     for (my $i = 0; $i < @$item; $i++) {
7591
7592                         # Indent array elements one level
7593                         $output .= &simple_dumper($item->[$i], $next_indent);
7594                         $output =~ s/\n$//;      # Remove trailing nl so as to
7595                         $output .= " # [$i]\n";  # add a comment giving the
7596                                                  # array index
7597                     }
7598                     $output .= $indent;     # Indent closing ']' to orig level
7599                 }
7600                 $output .= ']' if $using_brackets;
7601                 $output .= ",\n";
7602             }
7603             elsif (ref $item eq 'HASH') {
7604                 my $is_first_line;
7605                 my $using_braces;
7606                 my $body_indent;
7607
7608                 # No surrounding braces at top level
7609                 $output .= $indent;
7610                 if ($main::simple_dumper_nesting > 1) {
7611                     $output .= "{\n";
7612                     $is_first_line = 0;
7613                     $body_indent = $next_indent;
7614                     $next_indent .= $indent_increment;
7615                     $using_braces = 1;
7616                 }
7617                 else {
7618                     $is_first_line = 1;
7619                     $body_indent = $indent;
7620                     $using_braces = 0;
7621                 }
7622
7623                 # Output hashes sorted alphabetically instead of apparently
7624                 # random.  Use caseless alphabetic sort
7625                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
7626                 {
7627                     if ($is_first_line) {
7628                         $is_first_line = 0;
7629                     }
7630                     else {
7631                         $output .= "$body_indent";
7632                     }
7633
7634                     # The key must be a scalar, but this recursive call quotes
7635                     # it
7636                     $output .= &simple_dumper($key);
7637
7638                     # And change the trailing comma and nl to the hash fat
7639                     # comma for clarity, and so the value can be on the same
7640                     # line
7641                     $output =~ s/,\n$/ => /;
7642
7643                     # Recursively call to get the value's dump.
7644                     my $next = &simple_dumper($item->{$key}, $next_indent);
7645
7646                     # If the value is all on one line, remove its indent, so
7647                     # will follow the => immediately.  If it takes more than
7648                     # one line, start it on a new line.
7649                     if ($next !~ /\n.*\n/) {
7650                         $next =~ s/^ *//;
7651                     }
7652                     else {
7653                         $output .= "\n";
7654                     }
7655                     $output .= $next;
7656                 }
7657
7658                 $output .= "$indent},\n" if $using_braces;
7659             }
7660             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
7661                 $output = $indent . ref($item) . "\n";
7662                 # XXX see if blessed
7663             }
7664             elsif ($item->can('dump')) {
7665
7666                 # By convention in this program, objects furnish a 'dump'
7667                 # method.  Since not doing any output at this level, just pass
7668                 # on the input indent
7669                 $output = $item->dump($indent);
7670             }
7671             else {
7672                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
7673             }
7674         }
7675         return $output;
7676     }
7677 }
7678
7679 sub dump_inside_out {
7680     # Dump inside-out hashes in an object's state by converting them to a
7681     # regular hash and then calling simple_dumper on that.
7682
7683     my $object = shift;
7684     my $fields_ref = shift;
7685     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7686
7687     my $addr; { no overloading; $addr = 0+$object; }
7688
7689     my %hash;
7690     foreach my $key (keys %$fields_ref) {
7691         $hash{$key} = $fields_ref->{$key}{$addr};
7692     }
7693
7694     return simple_dumper(\%hash, @_);
7695 }
7696
7697 sub _operator_dot {
7698     # Overloaded '.' method that is common to all packages.  It uses the
7699     # package's stringify method.
7700
7701     my $self = shift;
7702     my $other = shift;
7703     my $reversed = shift;
7704     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7705
7706     $other = "" unless defined $other;
7707
7708     foreach my $which (\$self, \$other) {
7709         next unless ref $$which;
7710         if ($$which->can('_operator_stringify')) {
7711             $$which = $$which->_operator_stringify;
7712         }
7713         else {
7714             my $ref = ref $$which;
7715             my $addr; { no overloading; $addr = 0+$$which; }
7716             $$which = "$ref ($addr)";
7717         }
7718     }
7719     return ($reversed)
7720             ? "$other$self"
7721             : "$self$other";
7722 }
7723
7724 sub _operator_equal {
7725     # Generic overloaded '==' routine.  To be equal, they must be the exact
7726     # same object
7727
7728     my $self = shift;
7729     my $other = shift;
7730
7731     return 0 unless defined $other;
7732     return 0 unless ref $other;
7733     no overloading;
7734     return 0+$self == 0+$other;
7735 }
7736
7737 sub _operator_not_equal {
7738     my $self = shift;
7739     my $other = shift;
7740
7741     return ! _operator_equal($self, $other);
7742 }
7743
7744 sub process_PropertyAliases($) {
7745     # This reads in the PropertyAliases.txt file, which contains almost all
7746     # the character properties in Unicode and their equivalent aliases:
7747     # scf       ; Simple_Case_Folding         ; sfc
7748     #
7749     # Field 0 is the preferred short name for the property.
7750     # Field 1 is the full name.
7751     # Any succeeding ones are other accepted names.
7752
7753     my $file= shift;
7754     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7755
7756     # This whole file was non-existent in early releases, so use our own
7757     # internal one.
7758     $file->insert_lines(get_old_property_aliases())
7759                                                 if ! -e 'PropertyAliases.txt';
7760
7761     # Add any cjk properties that may have been defined.
7762     $file->insert_lines(@cjk_properties);
7763
7764     while ($file->next_line) {
7765
7766         my @data = split /\s*;\s*/;
7767
7768         my $full = $data[1];
7769
7770         my $this = Property->new($data[0], Full_Name => $full);
7771
7772         # Start looking for more aliases after these two.
7773         for my $i (2 .. @data - 1) {
7774             $this->add_alias($data[$i]);
7775         }
7776
7777     }
7778     return;
7779 }
7780
7781 sub finish_property_setup {
7782     # Finishes setting up after PropertyAliases.
7783
7784     my $file = shift;
7785     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7786
7787     # This entry was missing from this file in earlier Unicode versions
7788     if (-e 'Jamo.txt') {
7789         my $jsn = property_ref('JSN');
7790         if (! defined $jsn) {
7791             $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
7792         }
7793     }
7794
7795     # This entry is still missing as of 5.2, perhaps because no short name for
7796     # it.
7797     if (-e 'NameAliases.txt') {
7798         my $aliases = property_ref('Name_Alias');
7799         if (! defined $aliases) {
7800             $aliases = Property->new('Name_Alias');
7801         }
7802     }
7803
7804     # These are used so much, that we set globals for them.
7805     $gc = property_ref('General_Category');
7806     $block = property_ref('Block');
7807
7808     # Perl adds this alias.
7809     $gc->add_alias('Category');
7810
7811     # For backwards compatibility, these property files have particular names.
7812     my $upper = property_ref('Uppercase_Mapping');
7813     $upper->set_core_access('uc()');
7814     $upper->set_file('Upper'); # This is what utf8.c calls it
7815
7816     my $lower = property_ref('Lowercase_Mapping');
7817     $lower->set_core_access('lc()');
7818     $lower->set_file('Lower');
7819
7820     my $title = property_ref('Titlecase_Mapping');
7821     $title->set_core_access('ucfirst()');
7822     $title->set_file('Title');
7823
7824     my $fold = property_ref('Case_Folding');
7825     $fold->set_file('Fold') if defined $fold;
7826
7827     # utf8.c can't currently cope with non range-size-1 for these, and even if
7828     # it were changed to do so, someone else may be using them, expecting the
7829     # old style
7830     foreach my $property (qw {
7831                                 Case_Folding
7832                                 Lowercase_Mapping
7833                                 Titlecase_Mapping
7834                                 Uppercase_Mapping
7835                             })
7836     {
7837         property_ref($property)->set_range_size_1(1);
7838     }
7839
7840     # These two properties aren't actually used in the core, but unfortunately
7841     # the names just above that are in the core interfere with these, so
7842     # choose different names.  These aren't a problem unless the map tables
7843     # for these files get written out.
7844     my $lowercase = property_ref('Lowercase');
7845     $lowercase->set_file('IsLower') if defined $lowercase;
7846     my $uppercase = property_ref('Uppercase');
7847     $uppercase->set_file('IsUpper') if defined $uppercase;
7848
7849     # Set up the hard-coded default mappings, but only on properties defined
7850     # for this release
7851     foreach my $property (keys %default_mapping) {
7852         my $property_object = property_ref($property);
7853         next if ! defined $property_object;
7854         my $default_map = $default_mapping{$property};
7855         $property_object->set_default_map($default_map);
7856
7857         # A map of <code point> implies the property is string.
7858         if ($property_object->type == $UNKNOWN
7859             && $default_map eq $CODE_POINT)
7860         {
7861             $property_object->set_type($STRING);
7862         }
7863     }
7864
7865     # The following use the Multi_Default class to create objects for
7866     # defaults.
7867
7868     # Bidi class has a complicated default, but the derived file takes care of
7869     # the complications, leaving just 'L'.
7870     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
7871         property_ref('Bidi_Class')->set_default_map('L');
7872     }
7873     else {
7874         my $default;
7875
7876         # The derived file was introduced in 3.1.1.  The values below are
7877         # taken from table 3-8, TUS 3.0
7878         my $default_R =
7879             'my $default = Range_List->new;
7880              $default->add_range(0x0590, 0x05FF);
7881              $default->add_range(0xFB1D, 0xFB4F);'
7882         ;
7883
7884         # The defaults apply only to unassigned characters
7885         $default_R .= '$gc->table("Cn") & $default;';
7886
7887         if ($v_version lt v3.0.0) {
7888             $default = Multi_Default->new(R => $default_R, 'L');
7889         }
7890         else {
7891
7892             # AL apparently not introduced until 3.0:  TUS 2.x references are
7893             # not on-line to check it out
7894             my $default_AL =
7895                 'my $default = Range_List->new;
7896                  $default->add_range(0x0600, 0x07BF);
7897                  $default->add_range(0xFB50, 0xFDFF);
7898                  $default->add_range(0xFE70, 0xFEFF);'
7899             ;
7900
7901             # Non-character code points introduced in this release; aren't AL
7902             if ($v_version ge 3.1.0) {
7903                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
7904             }
7905             $default_AL .= '$gc->table("Cn") & $default';
7906             $default = Multi_Default->new(AL => $default_AL,
7907                                           R => $default_R,
7908                                           'L');
7909         }
7910         property_ref('Bidi_Class')->set_default_map($default);
7911     }
7912
7913     # Joining type has a complicated default, but the derived file takes care
7914     # of the complications, leaving just 'U' (or Non_Joining), except the file
7915     # is bad in 3.1.0
7916     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
7917         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
7918             property_ref('Joining_Type')->set_default_map('Non_Joining');
7919         }
7920         else {
7921
7922             # Otherwise, there are not one, but two possibilities for the
7923             # missing defaults: T and U.
7924             # The missing defaults that evaluate to T are given by:
7925             # T = Mn + Cf - ZWNJ - ZWJ
7926             # where Mn and Cf are the general category values. In other words,
7927             # any non-spacing mark or any format control character, except
7928             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
7929             # WIDTH JOINER (joining type C).
7930             my $default = Multi_Default->new(
7931                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
7932                'Non_Joining');
7933             property_ref('Joining_Type')->set_default_map($default);
7934         }
7935     }
7936
7937     # Line break has a complicated default in early releases. It is 'Unknown'
7938     # for non-assigned code points; 'AL' for assigned.
7939     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
7940         my $lb = property_ref('Line_Break');
7941         if ($v_version gt 3.2.0) {
7942             $lb->set_default_map('Unknown');
7943         }
7944         else {
7945             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
7946                                               'AL');
7947             $lb->set_default_map($default);
7948         }
7949
7950         # If has the URS property, make sure that the standard aliases are in
7951         # it, since not in the input tables in some versions.
7952         my $urs = property_ref('Unicode_Radical_Stroke');
7953         if (defined $urs) {
7954             $urs->add_alias('cjkRSUnicode');
7955             $urs->add_alias('kRSUnicode');
7956         }
7957     }
7958     return;
7959 }
7960
7961 sub get_old_property_aliases() {
7962     # Returns what would be in PropertyAliases.txt if it existed in very old
7963     # versions of Unicode.  It was derived from the one in 3.2, and pared
7964     # down based on the data that was actually in the older releases.
7965     # An attempt was made to use the existence of files to mean inclusion or
7966     # not of various aliases, but if this was not sufficient, using version
7967     # numbers was resorted to.
7968
7969     my @return;
7970
7971     # These are to be used in all versions (though some are constructed by
7972     # this program if missing)
7973     push @return, split /\n/, <<'END';
7974 bc        ; Bidi_Class
7975 Bidi_M    ; Bidi_Mirrored
7976 cf        ; Case_Folding
7977 ccc       ; Canonical_Combining_Class
7978 dm        ; Decomposition_Mapping
7979 dt        ; Decomposition_Type
7980 gc        ; General_Category
7981 isc       ; ISO_Comment
7982 lc        ; Lowercase_Mapping
7983 na        ; Name
7984 na1       ; Unicode_1_Name
7985 nt        ; Numeric_Type
7986 nv        ; Numeric_Value
7987 sfc       ; Simple_Case_Folding
7988 slc       ; Simple_Lowercase_Mapping
7989 stc       ; Simple_Titlecase_Mapping
7990 suc       ; Simple_Uppercase_Mapping
7991 tc        ; Titlecase_Mapping
7992 uc        ; Uppercase_Mapping
7993 END
7994
7995     if (-e 'Blocks.txt') {
7996         push @return, "blk       ; Block\n";
7997     }
7998     if (-e 'ArabicShaping.txt') {
7999         push @return, split /\n/, <<'END';
8000 jg        ; Joining_Group
8001 jt        ; Joining_Type
8002 END
8003     }
8004     if (-e 'PropList.txt') {
8005
8006         # This first set is in the original old-style proplist.
8007         push @return, split /\n/, <<'END';
8008 Alpha     ; Alphabetic
8009 Bidi_C    ; Bidi_Control
8010 Dash      ; Dash
8011 Dia       ; Diacritic
8012 Ext       ; Extender
8013 Hex       ; Hex_Digit
8014 Hyphen    ; Hyphen
8015 IDC       ; ID_Continue
8016 Ideo      ; Ideographic
8017 Join_C    ; Join_Control
8018 Math      ; Math
8019 QMark     ; Quotation_Mark
8020 Term      ; Terminal_Punctuation
8021 WSpace    ; White_Space
8022 END
8023         # The next sets were added later
8024         if ($v_version ge v3.0.0) {
8025             push @return, split /\n/, <<'END';
8026 Upper     ; Uppercase
8027 Lower     ; Lowercase
8028 END
8029         }
8030         if ($v_version ge v3.0.1) {
8031             push @return, split /\n/, <<'END';
8032 NChar     ; Noncharacter_Code_Point
8033 END
8034         }
8035         # The next sets were added in the new-style
8036         if ($v_version ge v3.1.0) {
8037             push @return, split /\n/, <<'END';
8038 OAlpha    ; Other_Alphabetic
8039 OLower    ; Other_Lowercase
8040 OMath     ; Other_Math
8041 OUpper    ; Other_Uppercase
8042 END
8043         }
8044         if ($v_version ge v3.1.1) {
8045             push @return, "AHex      ; ASCII_Hex_Digit\n";
8046         }
8047     }
8048     if (-e 'EastAsianWidth.txt') {
8049         push @return, "ea        ; East_Asian_Width\n";
8050     }
8051     if (-e 'CompositionExclusions.txt') {
8052         push @return, "CE        ; Composition_Exclusion\n";
8053     }
8054     if (-e 'LineBreak.txt') {
8055         push @return, "lb        ; Line_Break\n";
8056     }
8057     if (-e 'BidiMirroring.txt') {
8058         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
8059     }
8060     if (-e 'Scripts.txt') {
8061         push @return, "sc        ; Script\n";
8062     }
8063     if (-e 'DNormalizationProps.txt') {
8064         push @return, split /\n/, <<'END';
8065 Comp_Ex   ; Full_Composition_Exclusion
8066 FC_NFKC   ; FC_NFKC_Closure
8067 NFC_QC    ; NFC_Quick_Check
8068 NFD_QC    ; NFD_Quick_Check
8069 NFKC_QC   ; NFKC_Quick_Check
8070 NFKD_QC   ; NFKD_Quick_Check
8071 XO_NFC    ; Expands_On_NFC
8072 XO_NFD    ; Expands_On_NFD
8073 XO_NFKC   ; Expands_On_NFKC
8074 XO_NFKD   ; Expands_On_NFKD
8075 END
8076     }
8077     if (-e 'DCoreProperties.txt') {
8078         push @return, split /\n/, <<'END';
8079 IDS       ; ID_Start
8080 XIDC      ; XID_Continue
8081 XIDS      ; XID_Start
8082 END
8083         # These can also appear in some versions of PropList.txt
8084         push @return, "Lower     ; Lowercase\n"
8085                                     unless grep { $_ =~ /^Lower\b/} @return;
8086         push @return, "Upper     ; Uppercase\n"
8087                                     unless grep { $_ =~ /^Upper\b/} @return;
8088     }
8089
8090     # This flag requires the DAge.txt file to be copied into the directory.
8091     if (DEBUG && $compare_versions) {
8092         push @return, 'age       ; Age';
8093     }
8094
8095     return @return;
8096 }
8097
8098 sub process_PropValueAliases {
8099     # This file contains values that properties look like:
8100     # bc ; AL        ; Arabic_Letter
8101     # blk; n/a       ; Greek_And_Coptic                 ; Greek
8102     #
8103     # Field 0 is the property.
8104     # Field 1 is the short name of a property value or 'n/a' if no
8105     #                short name exists;
8106     # Field 2 is the full property value name;
8107     # Any other fields are more synonyms for the property value.
8108     # Purely numeric property values are omitted from the file; as are some
8109     # others, fewer and fewer in later releases
8110
8111     # Entries for the ccc property have an extra field before the
8112     # abbreviation:
8113     # ccc;   0; NR   ; Not_Reordered
8114     # It is the numeric value that the names are synonyms for.
8115
8116     # There are comment entries for values missing from this file:
8117     # # @missing: 0000..10FFFF; ISO_Comment; <none>
8118     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8119
8120     my $file= shift;
8121     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8122
8123     # This whole file was non-existent in early releases, so use our own
8124     # internal one if necessary.
8125     if (! -e 'PropValueAliases.txt') {
8126         $file->insert_lines(get_old_property_value_aliases());
8127     }
8128
8129     # Add any explicit cjk values
8130     $file->insert_lines(@cjk_property_values);
8131
8132     # This line is used only for testing the code that checks for name
8133     # conflicts.  There is a script Inherited, and when this line is executed
8134     # it causes there to be a name conflict with the 'Inherited' that this
8135     # program generates for this block property value
8136     #$file->insert_lines('blk; n/a; Herited');
8137
8138
8139     # Process each line of the file ...
8140     while ($file->next_line) {
8141
8142         my ($property, @data) = split /\s*;\s*/;
8143
8144         # The full name for the ccc property value is in field 2 of the
8145         # remaining ones; field 1 for all other properties.  Swap ccc fields 1
8146         # and 2.  (Rightmost splice removes field 2, returning it; left splice
8147         # inserts that into field 1, thus shifting former field 1 to field 2.)
8148         splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
8149
8150         # If there is no short name, use the full one in element 1
8151         $data[0] = $data[1] if $data[0] eq "n/a";
8152
8153         # Earlier releases had the pseudo property 'qc' that should expand to
8154         # the ones that replace it below.
8155         if ($property eq 'qc') {
8156             if (lc $data[0] eq 'y') {
8157                 $file->insert_lines('NFC_QC; Y      ; Yes',
8158                                     'NFD_QC; Y      ; Yes',
8159                                     'NFKC_QC; Y     ; Yes',
8160                                     'NFKD_QC; Y     ; Yes',
8161                                     );
8162             }
8163             elsif (lc $data[0] eq 'n') {
8164                 $file->insert_lines('NFC_QC; N      ; No',
8165                                     'NFD_QC; N      ; No',
8166                                     'NFKC_QC; N     ; No',
8167                                     'NFKD_QC; N     ; No',
8168                                     );
8169             }
8170             elsif (lc $data[0] eq 'm') {
8171                 $file->insert_lines('NFC_QC; M      ; Maybe',
8172                                     'NFKC_QC; M     ; Maybe',
8173                                     );
8174             }
8175             else {
8176                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8177             }
8178             next;
8179         }
8180
8181         # The first field is the short name, 2nd is the full one.
8182         my $property_object = property_ref($property);
8183         my $table = $property_object->add_match_table($data[0],
8184                                                 Full_Name => $data[1]);
8185
8186         # Start looking for more aliases after these two.
8187         for my $i (2 .. @data - 1) {
8188             $table->add_alias($data[$i]);
8189         }
8190     } # End of looping through the file
8191
8192     # As noted in the comments early in the program, it generates tables for
8193     # the default values for all releases, even those for which the concept
8194     # didn't exist at the time.  Here we add those if missing.
8195     my $age = property_ref('age');
8196     if (defined $age && ! defined $age->table('Unassigned')) {
8197         $age->add_match_table('Unassigned');
8198     }
8199     $block->add_match_table('No_Block') if -e 'Blocks.txt'
8200                                     && ! defined $block->table('No_Block');
8201
8202
8203     # Now set the default mappings of the properties from the file.  This is
8204     # done after the loop because a number of properties have only @missings
8205     # entries in the file, and may not show up until the end.
8206     my @defaults = $file->get_missings;
8207     foreach my $default_ref (@defaults) {
8208         my $default = $default_ref->[0];
8209         my $property = property_ref($default_ref->[1]);
8210         $property->set_default_map($default);
8211     }
8212     return;
8213 }
8214
8215 sub get_old_property_value_aliases () {
8216     # Returns what would be in PropValueAliases.txt if it existed in very old
8217     # versions of Unicode.  It was derived from the one in 3.2, and pared
8218     # down.  An attempt was made to use the existence of files to mean
8219     # inclusion or not of various aliases, but if this was not sufficient,
8220     # using version numbers was resorted to.
8221
8222     my @return = split /\n/, <<'END';
8223 bc ; AN        ; Arabic_Number
8224 bc ; B         ; Paragraph_Separator
8225 bc ; CS        ; Common_Separator
8226 bc ; EN        ; European_Number
8227 bc ; ES        ; European_Separator
8228 bc ; ET        ; European_Terminator
8229 bc ; L         ; Left_To_Right
8230 bc ; ON        ; Other_Neutral
8231 bc ; R         ; Right_To_Left
8232 bc ; WS        ; White_Space
8233
8234 # The standard combining classes are very much different in v1, so only use
8235 # ones that look right (not checked thoroughly)
8236 ccc;   0; NR   ; Not_Reordered
8237 ccc;   1; OV   ; Overlay
8238 ccc;   7; NK   ; Nukta
8239 ccc;   8; KV   ; Kana_Voicing
8240 ccc;   9; VR   ; Virama
8241 ccc; 202; ATBL ; Attached_Below_Left
8242 ccc; 216; ATAR ; Attached_Above_Right
8243 ccc; 218; BL   ; Below_Left
8244 ccc; 220; B    ; Below
8245 ccc; 222; BR   ; Below_Right
8246 ccc; 224; L    ; Left
8247 ccc; 228; AL   ; Above_Left
8248 ccc; 230; A    ; Above
8249 ccc; 232; AR   ; Above_Right
8250 ccc; 234; DA   ; Double_Above
8251
8252 dt ; can       ; canonical
8253 dt ; enc       ; circle
8254 dt ; fin       ; final
8255 dt ; font      ; font
8256 dt ; fra       ; fraction
8257 dt ; init      ; initial
8258 dt ; iso       ; isolated
8259 dt ; med       ; medial
8260 dt ; n/a       ; none
8261 dt ; nb        ; noBreak
8262 dt ; sqr       ; square
8263 dt ; sub       ; sub
8264 dt ; sup       ; super
8265
8266 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
8267 gc ; Cc        ; Control
8268 gc ; Cn        ; Unassigned
8269 gc ; Co        ; Private_Use
8270 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
8271 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
8272 gc ; Ll        ; Lowercase_Letter
8273 gc ; Lm        ; Modifier_Letter
8274 gc ; Lo        ; Other_Letter
8275 gc ; Lu        ; Uppercase_Letter
8276 gc ; M         ; Mark                             # Mc | Me | Mn
8277 gc ; Mc        ; Spacing_Mark
8278 gc ; Mn        ; Nonspacing_Mark
8279 gc ; N         ; Number                           # Nd | Nl | No
8280 gc ; Nd        ; Decimal_Number
8281 gc ; No        ; Other_Number
8282 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
8283 gc ; Pd        ; Dash_Punctuation
8284 gc ; Pe        ; Close_Punctuation
8285 gc ; Po        ; Other_Punctuation
8286 gc ; Ps        ; Open_Punctuation
8287 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
8288 gc ; Sc        ; Currency_Symbol
8289 gc ; Sm        ; Math_Symbol
8290 gc ; So        ; Other_Symbol
8291 gc ; Z         ; Separator                        # Zl | Zp | Zs
8292 gc ; Zl        ; Line_Separator
8293 gc ; Zp        ; Paragraph_Separator
8294 gc ; Zs        ; Space_Separator
8295
8296 nt ; de        ; Decimal
8297 nt ; di        ; Digit
8298 nt ; n/a       ; None
8299 nt ; nu        ; Numeric
8300 END
8301
8302     if (-e 'ArabicShaping.txt') {
8303         push @return, split /\n/, <<'END';
8304 jg ; n/a       ; AIN
8305 jg ; n/a       ; ALEF
8306 jg ; n/a       ; DAL
8307 jg ; n/a       ; GAF
8308 jg ; n/a       ; LAM
8309 jg ; n/a       ; MEEM
8310 jg ; n/a       ; NO_JOINING_GROUP
8311 jg ; n/a       ; NOON
8312 jg ; n/a       ; QAF
8313 jg ; n/a       ; SAD
8314 jg ; n/a       ; SEEN
8315 jg ; n/a       ; TAH
8316 jg ; n/a       ; WAW
8317
8318 jt ; C         ; Join_Causing
8319 jt ; D         ; Dual_Joining
8320 jt ; L         ; Left_Joining
8321 jt ; R         ; Right_Joining
8322 jt ; U         ; Non_Joining
8323 jt ; T         ; Transparent
8324 END
8325         if ($v_version ge v3.0.0) {
8326             push @return, split /\n/, <<'END';
8327 jg ; n/a       ; ALAPH
8328 jg ; n/a       ; BEH
8329 jg ; n/a       ; BETH
8330 jg ; n/a       ; DALATH_RISH
8331 jg ; n/a       ; E
8332 jg ; n/a       ; FEH
8333 jg ; n/a       ; FINAL_SEMKATH
8334 jg ; n/a       ; GAMAL
8335 jg ; n/a       ; HAH
8336 jg ; n/a       ; HAMZA_ON_HEH_GOAL
8337 jg ; n/a       ; HE
8338 jg ; n/a       ; HEH
8339 jg ; n/a       ; HEH_GOAL
8340 jg ; n/a       ; HETH
8341 jg ; n/a       ; KAF
8342 jg ; n/a       ; KAPH
8343 jg ; n/a       ; KNOTTED_HEH
8344 jg ; n/a       ; LAMADH
8345 jg ; n/a       ; MIM
8346 jg ; n/a       ; NUN
8347 jg ; n/a       ; PE
8348 jg ; n/a       ; QAPH
8349 jg ; n/a       ; REH
8350 jg ; n/a       ; REVERSED_PE
8351 jg ; n/a       ; SADHE
8352 jg ; n/a       ; SEMKATH
8353 jg ; n/a       ; SHIN
8354 jg ; n/a       ; SWASH_KAF
8355 jg ; n/a       ; TAW
8356 jg ; n/a       ; TEH_MARBUTA
8357 jg ; n/a       ; TETH
8358 jg ; n/a       ; YEH
8359 jg ; n/a       ; YEH_BARREE
8360 jg ; n/a       ; YEH_WITH_TAIL
8361 jg ; n/a       ; YUDH
8362 jg ; n/a       ; YUDH_HE
8363 jg ; n/a       ; ZAIN
8364 END
8365         }
8366     }
8367
8368
8369     if (-e 'EastAsianWidth.txt') {
8370         push @return, split /\n/, <<'END';
8371 ea ; A         ; Ambiguous
8372 ea ; F         ; Fullwidth
8373 ea ; H         ; Halfwidth
8374 ea ; N         ; Neutral
8375 ea ; Na        ; Narrow
8376 ea ; W         ; Wide
8377 END
8378     }
8379
8380     if (-e 'LineBreak.txt') {
8381         push @return, split /\n/, <<'END';
8382 lb ; AI        ; Ambiguous
8383 lb ; AL        ; Alphabetic
8384 lb ; B2        ; Break_Both
8385 lb ; BA        ; Break_After
8386 lb ; BB        ; Break_Before
8387 lb ; BK        ; Mandatory_Break
8388 lb ; CB        ; Contingent_Break
8389 lb ; CL        ; Close_Punctuation
8390 lb ; CM        ; Combining_Mark
8391 lb ; CR        ; Carriage_Return
8392 lb ; EX        ; Exclamation
8393 lb ; GL        ; Glue
8394 lb ; HY        ; Hyphen
8395 lb ; ID        ; Ideographic
8396 lb ; IN        ; Inseperable
8397 lb ; IS        ; Infix_Numeric
8398 lb ; LF        ; Line_Feed
8399 lb ; NS        ; Nonstarter
8400 lb ; NU        ; Numeric
8401 lb ; OP        ; Open_Punctuation
8402 lb ; PO        ; Postfix_Numeric
8403 lb ; PR        ; Prefix_Numeric
8404 lb ; QU        ; Quotation
8405 lb ; SA        ; Complex_Context
8406 lb ; SG        ; Surrogate
8407 lb ; SP        ; Space
8408 lb ; SY        ; Break_Symbols
8409 lb ; XX        ; Unknown
8410 lb ; ZW        ; ZWSpace
8411 END
8412     }
8413
8414     if (-e 'DNormalizationProps.txt') {
8415         push @return, split /\n/, <<'END';
8416 qc ; M         ; Maybe
8417 qc ; N         ; No
8418 qc ; Y         ; Yes
8419 END
8420     }
8421
8422     if (-e 'Scripts.txt') {
8423         push @return, split /\n/, <<'END';
8424 sc ; Arab      ; Arabic
8425 sc ; Armn      ; Armenian
8426 sc ; Beng      ; Bengali
8427 sc ; Bopo      ; Bopomofo
8428 sc ; Cans      ; Canadian_Aboriginal
8429 sc ; Cher      ; Cherokee
8430 sc ; Cyrl      ; Cyrillic
8431 sc ; Deva      ; Devanagari
8432 sc ; Dsrt      ; Deseret
8433 sc ; Ethi      ; Ethiopic
8434 sc ; Geor      ; Georgian
8435 sc ; Goth      ; Gothic
8436 sc ; Grek      ; Greek
8437 sc ; Gujr      ; Gujarati
8438 sc ; Guru      ; Gurmukhi
8439 sc ; Hang      ; Hangul
8440 sc ; Hani      ; Han
8441 sc ; Hebr      ; Hebrew
8442 sc ; Hira      ; Hiragana
8443 sc ; Ital      ; Old_Italic
8444 sc ; Kana      ; Katakana
8445 sc ; Khmr      ; Khmer
8446 sc ; Knda      ; Kannada
8447 sc ; Laoo      ; Lao
8448 sc ; Latn      ; Latin
8449 sc ; Mlym      ; Malayalam
8450 sc ; Mong      ; Mongolian
8451 sc ; Mymr      ; Myanmar
8452 sc ; Ogam      ; Ogham
8453 sc ; Orya      ; Oriya
8454 sc ; Qaai      ; Inherited
8455 sc ; Runr      ; Runic
8456 sc ; Sinh      ; Sinhala
8457 sc ; Syrc      ; Syriac
8458 sc ; Taml      ; Tamil
8459 sc ; Telu      ; Telugu
8460 sc ; Thaa      ; Thaana
8461 sc ; Thai      ; Thai
8462 sc ; Tibt      ; Tibetan
8463 sc ; Yiii      ; Yi
8464 sc ; Zyyy      ; Common
8465 END
8466     }
8467
8468     if ($v_version ge v2.0.0) {
8469         push @return, split /\n/, <<'END';
8470 dt ; com       ; compat
8471 dt ; nar       ; narrow
8472 dt ; sml       ; small
8473 dt ; vert      ; vertical
8474 dt ; wide      ; wide
8475
8476 gc ; Cf        ; Format
8477 gc ; Cs        ; Surrogate
8478 gc ; Lt        ; Titlecase_Letter
8479 gc ; Me        ; Enclosing_Mark
8480 gc ; Nl        ; Letter_Number
8481 gc ; Pc        ; Connector_Punctuation
8482 gc ; Sk        ; Modifier_Symbol
8483 END
8484     }
8485     if ($v_version ge v2.1.2) {
8486         push @return, "bc ; S         ; Segment_Separator\n";
8487     }
8488     if ($v_version ge v2.1.5) {
8489         push @return, split /\n/, <<'END';
8490 gc ; Pf        ; Final_Punctuation
8491 gc ; Pi        ; Initial_Punctuation
8492 END
8493     }
8494     if ($v_version ge v2.1.8) {
8495         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
8496     }
8497
8498     if ($v_version ge v3.0.0) {
8499         push @return, split /\n/, <<'END';
8500 bc ; AL        ; Arabic_Letter
8501 bc ; BN        ; Boundary_Neutral
8502 bc ; LRE       ; Left_To_Right_Embedding
8503 bc ; LRO       ; Left_To_Right_Override
8504 bc ; NSM       ; Nonspacing_Mark
8505 bc ; PDF       ; Pop_Directional_Format
8506 bc ; RLE       ; Right_To_Left_Embedding
8507 bc ; RLO       ; Right_To_Left_Override
8508
8509 ccc; 233; DB   ; Double_Below
8510 END
8511     }
8512
8513     if ($v_version ge v3.1.0) {
8514         push @return, "ccc; 226; R    ; Right\n";
8515     }
8516
8517     return @return;
8518 }
8519
8520 { # Closure
8521     # This is used to store the range list of all the code points usable when
8522     # the little used $compare_versions feature is enabled.
8523     my $compare_versions_range_list;
8524
8525     sub process_generic_property_file {
8526         # This processes a file containing property mappings and puts them
8527         # into internal map tables.  It should be used to handle any property
8528         # files that have mappings from a code point or range thereof to
8529         # something else.  This means almost all the UCD .txt files.
8530         # each_line_handlers() should be set to adjust the lines of these
8531         # files, if necessary, to what this routine understands:
8532         #
8533         # 0374          ; NFD_QC; N
8534         # 003C..003E    ; Math
8535         #
8536         # the fields are: "codepoint range ; property; map"
8537         #
8538         # meaning the codepoints in the range all have the value 'map' under
8539         # 'property'.
8540         # Beginning and trailing white space in each field are not signficant.
8541         # Note there is not a trailing semi-colon in the above.  A trailing
8542         # semi-colon means the map is a null-string.  An omitted map, as
8543         # opposed to a null-string, is assumed to be 'Y', based on Unicode
8544         # table syntax.  (This could have been hidden from this routine by
8545         # doing it in the $file object, but that would require parsing of the
8546         # line there, so would have to parse it twice, or change the interface
8547         # to pass this an array.  So not done.)
8548         #
8549         # The map field may begin with a sequence of commands that apply to
8550         # this range.  Each such command begins and ends with $CMD_DELIM.
8551         # These are used to indicate, for example, that the mapping for a
8552         # range has a non-default type.
8553         #
8554         # This loops through the file, calling it's next_line() method, and
8555         # then taking the map and adding it to the property's table.
8556         # Complications arise because any number of properties can be in the
8557         # file, in any order, interspersed in any way.  The first time a
8558         # property is seen, it gets information about that property and
8559         # caches it for quick retrieval later.  It also normalizes the maps
8560         # so that only one of many synonym is stored.  The Unicode input files
8561         # do use some multiple synonyms.
8562
8563         my $file = shift;
8564         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8565
8566         my %property_info;               # To keep track of what properties
8567                                          # have already had entries in the
8568                                          # current file, and info about each,
8569                                          # so don't have to recompute.
8570         my $property_name;               # property currently being worked on
8571         my $property_type;               # and its type
8572         my $previous_property_name = ""; # name from last time through loop
8573         my $property_object;             # pointer to the current property's
8574                                          # object
8575         my $property_addr;               # the address of that object
8576         my $default_map;                 # the string that code points missing
8577                                          # from the file map to
8578         my $default_table;               # For non-string properties, a
8579                                          # reference to the match table that
8580                                          # will contain the list of code
8581                                          # points that map to $default_map.
8582
8583         # Get the next real non-comment line
8584         LINE:
8585         while ($file->next_line) {
8586
8587             # Default replacement type; means that if parts of the range have
8588             # already been stored in our tables, the new map overrides them if
8589             # they differ more than cosmetically
8590             my $replace = $IF_NOT_EQUIVALENT;
8591             my $map_type;            # Default type for the map of this range
8592
8593             #local $to_trace = 1 if main::DEBUG;
8594             trace $_ if main::DEBUG && $to_trace;
8595
8596             # Split the line into components
8597             my ($range, $property_name, $map, @remainder)
8598                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
8599
8600             # If more or less on the line than we are expecting, warn and skip
8601             # the line
8602             if (@remainder) {
8603                 $file->carp_bad_line('Extra fields');
8604                 next LINE;
8605             }
8606             elsif ( ! defined $property_name) {
8607                 $file->carp_bad_line('Missing property');
8608                 next LINE;
8609             }
8610
8611             # Examine the range.
8612             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
8613             {
8614                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
8615                 next LINE;
8616             }
8617             my $low = hex $1;
8618             my $high = (defined $2) ? hex $2 : $low;
8619
8620             # For the very specialized case of comparing two Unicode
8621             # versions...
8622             if (DEBUG && $compare_versions) {
8623                 if ($property_name eq 'Age') {
8624
8625                     # Only allow code points at least as old as the version
8626                     # specified.
8627                     my $age = pack "C*", split(/\./, $map);        # v string
8628                     next LINE if $age gt $compare_versions;
8629                 }
8630                 else {
8631
8632                     # Again, we throw out code points younger than those of
8633                     # the specified version.  By now, the Age property is
8634                     # populated.  We use the intersection of each input range
8635                     # with this property to find what code points in it are
8636                     # valid.   To do the intersection, we have to convert the
8637                     # Age property map to a Range_list.  We only have to do
8638                     # this once.
8639                     if (! defined $compare_versions_range_list) {
8640                         my $age = property_ref('Age');
8641                         if (! -e 'DAge.txt') {
8642                             croak "Need to have 'DAge.txt' file to do version comparison";
8643                         }
8644                         elsif ($age->count == 0) {
8645                             croak "The 'Age' table is empty, but its file exists";
8646                         }
8647                         $compare_versions_range_list
8648                                         = Range_List->new(Initialize => $age);
8649                     }
8650
8651                     # An undefined map is always 'Y'
8652                     $map = 'Y' if ! defined $map;
8653
8654                     # Calculate the intersection of the input range with the
8655                     # code points that are known in the specified version
8656                     my @ranges = ($compare_versions_range_list
8657                                   & Range->new($low, $high))->ranges;
8658
8659                     # If the intersection is empty, throw away this range
8660                     next LINE unless @ranges;
8661
8662                     # Only examine the first range this time through the loop.
8663                     my $this_range = shift @ranges;
8664
8665                     # Put any remaining ranges in the queue to be processed
8666                     # later.  Note that there is unnecessary work here, as we
8667                     # will do the intersection again for each of these ranges
8668                     # during some future iteration of the LINE loop, but this
8669                     # code is not used in production.  The later intersections
8670                     # are guaranteed to not splinter, so this will not become
8671                     # an infinite loop.
8672                     my $line = join ';', $property_name, $map;
8673                     foreach my $range (@ranges) {
8674                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
8675                                                             $range->start,
8676                                                             $range->end,
8677                                                             $line));
8678                     }
8679
8680                     # And process the first range, like any other.
8681                     $low = $this_range->start;
8682                     $high = $this_range->end;
8683                 }
8684             } # End of $compare_versions
8685
8686             # If changing to a new property, get the things constant per
8687             # property
8688             if ($previous_property_name ne $property_name) {
8689
8690                 $property_object = property_ref($property_name);
8691                 if (! defined $property_object) {
8692                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
8693                     next LINE;
8694                 }
8695                 { no overloading; $property_addr = 0+($property_object); }
8696
8697                 # Defer changing names until have a line that is acceptable
8698                 # (the 'next' statement above means is unacceptable)
8699                 $previous_property_name = $property_name;
8700
8701                 # If not the first time for this property, retrieve info about
8702                 # it from the cache
8703                 if (defined ($property_info{$property_addr}{'type'})) {
8704                     $property_type = $property_info{$property_addr}{'type'};
8705                     $default_map = $property_info{$property_addr}{'default'};
8706                     $map_type
8707                         = $property_info{$property_addr}{'pseudo_map_type'};
8708                     $default_table
8709                             = $property_info{$property_addr}{'default_table'};
8710                 }
8711                 else {
8712
8713                     # Here, is the first time for this property.  Set up the
8714                     # cache.
8715                     $property_type = $property_info{$property_addr}{'type'}
8716                                    = $property_object->type;
8717                     $map_type
8718                         = $property_info{$property_addr}{'pseudo_map_type'}
8719                         = $property_object->pseudo_map_type;
8720
8721                     # The Unicode files are set up so that if the map is not
8722                     # defined, it is a binary property
8723                     if (! defined $map && $property_type != $BINARY) {
8724                         if ($property_type != $UNKNOWN
8725                             && $property_type != $NON_STRING)
8726                         {
8727                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
8728                         }
8729                         else {
8730                             $property_object->set_type($BINARY);
8731                             $property_type
8732                                 = $property_info{$property_addr}{'type'}
8733                                 = $BINARY;
8734                         }
8735                     }
8736
8737                     # Get any @missings default for this property.  This
8738                     # should precede the first entry for the property in the
8739                     # input file, and is located in a comment that has been
8740                     # stored by the Input_file class until we access it here.
8741                     # It's possible that there is more than one such line
8742                     # waiting for us; collect them all, and parse
8743                     my @missings_list = $file->get_missings
8744                                             if $file->has_missings_defaults;
8745                     foreach my $default_ref (@missings_list) {
8746                         my $default = $default_ref->[0];
8747                         my $addr; { no overloading; $addr = 0+property_ref($default_ref->[1]); }
8748
8749                         # For string properties, the default is just what the
8750                         # file says, but non-string properties should already
8751                         # have set up a table for the default property value;
8752                         # use the table for these, so can resolve synonyms
8753                         # later to a single standard one.
8754                         if ($property_type == $STRING
8755                             || $property_type == $UNKNOWN)
8756                         {
8757                             $property_info{$addr}{'missings'} = $default;
8758                         }
8759                         else {
8760                             $property_info{$addr}{'missings'}
8761                                         = $property_object->table($default);
8762                         }
8763                     }
8764
8765                     # Finished storing all the @missings defaults in the input
8766                     # file so far.  Get the one for the current property.
8767                     my $missings = $property_info{$property_addr}{'missings'};
8768
8769                     # But we likely have separately stored what the default
8770                     # should be.  (This is to accommodate versions of the
8771                     # standard where the @missings lines are absent or
8772                     # incomplete.)  Hopefully the two will match.  But check
8773                     # it out.
8774                     $default_map = $property_object->default_map;
8775
8776                     # If the map is a ref, it means that the default won't be
8777                     # processed until later, so undef it, so next few lines
8778                     # will redefine it to something that nothing will match
8779                     undef $default_map if ref $default_map;
8780
8781                     # Create a $default_map if don't have one; maybe a dummy
8782                     # that won't match anything.
8783                     if (! defined $default_map) {
8784
8785                         # Use any @missings line in the file.
8786                         if (defined $missings) {
8787                             if (ref $missings) {
8788                                 $default_map = $missings->full_name;
8789                                 $default_table = $missings;
8790                             }
8791                             else {
8792                                 $default_map = $missings;
8793                             }
8794
8795                             # And store it with the property for outside use.
8796                             $property_object->set_default_map($default_map);
8797                         }
8798                         else {
8799
8800                             # Neither an @missings nor a default map.  Create
8801                             # a dummy one, so won't have to test definedness
8802                             # in the main loop.
8803                             $default_map = '_Perl This will never be in a file
8804                                             from Unicode';
8805                         }
8806                     }
8807
8808                     # Here, we have $default_map defined, possibly in terms of
8809                     # $missings, but maybe not, and possibly is a dummy one.
8810                     if (defined $missings) {
8811
8812                         # Make sure there is no conflict between the two.
8813                         # $missings has priority.
8814                         if (ref $missings) {
8815                             $default_table
8816                                         = $property_object->table($default_map);
8817                             if (! defined $default_table
8818                                 || $default_table != $missings)
8819                             {
8820                                 if (! defined $default_table) {
8821                                     $default_table = $UNDEF;
8822                                 }
8823                                 $file->carp_bad_line(<<END
8824 The \@missings line for $property_name in $file says that missings default to
8825 $missings, but we expect it to be $default_table.  $missings used.
8826 END
8827                                 );
8828                                 $default_table = $missings;
8829                                 $default_map = $missings->full_name;
8830                             }
8831                             $property_info{$property_addr}{'default_table'}
8832                                                         = $default_table;
8833                         }
8834                         elsif ($default_map ne $missings) {
8835                             $file->carp_bad_line(<<END
8836 The \@missings line for $property_name in $file says that missings default to
8837 $missings, but we expect it to be $default_map.  $missings used.
8838 END
8839                             );
8840                             $default_map = $missings;
8841                         }
8842                     }
8843
8844                     $property_info{$property_addr}{'default'}
8845                                                     = $default_map;
8846
8847                     # If haven't done so already, find the table corresponding
8848                     # to this map for non-string properties.
8849                     if (! defined $default_table
8850                         && $property_type != $STRING
8851                         && $property_type != $UNKNOWN)
8852                     {
8853                         $default_table = $property_info{$property_addr}
8854                                                         {'default_table'}
8855                                     = $property_object->table($default_map);
8856                     }
8857                 } # End of is first time for this property
8858             } # End of switching properties.
8859
8860             # Ready to process the line.
8861             # The Unicode files are set up so that if the map is not defined,
8862             # it is a binary property with value 'Y'
8863             if (! defined $map) {
8864                 $map = 'Y';
8865             }
8866             else {
8867
8868                 # If the map begins with a special command to us (enclosed in
8869                 # delimiters), extract the command(s).
8870                 if (substr($map, 0, 1) eq $CMD_DELIM) {
8871                     while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
8872                         my $command = $1;
8873                         if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
8874                             $replace = $1;
8875                         }
8876                         elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
8877                             $map_type = $1;
8878                         }
8879                         else {
8880                            $file->carp_bad_line("Unknown command line: '$1'");
8881                            next LINE;
8882                         }
8883                     }
8884                 }
8885             }
8886
8887             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
8888             {
8889
8890                 # Here, we have a map to a particular code point, and the
8891                 # default map is to a code point itself.  If the range
8892                 # includes the particular code point, change that portion of
8893                 # the range to the default.  This makes sure that in the final
8894                 # table only the non-defaults are listed.
8895                 my $decimal_map = hex $map;
8896                 if ($low <= $decimal_map && $decimal_map <= $high) {
8897
8898                     # If the range includes stuff before or after the map
8899                     # we're changing, split it and process the split-off parts
8900                     # later.
8901                     if ($low < $decimal_map) {
8902                         $file->insert_adjusted_lines(
8903                                             sprintf("%04X..%04X; %s; %s",
8904                                                     $low,
8905                                                     $decimal_map - 1,
8906                                                     $property_name,
8907                                                     $map));
8908                     }
8909                     if ($high > $decimal_map) {
8910                         $file->insert_adjusted_lines(
8911                                             sprintf("%04X..%04X; %s; %s",
8912                                                     $decimal_map + 1,
8913                                                     $high,
8914                                                     $property_name,
8915                                                     $map));
8916                     }
8917                     $low = $high = $decimal_map;
8918                     $map = $CODE_POINT;
8919                 }
8920             }
8921
8922             # If we can tell that this is a synonym for the default map, use
8923             # the default one instead.
8924             if ($property_type != $STRING
8925                 && $property_type != $UNKNOWN)
8926             {
8927                 my $table = $property_object->table($map);
8928                 if (defined $table && $table == $default_table) {
8929                     $map = $default_map;
8930                 }
8931             }
8932
8933             # And figure out the map type if not known.
8934             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
8935                 if ($map eq "") {   # Nulls are always $NULL map type
8936                     $map_type = $NULL;
8937                 } # Otherwise, non-strings, and those that don't allow
8938                   # $MULTI_CP, and those that aren't multiple code points are
8939                   # 0
8940                 elsif
8941                    (($property_type != $STRING && $property_type != $UNKNOWN)
8942                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
8943                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
8944                 {
8945                     $map_type = 0;
8946                 }
8947                 else {
8948                     $map_type = $MULTI_CP;
8949                 }
8950             }
8951
8952             $property_object->add_map($low, $high,
8953                                         $map,
8954                                         Type => $map_type,
8955                                         Replace => $replace);
8956         } # End of loop through file's lines
8957
8958         return;
8959     }
8960 }
8961
8962 # XXX Unused until revise charnames;
8963 #sub check_and_handle_compound_name {
8964 #    This looks at Name properties for parenthesized components and splits
8965 #    them off.  Thus it finds FF as an equivalent to Form Feed.
8966 #    my $code_point = shift;
8967 #    my $name = shift;
8968 #    if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) {
8969 #        #local $to_trace = 1 if main::DEBUG;
8970 #        trace $1, $2, $3, $4 if main::DEBUG && $to_trace;
8971 #        push @more_Names, "$code_point; $1";
8972 #        push @more_Names, "$code_point; $3";
8973 #        Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'.  Proceeding and assuming it was there;") if $2 ne " ";
8974 #        Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'.  Proceeding and ignoring that;") if $4 ne "";
8975 #    }
8976 #    return;
8977 #}
8978
8979 { # Closure for UnicodeData.txt handling
8980
8981     # This file was the first one in the UCD; its design leads to some
8982     # awkwardness in processing.  Here is a sample line:
8983     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
8984     # The fields in order are:
8985     my $i = 0;            # The code point is in field 0, and is shifted off.
8986     my $NAME = $i++;      # character name (e.g. "LATIN CAPITAL LETTER A")
8987     my $CATEGORY = $i++;  # category (e.g. "Lu")
8988     my $CCC = $i++;       # Canonical combining class (e.g. "230")
8989     my $BIDI = $i++;      # directional class (e.g. "L")
8990     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
8991     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
8992     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
8993                                          # Dual-use in this program; see below
8994     my $NUMERIC = $i++;   # numeric value
8995     my $MIRRORED = $i++;  # ? mirrored
8996     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
8997     my $COMMENT = $i++;   # iso comment
8998     my $UPPER = $i++;     # simple uppercase mapping
8999     my $LOWER = $i++;     # simple lowercase mapping
9000     my $TITLE = $i++;     # simple titlecase mapping
9001     my $input_field_count = $i;
9002
9003     # This routine in addition outputs these extra fields:
9004     my $DECOMP_TYPE = $i++; # Decomposition type
9005     my $DECOMP_MAP = $i++;  # Must be last; another decomposition mapping
9006     my $last_field = $i - 1;
9007
9008     # All these are read into an array for each line, with the indices defined
9009     # above.  The empty fields in the example line above indicate that the
9010     # value is defaulted.  The handler called for each line of the input
9011     # changes these to their defaults.
9012
9013     # Here are the official names of the properties, in a parallel array:
9014     my @field_names;
9015     $field_names[$BIDI] = 'Bidi_Class';
9016     $field_names[$CATEGORY] = 'General_Category';
9017     $field_names[$CCC] = 'Canonical_Combining_Class';
9018     $field_names[$COMMENT] = 'ISO_Comment';
9019     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9020     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
9021     $field_names[$LOWER] = 'Lowercase_Mapping';
9022     $field_names[$MIRRORED] = 'Bidi_Mirrored';
9023     $field_names[$NAME] = 'Name';
9024     $field_names[$NUMERIC] = 'Numeric_Value';
9025     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9026     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9027     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
9028     $field_names[$TITLE] = 'Titlecase_Mapping';
9029     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
9030     $field_names[$UPPER] = 'Uppercase_Mapping';
9031
9032     # Some of these need a little more explanation.  The $PERL_DECIMAL_DIGIT
9033     # field does not lead to an official Unicode property, but is used in
9034     # calculating the Numeric_Type.  Perl however, creates a file from this
9035     # field, so a Perl property is created from it.  Similarly, the Other
9036     # Digit field is used only for calculating the Numeric_Type, and so it can
9037     # be safely re-used as the place to store the value for Numeric_Type;
9038     # hence it is referred to as $NUMERIC_TYPE_OTHER_DIGIT.  The input field
9039     # named $PERL_DECOMPOSITION is a combination of both the decomposition
9040     # mapping and its type.  Perl creates a file containing exactly this
9041     # field, so it is used for that.  The two properties are separated into
9042     # two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9043
9044     # This file is processed like most in this program.  Control is passed to
9045     # process_generic_property_file() which calls filter_UnicodeData_line()
9046     # for each input line.  This filter converts the input into line(s) that
9047     # process_generic_property_file() understands.  There is also a setup
9048     # routine called before any of the file is processed, and a handler for
9049     # EOF processing, all in this closure.
9050
9051     # A huge speed-up occurred at the cost of some added complexity when these
9052     # routines were altered to buffer the outputs into ranges.  Almost all the
9053     # lines of the input file apply to just one code point, and for most
9054     # properties, the map for the next code point up is the same as the
9055     # current one.  So instead of creating a line for each property for each
9056     # input line, filter_UnicodeData_line() remembers what the previous map
9057     # of a property was, and doesn't generate a line to pass on until it has
9058     # to, as when the map changes; and that passed-on line encompasses the
9059     # whole contiguous range of code points that have the same map for that
9060     # property.  This means a slight amount of extra setup, and having to
9061     # flush these buffers on EOF, testing if the maps have changed, plus
9062     # remembering state information in the closure.  But it means a lot less
9063     # real time in not having to change the data base for each property on
9064     # each line.
9065
9066     # Another complication is that there are already a few ranges designated
9067     # in the input.  There are two lines for each, with the same maps except
9068     # the code point and name on each line.  This was actually the hardest
9069     # thing to design around.  The code points in those ranges may actually
9070     # have real maps not given by these two lines.  These maps will either
9071     # be algorthimically determinable, or in the extracted files furnished
9072     # with the UCD.  In the event of conflicts between these extracted files,
9073     # and this one, Unicode says that this one prevails.  But it shouldn't
9074     # prevail for conflicts that occur in these ranges.  The data from the
9075     # extracted files prevails in those cases.  So, this program is structured
9076     # so that those files are processed first, storing maps.  Then the other
9077     # files are processed, generally overwriting what the extracted files
9078     # stored.  But just the range lines in this input file are processed
9079     # without overwriting.  This is accomplished by adding a special string to
9080     # the lines output to tell process_generic_property_file() to turn off the
9081     # overwriting for just this one line.
9082     # A similar mechanism is used to tell it that the map is of a non-default
9083     # type.
9084
9085     sub setup_UnicodeData { # Called before any lines of the input are read
9086         my $file = shift;
9087         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9088
9089         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
9090                                         Directory => File::Spec->curdir(),
9091                                         File => 'Decomposition',
9092                                         Format => $STRING_FORMAT,
9093                                         Internal_Only_Warning => 1,
9094                                         Perl_Extension => 1,
9095                                         Default_Map => $CODE_POINT,
9096
9097                                         # normalize.pm can't cope with these
9098                                         Output_Range_Counts => 0,
9099
9100                                         # This is a specially formatted table
9101                                         # explicitly for normalize.pm, which
9102                                         # is expecting a particular format,
9103                                         # which means that mappings containing
9104                                         # multiple code points are in the main
9105                                         # body of the table
9106                                         Map_Type => $COMPUTE_NO_MULTI_CP,
9107                                         Type => $STRING,
9108                                         );
9109         $Perl_decomp->add_comment(join_lines(<<END
9110 This mapping is a combination of the Unicode 'Decomposition_Type' and
9111 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
9112 identical to the official Unicode 'Decomposition_Mapping'  property except for
9113 two things:
9114  1) It omits the algorithmically determinable Hangul syllable decompositions,
9115 which normalize.pm handles algorithmically.
9116  2) It contains the decomposition type as well.  Non-canonical decompositions
9117 begin with a word in angle brackets, like <super>, which denotes the
9118 compatible decomposition type.  If the map does not begin with the <angle
9119 brackets>, the decomposition is canonical.
9120 END
9121         ));
9122
9123         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9124                                         Default_Map => "",
9125                                         Perl_Extension => 1,
9126                                         File => 'Digit',    # Trad. location
9127                                         Directory => $map_directory,
9128                                         Type => $STRING,
9129                                         Range_Size_1 => 1,
9130                                         );
9131         $Decimal_Digit->add_comment(join_lines(<<END
9132 This file gives the mapping of all code points which represent a single
9133 decimal digit [0-9] to their respective digits.  For example, the code point
9134 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
9135 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9136 numerals.
9137 END
9138         ));
9139
9140         # This property is not used for generating anything else, and is
9141         # usually not output.  By making it last in the list, we can just
9142         # change the high end of the loop downwards to avoid the work of
9143         # generating a table that is just going to get thrown away.
9144         if (! property_ref('Decomposition_Mapping')->to_output_map) {
9145             $last_field--;
9146         }
9147         return;
9148     }
9149
9150     my $first_time = 1;                 # ? Is this the first line of the file
9151     my $in_range = 0;                   # ? Are we in one of the file's ranges
9152     my $previous_cp;                    # hex code point of previous line
9153     my $decimal_previous_cp = -1;       # And its decimal equivalent
9154     my @start;                          # For each field, the current starting
9155                                         # code point in hex for the range
9156                                         # being accumulated.
9157     my @fields;                         # The input fields;
9158     my @previous_fields;                # And those from the previous call
9159
9160     sub filter_UnicodeData_line {
9161         # Handle a single input line from UnicodeData.txt; see comments above
9162         # Conceptually this takes a single line from the file containing N
9163         # properties, and converts it into N lines with one property per line,
9164         # which is what the final handler expects.  But there are
9165         # complications due to the quirkiness of the input file, and to save
9166         # time, it accumulates ranges where the property values don't change
9167         # and only emits lines when necessary.  This is about an order of
9168         # magnitude fewer lines emitted.
9169
9170         my $file = shift;
9171         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9172
9173         # $_ contains the input line.
9174         # -1 in split means retain trailing null fields
9175         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9176
9177         #local $to_trace = 1 if main::DEBUG;
9178         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9179         if (@fields > $input_field_count) {
9180             $file->carp_bad_line('Extra fields');
9181             $_ = "";
9182             return;
9183         }
9184
9185         my $decimal_cp = hex $cp;
9186
9187         # We have to output all the buffered ranges when the next code point
9188         # is not exactly one after the previous one, which means there is a
9189         # gap in the ranges.
9190         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9191
9192         # The decomposition mapping field requires special handling.  It looks
9193         # like either:
9194         #
9195         # <compat> 0032 0020
9196         # 0041 0300
9197         #
9198         # The decomposition type is enclosed in <brackets>; if missing, it
9199         # means the type is canonical.  There are two decomposition mapping
9200         # tables: the one for use by Perl's normalize.pm has a special format
9201         # which is this field intact; the other, for general use is of
9202         # standard format.  In either case we have to find the decomposition
9203         # type.  Empty fields have None as their type, and map to the code
9204         # point itself
9205         if ($fields[$PERL_DECOMPOSITION] eq "") {
9206             $fields[$DECOMP_TYPE] = 'None';
9207             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9208         }
9209         else {
9210             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9211                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
9212             if (! defined $fields[$DECOMP_TYPE]) {
9213                 $fields[$DECOMP_TYPE] = 'Canonical';
9214                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9215             }
9216             else {
9217                 $fields[$DECOMP_MAP] = $map;
9218             }
9219         }
9220
9221         # The 3 numeric fields also require special handling.  The 2 digit
9222         # fields must be either empty or match the number field.  This means
9223         # that if it is empty, they must be as well, and the numeric type is
9224         # None, and the numeric value is 'Nan'.
9225         # The decimal digit field must be empty or match the other digit
9226         # field.  If the decimal digit field is non-empty, the code point is
9227         # a decimal digit, and the other two fields will have the same value.
9228         # If it is empty, but the other digit field is non-empty, the code
9229         # point is an 'other digit', and the number field will have the same
9230         # value as the other digit field.  If the other digit field is empty,
9231         # but the number field is non-empty, the code point is a generic
9232         # numeric type.
9233         if ($fields[$NUMERIC] eq "") {
9234             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9235                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9236             ) {
9237                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
9238             }
9239             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9240             $fields[$NUMERIC] = 'NaN';
9241         }
9242         else {
9243             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9244             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9245                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9246                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9247             }
9248             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9249                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9250                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9251             }
9252             else {
9253                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9254
9255                 # Rationals require extra effort.
9256                 register_fraction($fields[$NUMERIC])
9257                                                 if $fields[$NUMERIC] =~ qr{/};
9258             }
9259         }
9260
9261         # For the properties that have empty fields in the file, and which
9262         # mean something different from empty, change them to that default.
9263         # Certain fields just haven't been empty so far in any Unicode
9264         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9265         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
9266         # the defaults; which are verly unlikely to ever change.
9267         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9268         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9269
9270         # UAX44 says that if title is empty, it is the same as whatever upper
9271         # is,
9272         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9273
9274         # There are a few pairs of lines like:
9275         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9276         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9277         # that define ranges.  These should be processed after the fields are
9278         # adjusted above, as they may override some of them; but mostly what
9279         # is left is to possibly adjust the $NAME field.  The names of all the
9280         # paired lines start with a '<', but this is also true of '<control>,
9281         # which isn't one of these special ones.
9282         if ($fields[$NAME] eq '<control>') {
9283
9284             # Some code points in this file have the pseudo-name
9285             # '<control>', but the official name for such ones is the null
9286             # string.
9287             $fields[$NAME] = "";
9288
9289             # We had better not be in between range lines.
9290             if ($in_range) {
9291                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'.  Trying anyway");
9292                 $in_range = 0;
9293             }
9294         }
9295         elsif (substr($fields[$NAME], 0, 1) ne '<') {
9296
9297             # Here is a non-range line.  We had better not be in between range
9298             # lines.
9299             if ($in_range) {
9300                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'.  Trying anyway");
9301                 $in_range = 0;
9302             }
9303             # XXX until charnames catches up.
9304 #            if ($fields[$NAME] =~ s/- $cp $//x) {
9305 #
9306 #                # These are code points whose names end in their code points,
9307 #                # which means the names are algorithmically derivable from the
9308 #                # code points.  To shorten the output Name file, the algorithm
9309 #                # for deriving these is placed in the file instead of each
9310 #                # code point, so they have map type $CP_IN_NAME
9311 #                $fields[$NAME] = $CMD_DELIM
9312 #                                 . $MAP_TYPE_CMD
9313 #                                 . '='
9314 #                                 . $CP_IN_NAME
9315 #                                 . $CMD_DELIM
9316 #                                 . $fields[$NAME];
9317 #            }
9318
9319             # Some official names are really two alternate names with one in
9320             # parentheses.  What we do here is use the full official one for
9321             # the standard property (stored just above), but for the charnames
9322             # table, we add two more entries, one for each of the alternate
9323             # ones.
9324             # elsif name ne ""
9325             #check_and_handle_compound_name($cp, $fields[$NAME]);
9326             #check_and_handle_compound_name($cp, $unicode_1_name);
9327             # XXX until charnames catches up.
9328         }
9329         elsif ($fields[$NAME] =~ /^<(.+), First>$/) {
9330             $fields[$NAME] = $1;
9331
9332             # Here we are at the beginning of a range pair.
9333             if ($in_range) {
9334                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$NAME]'.  Trying anyway");
9335             }
9336             $in_range = 1;
9337
9338             # Because the properties in the range do not overwrite any already
9339             # in the db, we must flush the buffers of what's already there, so
9340             # they get handled in the normal scheme.
9341             $force_output = 1;
9342
9343         }
9344         elsif ($fields[$NAME] !~ s/^<(.+), Last>$/$1/) {
9345             $file->carp_bad_line("Unexpected name starting with '<' $fields[$NAME].  Ignoring this line.");
9346             $_ = "";
9347             return;
9348         }
9349         else { # Here, we are at the last line of a range pair.
9350
9351             if (! $in_range) {
9352                 $file->carp_bad_line("Unexpected end of range $fields[$NAME] when not in one.  Ignoring this line.");
9353                 $_ = "";
9354                 return;
9355             }
9356             $in_range = 0;
9357
9358             # Check that the input is valid: that the closing of the range is
9359             # the same as the beginning.
9360             foreach my $i (0 .. $last_field) {
9361                 next if $fields[$i] eq $previous_fields[$i];
9362                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
9363             }
9364
9365             # The processing differs depending on the type of range,
9366             # determined by its $NAME
9367             if ($fields[$NAME] =~ /^Hangul Syllable/) {
9368
9369                 # Check that the data looks right.
9370                 if ($decimal_previous_cp != $SBase) {
9371                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
9372                 }
9373                 if ($decimal_cp != $SBase + $SCount - 1) {
9374                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
9375                 }
9376
9377                 # The Hangul syllable range has a somewhat complicated name
9378                 # generation algorithm.  Each code point in it has a canonical
9379                 # decomposition also computable by an algorithm.  The
9380                 # perl decomposition map table built from these is used only
9381                 # by normalize.pm, which has the algorithm built in it, so the
9382                 # decomposition maps are not needed, and are large, so are
9383                 # omitted from it.  If the full decomposition map table is to
9384                 # be output, the decompositions are generated for it, in the
9385                 # EOF handling code for this input file.
9386
9387                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9388
9389                 # This range is stored in our internal structure with its
9390                 # own map type, different from all others.
9391                 $previous_fields[$NAME] = $CMD_DELIM
9392                                           . $MAP_TYPE_CMD
9393                                           . '='
9394                                           . $HANGUL_SYLLABLE
9395                                           . $CMD_DELIM
9396                                           . $fields[$NAME];
9397             }
9398             elsif ($fields[$NAME] =~ /^CJK/) {
9399
9400                 # The name for these contains the code point itself, and all
9401                 # are defined to have the same base name, regardless of what
9402                 # is in the file.  They are stored in our internal structure
9403                 # with a map type of $CP_IN_NAME
9404                 $previous_fields[$NAME] = $CMD_DELIM
9405                                            . $MAP_TYPE_CMD
9406                                            . '='
9407                                            . $CP_IN_NAME
9408                                            . $CMD_DELIM
9409                                            . 'CJK UNIFIED IDEOGRAPH';
9410
9411             }
9412             elsif ($fields[$CATEGORY] eq 'Co'
9413                      || $fields[$CATEGORY] eq 'Cs')
9414             {
9415                 # The names of all the code points in these ranges are set to
9416                 # null, as there are no names for the private use and
9417                 # surrogate code points.
9418
9419                 $previous_fields[$NAME] = "";
9420             }
9421             else {
9422                 $file->carp_bad_line("Unexpected code point range $fields[$NAME] because category is $fields[$CATEGORY].  Attempting to process it.");
9423             }
9424
9425             # The first line of the range caused everything else to be output,
9426             # and then its values were stored as the beginning values for the
9427             # next set of ranges, which this one ends.  Now, for each value,
9428             # add a command to tell the handler that these values should not
9429             # replace any existing ones in our database.
9430             foreach my $i (0 .. $last_field) {
9431                 $previous_fields[$i] = $CMD_DELIM
9432                                         . $REPLACE_CMD
9433                                         . '='
9434                                         . $NO
9435                                         . $CMD_DELIM
9436                                         . $previous_fields[$i];
9437             }
9438
9439             # And change things so it looks like the entire range has been
9440             # gone through with this being the final part of it.  Adding the
9441             # command above to each field will cause this range to be flushed
9442             # during the next iteration, as it guaranteed that the stored
9443             # field won't match whatever value the next one has.
9444             $previous_cp = $cp;
9445             $decimal_previous_cp = $decimal_cp;
9446
9447             # We are now set up for the next iteration; so skip the remaining
9448             # code in this subroutine that does the same thing, but doesn't
9449             # know about these ranges.
9450             $_ = "";
9451             return;
9452         }
9453
9454         # On the very first line, we fake it so the code below thinks there is
9455         # nothing to output, and initialize so that when it does get output it
9456         # uses the first line's values for the lowest part of the range.
9457         # (One could avoid this by using peek(), but then one would need to
9458         # know the adjustments done above and do the same ones in the setup
9459         # routine; not worth it)
9460         if ($first_time) {
9461             $first_time = 0;
9462             @previous_fields = @fields;
9463             @start = ($cp) x scalar @fields;
9464             $decimal_previous_cp = $decimal_cp - 1;
9465         }
9466
9467         # For each field, output the stored up ranges that this code point
9468         # doesn't fit in.  Earlier we figured out if all ranges should be
9469         # terminated because of changing the replace or map type styles, or if
9470         # there is a gap between this new code point and the previous one, and
9471         # that is stored in $force_output.  But even if those aren't true, we
9472         # need to output the range if this new code point's value for the
9473         # given property doesn't match the stored range's.
9474         #local $to_trace = 1 if main::DEBUG;
9475         foreach my $i (0 .. $last_field) {
9476             my $field = $fields[$i];
9477             if ($force_output || $field ne $previous_fields[$i]) {
9478
9479                 # Flush the buffer of stored values.
9480                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9481
9482                 # Start a new range with this code point and its value
9483                 $start[$i] = $cp;
9484                 $previous_fields[$i] = $field;
9485             }
9486         }
9487
9488         # Set the values for the next time.
9489         $previous_cp = $cp;
9490         $decimal_previous_cp = $decimal_cp;
9491
9492         # The input line has generated whatever adjusted lines are needed, and
9493         # should not be looked at further.
9494         $_ = "";
9495         return;
9496     }
9497
9498     sub EOF_UnicodeData {
9499         # Called upon EOF to flush the buffers, and create the Hangul
9500         # decomposition mappings if needed.
9501
9502         my $file = shift;
9503         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9504
9505         # Flush the buffers.
9506         foreach my $i (1 .. $last_field) {
9507             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9508         }
9509
9510         if (-e 'Jamo.txt') {
9511
9512             # The algorithm is published by Unicode, based on values in
9513             # Jamo.txt, (which should have been processed before this
9514             # subroutine), and the results left in %Jamo
9515             unless (%Jamo) {
9516                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
9517                 return;
9518             }
9519
9520             # If the full decomposition map table is being output, insert
9521             # into it the Hangul syllable mappings.  This is to avoid having
9522             # to publish a subroutine in it to compute them.  (which would
9523             # essentially be this code.)  This uses the algorithm published by
9524             # Unicode.
9525             if (property_ref('Decomposition_Mapping')->to_output_map) {
9526                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9527                     use integer;
9528                     my $SIndex = $S - $SBase;
9529                     my $L = $LBase + $SIndex / $NCount;
9530                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
9531                     my $T = $TBase + $SIndex % $TCount;
9532
9533                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9534                     my $decomposition = sprintf("%04X %04X", $L, $V);
9535                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9536                     $file->insert_adjusted_lines(
9537                                 sprintf("%04X; Decomposition_Mapping; %s",
9538                                         $S,
9539                                         $decomposition));
9540                 }
9541             }
9542         }
9543
9544         return;
9545     }
9546
9547     sub filter_v1_ucd {
9548         # Fix UCD lines in version 1.  This is probably overkill, but this
9549         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
9550         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
9551         #       removed.  This program retains them
9552         # 2)    didn't include ranges, which it should have, and which are now
9553         #       added in @corrected_lines below.  It was hand populated by
9554         #       taking the data from Version 2, verified by analyzing
9555         #       DAge.txt.
9556         # 3)    There is a syntax error in the entry for U+09F8 which could
9557         #       cause problems for utf8_heavy, and so is changed.  It's
9558         #       numeric value was simply a minus sign, without any number.
9559         #       (Eventually Unicode changed the code point to non-numeric.)
9560         # 4)    The decomposition types often don't match later versions
9561         #       exactly, and the whole syntax of that field is different; so
9562         #       the syntax is changed as well as the types to their later
9563         #       terminology.  Otherwise normalize.pm would be very unhappy
9564         # 5)    Many ccc classes are different.  These are left intact.
9565         # 6)    U+FF10 - U+FF19 are missing their numeric values in all three
9566         #       fields.  These are unchanged because it doesn't really cause
9567         #       problems for Perl.
9568         # 7)    A number of code points, such as controls, don't have their
9569         #       Unicode Version 1 Names in this file.  These are unchanged.
9570
9571         my @corrected_lines = split /\n/, <<'END';
9572 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9573 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9574 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
9575 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
9576 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
9577 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9578 END
9579
9580         my $file = shift;
9581         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9582
9583         #local $to_trace = 1 if main::DEBUG;
9584         trace $_ if main::DEBUG && $to_trace;
9585
9586         # -1 => retain trailing null fields
9587         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9588
9589         # At the first place that is wrong in the input, insert all the
9590         # corrections, replacing the wrong line.
9591         if ($code_point eq '4E00') {
9592             my @copy = @corrected_lines;
9593             $_ = shift @copy;
9594             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9595
9596             $file->insert_lines(@copy);
9597         }
9598
9599
9600         if ($fields[$NUMERIC] eq '-') {
9601             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
9602         }
9603
9604         if  ($fields[$PERL_DECOMPOSITION] ne "") {
9605
9606             # Several entries have this change to superscript 2 or 3 in the
9607             # middle.  Convert these to the modern version, which is to use
9608             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
9609             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
9610             # 'HHHH HHHH 00B3 HHHH'.
9611             # It turns out that all of these that don't have another
9612             # decomposition defined at the beginning of the line have the
9613             # <square> decomposition in later releases.
9614             if ($code_point ne '00B2' && $code_point ne '00B3') {
9615                 if  ($fields[$PERL_DECOMPOSITION]
9616                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
9617                 {
9618                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
9619                         $fields[$PERL_DECOMPOSITION] = '<square> '
9620                         . $fields[$PERL_DECOMPOSITION];
9621                     }
9622                 }
9623             }
9624
9625             # If is like '<+circled> 0052 <-circled>', convert to
9626             # '<circled> 0052'
9627             $fields[$PERL_DECOMPOSITION] =~
9628                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
9629
9630             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
9631             $fields[$PERL_DECOMPOSITION] =~
9632                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
9633             or $fields[$PERL_DECOMPOSITION] =~
9634                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
9635             or $fields[$PERL_DECOMPOSITION] =~
9636                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
9637             or $fields[$PERL_DECOMPOSITION] =~
9638                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
9639
9640             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
9641             $fields[$PERL_DECOMPOSITION] =~
9642                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
9643
9644             # Change names to modern form.
9645             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
9646             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
9647             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
9648             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
9649
9650             # One entry has weird braces
9651             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
9652         }
9653
9654         $_ = join ';', $code_point, @fields;
9655         trace $_ if main::DEBUG && $to_trace;
9656         return;
9657     }
9658
9659     sub filter_v2_1_5_ucd {
9660         # A dozen entries in this 2.1.5 file had the mirrored and numeric
9661         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
9662         # column appears to be N, swap it back.
9663
9664         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9665         if ($fields[$NUMERIC] eq 'N') {
9666             $fields[$NUMERIC] = $fields[$MIRRORED];
9667             $fields[$MIRRORED] = 'N';
9668             $_ = join ';', $code_point, @fields;
9669         }
9670         return;
9671     }
9672 } # End closure for UnicodeData
9673
9674 sub process_GCB_test {
9675
9676     my $file = shift;
9677     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9678
9679     while ($file->next_line) {
9680         push @backslash_X_tests, $_;
9681     }
9682
9683     return;
9684 }
9685
9686 sub process_NamedSequences {
9687     # NamedSequences.txt entries are just added to an array.  Because these
9688     # don't look like the other tables, they have their own handler.
9689     # An example:
9690     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
9691     #
9692     # This just adds the sequence to an array for later handling
9693
9694     return; # XXX Until charnames catches up
9695     my $file = shift;
9696     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9697
9698     while ($file->next_line) {
9699         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
9700         if (@remainder) {
9701             $file->carp_bad_line(
9702                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
9703             next;
9704         }
9705         push @named_sequences, "$sequence\t\t$name";
9706     }
9707     return;
9708 }
9709
9710 { # Closure
9711
9712     my $first_range;
9713
9714     sub  filter_early_ea_lb {
9715         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
9716         # third field be the name of the code point, which can be ignored in
9717         # most cases.  But it can be meaningful if it marks a range:
9718         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
9719         # 3400;W;<CJK Ideograph Extension A, First>
9720         #
9721         # We need to see the First in the example above to know it's a range.
9722         # They did not use the later range syntaxes.  This routine changes it
9723         # to use the modern syntax.
9724         # $1 is the Input_file object.
9725
9726         my @fields = split /\s*;\s*/;
9727         if ($fields[2] =~ /^<.*, First>/) {
9728             $first_range = $fields[0];
9729             $_ = "";
9730         }
9731         elsif ($fields[2] =~ /^<.*, Last>/) {
9732             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
9733         }
9734         else {
9735             undef $first_range;
9736             $_ = "$fields[0]; $fields[1]";
9737         }
9738
9739         return;
9740     }
9741 }
9742
9743 sub filter_old_style_arabic_shaping {
9744     # Early versions used a different term for the later one.
9745
9746     my @fields = split /\s*;\s*/;
9747     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
9748     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
9749     $_ = join ';', @fields;
9750     return;
9751 }
9752
9753 sub filter_arabic_shaping_line {
9754     # ArabicShaping.txt has entries that look like:
9755     # 062A; TEH; D; BEH
9756     # The field containing 'TEH' is not used.  The next field is Joining_Type
9757     # and the last is Joining_Group
9758     # This generates two lines to pass on, one for each property on the input
9759     # line.
9760
9761     my $file = shift;
9762     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9763
9764     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9765
9766     if (@fields > 4) {
9767         $file->carp_bad_line('Extra fields');
9768         $_ = "";
9769         return;
9770     }
9771
9772     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
9773     $_ = "$fields[0]; Joining_Type; $fields[2]";
9774
9775     return;
9776 }
9777
9778 sub setup_special_casing {
9779     # SpecialCasing.txt contains the non-simple case change mappings.  The
9780     # simple ones are in UnicodeData.txt, which should already have been read
9781     # in to the full property data structures, so as to initialize these with
9782     # the simple ones.  Then the SpecialCasing.txt entries overwrite the ones
9783     # which have different full mappings.
9784
9785     # This routine sees if the simple mappings are to be output, and if so,
9786     # copies what has already been put into the full mapping tables, while
9787     # they still contain only the simple mappings.
9788
9789     # The reason it is done this way is that the simple mappings are probably
9790     # not going to be output, so it saves work to initialize the full tables
9791     # with the simple mappings, and then overwrite those relatively few
9792     # entries in them that have different full mappings, and thus skip the
9793     # simple mapping tables altogether.
9794
9795     my $file= shift;
9796     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9797
9798     # For each of the case change mappings...
9799     foreach my $case ('lc', 'tc', 'uc') {
9800         my $full = property_ref($case);
9801         unless (defined $full && ! $full->is_empty) {
9802             Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
9803         }
9804
9805         # The simple version's name in each mapping merely has an 's' in front
9806         # of the full one's
9807         my $simple = property_ref('s' . $case);
9808         $simple->initialize($case) if $simple->to_output_map();
9809     }
9810
9811     return;
9812 }
9813
9814 sub filter_special_casing_line {
9815     # Change the format of $_ from SpecialCasing.txt into something that the
9816     # generic handler understands.  Each input line contains three case
9817     # mappings.  This will generate three lines to pass to the generic handler
9818     # for each of those.
9819
9820     # The input syntax (after stripping comments and trailing white space is
9821     # like one of the following (with the final two being entries that we
9822     # ignore):
9823     # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
9824     # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
9825     # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
9826     # Note the trailing semi-colon, unlike many of the input files.  That
9827     # means that there will be an extra null field generated by the split
9828
9829     my $file = shift;
9830     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9831
9832     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9833
9834     # field #4 is when this mapping is conditional.  If any of these get
9835     # implemented, it would be by hard-coding in the casing functions in the
9836     # Perl core, not through tables.  But if there is a new condition we don't
9837     # know about, output a warning.  We know about all the conditions through
9838     # 5.2
9839     if ($fields[4] ne "") {
9840         my @conditions = split ' ', $fields[4];
9841         if ($conditions[0] ne 'tr'  # We know that these languages have
9842                                     # conditions, and some are multiple
9843             && $conditions[0] ne 'az'
9844             && $conditions[0] ne 'lt'
9845
9846             # And, we know about a single condition Final_Sigma, but
9847             # nothing else.
9848             && ($v_version gt v5.2.0
9849                 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
9850         {
9851             $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
9852         }
9853         elsif ($conditions[0] ne 'Final_Sigma') {
9854
9855                 # Don't print out a message for Final_Sigma, because we have
9856                 # hard-coded handling for it.  (But the standard could change
9857                 # what the rule should be, but it wouldn't show up here
9858                 # anyway.
9859
9860                 print "# SKIPPING Special Casing: $_\n"
9861                                                     if $verbosity >= $VERBOSE;
9862         }
9863         $_ = "";
9864         return;
9865     }
9866     elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
9867         $file->carp_bad_line('Extra fields');
9868         $_ = "";
9869         return;
9870     }
9871
9872     $_ = "$fields[0]; lc; $fields[1]";
9873     $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
9874     $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
9875
9876     return;
9877 }
9878
9879 sub filter_old_style_case_folding {
9880     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
9881     # and later style.  Different letters were used in the earlier.
9882
9883     my $file = shift;
9884     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9885
9886     my @fields = split /\s*;\s*/;
9887     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
9888         $fields[1] = 'I';
9889     }
9890     elsif ($fields[1] eq 'L') {
9891         $fields[1] = 'C';             # L => C always
9892     }
9893     elsif ($fields[1] eq 'E') {
9894         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
9895             $fields[1] = 'F'
9896         }
9897         else {
9898             $fields[1] = 'C'
9899         }
9900     }
9901     else {
9902         $file->carp_bad_line("Expecting L or E in second field");
9903         $_ = "";
9904         return;
9905     }
9906     $_ = join("; ", @fields) . ';';
9907     return;
9908 }
9909
9910 { # Closure for case folding
9911
9912     # Create the map for simple only if are going to output it, for otherwise
9913     # it takes no part in anything we do.
9914     my $to_output_simple;
9915
9916     # These are experimental, perhaps will need these to pass to regcomp.c to
9917     # handle the cases where for example the Kelvin sign character folds to k,
9918     # and in regcomp, we need to know which of the characters can have a
9919     # non-latin1 char fold to it, so it doesn't do the optimizations it might
9920     # otherwise.
9921     my @latin1_singly_folded;
9922     my @latin1_folded;
9923
9924     sub setup_case_folding($) {
9925         # Read in the case foldings in CaseFolding.txt.  This handles both
9926         # simple and full case folding.
9927
9928         $to_output_simple
9929                         = property_ref('Simple_Case_Folding')->to_output_map;
9930
9931         return;
9932     }
9933
9934     sub filter_case_folding_line {
9935         # Called for each line in CaseFolding.txt
9936         # Input lines look like:
9937         # 0041; C; 0061; # LATIN CAPITAL LETTER A
9938         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
9939         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
9940         #
9941         # 'C' means that folding is the same for both simple and full
9942         # 'F' that it is only for full folding
9943         # 'S' that it is only for simple folding
9944         # 'T' is locale-dependent, and ignored
9945         # 'I' is a type of 'F' used in some early releases.
9946         # Note the trailing semi-colon, unlike many of the input files.  That
9947         # means that there will be an extra null field generated by the split
9948         # below, which we ignore and hence is not an error.
9949
9950         my $file = shift;
9951         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9952
9953         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
9954         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
9955             $file->carp_bad_line('Extra fields');
9956             $_ = "";
9957             return;
9958         }
9959
9960         if ($type eq 'T') {   # Skip Turkic case folding, is locale dependent
9961             $_ = "";
9962             return;
9963         }
9964
9965         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
9966         # I are all full foldings
9967         if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
9968             $_ = "$range; Case_Folding; $map";
9969         }
9970         else {
9971             $_ = "";
9972             if ($type ne 'S') {
9973                $file->carp_bad_line('Expecting C F I S or T in second field');
9974                return;
9975             }
9976         }
9977
9978         # C and S are simple foldings, but simple case folding is not needed
9979         # unless we explicitly want its map table output.
9980         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
9981             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
9982         }
9983
9984         # Experimental, see comment above
9985         if ($type ne 'S' && hex($range) >= 256) {   # assumes range is 1 point
9986             my @folded = split ' ', $map;
9987             if (hex $folded[0] < 256 && @folded == 1) {
9988                 push @latin1_singly_folded, hex $folded[0];
9989             }
9990             foreach my $folded (@folded) {
9991                 push @latin1_folded, hex $folded if hex $folded < 256;
9992             }
9993         }
9994
9995         return;
9996     }
9997
9998     sub post_fold {
9999         # Experimental, see comment above
10000         return;
10001
10002         #local $to_trace = 1 if main::DEBUG;
10003         @latin1_singly_folded = uniques(@latin1_singly_folded);
10004         @latin1_folded = uniques(@latin1_folded);
10005         trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
10006         trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
10007         return;
10008     }
10009 } # End case fold closure
10010
10011 sub filter_jamo_line {
10012     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
10013     # from this file that is used in generating the Name property for Jamo
10014     # code points.  But, it also is used to convert early versions' syntax
10015     # into the modern form.  Here are two examples:
10016     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
10017     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
10018     #
10019     # The input is $_, the output is $_ filtered.
10020
10021     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
10022
10023     # Let the caller handle unexpected input.  In earlier versions, there was
10024     # a third field which is supposed to be a comment, but did not have a '#'
10025     # before it.
10026     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10027
10028     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
10029                                 # beginning.
10030
10031     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
10032     $fields[1] = 'R' if $fields[0] eq '1105';
10033
10034     # Add to structure so can generate Names from it.
10035     my $cp = hex $fields[0];
10036     my $short_name = $fields[1];
10037     $Jamo{$cp} = $short_name;
10038     if ($cp <= $LBase + $LCount) {
10039         $Jamo_L{$short_name} = $cp - $LBase;
10040     }
10041     elsif ($cp <= $VBase + $VCount) {
10042         $Jamo_V{$short_name} = $cp - $VBase;
10043     }
10044     elsif ($cp <= $TBase + $TCount) {
10045         $Jamo_T{$short_name} = $cp - $TBase;
10046     }
10047     else {
10048         Carp::my_carp_bug("Unexpected Jamo code point in $_");
10049     }
10050
10051
10052     # Reassemble using just the first two fields to look like a typical
10053     # property file line
10054     $_ = "$fields[0]; $fields[1]";
10055
10056     return;
10057 }
10058
10059 sub register_fraction($) {
10060     # This registers the input rational number so that it can be passed on to
10061     # utf8_heavy.pl, both in rational and floating forms.
10062
10063     my $rational = shift;
10064
10065     my $float = eval $rational;
10066     $nv_floating_to_rational{$float} = $rational;
10067     return;
10068 }
10069
10070 sub filter_numeric_value_line {
10071     # DNumValues contains lines of a different syntax than the typical
10072     # property file:
10073     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
10074     #
10075     # This routine transforms $_ containing the anomalous syntax to the
10076     # typical, by filtering out the extra columns, and convert early version
10077     # decimal numbers to strings that look like rational numbers.
10078
10079     my $file = shift;
10080     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10081
10082     # Starting in 5.1, there is a rational field.  Just use that, omitting the
10083     # extra columns.  Otherwise convert the decimal number in the second field
10084     # to a rational, and omit extraneous columns.
10085     my @fields = split /\s*;\s*/, $_, -1;
10086     my $rational;
10087
10088     if ($v_version ge v5.1.0) {
10089         if (@fields != 4) {
10090             $file->carp_bad_line('Not 4 semi-colon separated fields');
10091             $_ = "";
10092             return;
10093         }
10094         $rational = $fields[3];
10095         $_ = join '; ', @fields[ 0, 3 ];
10096     }
10097     else {
10098
10099         # Here, is an older Unicode file, which has decimal numbers instead of
10100         # rationals in it.  Use the fraction to calculate the denominator and
10101         # convert to rational.
10102
10103         if (@fields != 2 && @fields != 3) {
10104             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10105             $_ = "";
10106             return;
10107         }
10108
10109         my $codepoints = $fields[0];
10110         my $decimal = $fields[1];
10111         if ($decimal =~ s/\.0+$//) {
10112
10113             # Anything ending with a decimal followed by nothing but 0's is an
10114             # integer
10115             $_ = "$codepoints; $decimal";
10116             $rational = $decimal;
10117         }
10118         else {
10119
10120             my $denominator;
10121             if ($decimal =~ /\.50*$/) {
10122                 $denominator = 2;
10123             }
10124
10125             # Here have the hardcoded repeating decimals in the fraction, and
10126             # the denominator they imply.  There were only a few denominators
10127             # in the older Unicode versions of this file which this code
10128             # handles, so it is easy to convert them.
10129
10130             # The 4 is because of a round-off error in the Unicode 3.2 files
10131             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10132                 $denominator = 3;
10133             }
10134             elsif ($decimal =~ /\.[27]50*$/) {
10135                 $denominator = 4;
10136             }
10137             elsif ($decimal =~ /\.[2468]0*$/) {
10138                 $denominator = 5;
10139             }
10140             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10141                 $denominator = 6;
10142             }
10143             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10144                 $denominator = 8;
10145             }
10146             if ($denominator) {
10147                 my $sign = ($decimal < 0) ? "-" : "";
10148                 my $numerator = int((abs($decimal) * $denominator) + .5);
10149                 $rational = "$sign$numerator/$denominator";
10150                 $_ = "$codepoints; $rational";
10151             }
10152             else {
10153                 $file->carp_bad_line("Can't cope with number '$decimal'.");
10154                 $_ = "";
10155                 return;
10156             }
10157         }
10158     }
10159
10160     register_fraction($rational) if $rational =~ qr{/};
10161     return;
10162 }
10163
10164 { # Closure
10165     my %unihan_properties;
10166     my $iicore;
10167
10168
10169     sub setup_unihan {
10170         # Do any special setup for Unihan properties.
10171
10172         # This property gives the wrong computed type, so override.
10173         my $usource = property_ref('kIRG_USource');
10174         $usource->set_type($STRING) if defined $usource;
10175
10176         # This property is to be considered binary, so change all the values
10177         # to Y.
10178         $iicore = property_ref('kIICore');
10179         if (defined $iicore) {
10180             $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
10181
10182             # We have to change the default map, because the @missing line is
10183             # misleading, given that we are treating it as binary.
10184             $iicore->set_default_map('N');
10185             $iicore->set_type($BINARY);
10186         }
10187
10188         return;
10189     }
10190
10191     sub filter_unihan_line {
10192         # Change unihan db lines to look like the others in the db.  Here is
10193         # an input sample:
10194         #   U+341C        kCangjie        IEKN
10195
10196         # Tabs are used instead of semi-colons to separate fields; therefore
10197         # they may have semi-colons embedded in them.  Change these to periods
10198         # so won't screw up the rest of the code.
10199         s/;/./g;
10200
10201         # Remove lines that don't look like ones we accept.
10202         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10203             $_ = "";
10204             return;
10205         }
10206
10207         # Extract the property, and save a reference to its object.
10208         my $property = $1;
10209         if (! exists $unihan_properties{$property}) {
10210             $unihan_properties{$property} = property_ref($property);
10211         }
10212
10213         # Don't do anything unless the property is one we're handling, which
10214         # we determine by seeing if there is an object defined for it or not
10215         if (! defined $unihan_properties{$property}) {
10216             $_ = "";
10217             return;
10218         }
10219
10220         # The iicore property is supposed to be a boolean, so convert to our
10221         # standard boolean form.
10222         if (defined $iicore && $unihan_properties{$property} == $iicore) {
10223             $_ =~ s/$property.*/$property\tY/
10224         }
10225
10226         # Convert the tab separators to our standard semi-colons, and convert
10227         # the U+HHHH notation to the rest of the standard's HHHH
10228         s/\t/;/g;
10229         s/\b U \+ (?= $code_point_re )//xg;
10230
10231         #local $to_trace = 1 if main::DEBUG;
10232         trace $_ if main::DEBUG && $to_trace;
10233
10234         return;
10235     }
10236 }
10237
10238 sub filter_blocks_lines {
10239     # In the Blocks.txt file, the names of the blocks don't quite match the
10240     # names given in PropertyValueAliases.txt, so this changes them so they
10241     # do match:  Blanks and hyphens are changed into underscores.  Also makes
10242     # early release versions look like later ones
10243     #
10244     # $_ is transformed to the correct value.
10245
10246     my $file = shift;
10247         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10248
10249     if ($v_version lt v3.2.0) {
10250         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10251             $_ = "";
10252             return;
10253         }
10254
10255         # Old versions used a different syntax to mark the range.
10256         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10257     }
10258
10259     my @fields = split /\s*;\s*/, $_, -1;
10260     if (@fields != 2) {
10261         $file->carp_bad_line("Expecting exactly two fields");
10262         $_ = "";
10263         return;
10264     }
10265
10266     # Change hyphens and blanks in the block name field only
10267     $fields[1] =~ s/[ -]/_/g;
10268     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
10269
10270     $_ = join("; ", @fields);
10271     return;
10272 }
10273
10274 { # Closure
10275     my $current_property;
10276
10277     sub filter_old_style_proplist {
10278         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
10279         # was in a completely different syntax.  Ken Whistler of Unicode says
10280         # that it was something he used as an aid for his own purposes, but
10281         # was never an official part of the standard.  However, comments in
10282         # DAge.txt indicate that non-character code points were available in
10283         # the UCD as of 3.1.  It is unclear to me (khw) how they could be
10284         # there except through this file (but on the other hand, they first
10285         # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10286         # not.  But the claim is that it was published as an aid to others who
10287         # might want some more information than was given in the official UCD
10288         # of the time.  Many of the properties in it were incorporated into
10289         # the later PropList.txt, but some were not.  This program uses this
10290         # early file to generate property tables that are otherwise not
10291         # accessible in the early UCD's, and most were probably not really
10292         # official at that time, so one could argue that it should be ignored,
10293         # and you can easily modify things to skip this.  And there are bugs
10294         # in this file in various versions.  (For example, the 2.1.9 version
10295         # removes from Alphabetic the CJK range starting at 4E00, and they
10296         # weren't added back in until 3.1.0.)  Many of this file's properties
10297         # were later sanctioned, so this code generates tables for those
10298         # properties that aren't otherwise in the UCD of the time but
10299         # eventually did become official, and throws away the rest.  Here is a
10300         # list of all the ones that are thrown away:
10301         #   Bidi=*                       duplicates UnicodeData.txt
10302         #   Combining                    never made into official property;
10303         #                                is \P{ccc=0}
10304         #   Composite                    never made into official property.
10305         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
10306         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
10307         #   Delimiter                    never made into official property;
10308         #                                removed in 3.0.1
10309         #   Format Control               never made into official property;
10310         #                                similar to gc=cf
10311         #   High Surrogate               duplicates Blocks.txt
10312         #   Ignorable Control            never made into official property;
10313         #                                similar to di=y
10314         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
10315         #   Left of Pair                 never made into official property;
10316         #   Line Separator               duplicates UnicodeData.txt: gc=zl
10317         #   Low Surrogate                duplicates Blocks.txt
10318         #   Non-break                    was actually listed as a property
10319         #                                in 3.2, but without any code
10320         #                                points.  Unicode denies that this
10321         #                                was ever an official property
10322         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
10323         #   Numeric                      duplicates UnicodeData.txt: gc=cc
10324         #   Paired Punctuation           never made into official property;
10325         #                                appears to be gc=ps + gc=pe
10326         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
10327         #   Private Use                  duplicates UnicodeData.txt: gc=co
10328         #   Private Use High Surrogate   duplicates Blocks.txt
10329         #   Punctuation                  duplicates UnicodeData.txt: gc=p
10330         #   Space                        different definition than eventual
10331         #                                one.
10332         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
10333         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cc
10334         #   Zero-width                   never made into offical property;
10335         #                                subset of gc=cf
10336         # Most of the properties have the same names in this file as in later
10337         # versions, but a couple do not.
10338         #
10339         # This subroutine filters $_, converting it from the old style into
10340         # the new style.  Here's a sample of the old-style
10341         #
10342         #   *******************************************
10343         #
10344         #   Property dump for: 0x100000A0 (Join Control)
10345         #
10346         #   200C..200D  (2 chars)
10347         #
10348         # In the example, the property is "Join Control".  It is kept in this
10349         # closure between calls to the subroutine.  The numbers beginning with
10350         # 0x were internal to Ken's program that generated this file.
10351
10352         # If this line contains the property name, extract it.
10353         if (/^Property dump for: [^(]*\((.*)\)/) {
10354             $_ = $1;
10355
10356             # Convert white space to underscores.
10357             s/ /_/g;
10358
10359             # Convert the few properties that don't have the same name as
10360             # their modern counterparts
10361             s/Identifier_Part/ID_Continue/
10362             or s/Not_a_Character/NChar/;
10363
10364             # If the name matches an existing property, use it.
10365             if (defined property_ref($_)) {
10366                 trace "new property=", $_ if main::DEBUG && $to_trace;
10367                 $current_property = $_;
10368             }
10369             else {        # Otherwise discard it
10370                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10371                 undef $current_property;
10372             }
10373             $_ = "";    # The property is saved for the next lines of the
10374                         # file, but this defining line is of no further use,
10375                         # so clear it so that the caller won't process it
10376                         # further.
10377         }
10378         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10379
10380             # Here, the input line isn't a header defining a property for the
10381             # following section, and either we aren't in such a section, or
10382             # the line doesn't look like one that defines the code points in
10383             # such a section.  Ignore this line.
10384             $_ = "";
10385         }
10386         else {
10387
10388             # Here, we have a line defining the code points for the current
10389             # stashed property.  Anything starting with the first blank is
10390             # extraneous.  Otherwise, it should look like a normal range to
10391             # the caller.  Append the property name so that it looks just like
10392             # a modern PropList entry.
10393
10394             $_ =~ s/\s.*//;
10395             $_ .= "; $current_property";
10396         }
10397         trace $_ if main::DEBUG && $to_trace;
10398         return;
10399     }
10400 } # End closure for old style proplist
10401
10402 sub filter_old_style_normalization_lines {
10403     # For early releases of Unicode, the lines were like:
10404     #        74..2A76    ; NFKD_NO
10405     # For later releases this became:
10406     #        74..2A76    ; NFKD_QC; N
10407     # Filter $_ to look like those in later releases.
10408     # Similarly for MAYBEs
10409
10410     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10411
10412     # Also, the property FC_NFKC was abbreviated to FNC
10413     s/FNC/FC_NFKC/;
10414     return;
10415 }
10416
10417 sub finish_Unicode() {
10418     # This routine should be called after all the Unicode files have been read
10419     # in.  It:
10420     # 1) Adds the mappings for code points missing from the files which have
10421     #    defaults specified for them.
10422     # 2) At this this point all mappings are known, so it computes the type of
10423     #    each property whose type hasn't been determined yet.
10424     # 3) Calculates all the regular expression match tables based on the
10425     #    mappings.
10426     # 3) Calculates and adds the tables which are defined by Unicode, but
10427     #    which aren't derived by them
10428
10429     # For each property, fill in any missing mappings, and calculate the re
10430     # match tables.  If a property has more than one missing mapping, the
10431     # default is a reference to a data structure, and requires data from other
10432     # properties to resolve.  The sort is used to cause these to be processed
10433     # last, after all the other properties have been calculated.
10434     # (Fortunately, the missing properties so far don't depend on each other.)
10435     foreach my $property
10436         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10437         property_ref('*'))
10438     {
10439         # $perl has been defined, but isn't one of the Unicode properties that
10440         # need to be finished up.
10441         next if $property == $perl;
10442
10443         # Handle the properties that have more than one possible default
10444         if (ref $property->default_map) {
10445             my $default_map = $property->default_map;
10446
10447             # These properties have stored in the default_map:
10448             # One or more of:
10449             #   1)  A default map which applies to all code points in a
10450             #       certain class
10451             #   2)  an expression which will evaluate to the list of code
10452             #       points in that class
10453             # And
10454             #   3) the default map which applies to every other missing code
10455             #      point.
10456             #
10457             # Go through each list.
10458             while (my ($default, $eval) = $default_map->get_next_defaults) {
10459
10460                 # Get the class list, and intersect it with all the so-far
10461                 # unspecified code points yielding all the code points
10462                 # in the class that haven't been specified.
10463                 my $list = eval $eval;
10464                 if ($@) {
10465                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10466                     last;
10467                 }
10468
10469                 # Narrow down the list to just those code points we don't have
10470                 # maps for yet.
10471                 $list = $list & $property->inverse_list;
10472
10473                 # Add mappings to the property for each code point in the list
10474                 foreach my $range ($list->ranges) {
10475                     $property->add_map($range->start, $range->end, $default);
10476                 }
10477             }
10478
10479             # All remaining code points have the other mapping.  Set that up
10480             # so the normal single-default mapping code will work on them
10481             $property->set_default_map($default_map->other_default);
10482
10483             # And fall through to do that
10484         }
10485
10486         # We should have enough data now to compute the type of the property.
10487         $property->compute_type;
10488         my $property_type = $property->type;
10489
10490         next if ! $property->to_create_match_tables;
10491
10492         # Here want to create match tables for this property
10493
10494         # The Unicode db always (so far, and they claim into the future) have
10495         # the default for missing entries in binary properties be 'N' (unless
10496         # there is a '@missing' line that specifies otherwise)
10497         if ($property_type == $BINARY && ! defined $property->default_map) {
10498             $property->set_default_map('N');
10499         }
10500
10501         # Add any remaining code points to the mapping, using the default for
10502         # missing code points
10503         if (defined (my $default_map = $property->default_map)) {
10504             foreach my $range ($property->inverse_list->ranges) {
10505                 $property->add_map($range->start, $range->end, $default_map);
10506             }
10507
10508             # Make sure there is a match table for the default
10509             if (! defined $property->table($default_map)) {
10510                 $property->add_match_table($default_map);
10511             }
10512         }
10513
10514         # Have all we need to populate the match tables.
10515         my $property_name = $property->name;
10516         foreach my $range ($property->ranges) {
10517             my $map = $range->value;
10518             my $table = property_ref($property_name)->table($map);
10519             if (! defined $table) {
10520
10521                 # Integral and rational property values are not necessarily
10522                 # defined in PropValueAliases, but all other ones should be,
10523                 # starting in 5.1
10524                 if ($v_version ge v5.1.0
10525                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10526                 {
10527                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
10528                 }
10529                 $table = property_ref($property_name)->add_match_table($map);
10530             }
10531
10532             $table->add_range($range->start, $range->end);
10533         }
10534
10535         # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
10536         # all properties have this optional prefix.  These do not get a
10537         # separate entry in the pod file, because are covered by a wild-card
10538         # entry
10539         foreach my $alias ($property->aliases) {
10540             my $Is_name = 'Is_' . $alias->name;
10541             if (! defined (my $pre_existing = property_ref($Is_name))) {
10542                 $property->add_alias($Is_name,
10543                                      Pod_Entry => 0,
10544                                      Status => $alias->status,
10545                                      Externally_Ok => 0);
10546             }
10547             else {
10548
10549                 # It seemed too much work to add in these warnings when it
10550                 # appears that Unicode has made a decision never to begin a
10551                 # property name with 'Is_', so this shouldn't happen, but just
10552                 # in case, it is a warning.
10553                 Carp::my_carp(<<END
10554 There is already an alias named $Is_name (from " . $pre_existing . "), so not
10555 creating this alias for $property.  The generated table and pod files do not
10556 warn users of this conflict.
10557 END
10558                 );
10559                 $has_Is_conflicts++;
10560             }
10561         } # End of loop through aliases for this property
10562     } # End of loop through all Unicode properties.
10563
10564     # Fill in the mappings that Unicode doesn't completely furnish.  First the
10565     # single letter major general categories.  If Unicode were to start
10566     # delivering the values, this would be redundant, but better that than to
10567     # try to figure out if should skip and not get it right.  Ths could happen
10568     # if a new major category were to be introduced, and the hard-coded test
10569     # wouldn't know about it.
10570     # This routine depends on the standard names for the general categories
10571     # being what it thinks they are, like 'Cn'.  The major categories are the
10572     # union of all the general category tables which have the same first
10573     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
10574     foreach my $minor_table ($gc->tables) {
10575         my $minor_name = $minor_table->name;
10576         next if length $minor_name == 1;
10577         if (length $minor_name != 2) {
10578             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
10579             next;
10580         }
10581
10582         my $major_name = uc(substr($minor_name, 0, 1));
10583         my $major_table = $gc->table($major_name);
10584         $major_table += $minor_table;
10585     }
10586
10587     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
10588     # defines it as LC)
10589     my $LC = $gc->table('LC');
10590     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
10591     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
10592
10593
10594     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
10595                          # deliver the correct values in it
10596         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
10597
10598         # Lt not in release 1.
10599         $LC += $gc->table('Lt') if defined $gc->table('Lt');
10600     }
10601     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
10602
10603     my $Cs = $gc->table('Cs');
10604     if (defined $Cs) {
10605         $Cs->add_note('Mostly not usable in Perl.');
10606         $Cs->add_comment(join_lines(<<END
10607 Surrogates are used exclusively for I/O in UTF-16, and should not appear in
10608 Unicode text, and hence their use will generate (usually fatal) messages
10609 END
10610         ));
10611     }
10612
10613
10614     # Folding information was introduced later into Unicode data.  To get
10615     # Perl's case ignore (/i) to work at all in releases that don't have
10616     # folding, use the best available alternative, which is lower casing.
10617     my $fold = property_ref('Simple_Case_Folding');
10618     if ($fold->is_empty) {
10619         $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
10620         $fold->add_note(join_lines(<<END
10621 WARNING: This table uses lower case as a substitute for missing fold
10622 information
10623 END
10624         ));
10625     }
10626
10627     # Multiple-character mapping was introduced later into Unicode data.  If
10628     # missing, use the single-characters maps as best available alternative
10629     foreach my $map (qw {   Uppercase_Mapping
10630                             Lowercase_Mapping
10631                             Titlecase_Mapping
10632                             Case_Folding
10633                         } ) {
10634         my $full = property_ref($map);
10635         if ($full->is_empty) {
10636             my $simple = property_ref('Simple_' . $map);
10637             $full->initialize($simple);
10638             $full->add_comment($simple->comment) if ($simple->comment);
10639             $full->add_note(join_lines(<<END
10640 WARNING: This table uses simple mapping (single-character only) as a
10641 substitute for missing multiple-character information
10642 END
10643             ));
10644         }
10645     }
10646     return
10647 }
10648
10649 sub compile_perl() {
10650     # Create perl-defined tables.  Almost all are part of the pseudo-property
10651     # named 'perl' internally to this program.  Many of these are recommended
10652     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
10653     # on those found there.
10654     # Almost all of these are equivalent to some Unicode property.
10655     # A number of these properties have equivalents restricted to the ASCII
10656     # range, with their names prefaced by 'Posix', to signify that these match
10657     # what the Posix standard says they should match.  A couple are
10658     # effectively this, but the name doesn't have 'Posix' in it because there
10659     # just isn't any Posix equivalent.
10660
10661     # 'Any' is all code points.  As an error check, instead of just setting it
10662     # to be that, construct it to be the union of all the major categories
10663     my $Any = $perl->add_match_table('Any',
10664             Description  => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
10665             Matches_All => 1);
10666
10667     foreach my $major_table ($gc->tables) {
10668
10669         # Major categories are the ones with single letter names.
10670         next if length($major_table->name) != 1;
10671
10672         $Any += $major_table;
10673     }
10674
10675     if ($Any->max != $LAST_UNICODE_CODEPOINT) {
10676         Carp::my_carp_bug("Generated highest code point ("
10677            . sprintf("%X", $Any->max)
10678            . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
10679     }
10680     if ($Any->range_count != 1 || $Any->min != 0) {
10681      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
10682     }
10683
10684     $Any->add_alias('All');
10685
10686     # Assigned is the opposite of gc=unassigned
10687     my $Assigned = $perl->add_match_table('Assigned',
10688                                 Description  => "All assigned code points",
10689                                 Initialize => ~ $gc->table('Unassigned'),
10690                                 );
10691
10692     # Our internal-only property should be treated as more than just a
10693     # synonym.
10694     $perl->add_match_table('_CombAbove')
10695             ->set_equivalent_to(property_ref('ccc')->table('Above'),
10696                                                                 Related => 1);
10697
10698     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
10699     if (defined $block) {   # This is equivalent to the block if have it.
10700         my $Unicode_ASCII = $block->table('Basic_Latin');
10701         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
10702             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
10703         }
10704     }
10705
10706     # Very early releases didn't have blocks, so initialize ASCII ourselves if
10707     # necessary
10708     if ($ASCII->is_empty) {
10709         $ASCII->initialize([ 0..127 ]);
10710     }
10711
10712     # Get the best available case definitions.  Early Unicode versions didn't
10713     # have Uppercase and Lowercase defined, so use the general category
10714     # instead for them.
10715     my $Lower = $perl->add_match_table('Lower');
10716     my $Unicode_Lower = property_ref('Lowercase');
10717     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
10718         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
10719     }
10720     else {
10721         $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
10722                                                                 Related => 1);
10723     }
10724     $perl->add_match_table("PosixLower",
10725                             Description => "[a-z]",
10726                             Initialize => $Lower & $ASCII,
10727                             );
10728
10729     my $Upper = $perl->add_match_table('Upper');
10730     my $Unicode_Upper = property_ref('Uppercase');
10731     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
10732         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
10733     }
10734     else {
10735         $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
10736                                                                 Related => 1);
10737     }
10738     $perl->add_match_table("PosixUpper",
10739                             Description => "[A-Z]",
10740                             Initialize => $Upper & $ASCII,
10741                             );
10742
10743     # Earliest releases didn't have title case.  Initialize it to empty if not
10744     # otherwise present
10745     my $Title = $perl->add_match_table('Title');
10746     my $lt = $gc->table('Lt');
10747     if (defined $lt) {
10748         $Title->set_equivalent_to($lt, Related => 1);
10749     }
10750
10751     # If this Unicode version doesn't have Cased, set up our own.  From
10752     # Unicode 5.1: Definition D120: A character C is defined to be cased if
10753     # and only if C has the Lowercase or Uppercase property or has a
10754     # General_Category value of Titlecase_Letter.
10755     unless (defined property_ref('Cased')) {
10756         my $cased = $perl->add_match_table('Cased',
10757                         Initialize => $Lower + $Upper + $Title,
10758                         Description => 'Uppercase or Lowercase or Titlecase',
10759                         );
10760     }
10761
10762     # Similarly, set up our own Case_Ignorable property if this Unicode
10763     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
10764     # C is defined to be case-ignorable if C has the value MidLetter or the
10765     # value MidNumLet for the Word_Break property or its General_Category is
10766     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
10767     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10768
10769     # Perl has long had an internal-only alias for this property.
10770     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
10771     my $case_ignorable = property_ref('Case_Ignorable');
10772     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
10773         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
10774                                                                 Related => 1);
10775     }
10776     else {
10777
10778         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
10779
10780         # The following three properties are not in early releases
10781         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
10782         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
10783         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
10784
10785         # For versions 4.1 - 5.0, there is no MidNumLet property, and
10786         # correspondingly the case-ignorable definition lacks that one.  For
10787         # 4.0, it appears that it was meant to be the same definition, but was
10788         # inadvertently omitted from the standard's text, so add it if the
10789         # property actually is there
10790         my $wb = property_ref('Word_Break');
10791         if (defined $wb) {
10792             my $midlet = $wb->table('MidLetter');
10793             $perl_case_ignorable += $midlet if defined $midlet;
10794             my $midnumlet = $wb->table('MidNumLet');
10795             $perl_case_ignorable += $midnumlet if defined $midnumlet;
10796         }
10797         else {
10798
10799             # In earlier versions of the standard, instead of the above two
10800             # properties , just the following characters were used:
10801             $perl_case_ignorable +=  0x0027  # APOSTROPHE
10802                                 +   0x00AD  # SOFT HYPHEN (SHY)
10803                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
10804         }
10805     }
10806
10807     # The remaining perl defined tables are mostly based on Unicode TR 18,
10808     # "Annex C: Compatibility Properties".  All of these have two versions,
10809     # one whose name generally begins with Posix that is posix-compliant, and
10810     # one that matches Unicode characters beyond the Posix, ASCII range
10811
10812     my $Alpha = $perl->add_match_table('Alpha');
10813
10814     # Alphabetic was not present in early releases
10815     my $Alphabetic = property_ref('Alphabetic');
10816     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
10817         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
10818     }
10819     else {
10820
10821         # For early releases, we don't get it exactly right.  The below
10822         # includes more than it should, which in 5.2 terms is: L + Nl +
10823         # Other_Alphabetic.  Other_Alphabetic contains many characters from
10824         # Mn and Mc.  It's better to match more than we should, than less than
10825         # we should.
10826         $Alpha->initialize($gc->table('Letter')
10827                             + $gc->table('Mn')
10828                             + $gc->table('Mc'));
10829         $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
10830         $Alpha->add_description('Alphabetic');
10831     }
10832     $perl->add_match_table("PosixAlpha",
10833                             Description => "[A-Za-z]",
10834                             Initialize => $Alpha & $ASCII,
10835                             );
10836
10837     my $Alnum = $perl->add_match_table('Alnum',
10838                         Description => 'Alphabetic and (Decimal) Numeric',
10839                         Initialize => $Alpha + $gc->table('Decimal_Number'),
10840                         );
10841     $perl->add_match_table("PosixAlnum",
10842                             Description => "[A-Za-z0-9]",
10843                             Initialize => $Alnum & $ASCII,
10844                             );
10845
10846     my $Word = $perl->add_match_table('Word',
10847                                 Description => '\w, including beyond ASCII',
10848                                 Initialize => $Alnum + $gc->table('Mark'),
10849                                 );
10850     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
10851     $Word += $Pc if defined $Pc;
10852
10853     # This is a Perl extension, so the name doesn't begin with Posix.
10854     $perl->add_match_table('PerlWord',
10855                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
10856                     Initialize => $Word & $ASCII,
10857                     );
10858
10859     my $Blank = $perl->add_match_table('Blank',
10860                                 Description => '\h, Horizontal white space',
10861
10862                                 # 200B is Zero Width Space which is for line
10863                                 # break control, and was listed as
10864                                 # Space_Separator in early releases
10865                                 Initialize => $gc->table('Space_Separator')
10866                                             +   0x0009  # TAB
10867                                             -   0x200B, # ZWSP
10868                                 );
10869     $Blank->add_alias('HorizSpace');        # Another name for it.
10870     $perl->add_match_table("PosixBlank",
10871                             Description => "\\t and ' '",
10872                             Initialize => $Blank & $ASCII,
10873                             );
10874
10875     my $VertSpace = $perl->add_match_table('VertSpace',
10876                             Description => '\v',
10877                             Initialize => $gc->table('Line_Separator')
10878                                         + $gc->table('Paragraph_Separator')
10879                                         + 0x000A  # LINE FEED
10880                                         + 0x000B  # VERTICAL TAB
10881                                         + 0x000C  # FORM FEED
10882                                         + 0x000D  # CARRIAGE RETURN
10883                                         + 0x0085, # NEL
10884                             );
10885     # No Posix equivalent for vertical space
10886
10887     my $Space = $perl->add_match_table('Space',
10888                 Description => '\s including beyond ASCII plus vertical tab',
10889                 Initialize => $Blank + $VertSpace,
10890     );
10891     $perl->add_match_table("PosixSpace",
10892                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
10893                             Initialize => $Space & $ASCII,
10894                             );
10895
10896     # Perl's traditional space doesn't include Vertical Tab
10897     my $SpacePerl = $perl->add_match_table('SpacePerl',
10898                                   Description => '\s, including beyond ASCII',
10899                                   Initialize => $Space - 0x000B,
10900                                 );
10901     $perl->add_match_table('PerlSpace',
10902                             Description => '\s, restricted to ASCII',
10903                             Initialize => $SpacePerl & $ASCII,
10904                             );
10905
10906     my $Cntrl = $perl->add_match_table('Cntrl',
10907                                         Description => 'Control characters');
10908     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
10909     $perl->add_match_table("PosixCntrl",
10910                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
10911                             Initialize => $Cntrl & $ASCII,
10912                             );
10913
10914     # $controls is a temporary used to construct Graph.
10915     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
10916                                                 + $gc->table('Control'));
10917     # Cs not in release 1
10918     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
10919
10920     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
10921     my $Graph = $perl->add_match_table('Graph',
10922                         Description => 'Characters that are graphical',
10923                         Initialize => ~ ($Space + $controls),
10924                         );
10925     $perl->add_match_table("PosixGraph",
10926                             Description =>
10927                                 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
10928                             Initialize => $Graph & $ASCII,
10929                             );
10930
10931     my $Print = $perl->add_match_table('Print',
10932                         Description => 'Characters that are graphical plus space characters (but no controls)',
10933                         Initialize => $Blank + $Graph - $gc->table('Control'),
10934                         );
10935     $perl->add_match_table("PosixPrint",
10936                             Description =>
10937                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
10938                             Initialize => $Print & $ASCII,
10939                             );
10940
10941     my $Punct = $perl->add_match_table('Punct');
10942     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
10943
10944     # \p{punct} doesn't include the symbols, which posix does
10945     $perl->add_match_table('PosixPunct',
10946         Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
10947         Initialize => $ASCII & ($gc->table('Punctuation')
10948                                 + $gc->table('Symbol')),
10949         );
10950
10951     my $Digit = $perl->add_match_table('Digit',
10952                             Description => '\d, extended beyond just [0-9]');
10953     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
10954     my $PosixDigit = $perl->add_match_table("PosixDigit",
10955                                             Description => '[0-9]',
10956                                             Initialize => $Digit & $ASCII,
10957                                             );
10958
10959     # Hex_Digit was not present in first release
10960     my $Xdigit = $perl->add_match_table('XDigit');
10961     my $Hex = property_ref('Hex_Digit');
10962     if (defined $Hex && ! $Hex->is_empty) {
10963         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
10964     }
10965     else {
10966         # (Have to use hex instead of e.g. '0', because could be running on an
10967         # non-ASCII machine, and we want the Unicode (ASCII) values)
10968         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
10969                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
10970         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
10971     }
10972
10973     my $dt = property_ref('Decomposition_Type');
10974     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
10975         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
10976         Perl_Extension => 1,
10977         Note => 'Union of all non-canonical decompositions',
10978         );
10979
10980     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
10981     # than SD appeared, construct it ourselves, based on the first release SD
10982     # was in.
10983     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
10984     my $soft_dotted = property_ref('Soft_Dotted');
10985     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
10986         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
10987     }
10988     else {
10989
10990         # This list came from 3.2 Soft_Dotted.
10991         $CanonDCIJ->initialize([ 0x0069,
10992                                  0x006A,
10993                                  0x012F,
10994                                  0x0268,
10995                                  0x0456,
10996                                  0x0458,
10997                                  0x1E2D,
10998                                  0x1ECB,
10999                                ]);
11000         $CanonDCIJ = $CanonDCIJ & $Assigned;
11001     }
11002
11003     # These are used in Unicode's definition of \X
11004     my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11005     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11006
11007     my $gcb = property_ref('Grapheme_Cluster_Break');
11008
11009     # The 'extended' grapheme cluster came in 5.1.  The non-extended
11010     # definition differs too much from the traditional Perl one to use.
11011     if (defined $gcb && defined $gcb->table('SpacingMark')) {
11012
11013         # Note that assumes HST is defined; it came in an earlier release than
11014         # GCB.  In the line below, two negatives means: yes hangul
11015         $begin += ~ property_ref('Hangul_Syllable_Type')
11016                                                     ->table('Not_Applicable')
11017                + ~ ($gcb->table('Control')
11018                     + $gcb->table('CR')
11019                     + $gcb->table('LF'));
11020         $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11021
11022         $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11023         $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
11024     }
11025     else {    # Old definition, used on early releases.
11026         $extend += $gc->table('Mark')
11027                 + 0x200C    # ZWNJ
11028                 + 0x200D;   # ZWJ
11029         $begin += ~ $extend;
11030
11031         # Here we may have a release that has the regular grapheme cluster
11032         # defined, or a release that doesn't have anything defined.
11033         # We set things up so the Perl core degrades gracefully, possibly with
11034         # placeholders that match nothing.
11035
11036         if (! defined $gcb) {
11037             $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11038         }
11039         my $hst = property_ref('HST');
11040         if (!defined $hst) {
11041             $hst = Property->new('HST', Status => $PLACEHOLDER);
11042             $hst->add_match_table('Not_Applicable',
11043                                 Initialize => $Any,
11044                                 Matches_All => 1);
11045         }
11046
11047         # On some releases, here we may not have the needed tables for the
11048         # perl core, in some releases we may.
11049         foreach my $name (qw{ L LV LVT T V prepend }) {
11050             my $table = $gcb->table($name);
11051             if (! defined $table) {
11052                 $table = $gcb->add_match_table($name);
11053                 push @tables_that_may_be_empty, $table->complete_name;
11054             }
11055
11056             # The HST property predates the GCB one, and has identical tables
11057             # for some of them, so use it if we can.
11058             if ($table->is_empty
11059                 && defined $hst
11060                 && defined $hst->table($name))
11061             {
11062                 $table += $hst->table($name);
11063             }
11064         }
11065     }
11066
11067     # More GCB.  If we found some hangul syllables, populate a combined
11068     # table.
11069     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
11070     my $LV = $gcb->table('LV');
11071     if ($LV->is_empty) {
11072         push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
11073     } else {
11074         $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
11075         $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
11076     }
11077
11078     # Create a new property specially located that is a combination of the
11079     # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11080     # Name_Alias properties.  (The final duplicates elements of the first.)  A
11081     # comment for it is constructed based on the actual properties present and
11082     # used
11083     my $perl_charname = Property->new('Perl_Charnames',
11084                                 Core_Access => '\N{...} and charnames.pm',
11085                                 Default_Map => "",
11086                                 Directory => File::Spec->curdir(),
11087                                 File => 'Name',
11088                                 Internal_Only_Warning => 1,
11089                                 Perl_Extension => 1,
11090                                 Range_Size_1 => 1,
11091                                 Type => $STRING,
11092                                 Initialize => property_ref('Unicode_1_Name'),
11093                                 );
11094     # Name overrides Unicode_1_Name
11095     $perl_charname->property_add_or_replace_non_nulls(property_ref('Name'));
11096     my @composition = ('Name', 'Unicode_1_Name');
11097
11098     if (@named_sequences) {
11099         push @composition, 'Named_Sequence';
11100         foreach my $sequence (@named_sequences) {
11101             $perl_charname->add_anomalous_entry($sequence);
11102         }
11103     }
11104
11105     my $alias_sentence = "";
11106     my $alias = property_ref('Name_Alias');
11107     if (defined $alias) {
11108         push @composition, 'Name_Alias';
11109         $alias->reset_each_range;
11110         while (my ($range) = $alias->each_range) {
11111             next if $range->value eq "";
11112             if ($range->start != $range->end) {
11113                 Carp::my_carp("Expecting only one code point in the range $range.  Just to keep going, using just the first code point;");
11114             }
11115             $perl_charname->add_duplicate($range->start, $range->value);
11116         }
11117         $alias_sentence = <<END;
11118 The Name_Alias property adds duplicate code point entries with a corrected
11119 name.  The original (less correct, but still valid) name will be physically
11120 first.
11121 END
11122     }
11123     my $comment;
11124     if (@composition <= 2) { # Always at least 2
11125         $comment = join " and ", @composition;
11126     }
11127     else {
11128         $comment = join ", ", @composition[0 .. scalar @composition - 2];
11129         $comment .= ", and $composition[-1]";
11130     }
11131
11132     # Wait for charnames to catch up
11133 #    foreach my $entry (@more_Names,
11134 #                        split "\n", <<"END"
11135 #000A; LF
11136 #000C; FF
11137 #000D; CR
11138 #0085; NEL
11139 #200C; ZWNJ
11140 #200D; ZWJ
11141 #FEFF; BOM
11142 #FEFF; BYTE ORDER MARK
11143 #END
11144 #    ) {
11145 #        #local $to_trace = 1 if main::DEBUG;
11146 #        trace $entry if main::DEBUG && $to_trace;
11147 #        my ($code_point, $name) = split /\s*;\s*/, $entry;
11148 #        $code_point = hex $code_point;
11149 #        trace $code_point, $name if main::DEBUG && $to_trace;
11150 #        $perl_charname->add_duplicate($code_point, $name);
11151 #    }
11152 #    #$perl_charname->add_comment("This file is for charnames.pm.  It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'.  Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
11153     $perl_charname->add_comment(join_lines( <<END
11154 This file is for charnames.pm.  It is the union of the $comment properties.
11155 Unicode_1_Name entries are used only for otherwise nameless code
11156 points.
11157 $alias_sentence
11158 END
11159     ));
11160
11161     # The combining class property used by Perl's normalize.pm is not located
11162     # in the normal mapping directory; create a copy for it.
11163     my $ccc = property_ref('Canonical_Combining_Class');
11164     my $perl_ccc = Property->new('Perl_ccc',
11165                             Default_Map => $ccc->default_map,
11166                             Full_Name => 'Perl_Canonical_Combining_Class',
11167                             Internal_Only_Warning => 1,
11168                             Perl_Extension => 1,
11169                             Pod_Entry =>0,
11170                             Type => $ENUM,
11171                             Initialize => $ccc,
11172                             File => 'CombiningClass',
11173                             Directory => File::Spec->curdir(),
11174                             );
11175     $perl_ccc->set_to_output_map(1);
11176     $perl_ccc->add_comment(join_lines(<<END
11177 This mapping is for normalize.pm.  It is currently identical to the Unicode
11178 Canonical_Combining_Class property.
11179 END
11180     ));
11181
11182     # This one match table for it is needed for calculations on output
11183     my $default = $perl_ccc->add_match_table($ccc->default_map,
11184                         Initialize => $ccc->table($ccc->default_map),
11185                         Status => $SUPPRESSED);
11186
11187     # Construct the Present_In property from the Age property.
11188     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
11189         my $default_map = $age->default_map;
11190         my $in = Property->new('In',
11191                                 Default_Map => $default_map,
11192                                 Full_Name => "Present_In",
11193                                 Internal_Only_Warning => 1,
11194                                 Perl_Extension => 1,
11195                                 Type => $ENUM,
11196                                 Initialize => $age,
11197                                 );
11198         $in->add_comment(join_lines(<<END
11199 This file should not be used for any purpose.  The values in this file are the
11200 same as for $age, and not for what $in really means.  This is because anything
11201 defined in a given release should have multiple values: that release and all
11202 higher ones.  But only one value per code point can be represented in a table
11203 like this.
11204 END
11205         ));
11206
11207         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
11208         # lowest numbered (earliest) come first, with the non-numeric one
11209         # last.
11210         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
11211                                             ? 1
11212                                             : ($b->name !~ /^[\d.]*$/)
11213                                                 ? -1
11214                                                 : $a->name <=> $b->name
11215                                             } $age->tables;
11216
11217         # The Present_In property is the cumulative age properties.  The first
11218         # one hence is identical to the first age one.
11219         my $previous_in = $in->add_match_table($first_age->name);
11220         $previous_in->set_equivalent_to($first_age, Related => 1);
11221
11222         my $description_start = "Code point's usage introduced in version ";
11223         $first_age->add_description($description_start . $first_age->name);
11224
11225         # To construct the accumlated values, for each of the age tables
11226         # starting with the 2nd earliest, merge the earliest with it, to get
11227         # all those code points existing in the 2nd earliest.  Repeat merging
11228         # the new 2nd earliest with the 3rd earliest to get all those existing
11229         # in the 3rd earliest, and so on.
11230         foreach my $current_age (@rest_ages) {
11231             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
11232
11233             my $current_in = $in->add_match_table(
11234                                     $current_age->name,
11235                                     Initialize => $current_age + $previous_in,
11236                                     Description => $description_start
11237                                                     . $current_age->name
11238                                                     . ' or earlier',
11239                                     );
11240             $previous_in = $current_in;
11241
11242             # Add clarifying material for the corresponding age file.  This is
11243             # in part because of the confusing and contradictory information
11244             # given in the Standard's documentation itself, as of 5.2.
11245             $current_age->add_description(
11246                             "Code point's usage was introduced in version "
11247                             . $current_age->name);
11248             $current_age->add_note("See also $in");
11249
11250         }
11251
11252         # And finally the code points whose usages have yet to be decided are
11253         # the same in both properties.  Note that permanently unassigned code
11254         # points actually have their usage assigned (as being permanently
11255         # unassigned), so that these tables are not the same as gc=cn.
11256         my $unassigned = $in->add_match_table($default_map);
11257         my $age_default = $age->table($default_map);
11258         $age_default->add_description(<<END
11259 Code point's usage has not been assigned in any Unicode release thus far.
11260 END
11261         );
11262         $unassigned->set_equivalent_to($age_default, Related => 1);
11263     }
11264
11265
11266     # Finished creating all the perl properties.  All non-internal non-string
11267     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
11268     # an underscore.)  These do not get a separate entry in the pod file
11269     foreach my $table ($perl->tables) {
11270         foreach my $alias ($table->aliases) {
11271             next if $alias->name =~ /^_/;
11272             $table->add_alias('Is_' . $alias->name,
11273                                Pod_Entry => 0,
11274                                Status => $alias->status,
11275                                Externally_Ok => 0);
11276         }
11277     }
11278
11279     return;
11280 }
11281
11282 sub add_perl_synonyms() {
11283     # A number of Unicode tables have Perl synonyms that are expressed in
11284     # the single-form, \p{name}.  These are:
11285     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11286     #       \p{Is_Name} as synonyms
11287     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11288     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11289     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11290     #       conflict, \p{Value} and \p{Is_Value} as well
11291     #
11292     # This routine generates these synonyms, warning of any unexpected
11293     # conflicts.
11294
11295     # Construct the list of tables to get synonyms for.  Start with all the
11296     # binary and the General_Category ones.
11297     my @tables = grep { $_->type == $BINARY } property_ref('*');
11298     push @tables, $gc->tables;
11299
11300     # If the version of Unicode includes the Script property, add its tables
11301     if (defined property_ref('Script')) {
11302         push @tables, property_ref('Script')->tables;
11303     }
11304
11305     # The Block tables are kept separate because they are treated differently.
11306     # And the earliest versions of Unicode didn't include them, so add only if
11307     # there are some.
11308     my @blocks;
11309     push @blocks, $block->tables if defined $block;
11310
11311     # Here, have the lists of tables constructed.  Process blocks last so that
11312     # if there are name collisions with them, blocks have lowest priority.
11313     # Should there ever be other collisions, manual intervention would be
11314     # required.  See the comments at the beginning of the program for a
11315     # possible way to handle those semi-automatically.
11316     foreach my $table (@tables,  @blocks) {
11317
11318         # For non-binary properties, the synonym is just the name of the
11319         # table, like Greek, but for binary properties the synonym is the name
11320         # of the property, and means the code points in its 'Y' table.
11321         my $nominal = $table;
11322         my $nominal_property = $nominal->property;
11323         my $actual;
11324         if (! $nominal->isa('Property')) {
11325             $actual = $table;
11326         }
11327         else {
11328
11329             # Here is a binary property.  Use the 'Y' table.  Verify that is
11330             # there
11331             my $yes = $nominal->table('Y');
11332             unless (defined $yes) {  # Must be defined, but is permissible to
11333                                      # be empty.
11334                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
11335                 next;
11336             }
11337             $actual = $yes;
11338         }
11339
11340         foreach my $alias ($nominal->aliases) {
11341
11342             # Attempt to create a table in the perl directory for the
11343             # candidate table, using whatever aliases in it that don't
11344             # conflict.  Also add non-conflicting aliases for all these
11345             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11346             PREFIX:
11347             foreach my $prefix ("", 'Is_', 'In_') {
11348
11349                 # Only Block properties can have added 'In_' aliases.
11350                 next if $prefix eq 'In_' and $nominal_property != $block;
11351
11352                 my $proposed_name = $prefix . $alias->name;
11353
11354                 # No Is_Is, In_In, nor combinations thereof
11355                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11356                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11357
11358                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11359
11360                 # Get a reference to any existing table in the perl
11361                 # directory with the desired name.
11362                 my $pre_existing = $perl->table($proposed_name);
11363
11364                 if (! defined $pre_existing) {
11365
11366                     # No name collision, so ok to add the perl synonym.
11367
11368                     my $make_pod_entry;
11369                     my $externally_ok;
11370                     my $status = $actual->status;
11371                     if ($nominal_property == $block) {
11372
11373                         # For block properties, the 'In' form is preferred for
11374                         # external use; the pod file contains wild cards for
11375                         # this and the 'Is' form so no entries for those; and
11376                         # we don't want people using the name without the
11377                         # 'In', so discourage that.
11378                         if ($prefix eq "") {
11379                             $make_pod_entry = 1;
11380                             $status = $status || $DISCOURAGED;
11381                             $externally_ok = 0;
11382                         }
11383                         elsif ($prefix eq 'In_') {
11384                             $make_pod_entry = 0;
11385                             $status = $status || $NORMAL;
11386                             $externally_ok = 1;
11387                         }
11388                         else {
11389                             $make_pod_entry = 0;
11390                             $status = $status || $DISCOURAGED;
11391                             $externally_ok = 0;
11392                         }
11393                     }
11394                     elsif ($prefix ne "") {
11395
11396                         # The 'Is' prefix is handled in the pod by a wild
11397                         # card, and we won't use it for an external name
11398                         $make_pod_entry = 0;
11399                         $status = $status || $NORMAL;
11400                         $externally_ok = 0;
11401                     }
11402                     else {
11403
11404                         # Here, is an empty prefix, non block.  This gets its
11405                         # own pod entry and can be used for an external name.
11406                         $make_pod_entry = 1;
11407                         $status = $status || $NORMAL;
11408                         $externally_ok = 1;
11409                     }
11410
11411                     # Here, there isn't a perl pre-existing table with the
11412                     # name.  Look through the list of equivalents of this
11413                     # table to see if one is a perl table.
11414                     foreach my $equivalent ($actual->leader->equivalents) {
11415                         next if $equivalent->property != $perl;
11416
11417                         # Here, have found a table for $perl.  Add this alias
11418                         # to it, and are done with this prefix.
11419                         $equivalent->add_alias($proposed_name,
11420                                         Pod_Entry => $make_pod_entry,
11421                                         Status => $status,
11422                                         Externally_Ok => $externally_ok);
11423                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11424                         next PREFIX;
11425                     }
11426
11427                     # Here, $perl doesn't already have a table that is a
11428                     # synonym for this property, add one.
11429                     my $added_table = $perl->add_match_table($proposed_name,
11430                                             Pod_Entry => $make_pod_entry,
11431                                             Status => $status,
11432                                             Externally_Ok => $externally_ok);
11433                     # And it will be related to the actual table, since it is
11434                     # based on it.
11435                     $added_table->set_equivalent_to($actual, Related => 1);
11436                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11437                     next;
11438                 } # End of no pre-existing.
11439
11440                 # Here, there is a pre-existing table that has the proposed
11441                 # name.  We could be in trouble, but not if this is just a
11442                 # synonym for another table that we have already made a child
11443                 # of the pre-existing one.
11444                 if ($pre_existing->is_equivalent_to($actual)) {
11445                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11446                     $pre_existing->add_alias($proposed_name);
11447                     next;
11448                 }
11449
11450                 # Here, there is a name collision, but it still could be ok if
11451                 # the tables match the identical set of code points, in which
11452                 # case, we can combine the names.  Compare each table's code
11453                 # point list to see if they are identical.
11454                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11455                 if ($pre_existing->matches_identically_to($actual)) {
11456
11457                     # Here, they do match identically.  Not a real conflict.
11458                     # Make the perl version a child of the Unicode one, except
11459                     # in the non-obvious case of where the perl name is
11460                     # already a synonym of another Unicode property.  (This is
11461                     # excluded by the test for it being its own parent.)  The
11462                     # reason for this exclusion is that then the two Unicode
11463                     # properties become related; and we don't really know if
11464                     # they are or not.  We generate documentation based on
11465                     # relatedness, and this would be misleading.  Code
11466                     # later executed in the process will cause the tables to
11467                     # be represented by a single file anyway, without making
11468                     # it look in the pod like they are necessarily related.
11469                     if ($pre_existing->parent == $pre_existing
11470                         && ($pre_existing->property == $perl
11471                             || $actual->property == $perl))
11472                     {
11473                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11474                         $pre_existing->set_equivalent_to($actual, Related => 1);
11475                     }
11476                     elsif (main::DEBUG && $to_trace) {
11477                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11478                         trace $pre_existing->parent;
11479                     }
11480                     next PREFIX;
11481                 }
11482
11483                 # Here they didn't match identically, there is a real conflict
11484                 # between our new name and a pre-existing property.
11485                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11486                 $pre_existing->add_conflicting($nominal->full_name,
11487                                                'p',
11488                                                $actual);
11489
11490                 # Don't output a warning for aliases for the block
11491                 # properties (unless they start with 'In_') as it is
11492                 # expected that there will be conflicts and the block
11493                 # form loses.
11494                 if ($verbosity >= $NORMAL_VERBOSITY
11495                     && ($actual->property != $block || $prefix eq 'In_'))
11496                 {
11497                     print simple_fold(join_lines(<<END
11498 There is already an alias named $proposed_name (from " . $pre_existing . "),
11499 so not creating this alias for " . $actual
11500 END
11501                     ), "", 4);
11502                 }
11503
11504                 # Keep track for documentation purposes.
11505                 $has_In_conflicts++ if $prefix eq 'In_';
11506                 $has_Is_conflicts++ if $prefix eq 'Is_';
11507             }
11508         }
11509     }
11510
11511     # There are some properties which have No and Yes (and N and Y) as
11512     # property values, but aren't binary, and could possibly be confused with
11513     # binary ones.  So create caveats for them.  There are tables that are
11514     # named 'No', and tables that are named 'N', but confusion is not likely
11515     # unless they are the same table.  For example, N meaning Number or
11516     # Neutral is not likely to cause confusion, so don't add caveats to things
11517     # like them.
11518     foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11519         my $yes = $property->table('Yes');
11520         if (defined $yes) {
11521             my $y = $property->table('Y');
11522             if (defined $y && $yes == $y) {
11523                 foreach my $alias ($property->aliases) {
11524                     $yes->add_conflicting($alias->name);
11525                 }
11526             }
11527         }
11528         my $no = $property->table('No');
11529         if (defined $no) {
11530             my $n = $property->table('N');
11531             if (defined $n && $no == $n) {
11532                 foreach my $alias ($property->aliases) {
11533                     $no->add_conflicting($alias->name, 'P');
11534                 }
11535             }
11536         }
11537     }
11538
11539     return;
11540 }
11541
11542 sub register_file_for_name($$$) {
11543     # Given info about a table and a datafile that it should be associated
11544     # with, register that assocation
11545
11546     my $table = shift;
11547     my $directory_ref = shift;   # Array of the directory path for the file
11548     my $file = shift;            # The file name in the final directory, [-1].
11549     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11550
11551     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11552
11553     if ($table->isa('Property')) {
11554         $table->set_file_path(@$directory_ref, $file);
11555         push @map_properties, $table
11556                                     if $directory_ref->[0] eq $map_directory;
11557         return;
11558     }
11559
11560     # Do all of the work for all equivalent tables when called with the leader
11561     # table, so skip if isn't the leader.
11562     return if $table->leader != $table;
11563
11564     # Join all the file path components together, using slashes.
11565     my $full_filename = join('/', @$directory_ref, $file);
11566
11567     # All go in the same subdirectory of unicore
11568     if ($directory_ref->[0] ne $matches_directory) {
11569         Carp::my_carp("Unexpected directory in "
11570                 .  join('/', @{$directory_ref}, $file));
11571     }
11572
11573     # For this table and all its equivalents ...
11574     foreach my $table ($table, $table->equivalents) {
11575
11576         # Associate it with its file internally.  Don't include the
11577         # $matches_directory first component
11578         $table->set_file_path(@$directory_ref, $file);
11579         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
11580
11581         my $property = $table->property;
11582         $property = ($property == $perl)
11583                     ? ""                # 'perl' is never explicitly stated
11584                     : standardize($property->name) . '=';
11585
11586         my $deprecated = ($table->status eq $DEPRECATED)
11587                          ? $table->status_info
11588                          : "";
11589
11590         # And for each of the table's aliases...  This inner loop eventually
11591         # goes through all aliases in the UCD that we generate regex match
11592         # files for
11593         foreach my $alias ($table->aliases) {
11594             my $name = $alias->name;
11595
11596             # Generate an entry in either the loose or strict hashes, which
11597             # will translate the property and alias names combination into the
11598             # file where the table for them is stored.
11599             my $standard;
11600             if ($alias->loose_match) {
11601                 $standard = $property . standardize($alias->name);
11602                 if (exists $loose_to_file_of{$standard}) {
11603                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
11604                 }
11605                 else {
11606                     $loose_to_file_of{$standard} = $sub_filename;
11607                 }
11608             }
11609             else {
11610                 $standard = lc ($property . $name);
11611                 if (exists $stricter_to_file_of{$standard}) {
11612                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
11613                 }
11614                 else {
11615                     $stricter_to_file_of{$standard} = $sub_filename;
11616
11617                     # Tightly coupled with how utf8_heavy.pl works, for a
11618                     # floating point number that is a whole number, get rid of
11619                     # the trailing decimal point and 0's, so that utf8_heavy
11620                     # will work.  Also note that this assumes that such a
11621                     # number is matched strictly; so if that were to change,
11622                     # this would be wrong.
11623                     if ((my $integer_name = $name)
11624                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
11625                     {
11626                         $stricter_to_file_of{$property . $integer_name}
11627                             = $sub_filename;
11628                     }
11629                 }
11630             }
11631
11632             # Keep a list of the deprecated properties and their filenames
11633             if ($deprecated) {
11634                 $utf8::why_deprecated{$sub_filename} = $deprecated;
11635             }
11636         }
11637     }
11638
11639     return;
11640 }
11641
11642 {   # Closure
11643     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
11644                      # conflicts
11645     my %full_dir_name_of;   # Full length names of directories used.
11646
11647     sub construct_filename($$$) {
11648         # Return a file name for a table, based on the table name, but perhaps
11649         # changed to get rid of non-portable characters in it, and to make
11650         # sure that it is unique on a file system that allows the names before
11651         # any period to be at most 8 characters (DOS).  While we're at it
11652         # check and complain if there are any directory conflicts.
11653
11654         my $name = shift;       # The name to start with
11655         my $mutable = shift;    # Boolean: can it be changed?  If no, but
11656                                 # yet it must be to work properly, a warning
11657                                 # is given
11658         my $directories_ref = shift;  # A reference to an array containing the
11659                                 # path to the file, with each element one path
11660                                 # component.  This is used because the same
11661                                 # name can be used in different directories.
11662         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11663
11664         my $warn = ! defined wantarray;  # If true, then if the name is
11665                                 # changed, a warning is issued as well.
11666
11667         if (! defined $name) {
11668             Carp::my_carp("Undefined name in directory "
11669                           . File::Spec->join(@$directories_ref)
11670                           . ". '_' used");
11671             return '_';
11672         }
11673
11674         # Make sure that no directory names conflict with each other.  Look at
11675         # each directory in the input file's path.  If it is already in use,
11676         # assume it is correct, and is merely being re-used, but if we
11677         # truncate it to 8 characters, and find that there are two directories
11678         # that are the same for the first 8 characters, but differ after that,
11679         # then that is a problem.
11680         foreach my $directory (@$directories_ref) {
11681             my $short_dir = substr($directory, 0, 8);
11682             if (defined $full_dir_name_of{$short_dir}) {
11683                 next if $full_dir_name_of{$short_dir} eq $directory;
11684                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
11685             }
11686             else {
11687                 $full_dir_name_of{$short_dir} = $directory;
11688             }
11689         }
11690
11691         my $path = join '/', @$directories_ref;
11692         $path .= '/' if $path;
11693
11694         # Remove interior underscores.
11695         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
11696
11697         # Change any non-word character into an underscore, and truncate to 8.
11698         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
11699         substr($filename, 8) = "" if length($filename) > 8;
11700
11701         # Make sure the basename doesn't conflict with something we
11702         # might have already written. If we have, say,
11703         #     InGreekExtended1
11704         #     InGreekExtended2
11705         # they become
11706         #     InGreekE
11707         #     InGreek2
11708         my $warned = 0;
11709         while (my $num = $base_names{$path}{lc $filename}++) {
11710             $num++; # so basenames with numbers start with '2', which
11711                     # just looks more natural.
11712
11713             # Want to append $num, but if it'll make the basename longer
11714             # than 8 characters, pre-truncate $filename so that the result
11715             # is acceptable.
11716             my $delta = length($filename) + length($num) - 8;
11717             if ($delta > 0) {
11718                 substr($filename, -$delta) = $num;
11719             }
11720             else {
11721                 $filename .= $num;
11722             }
11723             if ($warn && ! $warned) {
11724                 $warned = 1;
11725                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
11726             }
11727         }
11728
11729         return $filename if $mutable;
11730
11731         # If not changeable, must return the input name, but warn if needed to
11732         # change it beyond shortening it.
11733         if ($name ne $filename
11734             && substr($name, 0, length($filename)) ne $filename) {
11735             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
11736         }
11737         return $name;
11738     }
11739 }
11740
11741 # The pod file contains a very large table.  Many of the lines in that table
11742 # would exceed a typical output window's size, and so need to be wrapped with
11743 # a hanging indent to make them look good.  The pod language is really
11744 # insufficient here.  There is no general construct to do that in pod, so it
11745 # is done here by beginning each such line with a space to cause the result to
11746 # be output without formatting, and doing all the formatting here.  This leads
11747 # to the result that if the eventual display window is too narrow it won't
11748 # look good, and if the window is too wide, no advantage is taken of that
11749 # extra width.  A further complication is that the output may be indented by
11750 # the formatter so that there is less space than expected.  What I (khw) have
11751 # done is to assume that that indent is a particular number of spaces based on
11752 # what it is in my Linux system;  people can always resize their windows if
11753 # necessary, but this is obviously less than desirable, but the best that can
11754 # be expected.
11755 my $automatic_pod_indent = 8;
11756
11757 # Try to format so that uses fewest lines, but few long left column entries
11758 # slide into the right column.  An experiment on 5.1 data yielded the
11759 # following percentages that didn't cut into the other side along with the
11760 # associated first-column widths
11761 # 69% = 24
11762 # 80% not too bad except for a few blocks
11763 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
11764 # 95% = 37;
11765 my $indent_info_column = 27;    # 75% of lines didn't have overlap
11766
11767 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
11768                     # The 3 is because of:
11769                     #   1   for the leading space to tell the pod formatter to
11770                     #       output as-is
11771                     #   1   for the flag
11772                     #   1   for the space between the flag and the main data
11773
11774 sub format_pod_line ($$$;$$) {
11775     # Take a pod line and return it, formatted properly
11776
11777     my $first_column_width = shift;
11778     my $entry = shift;  # Contents of left column
11779     my $info = shift;   # Contents of right column
11780
11781     my $status = shift || "";   # Any flag
11782
11783     my $loose_match = shift;    # Boolean.
11784     $loose_match = 1 unless defined $loose_match;
11785
11786     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11787
11788     my $flags = "";
11789     $flags .= $STRICTER if ! $loose_match;
11790
11791     $flags .= $status if $status;
11792
11793     # There is a blank in the left column to cause the pod formatter to
11794     # output the line as-is.
11795     return sprintf " %-*s%-*s %s\n",
11796                     # The first * in the format is replaced by this, the -1 is
11797                     # to account for the leading blank.  There isn't a
11798                     # hard-coded blank after this to separate the flags from
11799                     # the rest of the line, so that in the unlikely event that
11800                     # multiple flags are shown on the same line, they both
11801                     # will get displayed at the expense of that separation,
11802                     # but since they are left justified, a blank will be
11803                     # inserted in the normal case.
11804                     $FILLER - 1,
11805                     $flags,
11806
11807                     # The other * in the format is replaced by this number to
11808                     # cause the first main column to right fill with blanks.
11809                     # The -1 is for the guaranteed blank following it.
11810                     $first_column_width - $FILLER - 1,
11811                     $entry,
11812                     $info;
11813 }
11814
11815 my @zero_match_tables;  # List of tables that have no matches in this release
11816
11817 sub make_table_pod_entries($) {
11818     # This generates the entries for the pod file for a given table.
11819     # Also done at this time are any children tables.  The output looks like:
11820     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
11821
11822     my $input_table = shift;        # Table the entry is for
11823     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11824
11825     # Generate parent and all its children at the same time.
11826     return if $input_table->parent != $input_table;
11827
11828     my $property = $input_table->property;
11829     my $type = $property->type;
11830     my $full_name = $property->full_name;
11831
11832     my $count = $input_table->count;
11833     my $string_count = clarify_number($count);
11834     my $status = $input_table->status;
11835     my $status_info = $input_table->status_info;
11836
11837     my $entry_for_first_table; # The entry for the first table output.
11838                            # Almost certainly, it is the parent.
11839
11840     # For each related table (including itself), we will generate a pod entry
11841     # for each name each table goes by
11842     foreach my $table ($input_table, $input_table->children) {
11843
11844         # utf8_heavy.pl cannot deal with null string property values, so don't
11845         # output any.
11846         next if $table->name eq "";
11847
11848         # First, gather all the info that applies to this table as a whole.
11849
11850         push @zero_match_tables, $table if $count == 0;
11851
11852         my $table_property = $table->property;
11853
11854         # The short name has all the underscores removed, while the full name
11855         # retains them.  Later, we decide whether to output a short synonym
11856         # for the full one, we need to compare apples to apples, so we use the
11857         # short name's length including underscores.
11858         my $table_property_short_name_length;
11859         my $table_property_short_name
11860             = $table_property->short_name(\$table_property_short_name_length);
11861         my $table_property_full_name = $table_property->full_name;
11862
11863         # Get how much savings there is in the short name over the full one
11864         # (delta will always be <= 0)
11865         my $table_property_short_delta = $table_property_short_name_length
11866                                          - length($table_property_full_name);
11867         my @table_description = $table->description;
11868         my @table_note = $table->note;
11869
11870         # Generate an entry for each alias in this table.
11871         my $entry_for_first_alias;  # saves the first one encountered.
11872         foreach my $alias ($table->aliases) {
11873
11874             # Skip if not to go in pod.
11875             next unless $alias->make_pod_entry;
11876
11877             # Start gathering all the components for the entry
11878             my $name = $alias->name;
11879
11880             my $entry;      # Holds the left column, may include extras
11881             my $entry_ref;  # To refer to the left column's contents from
11882                             # another entry; has no extras
11883
11884             # First the left column of the pod entry.  Tables for the $perl
11885             # property always use the single form.
11886             if ($table_property == $perl) {
11887                 $entry = "\\p{$name}";
11888                 $entry_ref = "\\p{$name}";
11889             }
11890             else {    # Compound form.
11891
11892                 # Only generate one entry for all the aliases that mean true
11893                 # or false in binary properties.  Append a '*' to indicate
11894                 # some are missing.  (The heading comment notes this.)
11895                 my $wild_card_mark;
11896                 if ($type == $BINARY) {
11897                     next if $name ne 'N' && $name ne 'Y';
11898                     $wild_card_mark = '*';
11899                 }
11900                 else {
11901                     $wild_card_mark = "";
11902                 }
11903
11904                 # Colon-space is used to give a little more space to be easier
11905                 # to read;
11906                 $entry = "\\p{"
11907                         . $table_property_full_name
11908                         . ": $name$wild_card_mark}";
11909
11910                 # But for the reference to this entry, which will go in the
11911                 # right column, where space is at a premium, use equals
11912                 # without a space
11913                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
11914             }
11915
11916             # Then the right (info) column.  This is stored as components of
11917             # an array for the moment, then joined into a string later.  For
11918             # non-internal only properties, begin the info with the entry for
11919             # the first table we encountered (if any), as things are ordered
11920             # so that that one is the most descriptive.  This leads to the
11921             # info column of an entry being a more descriptive version of the
11922             # name column
11923             my @info;
11924             if ($name =~ /^_/) {
11925                 push @info,
11926                         '(For internal use by Perl, not necessarily stable)';
11927             }
11928             elsif ($entry_for_first_alias) {
11929                 push @info, $entry_for_first_alias;
11930             }
11931
11932             # If this entry is equivalent to another, add that to the info,
11933             # using the first such table we encountered
11934             if ($entry_for_first_table) {
11935                 if (@info) {
11936                     push @info, "(= $entry_for_first_table)";
11937                 }
11938                 else {
11939                     push @info, $entry_for_first_table;
11940                 }
11941             }
11942
11943             # If the name is a large integer, add an equivalent with an
11944             # exponent for better readability
11945             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
11946                 push @info, sprintf "(= %.1e)", $name
11947             }
11948
11949             my $parenthesized = "";
11950             if (! $entry_for_first_alias) {
11951
11952                 # This is the first alias for the current table.  The alias
11953                 # array is ordered so that this is the fullest, most
11954                 # descriptive alias, so it gets the fullest info.  The other
11955                 # aliases are mostly merely pointers to this one, using the
11956                 # information already added above.
11957
11958                 # Display any status message, but only on the parent table
11959                 if ($status && ! $entry_for_first_table) {
11960                     push @info, $status_info;
11961                 }
11962
11963                 # Put out any descriptive info
11964                 if (@table_description || @table_note) {
11965                     push @info, join "; ", @table_description, @table_note;
11966                 }
11967
11968                 # Look to see if there is a shorter name we can point people
11969                 # at
11970                 my $standard_name = standardize($name);
11971                 my $short_name;
11972                 my $proposed_short = $table->short_name;
11973                 if (defined $proposed_short) {
11974                     my $standard_short = standardize($proposed_short);
11975
11976                     # If the short name is shorter than the standard one, or
11977                     # even it it's not, but the combination of it and its
11978                     # short property name (as in \p{prop=short} ($perl doesn't
11979                     # have this form)) saves at least two characters, then,
11980                     # cause it to be listed as a shorter synonym.
11981                     if (length $standard_short < length $standard_name
11982                         || ($table_property != $perl
11983                             && (length($standard_short)
11984                                 - length($standard_name)
11985                                 + $table_property_short_delta)  # (<= 0)
11986                                 < -2))
11987                     {
11988                         $short_name = $proposed_short;
11989                         if ($table_property != $perl) {
11990                             $short_name = $table_property_short_name
11991                                           . "=$short_name";
11992                         }
11993                         $short_name = "\\p{$short_name}";
11994                     }
11995                 }
11996
11997                 # And if this is a compound form name, see if there is a
11998                 # single form equivalent
11999                 my $single_form;
12000                 if ($table_property != $perl) {
12001
12002                     # Special case the binary N tables, so that will print
12003                     # \P{single}, but use the Y table values to populate
12004                     # 'single', as we haven't populated the N table.
12005                     my $test_table;
12006                     my $p;
12007                     if ($type == $BINARY
12008                         && $input_table == $property->table('No'))
12009                     {
12010                         $test_table = $property->table('Yes');
12011                         $p = 'P';
12012                     }
12013                     else {
12014                         $test_table = $input_table;
12015                         $p = 'p';
12016                     }
12017
12018                     # Look for a single form amongst all the children.
12019                     foreach my $table ($test_table->children) {
12020                         next if $table->property != $perl;
12021                         my $proposed_name = $table->short_name;
12022                         next if ! defined $proposed_name;
12023
12024                         # Don't mention internal-only properties as a possible
12025                         # single form synonym
12026                         next if substr($proposed_name, 0, 1) eq '_';
12027
12028                         $proposed_name = "\\$p\{$proposed_name}";
12029                         if (! defined $single_form
12030                             || length($proposed_name) < length $single_form)
12031                         {
12032                             $single_form = $proposed_name;
12033
12034                             # The goal here is to find a single form; not the
12035                             # shortest possible one.  We've already found a
12036                             # short name.  So, stop at the first single form
12037                             # found, which is likely to be closer to the
12038                             # original.
12039                             last;
12040                         }
12041                     }
12042                 }
12043
12044                 # Ouput both short and single in the same parenthesized
12045                 # expression, but with only one of 'Single', 'Short' if there
12046                 # are both items.
12047                 if ($short_name || $single_form || $table->conflicting) {
12048                     $parenthesized .= '(';
12049                     $parenthesized .= "Short: $short_name" if $short_name;
12050                     if ($short_name && $single_form) {
12051                         $parenthesized .= ', ';
12052                     }
12053                     elsif ($single_form) {
12054                         $parenthesized .= 'Single: ';
12055                     }
12056                     $parenthesized .= $single_form if $single_form;
12057                 }
12058             }
12059
12060
12061             # Warn if this property isn't the same as one that a
12062             # semi-casual user might expect.  The other components of this
12063             # parenthesized structure are calculated only for the first entry
12064             # for this table, but the conflicting is deemed important enough
12065             # to go on every entry.
12066             my $conflicting = join " NOR ", $table->conflicting;
12067             if ($conflicting) {
12068                 $parenthesized .= '(' if ! $parenthesized;
12069                 $parenthesized .=  '; ' if $parenthesized ne '(';
12070                 $parenthesized .= "NOT $conflicting";
12071             }
12072             $parenthesized .= ')' if $parenthesized;
12073
12074             push @info, $parenthesized if $parenthesized;
12075
12076             if ($table_property != $perl && $table->perl_extension) {
12077                 push @info, '(Perl extension)';
12078             }
12079             push @info, "($string_count)" if $output_range_counts;
12080
12081             # Now, we have both the entry and info so add them to the
12082             # list of all the properties.
12083             push @match_properties,
12084                 format_pod_line($indent_info_column,
12085                                 $entry,
12086                                 join( " ", @info),
12087                                 $alias->status,
12088                                 $alias->loose_match);
12089
12090             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
12091         } # End of looping through the aliases for this table.
12092
12093         if (! $entry_for_first_table) {
12094             $entry_for_first_table = $entry_for_first_alias;
12095         }
12096     } # End of looping through all the related tables
12097     return;
12098 }
12099
12100 sub pod_alphanumeric_sort {
12101     # Sort pod entries alphanumerically.
12102
12103     # The first few character columns are filler, plus the '\p{'; and get rid
12104     # of all the trailing stuff, starting with the trailing '}', so as to sort
12105     # on just 'Name=Value'
12106     (my $a = lc $a) =~ s/^ .*? { //x;
12107     $a =~ s/}.*//;
12108     (my $b = lc $b) =~ s/^ .*? { //x;
12109     $b =~ s/}.*//;
12110
12111     # Determine if the two operands are both internal only or both not.
12112     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
12113     # should be the underscore that begins internal only
12114     my $a_is_internal = (substr($a, 0, 1) eq '_');
12115     my $b_is_internal = (substr($b, 0, 1) eq '_');
12116
12117     # Sort so the internals come last in the table instead of first (which the
12118     # leading underscore would otherwise indicate).
12119     if ($a_is_internal != $b_is_internal) {
12120         return 1 if $a_is_internal;
12121         return -1
12122     }
12123
12124     # Determine if the two operands are numeric property values or not.
12125     # A numeric property will look like xyz: 3.  But the number
12126     # can begin with an optional minus sign, and may have a
12127     # fraction or rational component, like xyz: 3/2.  If either
12128     # isn't numeric, use alphabetic sort.
12129     my ($a_initial, $a_number) =
12130         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
12131     return $a cmp $b unless defined $a_number;
12132     my ($b_initial, $b_number) =
12133         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
12134     return $a cmp $b unless defined $b_number;
12135
12136     # Here they are both numeric, but use alphabetic sort if the
12137     # initial parts don't match
12138     return $a cmp $b if $a_initial ne $b_initial;
12139
12140     # Convert rationals to floating for the comparison.
12141     $a_number = eval $a_number if $a_number =~ qr{/};
12142     $b_number = eval $b_number if $b_number =~ qr{/};
12143
12144     return $a_number <=> $b_number;
12145 }
12146
12147 sub make_pod () {
12148     # Create the .pod file.  This generates the various subsections and then
12149     # combines them in one big HERE document.
12150
12151     return unless defined $pod_directory;
12152     print "Making pod file\n" if $verbosity >= $PROGRESS;
12153
12154     my $exception_message =
12155     '(Any exceptions are individually noted beginning with the word NOT.)';
12156     my @block_warning;
12157     if (-e 'Blocks.txt') {
12158
12159         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
12160         # if the global $has_In_conflicts indicates we have them.
12161         push @match_properties, format_pod_line($indent_info_column,
12162                                                 '\p{In_*}',
12163                                                 '\p{Block: *}'
12164                                                     . (($has_In_conflicts)
12165                                                       ? " $exception_message"
12166                                                       : ""));
12167         @block_warning = << "END";
12168
12169 Matches in the Block property have shortcuts that begin with 'In_'.  For
12170 example, \\p{Block=Latin1} can be written as \\p{In_Latin1}.  For backward
12171 compatibility, if there is no conflict with another shortcut, these may also
12172 be written as \\p{Latin1} or \\p{Is_Latin1}.  But, N.B., there are numerous
12173 such conflicting shortcuts.  Use of these forms for Block is discouraged, and
12174 are flagged as such, not only because of the potential confusion as to what is
12175 meant, but also because a later release of Unicode may preempt the shortcut,
12176 and your program would no longer be correct.  Use the 'In_' form instead to
12177 avoid this, or even more clearly, use the compound form, e.g.,
12178 \\p{blk:latin1}.  See L<perlunicode/"Blocks"> for more information about this.
12179 END
12180     }
12181     my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
12182     $text = "$exception_message $text" if $has_Is_conflicts;
12183
12184     # And the 'Is_ line';
12185     push @match_properties, format_pod_line($indent_info_column,
12186                                             '\p{Is_*}',
12187                                             "\\p{*} $text");
12188
12189     # Sort the properties array for output.  It is sorted alphabetically
12190     # except numerically for numeric properties, and only output unique lines.
12191     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
12192
12193     my $formatted_properties = simple_fold(\@match_properties,
12194                                         "",
12195                                         # indent succeeding lines by two extra
12196                                         # which looks better
12197                                         $indent_info_column + 2,
12198
12199                                         # shorten the line length by how much
12200                                         # the formatter indents, so the folded
12201                                         # line will fit in the space
12202                                         # presumably available
12203                                         $automatic_pod_indent);
12204     # Add column headings, indented to be a little more centered, but not
12205     # exactly
12206     $formatted_properties =  format_pod_line($indent_info_column,
12207                                                     '    NAME',
12208                                                     '           INFO')
12209                                     . "\n"
12210                                     . $formatted_properties;
12211
12212     # Generate pod documentation lines for the tables that match nothing
12213     my $zero_matches;
12214     if (@zero_match_tables) {
12215         @zero_match_tables = uniques(@zero_match_tables);
12216         $zero_matches = join "\n\n",
12217                         map { $_ = '=item \p{' . $_->complete_name . "}" }
12218                             sort { $a->complete_name cmp $b->complete_name }
12219                             uniques(@zero_match_tables);
12220
12221         $zero_matches = <<END;
12222
12223 =head2 Legal \\p{} and \\P{} constructs that match no characters
12224
12225 Unicode has some property-value pairs that currently don't match anything.
12226 This happens generally either because they are obsolete, or for symmetry with
12227 other forms, but no language has yet been encoded that uses them.  In this
12228 version of Unicode, the following match zero code points:
12229
12230 =over 4
12231
12232 $zero_matches
12233
12234 =back
12235
12236 END
12237     }
12238
12239     # Generate list of properties that we don't accept, grouped by the reasons
12240     # why.  This is so only put out the 'why' once, and then list all the
12241     # properties that have that reason under it.
12242
12243     my %why_list;   # The keys are the reasons; the values are lists of
12244                     # properties that have the key as their reason
12245
12246     # For each property, add it to the list that are suppressed for its reason
12247     # The sort will cause the alphabetically first properties to be added to
12248     # each list first, so each list will be sorted.
12249     foreach my $property (sort keys %why_suppressed) {
12250         push @{$why_list{$why_suppressed{$property}}}, $property;
12251     }
12252
12253     # For each reason (sorted by the first property that has that reason)...
12254     my @bad_re_properties;
12255     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
12256                      keys %why_list)
12257     {
12258         # Add to the output, all the properties that have that reason.  Start
12259         # with an empty line.
12260         push @bad_re_properties, "\n\n";
12261
12262         my $has_item = 0;   # Flag if actually output anything.
12263         foreach my $name (@{$why_list{$why}}) {
12264
12265             # Split compound names into $property and $table components
12266             my $property = $name;
12267             my $table;
12268             if ($property =~ / (.*) = (.*) /x) {
12269                 $property = $1;
12270                 $table = $2;
12271             }
12272
12273             # This release of Unicode may not have a property that is
12274             # suppressed, so don't reference a non-existent one.
12275             $property = property_ref($property);
12276             next if ! defined $property;
12277
12278             # And since this list is only for match tables, don't list the
12279             # ones that don't have match tables.
12280             next if ! $property->to_create_match_tables;
12281
12282             # Find any abbreviation, and turn it into a compound name if this
12283             # is a property=value pair.
12284             my $short_name = $property->name;
12285             $short_name .= '=' . $property->table($table)->name if $table;
12286
12287             # And add the property as an item for the reason.
12288             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12289             $has_item = 1;
12290         }
12291
12292         # And add the reason under the list of properties, if such a list
12293         # actually got generated.  Note that the header got added
12294         # unconditionally before.  But pod ignores extra blank lines, so no
12295         # harm.
12296         push @bad_re_properties, "\n$why\n" if $has_item;
12297
12298     } # End of looping through each reason.
12299
12300     # Generate a list of the properties whose map table we output, from the
12301     # global @map_properties.
12302     my @map_tables_actually_output;
12303     my $info_indent = 20;       # Left column is narrower than \p{} table.
12304     foreach my $property (@map_properties) {
12305
12306         # Get the path to the file; don't output any not in the standard
12307         # directory.
12308         my @path = $property->file_path;
12309         next if $path[0] ne $map_directory;
12310         shift @path;    # Remove the standard name
12311
12312         my $file = join '/', @path; # In case is in sub directory
12313         my $info = $property->full_name;
12314         my $short_name = $property->name;
12315         if ($info ne $short_name) {
12316             $info .= " ($short_name)";
12317         }
12318         foreach my $more_info ($property->description,
12319                                $property->note,
12320                                $property->status_info)
12321         {
12322             next unless $more_info;
12323             $info =~ s/\.\Z//;
12324             $info .= ".  $more_info";
12325         }
12326         push @map_tables_actually_output, format_pod_line($info_indent,
12327                                                           $file,
12328                                                           $info,
12329                                                           $property->status);
12330     }
12331
12332     # Sort alphabetically, and fold for output
12333     @map_tables_actually_output = sort
12334                             pod_alphanumeric_sort @map_tables_actually_output;
12335     @map_tables_actually_output
12336                         = simple_fold(\@map_tables_actually_output,
12337                                         ' ',
12338                                         $info_indent,
12339                                         $automatic_pod_indent);
12340
12341     # Generate a list of the formats that can appear in the map tables.
12342     my @map_table_formats;
12343     foreach my $format (sort keys %map_table_formats) {
12344         push @map_table_formats, " $format    $map_table_formats{$format}\n";
12345     }
12346
12347     # Everything is ready to assemble.
12348     my @OUT = << "END";
12349 =begin comment
12350
12351 $HEADER
12352
12353 To change this file, edit $0 instead.
12354
12355 =end comment
12356
12357 =head1 NAME
12358
12359 $pod_file - Index of Unicode Version $string_version properties in Perl
12360
12361 =head1 DESCRIPTION
12362
12363 There are many properties in Unicode, and Perl provides access to almost all of
12364 them, as well as some additional extensions and short-cut synonyms.
12365
12366 And just about all of the few that aren't accessible through the Perl
12367 core are accessible through the modules: Unicode::Normalize and
12368 Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12369
12370 This document merely lists all available properties and does not attempt to
12371 explain what each property really means.  There is a brief description of each
12372 Perl extension.  There is some detail about Blocks, Scripts, General_Category,
12373 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12374 Unicode properties, refer to the Unicode standard.  A good starting place is
12375 L<$unicode_reference_url>.  More information on the Perl extensions is in
12376 L<perlrecharclass>.
12377
12378 Note that you can define your own properties; see
12379 L<perlunicode/"User-Defined Character Properties">.
12380
12381 =head1 Properties accessible through \\p{} and \\P{}
12382
12383 The Perl regular expression \\p{} and \\P{} constructs give access to most of
12384 the Unicode character properties.  The table below shows all these constructs,
12385 both single and compound forms.
12386
12387 B<Compound forms> consist of two components, separated by an equals sign or a
12388 colon.  The first component is the property name, and the second component is
12389 the particular value of the property to match against, for example,
12390 '\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12391 whose Script property is Greek.
12392
12393 B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12394 their equivalent compound forms.  The table shows these equivalences.  (In our
12395 example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12396 There are also a few Perl-defined single forms that are not shortcuts for a
12397 compound form.  One such is \\p{Word}.  These are also listed in the table.
12398
12399 In parsing these constructs, Perl always ignores Upper/lower case differences
12400 everywhere within the {braces}.  Thus '\\p{Greek}' means the same thing as
12401 '\\p{greek}'.  But note that changing the case of the 'p' or 'P' before the
12402 left brace completely changes the meaning of the construct, from "match" (for
12403 '\\p{}') to "doesn't match" (for '\\P{}').  Casing in this document is for
12404 improved legibility.
12405
12406 Also, white space, hyphens, and underscores are also normally ignored
12407 everywhere between the {braces}, and hence can be freely added or removed
12408 even if the C</x> modifier hasn't been specified on the regular expression.
12409 But $a_bold_stricter at the beginning of an entry in the table below
12410 means that tighter (stricter) rules are used for that entry:
12411
12412 =over 4
12413
12414 =item Single form (\\p{name}) tighter rules:
12415
12416 White space, hyphens, and underscores ARE significant
12417 except for:
12418
12419 =over 4
12420
12421 =item * white space adjacent to a non-word character
12422
12423 =item * underscores separating digits in numbers
12424
12425 =back
12426
12427 That means, for example, that you can freely add or remove white space
12428 adjacent to (but within) the braces without affecting the meaning.
12429
12430 =item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12431
12432 The tighter rules given above for the single form apply to everything to the
12433 right of the colon or equals; the looser rules still apply to everything to
12434 the left.
12435
12436 That means, for example, that you can freely add or remove white space
12437 adjacent to (but within) the braces and the colon or equal sign.
12438
12439 =back
12440
12441 Some properties are considered obsolete, but still available.  There are
12442 several varieties of obsolesence:
12443
12444 =over 4
12445
12446 =item Obsolete
12447
12448 Properties marked with $a_bold_obsolete in the table are considered
12449 obsolete.  At the time of this writing (Unicode version 5.2) there is no
12450 information in the Unicode standard about the implications of a property being
12451 obsolete.
12452
12453 =item Stabilized
12454
12455 Obsolete properties may be stabilized.  This means that they are not actively
12456 maintained by Unicode, and will not be extended as new characters are added to
12457 the standard.  Such properties are marked with $a_bold_stabilized in the
12458 table.  At the time of this writing (Unicode version 5.2) there is no further
12459 information in the Unicode standard about the implications of a property being
12460 stabilized.
12461
12462 =item Deprecated
12463
12464 Obsolete properties may be deprecated.  This means that their use is strongly
12465 discouraged, so much so that a warning will be issued if used, unless the
12466 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12467 statement.  $A_bold_deprecated flags each such entry in the table, and
12468 the entry there for the longest, most descriptive version of the property will
12469 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
12470 warning, even for properties that aren't officially deprecated by Unicode,
12471 when there used to be characters or code points that were matched by them, but
12472 no longer.  This is to warn you that your program may not work like it did on
12473 earlier Unicode releases.
12474
12475 A deprecated property may be made unavailable in a future Perl version, so it
12476 is best to move away from them.
12477
12478 =back
12479
12480 Some Perl extensions are present for backwards compatibility and are
12481 discouraged from being used, but not obsolete.  $A_bold_discouraged
12482 flags each such entry in the table.
12483
12484 @block_warning
12485
12486 The table below has two columns.  The left column contains the \\p{}
12487 constructs to look up, possibly preceeded by the flags mentioned above; and
12488 the right column contains information about them, like a description, or
12489 synonyms.  It shows both the single and compound forms for each property that
12490 has them.  If the left column is a short name for a property, the right column
12491 will give its longer, more descriptive name; and if the left column is the
12492 longest name, the right column will show any equivalent shortest name, in both
12493 single and compound forms if applicable.
12494
12495 The right column will also caution you if a property means something different
12496 than what might normally be expected.
12497
12498 All single forms are Perl extensions; a few compound forms are as well, and
12499 are noted as such.
12500
12501 Numbers in (parentheses) indicate the total number of code points matched by
12502 the property.  For emphasis, those properties that match no code points at all
12503 are listed as well in a separate section following the table.
12504
12505 There is no description given for most non-Perl defined properties (See
12506 $unicode_reference_url for that).
12507
12508 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12509 combinations.  For example, entries like:
12510
12511  \\p{Gc: *}                                  \\p{General_Category: *}
12512
12513 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12514 for the latter is also valid for the former.  Similarly,
12515
12516  \\p{Is_*}                                   \\p{*}
12517
12518 means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12519 \\p{IsFoo} are also valid and all mean the same thing.  And similarly,
12520 \\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}.  '*' here
12521 is restricted to something not beginning with an underscore.
12522
12523 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12524 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
12525 'N*' to indicate this, and doesn't have separate entries for the other
12526 possibilities.  Note that not all properties which have values 'Yes' and 'No'
12527 are binary, and they have all their values spelled out without using this wild
12528 card, and a C<NOT> clause in their description that highlights their not being
12529 binary.  These also require the compound form to match them, whereas true
12530 binary properties have both single and compound forms available.
12531
12532 Note that all non-essential underscores are removed in the display of the
12533 short names below.
12534
12535 B<Summary legend:>
12536
12537 =over 4
12538
12539 =item B<*> is a wild-card
12540
12541 =item B<(\\d+)> in the info column gives the number of code points matched by
12542 this property.
12543
12544 =item B<$DEPRECATED> means this is deprecated.
12545
12546 =item B<$OBSOLETE> means this is obsolete.
12547
12548 =item B<$STABILIZED> means this is stabilized.
12549
12550 =item B<$STRICTER> means tighter (stricter) name matching applies.
12551
12552 =item B<$DISCOURAGED> means use of this form is discouraged.
12553
12554 =back
12555
12556 $formatted_properties
12557
12558 $zero_matches
12559
12560 =head1 Properties not accessible through \\p{} and \\P{}
12561
12562 A few properties are accessible in Perl via various function calls only.
12563 These are:
12564  Lowercase_Mapping          lc() and lcfirst()
12565  Titlecase_Mapping          ucfirst()
12566  Uppercase_Mapping          uc()
12567
12568 Case_Folding is accessible through the /i modifier in regular expressions.
12569
12570 The Name property is accessible through the \\N{} interpolation in
12571 double-quoted strings and regular expressions, but both usages require a C<use
12572 charnames;> to be specified, which also contains related functions viacode()
12573 and vianame().
12574
12575 =head1 Unicode regular expression properties that are NOT accepted by Perl
12576
12577 Perl will generate an error for a few character properties in Unicode when
12578 used in a regular expression.  The non-Unihan ones are listed below, with the
12579 reasons they are not accepted, perhaps with work-arounds.  The short names for
12580 the properties are listed enclosed in (parentheses).
12581
12582 =over 4
12583
12584 @bad_re_properties
12585
12586 =back
12587
12588 An installation can choose to allow any of these to be matched by changing the
12589 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12590 and then re-running F<$0>.  (C<\%Config> is available from the Config module).
12591
12592 =head1 Files in the I<To> directory (for serious hackers only)
12593
12594 All Unicode properties are really mappings (in the mathematical sense) from
12595 code points to their respective values.  As part of its build process,
12596 Perl constructs tables containing these mappings for all properties that it
12597 deals with.  But only a few of these are written out into files.
12598 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
12599 (%Config is available from the Config module).
12600
12601 Those ones written are ones needed by Perl internally during execution, or for
12602 which there is some demand, and those for which there is no access through the
12603 Perl core.  Generally, properties that can be used in regular expression
12604 matching do not have their map tables written, like Script.  Nor are the
12605 simplistic properties that have a better, more complete version, such as
12606 Simple_Uppercase_Mapping  (Uppercase_Mapping is written instead).
12607
12608 None of the properties in the I<To> directory are currently directly
12609 accessible through the Perl core, although some may be accessed indirectly.
12610 For example, the uc() function implements the Uppercase_Mapping property and
12611 uses the F<Upper.pl> file found in this directory.
12612
12613 The available files with their properties (short names in parentheses),
12614 and any flags or comments about them, are:
12615
12616 @map_tables_actually_output
12617
12618 An installation can choose to change which files are generated by changing the
12619 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12620 and then re-running F<$0>.
12621
12622 Each of these files defines two hash entries to help reading programs decipher
12623 it.  One of them looks like this:
12624
12625     \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
12626
12627 where 'NAME' is a name to indicate the property.  For backwards compatibility,
12628 this is not necessarily the property's official Unicode name.  (The 'To' is
12629 also for backwards compatibility.)  The hash entry gives the format of the
12630 mapping fields of the table, currently one of the following:
12631
12632  @map_table_formats
12633
12634 This format applies only to the entries in the main body of the table.
12635 Entries defined in hashes or ones that are missing from the list can have a
12636 different format.
12637
12638 The value that the missing entries have is given by the other SwashInfo hash
12639 entry line; it looks like this:
12640
12641     \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
12642
12643 This example line says that any Unicode code points not explicitly listed in
12644 the file have the value 'NaN' under the property indicated by NAME.  If the
12645 value is the special string C<< <code point> >>, it means that the value for
12646 any missing code point is the code point itself.  This happens, for example,
12647 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
12648 character 'A', are missing because the uppercase of 'A' is itself.
12649
12650 =head1 SEE ALSO
12651
12652 L<$unicode_reference_url>
12653
12654 L<perlrecharclass>
12655
12656 L<perlunicode>
12657
12658 END
12659
12660     # And write it.
12661     main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
12662     return;
12663 }
12664
12665 sub make_Heavy () {
12666     # Create and write Heavy.pl, which passes info about the tables to
12667     # utf8_heavy.pl
12668
12669     my @heavy = <<END;
12670 $HEADER
12671 $INTERNAL_ONLY
12672
12673 # This file is for the use of utf8_heavy.pl
12674
12675 # Maps property names in loose standard form to its standard name
12676 \%utf8::loose_property_name_of = (
12677 END
12678
12679     push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
12680     push @heavy, <<END;
12681 );
12682
12683 # Maps property, table to file for those using stricter matching
12684 \%utf8::stricter_to_file_of = (
12685 END
12686     push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
12687     push @heavy, <<END;
12688 );
12689
12690 # Maps property, table to file for those using loose matching
12691 \%utf8::loose_to_file_of = (
12692 END
12693     push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
12694     push @heavy, <<END;
12695 );
12696
12697 # Maps floating point to fractional form
12698 \%utf8::nv_floating_to_rational = (
12699 END
12700     push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
12701     push @heavy, <<END;
12702 );
12703
12704 # If a floating point number doesn't have enough digits in it to get this
12705 # close to a fraction, it isn't considered to be that fraction even if all the
12706 # digits it does have match.
12707 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12708
12709 # Deprecated tables to generate a warning for.  The key is the file containing
12710 # the table, so as to avoid duplication, as many property names can map to the
12711 # file, but we only need one entry for all of them.
12712 \%utf8::why_deprecated = (
12713 END
12714
12715     push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
12716     push @heavy, <<END;
12717 );
12718
12719 1;
12720 END
12721
12722     main::write("Heavy.pl", @heavy);
12723     return;
12724 }
12725
12726 sub write_all_tables() {
12727     # Write out all the tables generated by this program to files, as well as
12728     # the supporting data structures, pod file, and .t file.
12729
12730     my @writables;              # List of tables that actually get written
12731     my %match_tables_to_write;  # Used to collapse identical match tables
12732                                 # into one file.  Each key is a hash function
12733                                 # result to partition tables into buckets.
12734                                 # Each value is an array of the tables that
12735                                 # fit in the bucket.
12736
12737     # For each property ...
12738     # (sort so that if there is an immutable file name, it has precedence, so
12739     # some other property can't come in and take over its file name.  If b's
12740     # file name is defined, will return 1, meaning to take it first; don't
12741     # care if both defined, as they had better be different anyway)
12742     PROPERTY:
12743     foreach my $property (sort { defined $b->file } property_ref('*')) {
12744         my $type = $property->type;
12745
12746         # And for each table for that property, starting with the mapping
12747         # table for it ...
12748         TABLE:
12749         foreach my $table($property,
12750
12751                         # and all the match tables for it (if any), sorted so
12752                         # the ones with the shortest associated file name come
12753                         # first.  The length sorting prevents problems of a
12754                         # longer file taking a name that might have to be used
12755                         # by a shorter one.  The alphabetic sorting prevents
12756                         # differences between releases
12757                         sort {  my $ext_a = $a->external_name;
12758                                 return 1 if ! defined $ext_a;
12759                                 my $ext_b = $b->external_name;
12760                                 return -1 if ! defined $ext_b;
12761                                 my $cmp = length $ext_a <=> length $ext_b;
12762
12763                                 # Return result if lengths not equal
12764                                 return $cmp if $cmp;
12765
12766                                 # Alphabetic if lengths equal
12767                                 return $ext_a cmp $ext_b
12768                         } $property->tables
12769                     )
12770         {
12771
12772             # Here we have a table associated with a property.  It could be
12773             # the map table (done first for each property), or one of the
12774             # other tables.  Determine which type.
12775             my $is_property = $table->isa('Property');
12776
12777             my $name = $table->name;
12778             my $complete_name = $table->complete_name;
12779
12780             # See if should suppress the table if is empty, but warn if it
12781             # contains something.
12782             my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
12783                                     keys %why_suppress_if_empty_warn_if_not;
12784
12785             # Calculate if this table should have any code points associated
12786             # with it or not.
12787             my $expected_empty =
12788
12789                 # $perl should be empty, as well as properties that we just
12790                 # don't do anything with
12791                 ($is_property
12792                     && ($table == $perl
12793                         || grep { $complete_name eq $_ }
12794                                                     @unimplemented_properties
12795                     )
12796                 )
12797
12798                 # Match tables in properties we skipped populating should be
12799                 # empty
12800                 || (! $is_property && ! $property->to_create_match_tables)
12801
12802                 # Tables and properties that are expected to have no code
12803                 # points should be empty
12804                 || $suppress_if_empty_warn_if_not
12805             ;
12806
12807             # Set a boolean if this table is the complement of an empty binary
12808             # table
12809             my $is_complement_of_empty_binary =
12810                 $type == $BINARY &&
12811                 (($table == $property->table('Y')
12812                     && $property->table('N')->is_empty)
12813                 || ($table == $property->table('N')
12814                     && $property->table('Y')->is_empty));
12815
12816
12817             # Some tables should match everything
12818             my $expected_full =
12819                 ($is_property)
12820                 ? # All these types of map tables will be full because
12821                   # they will have been populated with defaults
12822                   ($type == $ENUM || $type == $BINARY)
12823
12824                 : # A match table should match everything if its method
12825                   # shows it should
12826                   ($table->matches_all
12827
12828                   # The complement of an empty binary table will match
12829                   # everything
12830                   || $is_complement_of_empty_binary
12831                   )
12832             ;
12833
12834             if ($table->is_empty) {
12835
12836
12837                 if ($suppress_if_empty_warn_if_not) {
12838                     $table->set_status($SUPPRESSED,
12839                         $why_suppress_if_empty_warn_if_not{$complete_name});
12840                 }
12841
12842                 # Suppress expected empty tables.
12843                 next TABLE if $expected_empty;
12844
12845                 # And setup to later output a warning for those that aren't
12846                 # known to be allowed to be empty.  Don't do the warning if
12847                 # this table is a child of another one to avoid duplicating
12848                 # the warning that should come from the parent one.
12849                 if (($table == $property || $table->parent == $table)
12850                     && $table->status ne $SUPPRESSED
12851                     && ! grep { $complete_name =~ /^$_$/ }
12852                                                     @tables_that_may_be_empty)
12853                 {
12854                     push @unhandled_properties, "$table";
12855                 }
12856             }
12857             elsif ($expected_empty) {
12858                 my $because = "";
12859                 if ($suppress_if_empty_warn_if_not) {
12860                     $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
12861                 }
12862
12863                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
12864             }
12865
12866             my $count = $table->count;
12867             if ($expected_full) {
12868                 if ($count != $MAX_UNICODE_CODEPOINTS) {
12869                     Carp::my_carp("$table matches only "
12870                     . clarify_number($count)
12871                     . " Unicode code points but should match "
12872                     . clarify_number($MAX_UNICODE_CODEPOINTS)
12873                     . " (off by "
12874                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
12875                     . ").  Proceeding anyway.");
12876                 }
12877
12878                 # Here is expected to be full.  If it is because it is the
12879                 # complement of an (empty) binary table that is to be
12880                 # suppressed, then suppress this one as well.
12881                 if ($is_complement_of_empty_binary) {
12882                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
12883                     my $opposing = $property->table($opposing_name);
12884                     my $opposing_status = $opposing->status;
12885                     if ($opposing_status) {
12886                         $table->set_status($opposing_status,
12887                                            $opposing->status_info);
12888                     }
12889                 }
12890             }
12891             elsif ($count == $MAX_UNICODE_CODEPOINTS) {
12892                 if ($table == $property || $table->leader == $table) {
12893                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
12894                 }
12895             }
12896
12897             if ($table->status eq $SUPPRESSED) {
12898                 if (! $is_property) {
12899                     my @children = $table->children;
12900                     foreach my $child (@children) {
12901                         if ($child->status ne $SUPPRESSED) {
12902                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
12903                         }
12904                     }
12905                 }
12906                 next TABLE;
12907
12908             }
12909             if (! $is_property) {
12910
12911                 # Several things need to be done just once for each related
12912                 # group of match tables.  Do them on the parent.
12913                 if ($table->parent == $table) {
12914
12915                     # Add an entry in the pod file for the table; it also does
12916                     # the children.
12917                     make_table_pod_entries($table) if defined $pod_directory;
12918
12919                     # See if the the table matches identical code points with
12920                     # something that has already been output.  In that case,
12921                     # no need to have two files with the same code points in
12922                     # them.  We use the table's hash() method to store these
12923                     # in buckets, so that it is quite likely that if two
12924                     # tables are in the same bucket they will be identical, so
12925                     # don't have to compare tables frequently.  The tables
12926                     # have to have the same status to share a file, so add
12927                     # this to the bucket hash.  (The reason for this latter is
12928                     # that Heavy.pl associates a status with a file.)
12929                     my $hash = $table->hash . ';' . $table->status;
12930
12931                     # Look at each table that is in the same bucket as this
12932                     # one would be.
12933                     foreach my $comparison (@{$match_tables_to_write{$hash}})
12934                     {
12935                         if ($table->matches_identically_to($comparison)) {
12936                             $table->set_equivalent_to($comparison,
12937                                                                 Related => 0);
12938                             next TABLE;
12939                         }
12940                     }
12941
12942                     # Here, not equivalent, add this table to the bucket.
12943                     push @{$match_tables_to_write{$hash}}, $table;
12944                 }
12945             }
12946             else {
12947
12948                 # Here is the property itself.
12949                 # Don't write out or make references to the $perl property
12950                 next if $table == $perl;
12951
12952                 if ($type != $STRING) {
12953
12954                     # There is a mapping stored of the various synonyms to the
12955                     # standardized name of the property for utf8_heavy.pl.
12956                     # Also, the pod file contains entries of the form:
12957                     # \p{alias: *}         \p{full: *}
12958                     # rather than show every possible combination of things.
12959
12960                     my @property_aliases = $property->aliases;
12961
12962                     # The full name of this property is stored by convention
12963                     # first in the alias array
12964                     my $full_property_name =
12965                                 '\p{' . $property_aliases[0]->name . ': *}';
12966                     my $standard_property_name = standardize($table->name);
12967
12968                     # For each synonym ...
12969                     for my $i (0 .. @property_aliases - 1)  {
12970                         my $alias = $property_aliases[$i];
12971                         my $alias_name = $alias->name;
12972                         my $alias_standard = standardize($alias_name);
12973
12974                         # Set the mapping for utf8_heavy of the alias to the
12975                         # property
12976                         if (exists ($loose_property_name_of{$alias_standard}))
12977                         {
12978                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
12979                         }
12980                         else {
12981                             $loose_property_name_of{$alias_standard}
12982                                                 = $standard_property_name;
12983                         }
12984
12985                         # Now for the pod entry for this alias.  Skip if not
12986                         # outputting a pod; skip the first one, which is the
12987                         # full name so won't have an entry like: '\p{full: *}
12988                         # \p{full: *}', and skip if don't want an entry for
12989                         # this one.
12990                         next if $i == 0
12991                                 || ! defined $pod_directory
12992                                 || ! $alias->make_pod_entry;
12993
12994                         my $rhs = $full_property_name;
12995                         if ($property != $perl && $table->perl_extension) {
12996                             $rhs .= ' (Perl extension)';
12997                         }
12998                         push @match_properties,
12999                             format_pod_line($indent_info_column,
13000                                         '\p{' . $alias->name . ': *}',
13001                                         $rhs,
13002                                         $alias->status);
13003                     }
13004                 } # End of non-string-like property code
13005
13006
13007                 # Don't output a mapping file if not desired.
13008                 next if ! $property->to_output_map;
13009             }
13010
13011             # Here, we know we want to write out the table, but don't do it
13012             # yet because there may be other tables that come along and will
13013             # want to share the file, and the file's comments will change to
13014             # mention them.  So save for later.
13015             push @writables, $table;
13016
13017         } # End of looping through the property and all its tables.
13018     } # End of looping through all properties.
13019
13020     # Now have all the tables that will have files written for them.  Do it.
13021     foreach my $table (@writables) {
13022         my @directory;
13023         my $filename;
13024         my $property = $table->property;
13025         my $is_property = ($table == $property);
13026         if (! $is_property) {
13027
13028             # Match tables for the property go in lib/$subdirectory, which is
13029             # the property's name.  Don't use the standard file name for this,
13030             # as may get an unfamiliar alias
13031             @directory = ($matches_directory, $property->external_name);
13032         }
13033         else {
13034
13035             @directory = $table->directory;
13036             $filename = $table->file;
13037         }
13038
13039         # Use specified filename if avaliable, or default to property's
13040         # shortest name.  We need an 8.3 safe filename (which means "an 8
13041         # safe" filename, since after the dot is only 'pl', which is < 3)
13042         # The 2nd parameter is if the filename shouldn't be changed, and
13043         # it shouldn't iff there is a hard-coded name for this table.
13044         $filename = construct_filename(
13045                                 $filename || $table->external_name,
13046                                 ! $filename,    # mutable if no filename
13047                                 \@directory);
13048
13049         register_file_for_name($table, \@directory, $filename);
13050
13051         # Only need to write one file when shared by more than one
13052         # property
13053         next if ! $is_property && $table->leader != $table;
13054
13055         # Construct a nice comment to add to the file
13056         $table->set_final_comment;
13057
13058         $table->write;
13059     }
13060
13061
13062     # Write out the pod file
13063     make_pod;
13064
13065     # And Heavy.pl
13066     make_Heavy;
13067
13068     make_property_test_script() if $make_test_script;
13069     return;
13070 }
13071
13072 my @white_space_separators = ( # This used only for making the test script.
13073                             "",
13074                             ' ',
13075                             "\t",
13076                             '   '
13077                         );
13078
13079 sub generate_separator($) {
13080     # This used only for making the test script.  It generates the colon or
13081     # equal separator between the property and property value, with random
13082     # white space surrounding the separator
13083
13084     my $lhs = shift;
13085
13086     return "" if $lhs eq "";  # No separator if there's only one (the r) side
13087
13088     # Choose space before and after randomly
13089     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
13090     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
13091
13092     # And return the whole complex, half the time using a colon, half the
13093     # equals
13094     return $spaces_before
13095             . (rand() < 0.5) ? '=' : ':'
13096             . $spaces_after;
13097 }
13098
13099 sub generate_tests($$$$$$) {
13100     # This used only for making the test script.  It generates test cases that
13101     # are expected to compile successfully in perl.  Note that the lhs and
13102     # rhs are assumed to already be as randomized as the caller wants.
13103
13104     my $file_handle = shift;   # Where to output the tests
13105     my $lhs = shift;           # The property: what's to the left of the colon
13106                                #  or equals separator
13107     my $rhs = shift;           # The property value; what's to the right
13108     my $valid_code = shift;    # A code point that's known to be in the
13109                                # table given by lhs=rhs; undef if table is
13110                                # empty
13111     my $invalid_code = shift;  # A code point known to not be in the table;
13112                                # undef if the table is all code points
13113     my $warning = shift;
13114
13115     # Get the colon or equal
13116     my $separator = generate_separator($lhs);
13117
13118     # The whole 'property=value'
13119     my $name = "$lhs$separator$rhs";
13120
13121     # Create a complete set of tests, with complements.
13122     if (defined $valid_code) {
13123         printf $file_handle
13124                     qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/;
13125         printf $file_handle
13126                     qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/;
13127         printf $file_handle
13128                     qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/;
13129         printf $file_handle
13130                     qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/;
13131     }
13132     if (defined $invalid_code) {
13133         printf $file_handle
13134                     qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/;
13135         printf $file_handle
13136                     qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/;
13137         printf $file_handle
13138                     qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/;
13139         printf $file_handle
13140                     qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/;
13141     }
13142     return;
13143 }
13144
13145 sub generate_error($$$$) {
13146     # This used only for making the test script.  It generates test cases that
13147     # are expected to not only not match, but to be syntax or similar errors
13148
13149     my $file_handle = shift;        # Where to output to.
13150     my $lhs = shift;                # The property: what's to the left of the
13151                                     # colon or equals separator
13152     my $rhs = shift;                # The property value; what's to the right
13153     my $already_in_error = shift;   # Boolean; if true it's known that the
13154                                 # unmodified lhs and rhs will cause an error.
13155                                 # This routine should not force another one
13156     # Get the colon or equal
13157     my $separator = generate_separator($lhs);
13158
13159     # Since this is an error only, don't bother to randomly decide whether to
13160     # put the error on the left or right side; and assume that the rhs is
13161     # loosely matched, again for convenience rather than rigor.
13162     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
13163
13164     my $property = $lhs . $separator . $rhs;
13165
13166     print $file_handle qq/Error('\\p{$property}');\n/;
13167     print $file_handle qq/Error('\\P{$property}');\n/;
13168     return;
13169 }
13170
13171 # These are used only for making the test script
13172 # XXX Maybe should also have a bad strict seps, which includes underscore.
13173
13174 my @good_loose_seps = (
13175             " ",
13176             "-",
13177             "\t",
13178             "",
13179             "_",
13180            );
13181 my @bad_loose_seps = (
13182            "/a/",
13183            ':=',
13184           );
13185
13186 sub randomize_stricter_name {
13187     # This used only for making the test script.  Take the input name and
13188     # return a randomized, but valid version of it under the stricter matching
13189     # rules.
13190
13191     my $name = shift;
13192     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13193
13194     # If the name looks like a number (integer, floating, or rational), do
13195     # some extra work
13196     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
13197         my $sign = $1;
13198         my $number = $2;
13199         my $separator = $3;
13200
13201         # If there isn't a sign, part of the time add a plus
13202         # Note: Not testing having any denominator having a minus sign
13203         if (! $sign) {
13204             $sign = '+' if rand() <= .3;
13205         }
13206
13207         # And add 0 or more leading zeros.
13208         $name = $sign . ('0' x int rand(10)) . $number;
13209
13210         if (defined $separator) {
13211             my $extra_zeros = '0' x int rand(10);
13212
13213             if ($separator eq '.') {
13214
13215                 # Similarly, add 0 or more trailing zeros after a decimal
13216                 # point
13217                 $name .= $extra_zeros;
13218             }
13219             else {
13220
13221                 # Or, leading zeros before the denominator
13222                 $name =~ s,/,/$extra_zeros,;
13223             }
13224         }
13225     }
13226
13227     # For legibility of the test, only change the case of whole sections at a
13228     # time.  To do this, first split into sections.  The split returns the
13229     # delimiters
13230     my @sections;
13231     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
13232         trace $section if main::DEBUG && $to_trace;
13233
13234         if (length $section > 1 && $section !~ /\D/) {
13235
13236             # If the section is a sequence of digits, about half the time
13237             # randomly add underscores between some of them.
13238             if (rand() > .5) {
13239
13240                 # Figure out how many underscores to add.  max is 1 less than
13241                 # the number of digits.  (But add 1 at the end to make sure
13242                 # result isn't 0, and compensate earlier by subtracting 2
13243                 # instead of 1)
13244                 my $num_underscores = int rand(length($section) - 2) + 1;
13245
13246                 # And add them evenly throughout, for convenience, not rigor
13247                 use integer;
13248                 my $spacing = (length($section) - 1)/ $num_underscores;
13249                 my $temp = $section;
13250                 $section = "";
13251                 for my $i (1 .. $num_underscores) {
13252                     $section .= substr($temp, 0, $spacing, "") . '_';
13253                 }
13254                 $section .= $temp;
13255             }
13256             push @sections, $section;
13257         }
13258         else {
13259
13260             # Here not a sequence of digits.  Change the case of the section
13261             # randomly
13262             my $switch = int rand(4);
13263             if ($switch == 0) {
13264                 push @sections, uc $section;
13265             }
13266             elsif ($switch == 1) {
13267                 push @sections, lc $section;
13268             }
13269             elsif ($switch == 2) {
13270                 push @sections, ucfirst $section;
13271             }
13272             else {
13273                 push @sections, $section;
13274             }
13275         }
13276     }
13277     trace "returning", join "", @sections if main::DEBUG && $to_trace;
13278     return join "", @sections;
13279 }
13280
13281 sub randomize_loose_name($;$) {
13282     # This used only for making the test script
13283
13284     my $name = shift;
13285     my $want_error = shift;  # if true, make an error
13286     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13287
13288     $name = randomize_stricter_name($name);
13289
13290     my @parts;
13291     push @parts, $good_loose_seps[rand(@good_loose_seps)];
13292     for my $part (split /[-\s_]+/, $name) {
13293         if (@parts) {
13294             if ($want_error and rand() < 0.3) {
13295                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13296                 $want_error = 0;
13297             }
13298             else {
13299                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13300             }
13301         }
13302         push @parts, $part;
13303     }
13304     my $new = join("", @parts);
13305     trace "$name => $new" if main::DEBUG && $to_trace;
13306
13307     if ($want_error) {
13308         if (rand() >= 0.5) {
13309             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13310         }
13311         else {
13312             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
13313         }
13314     }
13315     return $new;
13316 }
13317
13318 # Used to make sure don't generate duplicate test cases.
13319 my %test_generated;
13320
13321 sub make_property_test_script() {
13322     # This used only for making the test script
13323     # this written directly -- it's huge.
13324
13325     print "Making test script\n" if $verbosity >= $PROGRESS;
13326
13327     # This uses randomness to test different possibilities without testing all
13328     # possibilities.  To ensure repeatability, set the seed to 0.  But if
13329     # tests are added, it will perturb all later ones in the .t file
13330     srand 0;
13331
13332     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13333
13334     force_unlink ($t_path);
13335     push @files_actually_output, $t_path;
13336     my $OUT;
13337     if (not open $OUT, "> $t_path") {
13338         Carp::my_carp("Can't open $t_path.  Skipping: $!");
13339         return;
13340     }
13341
13342     # Keep going down an order of magnitude
13343     # until find that adding this quantity to
13344     # 1 remains 1; but put an upper limit on
13345     # this so in case this algorithm doesn't
13346     # work properly on some platform, that we
13347     # won't loop forever.
13348     my $digits = 0;
13349     my $min_floating_slop = 1;
13350     while (1+ $min_floating_slop != 1
13351             && $digits++ < 50)
13352     {
13353         my $next = $min_floating_slop / 10;
13354         last if $next == 0; # If underflows,
13355                             # use previous one
13356         $min_floating_slop = $next;
13357     }
13358     print $OUT $HEADER, <DATA>;
13359
13360     foreach my $property (property_ref('*')) {
13361         foreach my $table ($property->tables) {
13362
13363             # Find code points that match, and don't match this table.
13364             my $valid = $table->get_valid_code_point;
13365             my $invalid = $table->get_invalid_code_point;
13366             my $warning = ($table->status eq $DEPRECATED)
13367                             ? "'deprecated'"
13368                             : '""';
13369
13370             # Test each possible combination of the property's aliases with
13371             # the table's.  If this gets to be too many, could do what is done
13372             # in the set_final_comment() for Tables
13373             my @table_aliases = $table->aliases;
13374             my @property_aliases = $table->property->aliases;
13375             my $max = max(scalar @table_aliases, scalar @property_aliases);
13376             for my $j (0 .. $max - 1) {
13377
13378                 # The current alias for property is the next one on the list,
13379                 # or if beyond the end, start over.  Similarly for table
13380                 my $property_name
13381                             = $property_aliases[$j % @property_aliases]->name;
13382
13383                 $property_name = "" if $table->property == $perl;
13384                 my $table_alias = $table_aliases[$j % @table_aliases];
13385                 my $table_name = $table_alias->name;
13386                 my $loose_match = $table_alias->loose_match;
13387
13388                 # If the table doesn't have a file, any test for it is
13389                 # already guaranteed to be in error
13390                 my $already_error = ! $table->file_path;
13391
13392                 # Generate error cases for this alias.
13393                 generate_error($OUT,
13394                                 $property_name,
13395                                 $table_name,
13396                                 $already_error);
13397
13398                 # If the table is guaranteed to always generate an error,
13399                 # quit now without generating success cases.
13400                 next if $already_error;
13401
13402                 # Now for the success cases.
13403                 my $random;
13404                 if ($loose_match) {
13405
13406                     # For loose matching, create an extra test case for the
13407                     # standard name.
13408                     my $standard = standardize($table_name);
13409
13410                     # $test_name should be a unique combination for each test
13411                     # case; used just to avoid duplicate tests
13412                     my $test_name = "$property_name=$standard";
13413
13414                     # Don't output duplicate test cases.
13415                     if (! exists $test_generated{$test_name}) {
13416                         $test_generated{$test_name} = 1;
13417                         generate_tests($OUT,
13418                                         $property_name,
13419                                         $standard,
13420                                         $valid,
13421                                         $invalid,
13422                                         $warning,
13423                                     );
13424                     }
13425                     $random = randomize_loose_name($table_name)
13426                 }
13427                 else { # Stricter match
13428                     $random = randomize_stricter_name($table_name);
13429                 }
13430
13431                 # Now for the main test case for this alias.
13432                 my $test_name = "$property_name=$random";
13433                 if (! exists $test_generated{$test_name}) {
13434                     $test_generated{$test_name} = 1;
13435                     generate_tests($OUT,
13436                                     $property_name,
13437                                     $random,
13438                                     $valid,
13439                                     $invalid,
13440                                     $warning,
13441                                 );
13442
13443                     # If the name is a rational number, add tests for the
13444                     # floating point equivalent.
13445                     if ($table_name =~ qr{/}) {
13446
13447                         # Calculate the float, and find just the fraction.
13448                         my $float = eval $table_name;
13449                         my ($whole, $fraction)
13450                                             = $float =~ / (.*) \. (.*) /x;
13451
13452                         # Starting with one digit after the decimal point,
13453                         # create a test for each possible precision (number of
13454                         # digits past the decimal point) until well beyond the
13455                         # native number found on this machine.  (If we started
13456                         # with 0 digits, it would be an integer, which could
13457                         # well match an unrelated table)
13458                         PLACE:
13459                         for my $i (1 .. $min_floating_slop + 3) {
13460                             my $table_name = sprintf("%.*f", $i, $float);
13461                             if ($i < $MIN_FRACTION_LENGTH) {
13462
13463                                 # If the test case has fewer digits than the
13464                                 # minimum acceptable precision, it shouldn't
13465                                 # succeed, so we expect an error for it.
13466                                 # E.g., 2/3 = .7 at one decimal point, and we
13467                                 # shouldn't say it matches .7.  We should make
13468                                 # it be .667 at least before agreeing that the
13469                                 # intent was to match 2/3.  But at the
13470                                 # less-than- acceptable level of precision, it
13471                                 # might actually match an unrelated number.
13472                                 # So don't generate a test case if this
13473                                 # conflating is possible.  In our example, we
13474                                 # don't want 2/3 matching 7/10, if there is
13475                                 # a 7/10 code point.
13476                                 for my $existing
13477                                         (keys %nv_floating_to_rational)
13478                                 {
13479                                     next PLACE
13480                                         if abs($table_name - $existing)
13481                                                 < $MAX_FLOATING_SLOP;
13482                                 }
13483                                 generate_error($OUT,
13484                                             $property_name,
13485                                             $table_name,
13486                                             1   # 1 => already an error
13487                                 );
13488                             }
13489                             else {
13490
13491                                 # Here the number of digits exceeds the
13492                                 # minimum we think is needed.  So generate a
13493                                 # success test case for it.
13494                                 generate_tests($OUT,
13495                                                 $property_name,
13496                                                 $table_name,
13497                                                 $valid,
13498                                                 $invalid,
13499                                                 $warning,
13500                                 );
13501                             }
13502                         }
13503                     }
13504                 }
13505             }
13506         }
13507     }
13508
13509     foreach my $test (@backslash_X_tests) {
13510         print $OUT "Test_X('$test');\n";
13511     }
13512
13513     print $OUT "Finished();\n";
13514     close $OUT;
13515     return;
13516 }
13517
13518 # This is a list of the input files and how to handle them.  The files are
13519 # processed in their order in this list.  Some reordering is possible if
13520 # desired, but the v0 files should be first, and the extracted before the
13521 # others except DAge.txt (as data in an extracted file can be over-ridden by
13522 # the non-extracted.  Some other files depend on data derived from an earlier
13523 # file, like UnicodeData requires data from Jamo, and the case changing and
13524 # folding requires data from Unicode.  Mostly, it safest to order by first
13525 # version releases in (except the Jamo).  DAge.txt is read before the
13526 # extracted ones because of the rarely used feature $compare_versions.  In the
13527 # unlikely event that there were ever an extracted file that contained the Age
13528 # property information, it would have to go in front of DAge.
13529 #
13530 # The version strings allow the program to know whether to expect a file or
13531 # not, but if a file exists in the directory, it will be processed, even if it
13532 # is in a version earlier than expected, so you can copy files from a later
13533 # release into an earlier release's directory.
13534 my @input_file_objects = (
13535     Input_file->new('PropertyAliases.txt', v0,
13536                     Handler => \&process_PropertyAliases,
13537                     ),
13538     Input_file->new(undef, v0,  # No file associated with this
13539                     Progress_Message => 'Finishing property setup',
13540                     Handler => \&finish_property_setup,
13541                     ),
13542     Input_file->new('PropValueAliases.txt', v0,
13543                      Handler => \&process_PropValueAliases,
13544                      Has_Missings_Defaults => $NOT_IGNORED,
13545                      ),
13546     Input_file->new('DAge.txt', v3.2.0,
13547                     Has_Missings_Defaults => $NOT_IGNORED,
13548                     Property => 'Age'
13549                     ),
13550     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13551                     Property => 'General_Category',
13552                     ),
13553     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13554                     Property => 'Canonical_Combining_Class',
13555                     Has_Missings_Defaults => $NOT_IGNORED,
13556                     ),
13557     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13558                     Property => 'Numeric_Type',
13559                     Has_Missings_Defaults => $NOT_IGNORED,
13560                     ),
13561     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13562                     Property => 'East_Asian_Width',
13563                     Has_Missings_Defaults => $NOT_IGNORED,
13564                     ),
13565     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13566                     Property => 'Line_Break',
13567                     Has_Missings_Defaults => $NOT_IGNORED,
13568                     ),
13569     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
13570                     Property => 'Bidi_Class',
13571                     Has_Missings_Defaults => $NOT_IGNORED,
13572                     ),
13573     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
13574                     Property => 'Decomposition_Type',
13575                     Has_Missings_Defaults => $NOT_IGNORED,
13576                     ),
13577     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
13578     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
13579                     Property => 'Numeric_Value',
13580                     Each_Line_Handler => \&filter_numeric_value_line,
13581                     Has_Missings_Defaults => $NOT_IGNORED,
13582                     ),
13583     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
13584                     Property => 'Joining_Group',
13585                     Has_Missings_Defaults => $NOT_IGNORED,
13586                     ),
13587
13588     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
13589                     Property => 'Joining_Type',
13590                     Has_Missings_Defaults => $NOT_IGNORED,
13591                     ),
13592     Input_file->new('Jamo.txt', v2.0.0,
13593                     Property => 'Jamo_Short_Name',
13594                     Each_Line_Handler => \&filter_jamo_line,
13595                     ),
13596     Input_file->new('UnicodeData.txt', v1.1.5,
13597                     Pre_Handler => \&setup_UnicodeData,
13598
13599                     # We clean up this file for some early versions.
13600                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
13601                                             ? \&filter_v1_ucd
13602                                             : ($v_version eq v2.1.5)
13603                                                 ? \&filter_v2_1_5_ucd
13604                                                 : undef),
13605
13606                                             # And the main filter
13607                                             \&filter_UnicodeData_line,
13608                                          ],
13609                     EOF_Handler => \&EOF_UnicodeData,
13610                     ),
13611     Input_file->new('ArabicShaping.txt', v2.0.0,
13612                     Each_Line_Handler =>
13613                         [ ($v_version lt 4.1.0)
13614                                     ? \&filter_old_style_arabic_shaping
13615                                     : undef,
13616                         \&filter_arabic_shaping_line,
13617                         ],
13618                     Has_Missings_Defaults => $NOT_IGNORED,
13619                     ),
13620     Input_file->new('Blocks.txt', v2.0.0,
13621                     Property => 'Block',
13622                     Has_Missings_Defaults => $NOT_IGNORED,
13623                     Each_Line_Handler => \&filter_blocks_lines
13624                     ),
13625     Input_file->new('PropList.txt', v2.0.0,
13626                     Each_Line_Handler => (($v_version lt v3.1.0)
13627                                             ? \&filter_old_style_proplist
13628                                             : undef),
13629                     ),
13630     Input_file->new('Unihan.txt', v2.0.0,
13631                     Pre_Handler => \&setup_unihan,
13632                     Optional => 1,
13633                     Each_Line_Handler => \&filter_unihan_line,
13634                         ),
13635     Input_file->new('SpecialCasing.txt', v2.1.8,
13636                     Each_Line_Handler => \&filter_special_casing_line,
13637                     Pre_Handler => \&setup_special_casing,
13638                     ),
13639     Input_file->new(
13640                     'LineBreak.txt', v3.0.0,
13641                     Has_Missings_Defaults => $NOT_IGNORED,
13642                     Property => 'Line_Break',
13643                     # Early versions had problematic syntax
13644                     Each_Line_Handler => (($v_version lt v3.1.0)
13645                                         ? \&filter_early_ea_lb
13646                                         : undef),
13647                     ),
13648     Input_file->new('EastAsianWidth.txt', v3.0.0,
13649                     Property => 'East_Asian_Width',
13650                     Has_Missings_Defaults => $NOT_IGNORED,
13651                     # Early versions had problematic syntax
13652                     Each_Line_Handler => (($v_version lt v3.1.0)
13653                                         ? \&filter_early_ea_lb
13654                                         : undef),
13655                     ),
13656     Input_file->new('CompositionExclusions.txt', v3.0.0,
13657                     Property => 'Composition_Exclusion',
13658                     ),
13659     Input_file->new('BidiMirroring.txt', v3.0.1,
13660                     Property => 'Bidi_Mirroring_Glyph',
13661                     ),
13662     Input_file->new("NormalizationTest.txt", v3.0.1,
13663                     Skip => 1,
13664                     ),
13665     Input_file->new('CaseFolding.txt', v3.0.1,
13666                     Pre_Handler => \&setup_case_folding,
13667                     Each_Line_Handler =>
13668                         [ ($v_version lt v3.1.0)
13669                                  ? \&filter_old_style_case_folding
13670                                  : undef,
13671                            \&filter_case_folding_line
13672                         ],
13673                     Post_Handler => \&post_fold,
13674                     ),
13675     Input_file->new('DCoreProperties.txt', v3.1.0,
13676                     # 5.2 changed this file
13677                     Has_Missings_Defaults => (($v_version ge v5.2.0)
13678                                             ? $NOT_IGNORED
13679                                             : $NO_DEFAULTS),
13680                     ),
13681     Input_file->new('Scripts.txt', v3.1.0,
13682                     Property => 'Script',
13683                     Has_Missings_Defaults => $NOT_IGNORED,
13684                     ),
13685     Input_file->new('DNormalizationProps.txt', v3.1.0,
13686                     Has_Missings_Defaults => $NOT_IGNORED,
13687                     Each_Line_Handler => (($v_version lt v4.0.1)
13688                                       ? \&filter_old_style_normalization_lines
13689                                       : undef),
13690                     ),
13691     Input_file->new('HangulSyllableType.txt', v4.0.0,
13692                     Has_Missings_Defaults => $NOT_IGNORED,
13693                     Property => 'Hangul_Syllable_Type'),
13694     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
13695                     Property => 'Word_Break',
13696                     Has_Missings_Defaults => $NOT_IGNORED,
13697                     ),
13698     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
13699                     Property => 'Grapheme_Cluster_Break',
13700                     Has_Missings_Defaults => $NOT_IGNORED,
13701                     ),
13702     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
13703                     Handler => \&process_GCB_test,
13704                     ),
13705     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
13706                     Skip => 1,
13707                     ),
13708     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
13709                     Skip => 1,
13710                     ),
13711     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
13712                     Skip => 1,
13713                     ),
13714     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
13715                     Property => 'Sentence_Break',
13716                     Has_Missings_Defaults => $NOT_IGNORED,
13717                     ),
13718     Input_file->new('NamedSequences.txt', v4.1.0,
13719                     Handler => \&process_NamedSequences
13720                     ),
13721     Input_file->new('NameAliases.txt', v5.0.0,
13722                     Property => 'Name_Alias',
13723                     ),
13724     Input_file->new("BidiTest.txt", v5.2.0,
13725                     Skip => 1,
13726                     ),
13727     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
13728                     Optional => 1,
13729                     Each_Line_Handler => \&filter_unihan_line,
13730                     ),
13731     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
13732                     Optional => 1,
13733                     Each_Line_Handler => \&filter_unihan_line,
13734                     ),
13735     Input_file->new('UnihanIRGSources.txt', v5.2.0,
13736                     Optional => 1,
13737                     Pre_Handler => \&setup_unihan,
13738                     Each_Line_Handler => \&filter_unihan_line,
13739                     ),
13740     Input_file->new('UnihanNumericValues.txt', v5.2.0,
13741                     Optional => 1,
13742                     Each_Line_Handler => \&filter_unihan_line,
13743                     ),
13744     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
13745                     Optional => 1,
13746                     Each_Line_Handler => \&filter_unihan_line,
13747                     ),
13748     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
13749                     Optional => 1,
13750                     Each_Line_Handler => \&filter_unihan_line,
13751                     ),
13752     Input_file->new('UnihanReadings.txt', v5.2.0,
13753                     Optional => 1,
13754                     Each_Line_Handler => \&filter_unihan_line,
13755                     ),
13756     Input_file->new('UnihanVariants.txt', v5.2.0,
13757                     Optional => 1,
13758                     Each_Line_Handler => \&filter_unihan_line,
13759                     ),
13760 );
13761
13762 # End of all the preliminaries.
13763 # Do it...
13764
13765 if ($compare_versions) {
13766     Carp::my_carp(<<END
13767 Warning.  \$compare_versions is set.  Output is not suitable for production
13768 END
13769     );
13770 }
13771
13772 # Put into %potential_files a list of all the files in the directory structure
13773 # that could be inputs to this program, excluding those that we should ignore.
13774 # Use absolute file names because it makes it easier across machine types.
13775 my @ignored_files_full_names = map { File::Spec->rel2abs(
13776                                      internal_file_to_platform($_))
13777                                 } keys %ignored_files;
13778 File::Find::find({
13779     wanted=>sub {
13780         return unless /\.txt$/i;  # Some platforms change the name's case
13781         my $full = lc(File::Spec->rel2abs($_));
13782         $potential_files{$full} = 1
13783                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
13784         return;
13785     }
13786 }, File::Spec->curdir());
13787
13788 my @mktables_list_output_files;
13789
13790 if ($write_unchanged_files) {
13791     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
13792 }
13793 else {
13794     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
13795     my $file_handle;
13796     if (! open $file_handle, "<", $file_list) {
13797         Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!");
13798         $glob_list = 1;
13799     }
13800     else {
13801         my @input;
13802
13803         # Read and parse mktables.lst, placing the results from the first part
13804         # into @input, and the second part into @mktables_list_output_files
13805         for my $list ( \@input, \@mktables_list_output_files ) {
13806             while (<$file_handle>) {
13807                 s/^ \s+ | \s+ $//xg;
13808                 next if /^ \s* (?: \# .* )? $/x;
13809                 last if /^ =+ $/x;
13810                 my ( $file ) = split /\t/;
13811                 push @$list, $file;
13812             }
13813             @$list = uniques(@$list);
13814             next;
13815         }
13816
13817         # Look through all the input files
13818         foreach my $input (@input) {
13819             next if $input eq 'version'; # Already have checked this.
13820
13821             # Ignore if doesn't exist.  The checking about whether we care or
13822             # not is done via the Input_file object.
13823             next if ! file_exists($input);
13824
13825             # The paths are stored with relative names, and with '/' as the
13826             # delimiter; convert to absolute on this machine
13827             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
13828             $potential_files{$full} = 1
13829                         if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
13830         }
13831     }
13832
13833     close $file_handle;
13834 }
13835
13836 if ($glob_list) {
13837
13838     # Here wants to process all .txt files in the directory structure.
13839     # Convert them to full path names.  They are stored in the platform's
13840     # relative style
13841     my @known_files;
13842     foreach my $object (@input_file_objects) {
13843         my $file = $object->file;
13844         next unless defined $file;
13845         push @known_files, File::Spec->rel2abs($file);
13846     }
13847
13848     my @unknown_input_files;
13849     foreach my $file (keys %potential_files) {
13850         next if grep { lc($file) eq lc($_) } @known_files;
13851
13852         # Here, the file is unknown to us.  Get relative path name
13853         $file = File::Spec->abs2rel($file);
13854         push @unknown_input_files, $file;
13855
13856         # What will happen is we create a data structure for it, and add it to
13857         # the list of input files to process.  First get the subdirectories
13858         # into an array
13859         my (undef, $directories, undef) = File::Spec->splitpath($file);
13860         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
13861         my @directories = File::Spec->splitdir($directories);
13862
13863         # If the file isn't extracted (meaning none of the directories is the
13864         # extracted one), just add it to the end of the list of inputs.
13865         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
13866             push @input_file_objects, Input_file->new($file, v0);
13867         }
13868         else {
13869
13870             # Here, the file is extracted.  It needs to go ahead of most other
13871             # processing.  Search for the first input file that isn't a
13872             # special required property (that is, find one whose first_release
13873             # is non-0), and isn't extracted.  Also, the Age property file is
13874             # processed before the extracted ones, just in case
13875             # $compare_versions is set.
13876             for (my $i = 0; $i < @input_file_objects; $i++) {
13877                 if ($input_file_objects[$i]->first_released ne v0
13878                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
13879                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
13880                 {
13881                     splice @input_file_objects, $i, 0,
13882                                                 Input_file->new($file, v0);
13883                     last;
13884                 }
13885             }
13886
13887         }
13888     }
13889     if (@unknown_input_files) {
13890         print STDERR simple_fold(join_lines(<<END
13891
13892 The following files are unknown as to how to handle.  Assuming they are
13893 typical property files.  You'll know by later error messages if it worked or
13894 not:
13895 END
13896         ) . " " . join(", ", @unknown_input_files) . "\n\n");
13897     }
13898 } # End of looking through directory structure for more .txt files.
13899
13900 # Create the list of input files from the objects we have defined, plus
13901 # version
13902 my @input_files = 'version';
13903 foreach my $object (@input_file_objects) {
13904     my $file = $object->file;
13905     next if ! defined $file;    # Not all objects have files
13906     next if $object->optional && ! -e $file;
13907     push @input_files,  $file;
13908 }
13909
13910 if ( $verbosity >= $VERBOSE ) {
13911     print "Expecting ".scalar( @input_files )." input files. ",
13912          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
13913 }
13914
13915 # We set $youngest to be the most recently changed input file, including this
13916 # program itself (done much earlier in this file)
13917 foreach my $in (@input_files) {
13918     my $age = -M $in;
13919     next unless defined $age;        # Keep going even if missing a file
13920     $youngest = $age if $age < $youngest;
13921
13922     # See that the input files have distinct names, to warn someone if they
13923     # are adding a new one
13924     if ($make_list) {
13925         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
13926         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
13927         my @directories = File::Spec->splitdir($directories);
13928         my $base = $file =~ s/\.txt$//;
13929         construct_filename($file, 'mutable', \@directories);
13930     }
13931 }
13932
13933 my $ok = ! $write_unchanged_files
13934         && scalar @mktables_list_output_files;        # If none known, rebuild
13935
13936 # Now we check to see if any output files are older than youngest, if
13937 # they are, we need to continue on, otherwise we can presumably bail.
13938 if ($ok) {
13939     foreach my $out (@mktables_list_output_files) {
13940         if ( ! file_exists($out)) {
13941             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
13942             $ok = 0;
13943             last;
13944          }
13945         #local $to_trace = 1 if main::DEBUG;
13946         trace $youngest, -M $out if main::DEBUG && $to_trace;
13947         if ( -M $out > $youngest ) {
13948             #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
13949             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
13950             $ok = 0;
13951             last;
13952         }
13953     }
13954 }
13955 if ($ok) {
13956     print "Files seem to be ok, not bothering to rebuild.\n";
13957     exit(0);
13958 }
13959 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
13960
13961 # Ready to do the major processing.  First create the perl pseudo-property.
13962 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
13963
13964 # Process each input file
13965 foreach my $file (@input_file_objects) {
13966     $file->run;
13967 }
13968
13969 # Finish the table generation.
13970
13971 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
13972 finish_Unicode();
13973
13974 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
13975 compile_perl();
13976
13977 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
13978 add_perl_synonyms();
13979
13980 print "Writing tables\n" if $verbosity >= $PROGRESS;
13981 write_all_tables();
13982
13983 # Write mktables.lst
13984 if ( $file_list and $make_list ) {
13985
13986     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
13987     foreach my $file (@input_files, @files_actually_output) {
13988         my (undef, $directories, $file) = File::Spec->splitpath($file);
13989         my @directories = File::Spec->splitdir($directories);
13990         $file = join '/', @directories, $file;
13991     }
13992
13993     my $ofh;
13994     if (! open $ofh,">",$file_list) {
13995         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
13996         return
13997     }
13998     else {
13999         print $ofh <<"END";
14000 #
14001 # $file_list -- File list for $0.
14002 #
14003 #   Autogenerated on @{[scalar localtime]}
14004 #
14005 # - First section is input files
14006 #   ($0 itself is not listed but is automatically considered an input)
14007 # - Section seperator is /^=+\$/
14008 # - Second section is a list of output files.
14009 # - Lines matching /^\\s*#/ are treated as comments
14010 #   which along with blank lines are ignored.
14011 #
14012
14013 # Input files:
14014
14015 END
14016         print $ofh "$_\n" for sort(@input_files);
14017         print $ofh "\n=================================\n# Output files:\n\n";
14018         print $ofh "$_\n" for sort @files_actually_output;
14019         print $ofh "\n# ",scalar(@input_files)," input files\n",
14020                 "# ",scalar(@files_actually_output)+1," output files\n\n",
14021                 "# End list\n";
14022         close $ofh
14023             or Carp::my_carp("Failed to close $ofh: $!");
14024
14025         print "Filelist has ",scalar(@input_files)," input files and ",
14026             scalar(@files_actually_output)+1," output files\n"
14027             if $verbosity >= $VERBOSE;
14028     }
14029 }
14030
14031 # Output these warnings unless -q explicitly specified.
14032 if ($verbosity >= $NORMAL_VERBOSITY) {
14033     if (@unhandled_properties) {
14034         print "\nProperties and tables that unexpectedly have no code points\n";
14035         foreach my $property (sort @unhandled_properties) {
14036             print $property, "\n";
14037         }
14038     }
14039
14040     if (%potential_files) {
14041         print "\nInput files that are not considered:\n";
14042         foreach my $file (sort keys %potential_files) {
14043             print File::Spec->abs2rel($file), "\n";
14044         }
14045     }
14046     print "\nAll done\n" if $verbosity >= $VERBOSE;
14047 }
14048 exit(0);
14049
14050 # TRAILING CODE IS USED BY make_property_test_script()
14051 __DATA__
14052
14053 use strict;
14054 use warnings;
14055
14056 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
14057 # constructed by mktables from the tables it generates, so if mktables is
14058 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
14059 # feasible properties; a few aren't currently feasible; see
14060 # is_code_point_usable() in mktables for details.
14061
14062 # Standard test packages are not used because this manipulates SIG_WARN.  It
14063 # exits 0 if every non-skipped test succeeded; -1 if any failed.
14064
14065 my $Tests = 0;
14066 my $Fails = 0;
14067
14068 my $non_ASCII = (ord('A') != 65);
14069
14070 # The 256 8-bit characters in ASCII ordinal order, with the ones that don't
14071 # have Perl names replaced by -1
14072 my @ascii_ordered_chars = (
14073     "\0",
14074     (-1) x 6,
14075     "\a", "\b", "\t", "\n",
14076     -1,   # No Vt
14077     "\f", "\r",
14078     (-1) x 18,
14079     " ", "!", "\"", "#", '$', "%", "&", "'",
14080     "(", ")", "*", "+", ",", "-", ".", "/",
14081     "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
14082     ":", ";", "<", "=", ">", "?", "@",
14083     "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
14084     "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
14085     "[", "\\", "]", "^", "_", "`",
14086     "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
14087     "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
14088     "{", "|", "}", "~",
14089     (-1) x 129
14090 );
14091
14092 sub ASCII_ord_to_native ($) {
14093     # Converts input ordinal number to the native one, if can be done easily.
14094     # Returns -1 otherwise.
14095
14096     my $ord = shift;
14097
14098     return $ord if $ord > 255 || ! $non_ASCII;
14099     my $result = $ascii_ordered_chars[$ord];
14100     return $result if $result eq '-1';
14101     return ord($result);
14102 }
14103
14104 sub Expect($$$$) {
14105     my $expected = shift;
14106     my $ord = shift;
14107     my $regex  = shift;
14108     my $warning_type = shift;   # Type of warning message, like 'deprecated'
14109                                 # or empty if none
14110     my $line   = (caller)[2];
14111
14112     # Convert the non-ASCII code points expressible as characters to their
14113     # ASCII equivalents, and skip the others.
14114     $ord = ASCII_ord_to_native($ord);
14115     if ($ord < 0) {
14116         $Tests++;
14117         print "ok $Tests - "
14118               . sprintf("\"\\x{%04X}\"", $ord)
14119               . " =~ $regex # Skipped: non-ASCII\n";
14120         return;
14121     }
14122
14123     # Convert the code point to hex form
14124     my $string = sprintf "\"\\x{%04X}\"", $ord;
14125
14126     my @tests = "";
14127
14128     # The first time through, use all warnings.  If the input should generate
14129     # a warning, add another time through with them turned off
14130     push @tests, "no warnings '$warning_type';" if $warning_type;
14131
14132     foreach my $no_warnings (@tests) {
14133
14134         # Store any warning messages instead of outputting them
14135         local $SIG{__WARN__} = $SIG{__WARN__};
14136         my $warning_message;
14137         $SIG{__WARN__} = sub { $warning_message = $_[0] };
14138
14139         $Tests++;
14140
14141         # A string eval is needed because of the 'no warnings'.
14142         # Assumes no parens in the regular expression
14143         my $result = eval "$no_warnings
14144                             my \$RegObj = qr($regex);
14145                             $string =~ \$RegObj ? 1 : 0";
14146         if (not defined $result) {
14147             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
14148             $Fails++;
14149         }
14150         elsif ($result ^ $expected) {
14151             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
14152             $Fails++;
14153         }
14154         elsif ($warning_message) {
14155             if (! $warning_type || ($warning_type && $no_warnings)) {
14156                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
14157                 $Fails++;
14158             }
14159             else {
14160                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
14161             }
14162         }
14163         elsif ($warning_type && ! $no_warnings) {
14164             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
14165             $Fails++;
14166         }
14167         else {
14168             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
14169         }
14170     }
14171     return;
14172 }
14173
14174 sub Error($) {
14175     my $regex  = shift;
14176     $Tests++;
14177     if (eval { 'x' =~ qr/$regex/; 1 }) {
14178         $Fails++;
14179         my $line = (caller)[2];
14180         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
14181     }
14182     else {
14183         my $line = (caller)[2];
14184         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
14185     }
14186     return;
14187 }
14188
14189 # GCBTest.txt character that separates grapheme clusters
14190 my $breakable_utf8 = my $breakable = chr(0xF7);
14191 utf8::upgrade($breakable_utf8);
14192
14193 # GCBTest.txt character that indicates that the adjoining code points are part
14194 # of the same grapheme cluster
14195 my $nobreak_utf8 = my $nobreak = chr(0xD7);
14196 utf8::upgrade($nobreak_utf8);
14197
14198 sub Test_X($) {
14199     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
14200     # Each such line is a sequence of code points given by their hex numbers,
14201     # separated by the two characters defined just before this subroutine that
14202     # indicate that either there can or cannot be a break between the adjacent
14203     # code points.  If there isn't a break, that means the sequence forms an
14204     # extended grapheme cluster, which means that \X should match the whole
14205     # thing.  If there is a break, \X should stop there.  This is all
14206     # converted by this routine into a match:
14207     #   $string =~ /(\X)/,
14208     # Each \X should match the next cluster; and that is what is checked.
14209
14210     my $template = shift;
14211
14212     my $line   = (caller)[2];
14213
14214     # The line contains characters above the ASCII range, but in Latin1.  It
14215     # may or may not be in utf8, and if it is, it may or may not know it.  So,
14216     # convert these characters to 8 bits.  If knows is in utf8, simply
14217     # downgrade.
14218     if (utf8::is_utf8($template)) {
14219         utf8::downgrade($template);
14220     } else {
14221
14222         # Otherwise, if it is in utf8, but doesn't know it, the next lines
14223         # convert the two problematic characters to their 8-bit equivalents.
14224         # If it isn't in utf8, they don't harm anything.
14225         use bytes;
14226         $template =~ s/$nobreak_utf8/$nobreak/g;
14227         $template =~ s/$breakable_utf8/$breakable/g;
14228     }
14229
14230     # Get rid of the leading and trailing breakables
14231     $template =~ s/^ \s* $breakable \s* //x;
14232     $template =~ s/ \s* $breakable \s* $ //x;
14233
14234     # And no-breaks become just a space.
14235     $template =~ s/ \s* $nobreak \s* / /xg;
14236
14237     # Split the input into segments that are breakable between them.
14238     my @segments = split /\s*$breakable\s*/, $template;
14239
14240     my $string = "";
14241     my $display_string = "";
14242     my @should_match;
14243     my @should_display;
14244
14245     # Convert the code point sequence in each segment into a Perl string of
14246     # characters
14247     foreach my $segment (@segments) {
14248         my @code_points = split /\s+/, $segment;
14249         my $this_string = "";
14250         my $this_display = "";
14251         foreach my $code_point (@code_points) {
14252             my $ord = ASCII_ord_to_native(hex $code_point);
14253             if ($ord < 0) {
14254                 $Tests++;
14255                 print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n";
14256                 return;
14257             }
14258             $this_string .= chr $ord;
14259             $this_display .= "\\x{$code_point}";
14260         }
14261
14262         # The next cluster should match the string in this segment.
14263         push @should_match, $this_string;
14264         push @should_display, $this_display;
14265         $string .= $this_string;
14266         $display_string .= $this_display;
14267     }
14268
14269     # If a string can be represented in both non-ut8 and utf8, test both cases
14270     UPGRADE:
14271     for my $to_upgrade (0 .. 1) {
14272
14273         if ($to_upgrade) {
14274
14275             # If already in utf8, would just be a repeat
14276             next UPGRADE if utf8::is_utf8($string);
14277
14278             utf8::upgrade($string);
14279         }
14280
14281         # Finally, do the \X match.
14282         my @matches = $string =~ /(\X)/g;
14283
14284         # Look through each matched cluster to verify that it matches what we
14285         # expect.
14286         my $min = (@matches < @should_match) ? @matches : @should_match;
14287         for my $i (0 .. $min - 1) {
14288             $Tests++;
14289             if ($matches[$i] eq $should_match[$i]) {
14290                 print "ok $Tests - ";
14291                 if ($i == 0) {
14292                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
14293                 } else {
14294                     print "And \\X #", $i + 1,
14295                 }
14296                 print " correctly matched $should_display[$i]; line $line\n";
14297             } else {
14298                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
14299                                                     unpack("U*", $matches[$i]));
14300                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
14301                     $i + 1,
14302                     " should have matched $should_display[$i]",
14303                     " but instead matched $matches[$i]",
14304                     ".  Abandoning rest of line $line\n";
14305                 next UPGRADE;
14306             }
14307         }
14308
14309         # And the number of matches should equal the number of expected matches.
14310         $Tests++;
14311         if (@matches == @should_match) {
14312             print "ok $Tests - Nothing was left over; line $line\n";
14313         } else {
14314             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
14315         }
14316     }
14317
14318     return;
14319 }
14320
14321 sub Finished() {
14322     print "1..$Tests\n";
14323     exit($Fails ? -1 : 0);
14324 }
14325
14326 Error('\p{Script=InGreek}');    # Bug #69018
14327 Test_X("1100 $nobreak 1161");  # Bug #70940
14328 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
14329 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
14330 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726