Commit | Line | Data |
---|---|---|
d73e5302 | 1 | #!/usr/bin/perl -w |
99870f4d KW |
2 | |
3 | # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! | |
4 | # Any files created or read by this program should be listed in 'mktables.lst' | |
5 | # Use -makelist to regenerate it. | |
6 | ||
23e33b60 KW |
7 | # Needs 'no overloading' to run faster on miniperl. Code commented out at the |
8 | # subroutine objaddr can be used instead to work as far back (untested) as | |
f998e60c KW |
9 | # 5.8: needs pack "U". But almost all occurrences of objaddr have been |
10 | # removed in favor of using 'no overloading'. You also would have to go | |
11 | # through and replace occurrences like: | |
ffe43484 | 12 | # my $addr = do { no overloading; pack 'J', $self; } |
f998e60c KW |
13 | # with |
14 | # my $addr = main::objaddr $self; | |
6c68572b | 15 | # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b |
051df77b NC |
16 | # that instituted the change to main::objaddr, and subsequent commits that |
17 | # changed 0+$self to pack 'J', $self.) | |
6c68572b | 18 | |
cdcef19a | 19 | my $start_time; |
98dc9551 | 20 | BEGIN { # Get the time the script started running; do it at compilation to |
cdcef19a KW |
21 | # get it as close as possible |
22 | $start_time= time; | |
23 | } | |
24 | ||
23e33b60 | 25 | require 5.010_001; |
d73e5302 | 26 | use strict; |
99870f4d | 27 | use warnings; |
cf25bb62 | 28 | use Carp; |
bd9ebcfd | 29 | use Config; |
99870f4d KW |
30 | use File::Find; |
31 | use File::Path; | |
d07a55ed | 32 | use File::Spec; |
99870f4d | 33 | use Text::Tabs; |
6b64c11c | 34 | use re "/aa"; |
99870f4d KW |
35 | |
36 | sub DEBUG () { 0 } # Set to 0 for production; 1 for development | |
bd9ebcfd | 37 | my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; |
99870f4d | 38 | |
74cd47d0 KW |
39 | sub NON_ASCII_PLATFORM { ord("A") != 65 } |
40 | ||
99870f4d KW |
41 | ########################################################################## |
42 | # | |
43 | # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), | |
44 | # from the Unicode database files (lib/unicore/.../*.txt), It also generates | |
232ed87f | 45 | # a pod file and .t files, depending on option parameters. |
99870f4d KW |
46 | # |
47 | # The structure of this file is: | |
48 | # First these introductory comments; then | |
49 | # code needed for everywhere, such as debugging stuff; then | |
50 | # code to handle input parameters; then | |
51 | # data structures likely to be of external interest (some of which depend on | |
52 | # the input parameters, so follows them; then | |
53 | # more data structures and subroutine and package (class) definitions; then | |
54 | # the small actual loop to process the input files and finish up; then | |
55 | # a __DATA__ section, for the .t tests | |
56 | # | |
232ed87f KW |
57 | # This program works on all releases of Unicode so far. The outputs have been |
58 | # scrutinized most intently for release 5.1. The others have been checked for | |
59 | # somewhat more than just sanity. It can handle all non-provisional Unicode | |
60 | # character properties in those releases. | |
99870f4d | 61 | # |
99870f4d KW |
62 | # This program is mostly about Unicode character (or code point) properties. |
63 | # A property describes some attribute or quality of a code point, like if it | |
64 | # is lowercase or not, its name, what version of Unicode it was first defined | |
65 | # in, or what its uppercase equivalent is. Unicode deals with these disparate | |
66 | # possibilities by making all properties into mappings from each code point | |
67 | # into some corresponding value. In the case of it being lowercase or not, | |
68 | # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each | |
69 | # property maps each Unicode code point to a single value, called a "property | |
232ed87f KW |
70 | # value". (Some more recently defined properties, map a code point to a set |
71 | # of values.) | |
99870f4d KW |
72 | # |
73 | # When using a property in a regular expression, what is desired isn't the | |
74 | # mapping of the code point to its property's value, but the reverse (or the | |
75 | # mathematical "inverse relation"): starting with the property value, "Does a | |
76 | # code point map to it?" These are written in a "compound" form: | |
77 | # \p{property=value}, e.g., \p{category=punctuation}. This program generates | |
78 | # files containing the lists of code points that map to each such regular | |
79 | # expression property value, one file per list | |
80 | # | |
81 | # There is also a single form shortcut that Perl adds for many of the commonly | |
82 | # used properties. This happens for all binary properties, plus script, | |
83 | # general_category, and block properties. | |
84 | # | |
85 | # Thus the outputs of this program are files. There are map files, mostly in | |
86 | # the 'To' directory; and there are list files for use in regular expression | |
87 | # matching, all in subdirectories of the 'lib' directory, with each | |
88 | # subdirectory being named for the property that the lists in it are for. | |
89 | # Bookkeeping, test, and documentation files are also generated. | |
90 | ||
91 | my $matches_directory = 'lib'; # Where match (\p{}) files go. | |
92 | my $map_directory = 'To'; # Where map files go. | |
93 | ||
94 | # DATA STRUCTURES | |
95 | # | |
96 | # The major data structures of this program are Property, of course, but also | |
97 | # Table. There are two kinds of tables, very similar to each other. | |
98 | # "Match_Table" is the data structure giving the list of code points that have | |
99 | # a particular property value, mentioned above. There is also a "Map_Table" | |
100 | # data structure which gives the property's mapping from code point to value. | |
101 | # There are two structures because the match tables need to be combined in | |
102 | # various ways, such as constructing unions, intersections, complements, etc., | |
103 | # and the map ones don't. And there would be problems, perhaps subtle, if | |
104 | # a map table were inadvertently operated on in some of those ways. | |
105 | # The use of separate classes with operations defined on one but not the other | |
106 | # prevents accidentally confusing the two. | |
107 | # | |
108 | # At the heart of each table's data structure is a "Range_List", which is just | |
109 | # an ordered list of "Ranges", plus ancillary information, and methods to | |
110 | # operate on them. A Range is a compact way to store property information. | |
111 | # Each range has a starting code point, an ending code point, and a value that | |
112 | # is meant to apply to all the code points between the two end points, | |
113 | # inclusive. For a map table, this value is the property value for those | |
114 | # code points. Two such ranges could be written like this: | |
115 | # 0x41 .. 0x5A, 'Upper', | |
116 | # 0x61 .. 0x7A, 'Lower' | |
117 | # | |
118 | # Each range also has a type used as a convenience to classify the values. | |
119 | # Most ranges in this program will be Type 0, or normal, but there are some | |
120 | # ranges that have a non-zero type. These are used only in map tables, and | |
121 | # are for mappings that don't fit into the normal scheme of things. Mappings | |
122 | # that require a hash entry to communicate with utf8.c are one example; | |
123 | # another example is mappings for charnames.pm to use which indicate a name | |
232ed87f | 124 | # that is algorithmically determinable from its code point (and the reverse). |
99870f4d KW |
125 | # These are used to significantly compact these tables, instead of listing |
126 | # each one of the tens of thousands individually. | |
127 | # | |
128 | # In a match table, the value of a range is irrelevant (and hence the type as | |
129 | # well, which will always be 0), and arbitrarily set to the null string. | |
130 | # Using the example above, there would be two match tables for those two | |
131 | # entries, one named Upper would contain the 0x41..0x5A range, and the other | |
132 | # named Lower would contain 0x61..0x7A. | |
133 | # | |
134 | # Actually, there are two types of range lists, "Range_Map" is the one | |
135 | # associated with map tables, and "Range_List" with match tables. | |
232ed87f KW |
136 | # Again, this is so that methods can be defined on one and not the others so |
137 | # as to prevent operating on them in incorrect ways. | |
99870f4d KW |
138 | # |
139 | # Eventually, most tables are written out to files to be read by utf8_heavy.pl | |
140 | # in the perl core. All tables could in theory be written, but some are | |
141 | # suppressed because there is no current practical use for them. It is easy | |
142 | # to change which get written by changing various lists that are near the top | |
143 | # of the actual code in this file. The table data structures contain enough | |
144 | # ancillary information to allow them to be treated as separate entities for | |
145 | # writing, such as the path to each one's file. There is a heading in each | |
146 | # map table that gives the format of its entries, and what the map is for all | |
147 | # the code points missing from it. (This allows tables to be more compact.) | |
678f13d5 | 148 | # |
99870f4d KW |
149 | # The Property data structure contains one or more tables. All properties |
150 | # contain a map table (except the $perl property which is a | |
151 | # pseudo-property containing only match tables), and any properties that | |
152 | # are usable in regular expression matches also contain various matching | |
153 | # tables, one for each value the property can have. A binary property can | |
154 | # have two values, True and False (or Y and N, which are preferred by Unicode | |
155 | # terminology). Thus each of these properties will have a map table that | |
156 | # takes every code point and maps it to Y or N (but having ranges cuts the | |
157 | # number of entries in that table way down), and two match tables, one | |
158 | # which has a list of all the code points that map to Y, and one for all the | |
232ed87f | 159 | # code points that map to N. (For each binary property, a third table is also |
99870f4d | 160 | # generated for the pseudo Perl property. It contains the identical code |
232ed87f KW |
161 | # points as the Y table, but can be written in regular expressions, not in the |
162 | # compound form, but in a "single" form like \p{IsUppercase}.) Many | |
163 | # properties are binary, but some properties have several possible values, | |
164 | # some have many, and properties like Name have a different value for every | |
165 | # named code point. Those will not, unless the controlling lists are changed, | |
166 | # have their match tables written out. But all the ones which can be used in | |
167 | # regular expression \p{} and \P{} constructs will. Prior to 5.14, generally | |
168 | # a property would have either its map table or its match tables written but | |
169 | # not both. Again, what gets written is controlled by lists which can easily | |
170 | # be changed. Starting in 5.14, advantage was taken of this, and all the map | |
171 | # tables needed to reconstruct the Unicode db are now written out, while | |
172 | # suppressing the Unicode .txt files that contain the data. Our tables are | |
173 | # much more compact than the .txt files, so a significant space savings was | |
174 | # achieved. Also, tables are not written out that are trivially derivable | |
175 | # from tables that do get written. So, there typically is no file containing | |
176 | # the code points not matched by a binary property (the table for \P{} versus | |
177 | # lowercase \p{}), since you just need to invert the True table to get the | |
178 | # False table. | |
179 | ||
180 | # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on | |
181 | # how many match tables there are and the content of the maps. This 'Type' is | |
c12f2655 KW |
182 | # different than a range 'Type', so don't get confused by the two concepts |
183 | # having the same name. | |
678f13d5 | 184 | # |
99870f4d KW |
185 | # For information about the Unicode properties, see Unicode's UAX44 document: |
186 | ||
187 | my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; | |
188 | ||
189 | # As stated earlier, this program will work on any release of Unicode so far. | |
190 | # Most obvious problems in earlier data have NOT been corrected except when | |
be864b6c | 191 | # necessary to make Perl or this program work reasonably, and to keep out |
232ed87f KW |
192 | # potential security issues. For example, no folding information was given in |
193 | # early releases, so this program substitutes lower case instead, just so that | |
194 | # a regular expression with the /i option will do something that actually | |
195 | # gives the right results in many cases. There are also a couple other | |
196 | # corrections for version 1.1.5, commented at the point they are made. As an | |
197 | # example of corrections that weren't made (but could be) is this statement | |
198 | # from DerivedAge.txt: "The supplementary private use code points and the | |
199 | # non-character code points were assigned in version 2.0, but not specifically | |
200 | # listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise | |
201 | # it was 3.0.1 not 3.0.0) More information on Unicode version glitches is | |
202 | # further down in these introductory comments. | |
99870f4d | 203 | # |
232ed87f KW |
204 | # This program works on all non-provisional properties as of the current |
205 | # Unicode release, though the files for some are suppressed for various | |
206 | # reasons. You can change which are output by changing lists in this program. | |
678f13d5 | 207 | # |
dc85bd38 | 208 | # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's |
99870f4d KW |
209 | # loose matchings rules (from Unicode TR18): |
210 | # | |
211 | # The recommended names for UCD properties and property values are in | |
212 | # PropertyAliases.txt [Prop] and PropertyValueAliases.txt | |
213 | # [PropValue]. There are both abbreviated names and longer, more | |
214 | # descriptive names. It is strongly recommended that both names be | |
215 | # recognized, and that loose matching of property names be used, | |
216 | # whereby the case distinctions, whitespace, hyphens, and underbar | |
217 | # are ignored. | |
232ed87f | 218 | # |
99870f4d KW |
219 | # The program still allows Fuzzy to override its determination of if loose |
220 | # matching should be used, but it isn't currently used, as it is no longer | |
221 | # needed; the calculations it makes are good enough. | |
678f13d5 | 222 | # |
99870f4d KW |
223 | # SUMMARY OF HOW IT WORKS: |
224 | # | |
225 | # Process arguments | |
226 | # | |
227 | # A list is constructed containing each input file that is to be processed | |
228 | # | |
229 | # Each file on the list is processed in a loop, using the associated handler | |
230 | # code for each: | |
231 | # The PropertyAliases.txt and PropValueAliases.txt files are processed | |
232 | # first. These files name the properties and property values. | |
233 | # Objects are created of all the property and property value names | |
234 | # that the rest of the input should expect, including all synonyms. | |
235 | # The other input files give mappings from properties to property | |
236 | # values. That is, they list code points and say what the mapping | |
237 | # is under the given property. Some files give the mappings for | |
238 | # just one property; and some for many. This program goes through | |
232ed87f KW |
239 | # each file and populates the properties and their map tables from |
240 | # them. Some properties are listed in more than one file, and | |
241 | # Unicode has set up a precedence as to which has priority if there | |
242 | # is a conflict. Thus the order of processing matters, and this | |
243 | # program handles the conflict possibility by processing the | |
244 | # overriding input files last, so that if necessary they replace | |
245 | # earlier values. | |
99870f4d KW |
246 | # After this is all done, the program creates the property mappings not |
247 | # furnished by Unicode, but derivable from what it does give. | |
248 | # The tables of code points that match each property value in each | |
249 | # property that is accessible by regular expressions are created. | |
250 | # The Perl-defined properties are created and populated. Many of these | |
251 | # require data determined from the earlier steps | |
252 | # Any Perl-defined synonyms are created, and name clashes between Perl | |
678f13d5 | 253 | # and Unicode are reconciled and warned about. |
99870f4d KW |
254 | # All the properties are written to files |
255 | # Any other files are written, and final warnings issued. | |
678f13d5 | 256 | # |
99870f4d KW |
257 | # For clarity, a number of operators have been overloaded to work on tables: |
258 | # ~ means invert (take all characters not in the set). The more | |
259 | # conventional '!' is not used because of the possibility of confusing | |
260 | # it with the actual boolean operation. | |
261 | # + means union | |
262 | # - means subtraction | |
263 | # & means intersection | |
264 | # The precedence of these is the order listed. Parentheses should be | |
265 | # copiously used. These are not a general scheme. The operations aren't | |
266 | # defined for a number of things, deliberately, to avoid getting into trouble. | |
267 | # Operations are done on references and affect the underlying structures, so | |
268 | # that the copy constructors for them have been overloaded to not return a new | |
269 | # clone, but the input object itself. | |
678f13d5 | 270 | # |
99870f4d KW |
271 | # The bool operator is deliberately not overloaded to avoid confusion with |
272 | # "should it mean if the object merely exists, or also is non-empty?". | |
99870f4d KW |
273 | # |
274 | # WHY CERTAIN DESIGN DECISIONS WERE MADE | |
678f13d5 KW |
275 | # |
276 | # This program needs to be able to run under miniperl. Therefore, it uses a | |
277 | # minimum of other modules, and hence implements some things itself that could | |
278 | # be gotten from CPAN | |
279 | # | |
280 | # This program uses inputs published by the Unicode Consortium. These can | |
281 | # change incompatibly between releases without the Perl maintainers realizing | |
282 | # it. Therefore this program is now designed to try to flag these. It looks | |
283 | # at the directories where the inputs are, and flags any unrecognized files. | |
284 | # It keeps track of all the properties in the files it handles, and flags any | |
285 | # that it doesn't know how to handle. It also flags any input lines that | |
286 | # don't match the expected syntax, among other checks. | |
287 | # | |
288 | # It is also designed so if a new input file matches one of the known | |
289 | # templates, one hopefully just needs to add it to a list to have it | |
290 | # processed. | |
291 | # | |
292 | # As mentioned earlier, some properties are given in more than one file. In | |
293 | # particular, the files in the extracted directory are supposedly just | |
294 | # reformattings of the others. But they contain information not easily | |
295 | # derivable from the other files, including results for Unihan, which this | |
296 | # program doesn't ordinarily look at, and for unassigned code points. They | |
297 | # also have historically had errors or been incomplete. In an attempt to | |
298 | # create the best possible data, this program thus processes them first to | |
299 | # glean information missing from the other files; then processes those other | |
300 | # files to override any errors in the extracted ones. Much of the design was | |
301 | # driven by this need to store things and then possibly override them. | |
302 | # | |
303 | # It tries to keep fatal errors to a minimum, to generate something usable for | |
304 | # testing purposes. It always looks for files that could be inputs, and will | |
305 | # warn about any that it doesn't know how to handle (the -q option suppresses | |
306 | # the warning). | |
99870f4d | 307 | # |
678f13d5 KW |
308 | # Why is there more than one type of range? |
309 | # This simplified things. There are some very specialized code points that | |
310 | # have to be handled specially for output, such as Hangul syllable names. | |
311 | # By creating a range type (done late in the development process), it | |
312 | # allowed this to be stored with the range, and overridden by other input. | |
313 | # Originally these were stored in another data structure, and it became a | |
314 | # mess trying to decide if a second file that was for the same property was | |
315 | # overriding the earlier one or not. | |
316 | # | |
317 | # Why are there two kinds of tables, match and map? | |
318 | # (And there is a base class shared by the two as well.) As stated above, | |
319 | # they actually are for different things. Development proceeded much more | |
320 | # smoothly when I (khw) realized the distinction. Map tables are used to | |
321 | # give the property value for every code point (actually every code point | |
322 | # that doesn't map to a default value). Match tables are used for regular | |
323 | # expression matches, and are essentially the inverse mapping. Separating | |
324 | # the two allows more specialized methods, and error checks so that one | |
325 | # can't just take the intersection of two map tables, for example, as that | |
326 | # is nonsensical. | |
99870f4d | 327 | # |
232ed87f KW |
328 | # What about 'fate' and 'status'. The concept of a table's fate was created |
329 | # late when it became clear that something more was needed. The difference | |
330 | # between this and 'status' is unclean, and could be improved if someone | |
331 | # wanted to spend the effort. | |
332 | # | |
23e33b60 KW |
333 | # DEBUGGING |
334 | # | |
678f13d5 KW |
335 | # This program is written so it will run under miniperl. Occasionally changes |
336 | # will cause an error where the backtrace doesn't work well under miniperl. | |
337 | # To diagnose the problem, you can instead run it under regular perl, if you | |
338 | # have one compiled. | |
339 | # | |
340 | # There is a good trace facility. To enable it, first sub DEBUG must be set | |
341 | # to return true. Then a line like | |
342 | # | |
343 | # local $to_trace = 1 if main::DEBUG; | |
344 | # | |
232ed87f KW |
345 | # can be added to enable tracing in its lexical scope (plus dynamic) or until |
346 | # you insert another line: | |
678f13d5 KW |
347 | # |
348 | # local $to_trace = 0 if main::DEBUG; | |
349 | # | |
232ed87f | 350 | # To actually trace, use a line like "trace $a, @b, %c, ...; |
678f13d5 KW |
351 | # |
352 | # Some of the more complex subroutines already have trace statements in them. | |
353 | # Permanent trace statements should be like: | |
354 | # | |
355 | # trace ... if main::DEBUG && $to_trace; | |
356 | # | |
357 | # If there is just one or a few files that you're debugging, you can easily | |
358 | # cause most everything else to be skipped. Change the line | |
359 | # | |
360 | # my $debug_skip = 0; | |
361 | # | |
362 | # to 1, and every file whose object is in @input_file_objects and doesn't have | |
232ed87f KW |
363 | # a, 'non_skip => 1,' in its constructor will be skipped. However, skipping |
364 | # Jamo.txt or UnicodeData.txt will likely cause fatal errors. | |
678f13d5 | 365 | # |
b4a0206c | 366 | # To compare the output tables, it may be useful to specify the -annotate |
97a8a595 KW |
367 | # flag. (As of this writing, this can't be done on a clean workspace, due to |
368 | # requirements in Text::Tabs used in this option; so first run mktables | |
369 | # without this option.) This option adds comment lines to each table, one for | |
370 | # each non-algorithmically named character giving, currently its code point, | |
371 | # name, and graphic representation if printable (and you have a font that | |
372 | # knows about it). This makes it easier to see what the particular code | |
373 | # points are in each output table. Non-named code points are annotated with a | |
374 | # description of their status, and contiguous ones with the same description | |
375 | # will be output as a range rather than individually. Algorithmically named | |
376 | # characters are also output as ranges, except when there are just a few | |
377 | # contiguous ones. | |
c4019d52 | 378 | # |
99870f4d KW |
379 | # FUTURE ISSUES |
380 | # | |
381 | # The program would break if Unicode were to change its names so that | |
382 | # interior white space, underscores, or dashes differences were significant | |
383 | # within property and property value names. | |
384 | # | |
385 | # It might be easier to use the xml versions of the UCD if this program ever | |
386 | # would need heavy revision, and the ability to handle old versions was not | |
387 | # required. | |
388 | # | |
389 | # There is the potential for name collisions, in that Perl has chosen names | |
390 | # that Unicode could decide it also likes. There have been such collisions in | |
391 | # the past, with mostly Perl deciding to adopt the Unicode definition of the | |
392 | # name. However in the 5.2 Unicode beta testing, there were a number of such | |
393 | # collisions, which were withdrawn before the final release, because of Perl's | |
394 | # and other's protests. These all involved new properties which began with | |
395 | # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, | |
396 | # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a | |
397 | # Unicode document, so they are unlikely to be used by Unicode for another | |
398 | # purpose. However, they might try something beginning with 'In', or use any | |
399 | # of the other Perl-defined properties. This program will warn you of name | |
400 | # collisions, and refuse to generate tables with them, but manual intervention | |
401 | # will be required in this event. One scheme that could be implemented, if | |
402 | # necessary, would be to have this program generate another file, or add a | |
403 | # field to mktables.lst that gives the date of first definition of a property. | |
404 | # Each new release of Unicode would use that file as a basis for the next | |
405 | # iteration. And the Perl synonym addition code could sort based on the age | |
406 | # of the property, so older properties get priority, and newer ones that clash | |
407 | # would be refused; hence existing code would not be impacted, and some other | |
408 | # synonym would have to be used for the new property. This is ugly, and | |
409 | # manual intervention would certainly be easier to do in the short run; lets | |
410 | # hope it never comes to this. | |
678f13d5 | 411 | # |
99870f4d KW |
412 | # A NOTE ON UNIHAN |
413 | # | |
414 | # This program can generate tables from the Unihan database. But it doesn't | |
415 | # by default, letting the CPAN module Unicode::Unihan handle them. Prior to | |
416 | # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the | |
417 | # database was split into 8 different files, all beginning with the letters | |
418 | # 'Unihan'. This program will read those file(s) if present, but it needs to | |
419 | # know which of the many properties in the file(s) should have tables created | |
420 | # for them. It will create tables for any properties listed in | |
421 | # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the | |
422 | # @cjk_properties array and the @cjk_property_values array. Thus, if a | |
423 | # property you want is not in those files of the release you are building | |
424 | # against, you must add it to those two arrays. Starting in 4.0, the | |
425 | # Unicode_Radical_Stroke was listed in those files, so if the Unihan database | |
426 | # is present in the directory, a table will be generated for that property. | |
427 | # In 5.2, several more properties were added. For your convenience, the two | |
5f7264c7 | 428 | # arrays are initialized with all the 6.0 listed properties that are also in |
99870f4d KW |
429 | # earlier releases. But these are commented out. You can just uncomment the |
430 | # ones you want, or use them as a template for adding entries for other | |
431 | # properties. | |
432 | # | |
433 | # You may need to adjust the entries to suit your purposes. setup_unihan(), | |
434 | # and filter_unihan_line() are the functions where this is done. This program | |
435 | # already does some adjusting to make the lines look more like the rest of the | |
436 | # Unicode DB; You can see what that is in filter_unihan_line() | |
437 | # | |
438 | # There is a bug in the 3.2 data file in which some values for the | |
439 | # kPrimaryNumeric property have commas and an unexpected comment. A filter | |
440 | # could be added for these; or for a particular installation, the Unihan.txt | |
441 | # file could be edited to fix them. | |
99870f4d | 442 | # |
678f13d5 KW |
443 | # HOW TO ADD A FILE TO BE PROCESSED |
444 | # | |
445 | # A new file from Unicode needs to have an object constructed for it in | |
446 | # @input_file_objects, probably at the end or at the end of the extracted | |
447 | # ones. The program should warn you if its name will clash with others on | |
448 | # restrictive file systems, like DOS. If so, figure out a better name, and | |
449 | # add lines to the README.perl file giving that. If the file is a character | |
232ed87f | 450 | # property, it should be in the format that Unicode has implicitly |
678f13d5 KW |
451 | # standardized for such files for the more recently introduced ones. |
452 | # If so, the Input_file constructor for @input_file_objects can just be the | |
453 | # file name and release it first appeared in. If not, then it should be | |
454 | # possible to construct an each_line_handler() to massage the line into the | |
455 | # standardized form. | |
456 | # | |
457 | # For non-character properties, more code will be needed. You can look at | |
458 | # the existing entries for clues. | |
459 | # | |
460 | # UNICODE VERSIONS NOTES | |
461 | # | |
462 | # The Unicode UCD has had a number of errors in it over the versions. And | |
463 | # these remain, by policy, in the standard for that version. Therefore it is | |
464 | # risky to correct them, because code may be expecting the error. So this | |
465 | # program doesn't generally make changes, unless the error breaks the Perl | |
466 | # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value | |
467 | # for U+1105, which causes real problems for the algorithms for Jamo | |
468 | # calculations, so it is changed here. | |
469 | # | |
470 | # But it isn't so clear cut as to what to do about concepts that are | |
471 | # introduced in a later release; should they extend back to earlier releases | |
472 | # where the concept just didn't exist? It was easier to do this than to not, | |
473 | # so that's what was done. For example, the default value for code points not | |
474 | # in the files for various properties was probably undefined until changed by | |
475 | # some version. No_Block for blocks is such an example. This program will | |
476 | # assign No_Block even in Unicode versions that didn't have it. This has the | |
477 | # benefit that code being written doesn't have to special case earlier | |
478 | # versions; and the detriment that it doesn't match the Standard precisely for | |
479 | # the affected versions. | |
480 | # | |
481 | # Here are some observations about some of the issues in early versions: | |
482 | # | |
232ed87f KW |
483 | # Prior to version 3.0, there were 3 character decompositions. These are not |
484 | # handled by Unicode::Normalize, nor will it compile when presented a version | |
485 | # that has them. However, you can trivially get it to compile by simply | |
486 | # ignoring those decompositions, by changing the croak to a carp. At the time | |
487 | # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads | |
488 | # | |
489 | # croak("Weird Canonical Decomposition of U+$h"); | |
490 | # | |
28807e1d KW |
491 | # Simply comment it out. It will compile, but will not know about any three |
492 | # character decompositions. If using the .pm version, there is a similar | |
493 | # line. | |
232ed87f KW |
494 | |
495 | # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out | |
496 | # that the reason is that the CJK block starting at 4E00 was removed from | |
497 | # PropList, and was not put back in until 3.1.0. The Perl extension (the | |
498 | # single property name \p{alpha}) has the correct values. But the compound | |
499 | # form is simply not generated until 3.1, as it can be argued that prior to | |
500 | # this release, this was not an official property. The comments for | |
501 | # filter_old_style_proplist() give more details. | |
678f13d5 KW |
502 | # |
503 | # Unicode introduced the synonym Space for White_Space in 4.1. Perl has | |
504 | # always had a \p{Space}. In release 3.2 only, they are not synonymous. The | |
505 | # reason is that 3.2 introduced U+205F=medium math space, which was not | |
506 | # classed as white space, but Perl figured out that it should have been. 4.0 | |
507 | # reclassified it correctly. | |
508 | # | |
509 | # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 | |
232ed87f KW |
510 | # this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB |
511 | # became 202, and ATBL was left with no code points, as all the ones that | |
512 | # mapped to 202 stayed mapped to 202. Thus if your program used the numeric | |
513 | # name for the class, it would not have been affected, but if it used the | |
514 | # mnemonic, it would have been. | |
678f13d5 KW |
515 | # |
516 | # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code | |
517 | # points which eventually came to have this script property value, instead | |
518 | # mapped to "Unknown". But in the next release all these code points were | |
519 | # moved to \p{sc=common} instead. | |
99870f4d KW |
520 | # |
521 | # The default for missing code points for BidiClass is complicated. Starting | |
522 | # in 3.1.1, the derived file DBidiClass.txt handles this, but this program | |
523 | # tries to do the best it can for earlier releases. It is done in | |
524 | # process_PropertyAliases() | |
525 | # | |
232ed87f KW |
526 | # In version 2.1.2, the entry in UnicodeData.txt: |
527 | # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F; | |
528 | # should instead be | |
529 | # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F | |
530 | # Without this change, there are casing problems for this character. | |
531 | # | |
7803ad2d KW |
532 | # Search for $string_compare_versions to see how to compare changes to |
533 | # properties between Unicode versions | |
534 | # | |
99870f4d KW |
535 | ############################################################################## |
536 | ||
537 | my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing | |
538 | # and errors | |
539 | my $MAX_LINE_WIDTH = 78; | |
540 | ||
541 | # Debugging aid to skip most files so as to not be distracted by them when | |
542 | # concentrating on the ones being debugged. Add | |
543 | # non_skip => 1, | |
544 | # to the constructor for those files you want processed when you set this. | |
545 | # Files with a first version number of 0 are special: they are always | |
c12f2655 KW |
546 | # processed regardless of the state of this flag. Generally, Jamo.txt and |
547 | # UnicodeData.txt must not be skipped if you want this program to not die | |
548 | # before normal completion. | |
99870f4d KW |
549 | my $debug_skip = 0; |
550 | ||
e9c4b4f8 KW |
551 | |
552 | # Normally these are suppressed. | |
553 | my $write_Unicode_deprecated_tables = 0; | |
554 | ||
99870f4d KW |
555 | # Set to 1 to enable tracing. |
556 | our $to_trace = 0; | |
557 | ||
558 | { # Closure for trace: debugging aid | |
559 | my $print_caller = 1; # ? Include calling subroutine name | |
560 | my $main_with_colon = 'main::'; | |
561 | my $main_colon_length = length($main_with_colon); | |
562 | ||
563 | sub trace { | |
564 | return unless $to_trace; # Do nothing if global flag not set | |
565 | ||
566 | my @input = @_; | |
567 | ||
568 | local $DB::trace = 0; | |
569 | $DB::trace = 0; # Quiet 'used only once' message | |
570 | ||
571 | my $line_number; | |
572 | ||
573 | # Loop looking up the stack to get the first non-trace caller | |
574 | my $caller_line; | |
575 | my $caller_name; | |
576 | my $i = 0; | |
577 | do { | |
578 | $line_number = $caller_line; | |
579 | (my $pkg, my $file, $caller_line, my $caller) = caller $i++; | |
580 | $caller = $main_with_colon unless defined $caller; | |
581 | ||
582 | $caller_name = $caller; | |
583 | ||
584 | # get rid of pkg | |
585 | $caller_name =~ s/.*:://; | |
586 | if (substr($caller_name, 0, $main_colon_length) | |
587 | eq $main_with_colon) | |
588 | { | |
589 | $caller_name = substr($caller_name, $main_colon_length); | |
590 | } | |
591 | ||
592 | } until ($caller_name ne 'trace'); | |
593 | ||
594 | # If the stack was empty, we were called from the top level | |
595 | $caller_name = 'main' if ($caller_name eq "" | |
596 | || $caller_name eq 'trace'); | |
597 | ||
598 | my $output = ""; | |
9346f59a | 599 | #print STDERR __LINE__, ": ", join ", ", @input, "\n"; |
99870f4d | 600 | foreach my $string (@input) { |
99870f4d KW |
601 | if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { |
602 | $output .= simple_dumper($string); | |
603 | } | |
604 | else { | |
605 | $string = "$string" if ref $string; | |
606 | $string = $UNDEF unless defined $string; | |
607 | chomp $string; | |
608 | $string = '""' if $string eq ""; | |
609 | $output .= " " if $output ne "" | |
610 | && $string ne "" | |
611 | && substr($output, -1, 1) ne " " | |
612 | && substr($string, 0, 1) ne " "; | |
613 | $output .= $string; | |
614 | } | |
615 | } | |
616 | ||
99f78760 KW |
617 | print STDERR sprintf "%4d: ", $line_number if defined $line_number; |
618 | print STDERR "$caller_name: " if $print_caller; | |
99870f4d KW |
619 | print STDERR $output, "\n"; |
620 | return; | |
621 | } | |
622 | } | |
623 | ||
624 | # This is for a rarely used development feature that allows you to compare two | |
625 | # versions of the Unicode standard without having to deal with changes caused | |
c12f2655 KW |
626 | # by the code points introduced in the later version. Change the 0 to a |
627 | # string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only | |
628 | # code points introduced in that release and earlier will be used; later ones | |
629 | # are thrown away. You use the version number of the earliest one you want to | |
630 | # compare; then run this program on directory structures containing each | |
631 | # release, and compare the outputs. These outputs will therefore include only | |
632 | # the code points common to both releases, and you can see the changes caused | |
633 | # just by the underlying release semantic changes. For versions earlier than | |
634 | # 3.2, you must copy a version of DAge.txt into the directory. | |
635 | my $string_compare_versions = DEBUG && 0; # e.g., "2.1"; | |
99870f4d KW |
636 | my $compare_versions = DEBUG |
637 | && $string_compare_versions | |
638 | && pack "C*", split /\./, $string_compare_versions; | |
639 | ||
640 | sub uniques { | |
641 | # Returns non-duplicated input values. From "Perl Best Practices: | |
642 | # Encapsulated Cleverness". p. 455 in first edition. | |
643 | ||
644 | my %seen; | |
0e407844 NC |
645 | # Arguably this breaks encapsulation, if the goal is to permit multiple |
646 | # distinct objects to stringify to the same value, and be interchangeable. | |
647 | # However, for this program, no two objects stringify identically, and all | |
648 | # lists passed to this function are either objects or strings. So this | |
649 | # doesn't affect correctness, but it does give a couple of percent speedup. | |
650 | no overloading; | |
99870f4d KW |
651 | return grep { ! $seen{$_}++ } @_; |
652 | } | |
653 | ||
654 | $0 = File::Spec->canonpath($0); | |
655 | ||
656 | my $make_test_script = 0; # ? Should we output a test script | |
6b5ab373 | 657 | my $make_norm_test_script = 0; # ? Should we output a normalization test script |
99870f4d KW |
658 | my $write_unchanged_files = 0; # ? Should we update the output files even if |
659 | # we don't think they have changed | |
660 | my $use_directory = ""; # ? Should we chdir somewhere. | |
661 | my $pod_directory; # input directory to store the pod file. | |
662 | my $pod_file = 'perluniprops'; | |
663 | my $t_path; # Path to the .t test file | |
664 | my $file_list = 'mktables.lst'; # File to store input and output file names. | |
665 | # This is used to speed up the build, by not | |
666 | # executing the main body of the program if | |
667 | # nothing on the list has changed since the | |
668 | # previous build | |
669 | my $make_list = 1; # ? Should we write $file_list. Set to always | |
670 | # make a list so that when the pumpking is | |
671 | # preparing a release, s/he won't have to do | |
672 | # special things | |
673 | my $glob_list = 0; # ? Should we try to include unknown .txt files | |
674 | # in the input. | |
bd9ebcfd KW |
675 | my $output_range_counts = $debugging_build; # ? Should we include the number |
676 | # of code points in ranges in | |
677 | # the output | |
558712cf | 678 | my $annotate = 0; # ? Should character names be in the output |
9ef2b94f | 679 | |
99870f4d KW |
680 | # Verbosity levels; 0 is quiet |
681 | my $NORMAL_VERBOSITY = 1; | |
682 | my $PROGRESS = 2; | |
683 | my $VERBOSE = 3; | |
684 | ||
685 | my $verbosity = $NORMAL_VERBOSITY; | |
686 | ||
0458fbc1 KW |
687 | # Stored in mktables.lst so that if this program is called with different |
688 | # options, will regenerate even if the files otherwise look like they're | |
689 | # up-to-date. | |
690 | my $command_line_arguments = join " ", @ARGV; | |
691 | ||
99870f4d KW |
692 | # Process arguments |
693 | while (@ARGV) { | |
cf25bb62 JH |
694 | my $arg = shift @ARGV; |
695 | if ($arg eq '-v') { | |
99870f4d KW |
696 | $verbosity = $VERBOSE; |
697 | } | |
698 | elsif ($arg eq '-p') { | |
699 | $verbosity = $PROGRESS; | |
700 | $| = 1; # Flush buffers as we go. | |
701 | } | |
702 | elsif ($arg eq '-q') { | |
703 | $verbosity = 0; | |
704 | } | |
705 | elsif ($arg eq '-w') { | |
706 | $write_unchanged_files = 1; # update the files even if havent changed | |
707 | } | |
708 | elsif ($arg eq '-check') { | |
6ae7e459 YO |
709 | my $this = shift @ARGV; |
710 | my $ok = shift @ARGV; | |
711 | if ($this ne $ok) { | |
712 | print "Skipping as check params are not the same.\n"; | |
713 | exit(0); | |
714 | } | |
00a8df5c | 715 | } |
99870f4d KW |
716 | elsif ($arg eq '-P' && defined ($pod_directory = shift)) { |
717 | -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; | |
718 | } | |
3df51b85 KW |
719 | elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) |
720 | { | |
99870f4d | 721 | $make_test_script = 1; |
99870f4d | 722 | } |
6b5ab373 KW |
723 | elsif ($arg eq '-makenormtest') |
724 | { | |
725 | $make_norm_test_script = 1; | |
726 | } | |
99870f4d KW |
727 | elsif ($arg eq '-makelist') { |
728 | $make_list = 1; | |
729 | } | |
730 | elsif ($arg eq '-C' && defined ($use_directory = shift)) { | |
731 | -d $use_directory or croak "Unknown directory '$use_directory'"; | |
732 | } | |
733 | elsif ($arg eq '-L') { | |
734 | ||
735 | # Existence not tested until have chdir'd | |
736 | $file_list = shift; | |
737 | } | |
738 | elsif ($arg eq '-globlist') { | |
739 | $glob_list = 1; | |
740 | } | |
741 | elsif ($arg eq '-c') { | |
742 | $output_range_counts = ! $output_range_counts | |
743 | } | |
b4a0206c | 744 | elsif ($arg eq '-annotate') { |
558712cf | 745 | $annotate = 1; |
bd9ebcfd KW |
746 | $debugging_build = 1; |
747 | $output_range_counts = 1; | |
9ef2b94f | 748 | } |
99870f4d KW |
749 | else { |
750 | my $with_c = 'with'; | |
751 | $with_c .= 'out' if $output_range_counts; # Complements the state | |
752 | croak <<END; | |
753 | usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] | |
754 | [ -T test_file_path ] [-globlist] [-makelist] [-maketest] | |
755 | [-check A B ] | |
756 | -c : Output comments $with_c number of code points in ranges | |
757 | -q : Quiet Mode: Only output serious warnings. | |
758 | -p : Set verbosity level to normal plus show progress. | |
759 | -v : Set Verbosity level high: Show progress and non-serious | |
760 | warnings | |
761 | -w : Write files regardless | |
762 | -C dir : Change to this directory before proceeding. All relative paths | |
763 | except those specified by the -P and -T options will be done | |
764 | with respect to this directory. | |
765 | -P dir : Output $pod_file file to directory 'dir'. | |
3df51b85 | 766 | -T path : Create a test script as 'path'; overrides -maketest |
99870f4d KW |
767 | -L filelist : Use alternate 'filelist' instead of standard one |
768 | -globlist : Take as input all non-Test *.txt files in current and sub | |
769 | directories | |
3df51b85 KW |
770 | -maketest : Make test script 'TestProp.pl' in current (or -C directory), |
771 | overrides -T | |
99870f4d | 772 | -makelist : Rewrite the file list $file_list based on current setup |
b4a0206c | 773 | -annotate : Output an annotation for each character in the table files; |
97a8a595 KW |
774 | useful for debugging mktables, looking at diffs; but is slow |
775 | and memory intensive | |
99870f4d KW |
776 | -check A B : Executes $0 only if A and B are the same |
777 | END | |
778 | } | |
779 | } | |
780 | ||
781 | # Stores the most-recently changed file. If none have changed, can skip the | |
782 | # build | |
aeab6150 | 783 | my $most_recent = (stat $0)[9]; # Do this before the chdir! |
99870f4d KW |
784 | |
785 | # Change directories now, because need to read 'version' early. | |
786 | if ($use_directory) { | |
3df51b85 | 787 | if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { |
99870f4d KW |
788 | $pod_directory = File::Spec->rel2abs($pod_directory); |
789 | } | |
3df51b85 | 790 | if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { |
99870f4d | 791 | $t_path = File::Spec->rel2abs($t_path); |
00a8df5c | 792 | } |
99870f4d | 793 | chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; |
3df51b85 | 794 | if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { |
99870f4d | 795 | $pod_directory = File::Spec->abs2rel($pod_directory); |
02b1aeec | 796 | } |
3df51b85 | 797 | if ($t_path && File::Spec->file_name_is_absolute($t_path)) { |
99870f4d | 798 | $t_path = File::Spec->abs2rel($t_path); |
02b1aeec | 799 | } |
00a8df5c YO |
800 | } |
801 | ||
99870f4d KW |
802 | # Get Unicode version into regular and v-string. This is done now because |
803 | # various tables below get populated based on it. These tables are populated | |
804 | # here to be near the top of the file, and so easily seeable by those needing | |
805 | # to modify things. | |
806 | open my $VERSION, "<", "version" | |
807 | or croak "$0: can't open required file 'version': $!\n"; | |
808 | my $string_version = <$VERSION>; | |
809 | close $VERSION; | |
810 | chomp $string_version; | |
811 | my $v_version = pack "C*", split /\./, $string_version; # v string | |
812 | ||
813 | # The following are the complete names of properties with property values that | |
814 | # are known to not match any code points in some versions of Unicode, but that | |
815 | # may change in the future so they should be matchable, hence an empty file is | |
816 | # generated for them. | |
a9c9e371 KW |
817 | my @tables_that_may_be_empty; |
818 | push @tables_that_may_be_empty, 'Joining_Type=Left_Joining' | |
819 | if $v_version lt v6.3.0; | |
99870f4d KW |
820 | push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; |
821 | push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; | |
822 | push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' | |
823 | if $v_version ge v4.1.0; | |
82aed44a KW |
824 | push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' |
825 | if $v_version ge v6.0.0; | |
f583b44c KW |
826 | push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' |
827 | if $v_version ge v6.1.0; | |
1e958ea9 KW |
828 | push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' |
829 | if $v_version ge v6.2.0; | |
99870f4d KW |
830 | |
831 | # The lists below are hashes, so the key is the item in the list, and the | |
832 | # value is the reason why it is in the list. This makes generation of | |
833 | # documentation easier. | |
834 | ||
835 | my %why_suppressed; # No file generated for these. | |
836 | ||
837 | # Files aren't generated for empty extraneous properties. This is arguable. | |
838 | # Extraneous properties generally come about because a property is no longer | |
839 | # used in a newer version of Unicode. If we generated a file without code | |
840 | # points, programs that used to work on that property will still execute | |
841 | # without errors. It just won't ever match (or will always match, with \P{}). | |
842 | # This means that the logic is now likely wrong. I (khw) think its better to | |
843 | # find this out by getting an error message. Just move them to the table | |
844 | # above to change this behavior | |
845 | my %why_suppress_if_empty_warn_if_not = ( | |
846 | ||
847 | # It is the only property that has ever officially been removed from the | |
848 | # Standard. The database never contained any code points for it. | |
849 | 'Special_Case_Condition' => 'Obsolete', | |
850 | ||
851 | # Apparently never official, but there were code points in some versions of | |
852 | # old-style PropList.txt | |
853 | 'Non_Break' => 'Obsolete', | |
854 | ); | |
855 | ||
856 | # These would normally go in the warn table just above, but they were changed | |
857 | # a long time before this program was written, so warnings about them are | |
858 | # moot. | |
859 | if ($v_version gt v3.2.0) { | |
860 | push @tables_that_may_be_empty, | |
861 | 'Canonical_Combining_Class=Attached_Below_Left' | |
862 | } | |
863 | ||
5f7264c7 | 864 | # These are listed in the Property aliases file in 6.0, but Unihan is ignored |
99870f4d KW |
865 | # unless explicitly added. |
866 | if ($v_version ge v5.2.0) { | |
867 | my $unihan = 'Unihan; remove from list if using Unihan'; | |
ea25a9b2 | 868 | foreach my $table (qw ( |
99870f4d KW |
869 | kAccountingNumeric |
870 | kOtherNumeric | |
871 | kPrimaryNumeric | |
872 | kCompatibilityVariant | |
873 | kIICore | |
874 | kIRG_GSource | |
875 | kIRG_HSource | |
876 | kIRG_JSource | |
877 | kIRG_KPSource | |
878 | kIRG_MSource | |
879 | kIRG_KSource | |
880 | kIRG_TSource | |
881 | kIRG_USource | |
882 | kIRG_VSource | |
883 | kRSUnicode | |
ea25a9b2 | 884 | )) |
99870f4d KW |
885 | { |
886 | $why_suppress_if_empty_warn_if_not{$table} = $unihan; | |
887 | } | |
ca12659b NC |
888 | } |
889 | ||
272501f6 KW |
890 | # Enum values for to_output_map() method in the Map_Table package. |
891 | my $EXTERNAL_MAP = 1; | |
892 | my $INTERNAL_MAP = 2; | |
ce712c88 | 893 | my $OUTPUT_ADJUSTED = 3; |
272501f6 | 894 | |
fcf1973c KW |
895 | # To override computed values for writing the map tables for these properties. |
896 | # The default for enum map tables is to write them out, so that the Unicode | |
897 | # .txt files can be removed, but all the data to compute any property value | |
898 | # for any code point is available in a more compact form. | |
899 | my %global_to_output_map = ( | |
900 | # Needed by UCD.pm, but don't want to publicize that it exists, so won't | |
c12f2655 KW |
901 | # get stuck supporting it if things change. Since it is a STRING |
902 | # property, it normally would be listed in the pod, but INTERNAL_MAP | |
903 | # suppresses that. | |
fcf1973c KW |
904 | Unicode_1_Name => $INTERNAL_MAP, |
905 | ||
906 | Present_In => 0, # Suppress, as easily computed from Age | |
74cd47d0 KW |
907 | Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is |
908 | # retained, but needed for | |
909 | # non-ASCII | |
53d34b6c KW |
910 | |
911 | # Suppress, as mapping can be found instead from the | |
912 | # Perl_Decomposition_Mapping file | |
913 | Decomposition_Type => 0, | |
fcf1973c KW |
914 | ); |
915 | ||
99870f4d | 916 | # Properties that this program ignores. |
230e0c16 KW |
917 | my @unimplemented_properties; |
918 | ||
919 | # With this release, it is automatically handled if the Unihan db is | |
920 | # downloaded | |
431c1f00 | 921 | push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version lt v5.2.0; |
d73e5302 | 922 | |
99870f4d KW |
923 | # There are several types of obsolete properties defined by Unicode. These |
924 | # must be hand-edited for every new Unicode release. | |
925 | my %why_deprecated; # Generates a deprecated warning message if used. | |
926 | my %why_stabilized; # Documentation only | |
927 | my %why_obsolete; # Documentation only | |
928 | ||
929 | { # Closure | |
8364f9eb | 930 | my $simple = 'Perl uses the more complete version'; |
99870f4d KW |
931 | my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan'; |
932 | ||
933 | my $other_properties = 'other properties'; | |
934 | my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; | |
5d294d41 | 935 | my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; |
99870f4d KW |
936 | |
937 | %why_deprecated = ( | |
5f7264c7 | 938 | 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)', |
99870f4d KW |
939 | 'Jamo_Short_Name' => $contributory, |
940 | 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', | |
941 | 'Other_Alphabetic' => $contributory, | |
942 | 'Other_Default_Ignorable_Code_Point' => $contributory, | |
943 | 'Other_Grapheme_Extend' => $contributory, | |
944 | 'Other_ID_Continue' => $contributory, | |
945 | 'Other_ID_Start' => $contributory, | |
946 | 'Other_Lowercase' => $contributory, | |
947 | 'Other_Math' => $contributory, | |
948 | 'Other_Uppercase' => $contributory, | |
e22aaf5c KW |
949 | 'Expands_On_NFC' => $why_no_expand, |
950 | 'Expands_On_NFD' => $why_no_expand, | |
951 | 'Expands_On_NFKC' => $why_no_expand, | |
952 | 'Expands_On_NFKD' => $why_no_expand, | |
99870f4d KW |
953 | ); |
954 | ||
955 | %why_suppressed = ( | |
5f7264c7 | 956 | # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which |
99870f4d KW |
957 | # contains the same information, but without the algorithmically |
958 | # determinable Hangul syllables'. This file is not published, so it's | |
959 | # existence is not noted in the comment. | |
12fee290 | 960 | 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::', |
99870f4d | 961 | |
ac71d2a0 | 962 | 'Indic_Matra_Category' => "Withdrawn by Unicode while still provisional", |
3111abc0 | 963 | |
5f8d1a89 KW |
964 | # Don't suppress ISO_Comment, as otherwise special handling is needed |
965 | # to differentiate between it and gc=c, which can be written as 'isc', | |
966 | # which is the same characters as ISO_Comment's short name. | |
99870f4d | 967 | |
12fee290 | 968 | 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::", |
e0b29447 | 969 | |
12fee290 KW |
970 | 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD", |
971 | 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", | |
972 | 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", | |
973 | 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", | |
99870f4d | 974 | |
dac6f618 | 975 | FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful', |
99870f4d KW |
976 | ); |
977 | ||
1704a0ea KW |
978 | foreach my $property ( |
979 | ||
980 | # The following are suppressed because they were made contributory | |
981 | # or deprecated by Unicode before Perl ever thought about | |
982 | # supporting them. | |
983 | 'Jamo_Short_Name', | |
984 | 'Grapheme_Link', | |
985 | 'Expands_On_NFC', | |
986 | 'Expands_On_NFD', | |
987 | 'Expands_On_NFKC', | |
988 | 'Expands_On_NFKD', | |
989 | ||
990 | # The following are suppressed because they have been marked | |
991 | # as deprecated for a sufficient amount of time | |
992 | 'Other_Alphabetic', | |
993 | 'Other_Default_Ignorable_Code_Point', | |
994 | 'Other_Grapheme_Extend', | |
995 | 'Other_ID_Continue', | |
996 | 'Other_ID_Start', | |
997 | 'Other_Lowercase', | |
998 | 'Other_Math', | |
999 | 'Other_Uppercase', | |
e22aaf5c | 1000 | ) { |
99870f4d KW |
1001 | $why_suppressed{$property} = $why_deprecated{$property}; |
1002 | } | |
cf25bb62 | 1003 | |
99870f4d KW |
1004 | # Customize the message for all the 'Other_' properties |
1005 | foreach my $property (keys %why_deprecated) { | |
1006 | next if (my $main_property = $property) !~ s/^Other_//; | |
1007 | $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; | |
1008 | } | |
1009 | } | |
1010 | ||
e9c4b4f8 KW |
1011 | if ($write_Unicode_deprecated_tables) { |
1012 | foreach my $property (keys %why_suppressed) { | |
1013 | delete $why_suppressed{$property} if $property =~ | |
1014 | / ^ Other | Grapheme /x; | |
1015 | } | |
1016 | } | |
1017 | ||
99870f4d KW |
1018 | if ($v_version ge 4.0.0) { |
1019 | $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; | |
5f7264c7 KW |
1020 | if ($v_version ge 6.0.0) { |
1021 | $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; | |
1022 | } | |
99870f4d | 1023 | } |
5f7264c7 | 1024 | if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { |
99870f4d | 1025 | $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; |
5f7264c7 | 1026 | if ($v_version ge 6.0.0) { |
63f74647 | 1027 | $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; |
5f7264c7 | 1028 | } |
99870f4d KW |
1029 | } |
1030 | ||
1031 | # Probably obsolete forever | |
1032 | if ($v_version ge v4.1.0) { | |
82aed44a KW |
1033 | $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; |
1034 | } | |
1035 | if ($v_version ge v6.0.0) { | |
caa75395 | 1036 | $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)'; |
2b352efd | 1037 | $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; |
99870f4d KW |
1038 | } |
1039 | ||
1040 | # This program can create files for enumerated-like properties, such as | |
1041 | # 'Numeric_Type'. This file would be the same format as for a string | |
1042 | # property, with a mapping from code point to its value, so you could look up, | |
1043 | # for example, the script a code point is in. But no one so far wants this | |
1044 | # mapping, or they have found another way to get it since this is a new | |
1045 | # feature. So no file is generated except if it is in this list. | |
1046 | my @output_mapped_properties = split "\n", <<END; | |
1047 | END | |
1048 | ||
c12f2655 KW |
1049 | # If you are using the Unihan database in a Unicode version before 5.2, you |
1050 | # need to add the properties that you want to extract from it to this table. | |
1051 | # For your convenience, the properties in the 6.0 PropertyAliases.txt file are | |
1052 | # listed, commented out | |
99870f4d KW |
1053 | my @cjk_properties = split "\n", <<'END'; |
1054 | #cjkAccountingNumeric; kAccountingNumeric | |
1055 | #cjkOtherNumeric; kOtherNumeric | |
1056 | #cjkPrimaryNumeric; kPrimaryNumeric | |
1057 | #cjkCompatibilityVariant; kCompatibilityVariant | |
1058 | #cjkIICore ; kIICore | |
1059 | #cjkIRG_GSource; kIRG_GSource | |
1060 | #cjkIRG_HSource; kIRG_HSource | |
1061 | #cjkIRG_JSource; kIRG_JSource | |
1062 | #cjkIRG_KPSource; kIRG_KPSource | |
1063 | #cjkIRG_KSource; kIRG_KSource | |
1064 | #cjkIRG_TSource; kIRG_TSource | |
1065 | #cjkIRG_USource; kIRG_USource | |
1066 | #cjkIRG_VSource; kIRG_VSource | |
1067 | #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS | |
1068 | END | |
1069 | ||
1070 | # Similarly for the property values. For your convenience, the lines in the | |
5f7264c7 | 1071 | # 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both |
c12f2655 | 1072 | # '#' marks (for Unicode versions before 5.2) |
99870f4d KW |
1073 | my @cjk_property_values = split "\n", <<'END'; |
1074 | ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN | |
1075 | ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> | |
1076 | ## @missing: 0000..10FFFF; cjkIICore; <none> | |
1077 | ## @missing: 0000..10FFFF; cjkIRG_GSource; <none> | |
1078 | ## @missing: 0000..10FFFF; cjkIRG_HSource; <none> | |
1079 | ## @missing: 0000..10FFFF; cjkIRG_JSource; <none> | |
1080 | ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none> | |
1081 | ## @missing: 0000..10FFFF; cjkIRG_KSource; <none> | |
1082 | ## @missing: 0000..10FFFF; cjkIRG_TSource; <none> | |
1083 | ## @missing: 0000..10FFFF; cjkIRG_USource; <none> | |
1084 | ## @missing: 0000..10FFFF; cjkIRG_VSource; <none> | |
1085 | ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN | |
1086 | ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN | |
1087 | ## @missing: 0000..10FFFF; cjkRSUnicode; <none> | |
1088 | END | |
1089 | ||
1090 | # The input files don't list every code point. Those not listed are to be | |
1091 | # defaulted to some value. Below are hard-coded what those values are for | |
1092 | # non-binary properties as of 5.1. Starting in 5.0, there are | |
caa75395 | 1093 | # machine-parsable comment lines in the files that give the defaults; so this |
99870f4d KW |
1094 | # list shouldn't have to be extended. The claim is that all missing entries |
1095 | # for binary properties will default to 'N'. Unicode tried to change that in | |
1096 | # 5.2, but the beta period produced enough protest that they backed off. | |
1097 | # | |
1098 | # The defaults for the fields that appear in UnicodeData.txt in this hash must | |
1099 | # be in the form that it expects. The others may be synonyms. | |
1100 | my $CODE_POINT = '<code point>'; | |
1101 | my %default_mapping = ( | |
1102 | Age => "Unassigned", | |
1103 | # Bidi_Class => Complicated; set in code | |
1104 | Bidi_Mirroring_Glyph => "", | |
1105 | Block => 'No_Block', | |
1106 | Canonical_Combining_Class => 0, | |
1107 | Case_Folding => $CODE_POINT, | |
1108 | Decomposition_Mapping => $CODE_POINT, | |
1109 | Decomposition_Type => 'None', | |
1110 | East_Asian_Width => "Neutral", | |
1111 | FC_NFKC_Closure => $CODE_POINT, | |
1112 | General_Category => 'Cn', | |
1113 | Grapheme_Cluster_Break => 'Other', | |
1114 | Hangul_Syllable_Type => 'NA', | |
1115 | ISO_Comment => "", | |
1116 | Jamo_Short_Name => "", | |
1117 | Joining_Group => "No_Joining_Group", | |
1118 | # Joining_Type => Complicated; set in code | |
1119 | kIICore => 'N', # Is converted to binary | |
1120 | #Line_Break => Complicated; set in code | |
1121 | Lowercase_Mapping => $CODE_POINT, | |
1122 | Name => "", | |
1123 | Name_Alias => "", | |
1124 | NFC_QC => 'Yes', | |
1125 | NFD_QC => 'Yes', | |
1126 | NFKC_QC => 'Yes', | |
1127 | NFKD_QC => 'Yes', | |
1128 | Numeric_Type => 'None', | |
1129 | Numeric_Value => 'NaN', | |
1130 | Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', | |
1131 | Sentence_Break => 'Other', | |
1132 | Simple_Case_Folding => $CODE_POINT, | |
1133 | Simple_Lowercase_Mapping => $CODE_POINT, | |
1134 | Simple_Titlecase_Mapping => $CODE_POINT, | |
1135 | Simple_Uppercase_Mapping => $CODE_POINT, | |
1136 | Titlecase_Mapping => $CODE_POINT, | |
1137 | Unicode_1_Name => "", | |
1138 | Unicode_Radical_Stroke => "", | |
1139 | Uppercase_Mapping => $CODE_POINT, | |
1140 | Word_Break => 'Other', | |
1141 | ); | |
1142 | ||
232ed87f KW |
1143 | # Below are files that Unicode furnishes, but this program ignores, and why. |
1144 | # NormalizationCorrections.txt requires some more explanation. It documents | |
1145 | # the cumulative fixes to erroneous normalizations in earlier Unicode | |
1146 | # versions. Its main purpose is so that someone running on an earlier version | |
1147 | # can use this file to override what got published in that earlier release. | |
1148 | # It would be easy for mktables to read and handle this file. But all the | |
1149 | # corrections in it should already be in the other files for the release it | |
1150 | # is. To get it to actually mean something useful, someone would have to be | |
1151 | # using an earlier Unicode release, and copy it to the files for that release | |
1152 | # and recomplile. So far there has been no demand to do that, so this hasn't | |
1153 | # been implemented. | |
99870f4d | 1154 | my %ignored_files = ( |
73ba1144 KW |
1155 | 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points', |
1156 | 'Index.txt' => 'Alphabetical index of Unicode characters', | |
1157 | 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl', | |
1158 | 'NamesList.txt' => 'Annotated list of characters', | |
524a8e5e | 1159 | 'NamesList.html' => 'Describes the format and contents of F<NamesList.txt>', |
73ba1144 KW |
1160 | 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base', |
1161 | 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)', | |
1162 | 'ReadMe.txt' => 'Documentation', | |
1163 | 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>', | |
524a8e5e | 1164 | 'StandardizedVariants.html' => 'Provides a visual display of the standard variant sequences derived from F<StandardizedVariants.txt>.', |
73ba1144 | 1165 | 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values', |
8ee2793f | 1166 | 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters', |
524a8e5e | 1167 | 'USourceGlyphs.pdf' => 'Pictures of the characters in F<USourceData.txt>', |
73ba1144 KW |
1168 | 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests', |
1169 | 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests', | |
1170 | 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests', | |
1171 | 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests', | |
99870f4d KW |
1172 | ); |
1173 | ||
1fec9f60 KW |
1174 | my %skipped_files; # List of files that we skip |
1175 | ||
678f13d5 | 1176 | ### End of externally interesting definitions, except for @input_file_objects |
99870f4d KW |
1177 | |
1178 | my $HEADER=<<"EOF"; | |
1179 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! | |
3df51b85 KW |
1180 | # This file is machine-generated by $0 from the Unicode |
1181 | # database, Version $string_version. Any changes made here will be lost! | |
cf25bb62 JH |
1182 | EOF |
1183 | ||
126c3d4e | 1184 | my $INTERNAL_ONLY_HEADER = <<"EOF"; |
99870f4d KW |
1185 | |
1186 | # !!!!!!! INTERNAL PERL USE ONLY !!!!!!! | |
fac53429 KW |
1187 | # This file is for internal use by core Perl only. The format and even the |
1188 | # name or existence of this file are subject to change without notice. Don't | |
d9ae3878 KW |
1189 | # use it directly. Use Unicode::UCD to access the Unicode character data |
1190 | # base. | |
99870f4d KW |
1191 | EOF |
1192 | ||
1193 | my $DEVELOPMENT_ONLY=<<"EOF"; | |
1194 | # !!!!!!! DEVELOPMENT USE ONLY !!!!!!! | |
1195 | # This file contains information artificially constrained to code points | |
1196 | # present in Unicode release $string_compare_versions. | |
1197 | # IT CANNOT BE RELIED ON. It is for use during development only and should | |
23e33b60 | 1198 | # not be used for production. |
b6922eda KW |
1199 | |
1200 | EOF | |
1201 | ||
346a20cf KW |
1202 | my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0) |
1203 | ? "10FFFF" | |
1204 | : "FFFF"; | |
6189eadc KW |
1205 | my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; |
1206 | my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; | |
99870f4d | 1207 | |
2d88a86a KW |
1208 | # We work with above-Unicode code points, up to UV_MAX. But when you get |
1209 | # that high, above IV_MAX, some operations don't work, and you can easily get | |
1210 | # overflow. Therefore for internal use, we use a much smaller number, | |
1211 | # translating it to UV_MAX only for output. The exact number is immaterial | |
1212 | # (all Unicode code points are treated exactly the same), but the algorithm | |
1213 | # requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1; | |
1214 | my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; | |
1215 | my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; | |
1216 | my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); | |
1217 | ||
1218 | my $MAX_PLATFORM_CODEPOINT = ~0; | |
1219 | ||
99870f4d KW |
1220 | # Matches legal code point. 4-6 hex numbers, If there are 6, the first |
1221 | # two must be 10; if there are 5, the first must not be a 0. Written this way | |
92199589 KW |
1222 | # to decrease backtracking. The first regex allows the code point to be at |
1223 | # the end of a word, but to work properly, the word shouldn't end with a valid | |
1224 | # hex character. The second one won't match a code point at the end of a | |
1225 | # word, and doesn't have the run-on issue | |
8c32d378 KW |
1226 | my $run_on_code_point_re = |
1227 | qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; | |
1228 | my $code_point_re = qr/\b$run_on_code_point_re/; | |
99870f4d KW |
1229 | |
1230 | # This matches the beginning of the line in the Unicode db files that give the | |
1231 | # defaults for code points not listed (i.e., missing) in the file. The code | |
1232 | # depends on this ending with a semi-colon, so it can assume it is a valid | |
1233 | # field when the line is split() by semi-colons | |
346a20cf | 1234 | my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/; |
99870f4d KW |
1235 | |
1236 | # Property types. Unicode has more types, but these are sufficient for our | |
1237 | # purposes. | |
1238 | my $UNKNOWN = -1; # initialized to illegal value | |
1239 | my $NON_STRING = 1; # Either binary or enum | |
1240 | my $BINARY = 2; | |
06f26c45 KW |
1241 | my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal |
1242 | # tables, additional true and false tables are | |
1243 | # generated so that false is anything matching the | |
1244 | # default value, and true is everything else. | |
1245 | my $ENUM = 4; # Include catalog | |
1246 | my $STRING = 5; # Anything else: string or misc | |
99870f4d KW |
1247 | |
1248 | # Some input files have lines that give default values for code points not | |
1249 | # contained in the file. Sometimes these should be ignored. | |
1250 | my $NO_DEFAULTS = 0; # Must evaluate to false | |
1251 | my $NOT_IGNORED = 1; | |
1252 | my $IGNORED = 2; | |
1253 | ||
1254 | # Range types. Each range has a type. Most ranges are type 0, for normal, | |
1255 | # and will appear in the main body of the tables in the output files, but | |
1256 | # there are other types of ranges as well, listed below, that are specially | |
1257 | # handled. There are pseudo-types as well that will never be stored as a | |
1258 | # type, but will affect the calculation of the type. | |
1259 | ||
1260 | # 0 is for normal, non-specials | |
1261 | my $MULTI_CP = 1; # Sequence of more than code point | |
1262 | my $HANGUL_SYLLABLE = 2; | |
1263 | my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. | |
1264 | my $NULL = 4; # The map is to the null string; utf8.c can't | |
1265 | # handle these, nor is there an accepted syntax | |
1266 | # for them in \p{} constructs | |
f86864ac | 1267 | my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would |
99870f4d KW |
1268 | # otherwise be $MULTI_CP type are instead type 0 |
1269 | ||
1270 | # process_generic_property_file() can accept certain overrides in its input. | |
1271 | # Each of these must begin AND end with $CMD_DELIM. | |
1272 | my $CMD_DELIM = "\a"; | |
1273 | my $REPLACE_CMD = 'replace'; # Override the Replace | |
1274 | my $MAP_TYPE_CMD = 'map_type'; # Override the Type | |
1275 | ||
1276 | my $NO = 0; | |
1277 | my $YES = 1; | |
1278 | ||
1279 | # Values for the Replace argument to add_range. | |
1280 | # $NO # Don't replace; add only the code points not | |
1281 | # already present. | |
1282 | my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in | |
1283 | # the comments at the subroutine definition. | |
1284 | my $UNCONDITIONALLY = 2; # Replace without conditions. | |
9470941f | 1285 | my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if |
99870f4d | 1286 | # already there |
7f4b1e25 KW |
1287 | my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if |
1288 | # already there | |
1289 | my $CROAK = 6; # Die with an error if is already there | |
99870f4d KW |
1290 | |
1291 | # Flags to give property statuses. The phrases are to remind maintainers that | |
1292 | # if the flag is changed, the indefinite article referring to it in the | |
1293 | # documentation may need to be as well. | |
1294 | my $NORMAL = ""; | |
99870f4d KW |
1295 | my $DEPRECATED = 'D'; |
1296 | my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; | |
1297 | my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; | |
1298 | my $DISCOURAGED = 'X'; | |
1299 | my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; | |
1300 | my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; | |
1301 | my $STRICTER = 'T'; | |
1302 | my $a_bold_stricter = "a 'B<$STRICTER>'"; | |
1303 | my $A_bold_stricter = "A 'B<$STRICTER>'"; | |
1304 | my $STABILIZED = 'S'; | |
1305 | my $a_bold_stabilized = "an 'B<$STABILIZED>'"; | |
1306 | my $A_bold_stabilized = "An 'B<$STABILIZED>'"; | |
1307 | my $OBSOLETE = 'O'; | |
1308 | my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; | |
1309 | my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; | |
1310 | ||
1311 | my %status_past_participles = ( | |
1312 | $DISCOURAGED => 'discouraged', | |
99870f4d KW |
1313 | $STABILIZED => 'stabilized', |
1314 | $OBSOLETE => 'obsolete', | |
37e2e78e | 1315 | $DEPRECATED => 'deprecated', |
99870f4d KW |
1316 | ); |
1317 | ||
395dfc19 KW |
1318 | # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be |
1319 | # externally documented. | |
301ba948 | 1320 | my $ORDINARY = 0; # The normal fate. |
395dfc19 KW |
1321 | my $MAP_PROXIED = 1; # The map table for the property isn't written out, |
1322 | # but there is a file written that can be used to | |
1323 | # reconstruct this table | |
3cdaf629 | 1324 | my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is |
301ba948 | 1325 | # for Perl's internal use only |
277b7b16 KW |
1326 | my $LEGACY_ONLY = 3; # Like $INTERNAL_ONLY, but not actually used by Perl. |
1327 | # Is for backwards compatibility for applications that | |
1328 | # read the file directly, so it's format is | |
1329 | # unchangeable. | |
1330 | my $SUPPRESSED = 4; # The file for this table is not written out, and as a | |
3cdaf629 KW |
1331 | # result, we don't bother to do many computations on |
1332 | # it. | |
277b7b16 | 1333 | my $PLACEHOLDER = 5; # Like $SUPPRESSED, but we go through all the |
3cdaf629 KW |
1334 | # computations anyway, as the values are needed for |
1335 | # things to work. This happens when we have Perl | |
1336 | # extensions that depend on Unicode tables that | |
1337 | # wouldn't normally be in a given Unicode version. | |
301ba948 | 1338 | |
f5817e0a KW |
1339 | # The format of the values of the tables: |
1340 | my $EMPTY_FORMAT = ""; | |
99870f4d KW |
1341 | my $BINARY_FORMAT = 'b'; |
1342 | my $DECIMAL_FORMAT = 'd'; | |
1343 | my $FLOAT_FORMAT = 'f'; | |
1344 | my $INTEGER_FORMAT = 'i'; | |
1345 | my $HEX_FORMAT = 'x'; | |
1346 | my $RATIONAL_FORMAT = 'r'; | |
1347 | my $STRING_FORMAT = 's'; | |
d11155ec | 1348 | my $ADJUST_FORMAT = 'a'; |
24303724 | 1349 | my $HEX_ADJUST_FORMAT = 'ax'; |
a14f3cb1 | 1350 | my $DECOMP_STRING_FORMAT = 'c'; |
c3ff2976 | 1351 | my $STRING_WHITE_SPACE_LIST = 'sw'; |
99870f4d KW |
1352 | |
1353 | my %map_table_formats = ( | |
1354 | $BINARY_FORMAT => 'binary', | |
1355 | $DECIMAL_FORMAT => 'single decimal digit', | |
1356 | $FLOAT_FORMAT => 'floating point number', | |
1357 | $INTEGER_FORMAT => 'integer', | |
add63c13 | 1358 | $HEX_FORMAT => 'non-negative hex whole number; a code point', |
99870f4d | 1359 | $RATIONAL_FORMAT => 'rational: an integer or a fraction', |
1a9d544b | 1360 | $STRING_FORMAT => 'string', |
d11155ec | 1361 | $ADJUST_FORMAT => 'some entries need adjustment', |
24303724 | 1362 | $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment', |
92f9d56c | 1363 | $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', |
c3ff2976 | 1364 | $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' |
99870f4d KW |
1365 | ); |
1366 | ||
1367 | # Unicode didn't put such derived files in a separate directory at first. | |
1368 | my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; | |
1369 | my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; | |
1370 | my $AUXILIARY = 'auxiliary'; | |
1371 | ||
3854b4b8 KW |
1372 | # Hashes and arrays that will eventually go into Heavy.pl for the use of |
1373 | # utf8_heavy.pl and into UCD.pl for the use of UCD.pm | |
99870f4d KW |
1374 | my %loose_to_file_of; # loosely maps table names to their respective |
1375 | # files | |
1376 | my %stricter_to_file_of; # same; but for stricter mapping. | |
315bfd4e | 1377 | my %loose_property_to_file_of; # Maps a loose property name to its map file |
e1dc048d | 1378 | my %strict_property_to_file_of; # Same, but strict |
3854b4b8 KW |
1379 | my @inline_definitions = "V0"; # Each element gives a definition of a unique |
1380 | # inversion list. When a definition is inlined, | |
1381 | # its value in the hash it's in (one of the two | |
1382 | # defined just above) will include an index into | |
1383 | # this array. The 0th element is initialized to | |
1384 | # the definition for a zero length invwersion list | |
89cf10cc KW |
1385 | my %file_to_swash_name; # Maps the file name to its corresponding key name |
1386 | # in the hash %utf8::SwashInfo | |
99870f4d KW |
1387 | my %nv_floating_to_rational; # maps numeric values floating point numbers to |
1388 | # their rational equivalent | |
c12f2655 KW |
1389 | my %loose_property_name_of; # Loosely maps (non_string) property names to |
1390 | # standard form | |
e1dc048d KW |
1391 | my %strict_property_name_of; # Strictly maps (non_string) property names to |
1392 | # standard form | |
86a52d1e | 1393 | my %string_property_loose_to_name; # Same, for string properties. |
c15fda25 KW |
1394 | my %loose_defaults; # keys are of form "prop=value", where 'prop' is |
1395 | # the property name in standard loose form, and | |
1396 | # 'value' is the default value for that property, | |
1397 | # also in standard loose form. | |
9e4a1e86 KW |
1398 | my %loose_to_standard_value; # loosely maps table names to the canonical |
1399 | # alias for them | |
2df7880f KW |
1400 | my %ambiguous_names; # keys are alias names (in standard form) that |
1401 | # have more than one possible meaning. | |
5d1df013 KW |
1402 | my %prop_aliases; # Keys are standard property name; values are each |
1403 | # one's aliases | |
1e863613 KW |
1404 | my %prop_value_aliases; # Keys of top level are standard property name; |
1405 | # values are keys to another hash, Each one is | |
1406 | # one of the property's values, in standard form. | |
1407 | # The values are that prop-val's aliases. | |
2df7880f | 1408 | my %ucd_pod; # Holds entries that will go into the UCD section of the pod |
99870f4d | 1409 | |
d867ccfb KW |
1410 | # Most properties are immune to caseless matching, otherwise you would get |
1411 | # nonsensical results, as properties are a function of a code point, not | |
1412 | # everything that is caselessly equivalent to that code point. For example, | |
1413 | # Changes_When_Case_Folded('s') should be false, whereas caselessly it would | |
1414 | # be true because 's' and 'S' are equivalent caselessly. However, | |
1415 | # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we | |
1416 | # extend that concept to those very few properties that are like this. Each | |
1417 | # such property will match the full range caselessly. They are hard-coded in | |
1418 | # the program; it's not worth trying to make it general as it's extremely | |
1419 | # unlikely that they will ever change. | |
1420 | my %caseless_equivalent_to; | |
1421 | ||
99870f4d KW |
1422 | # These constants names and values were taken from the Unicode standard, |
1423 | # version 5.1, section 3.12. They are used in conjunction with Hangul | |
6e5a209b KW |
1424 | # syllables. The '_string' versions are so generated tables can retain the |
1425 | # hex format, which is the more familiar value | |
1426 | my $SBase_string = "0xAC00"; | |
1427 | my $SBase = CORE::hex $SBase_string; | |
1428 | my $LBase_string = "0x1100"; | |
1429 | my $LBase = CORE::hex $LBase_string; | |
1430 | my $VBase_string = "0x1161"; | |
1431 | my $VBase = CORE::hex $VBase_string; | |
1432 | my $TBase_string = "0x11A7"; | |
1433 | my $TBase = CORE::hex $TBase_string; | |
99870f4d KW |
1434 | my $SCount = 11172; |
1435 | my $LCount = 19; | |
1436 | my $VCount = 21; | |
1437 | my $TCount = 28; | |
1438 | my $NCount = $VCount * $TCount; | |
1439 | ||
1440 | # For Hangul syllables; These store the numbers from Jamo.txt in conjunction | |
1441 | # with the above published constants. | |
1442 | my %Jamo; | |
1443 | my %Jamo_L; # Leading consonants | |
1444 | my %Jamo_V; # Vowels | |
1445 | my %Jamo_T; # Trailing consonants | |
1446 | ||
bb1dd3da KW |
1447 | # For code points whose name contains its ordinal as a '-ABCD' suffix. |
1448 | # The key is the base name of the code point, and the value is an | |
1449 | # array giving all the ranges that use this base name. Each range | |
1450 | # is actually a hash giving the 'low' and 'high' values of it. | |
1451 | my %names_ending_in_code_point; | |
1452 | my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes | |
1453 | # removed from the names | |
1454 | # Inverse mapping. The list of ranges that have these kinds of | |
1455 | # names. Each element contains the low, high, and base names in an | |
1456 | # anonymous hash. | |
1457 | my @code_points_ending_in_code_point; | |
1458 | ||
6b5ab373 KW |
1459 | # To hold Unicode's normalization test suite |
1460 | my @normalization_tests; | |
1461 | ||
bb1dd3da KW |
1462 | # Boolean: does this Unicode version have the hangul syllables, and are we |
1463 | # writing out a table for them? | |
1464 | my $has_hangul_syllables = 0; | |
1465 | ||
1466 | # Does this Unicode version have code points whose names end in their | |
1467 | # respective code points, and are we writing out a table for them? 0 for no; | |
1468 | # otherwise points to first property that a table is needed for them, so that | |
1469 | # if multiple tables are needed, we don't create duplicates | |
1470 | my $needing_code_points_ending_in_code_point = 0; | |
1471 | ||
37e2e78e | 1472 | my @backslash_X_tests; # List of tests read in for testing \X |
06ae2722 | 1473 | my @SB_tests; # List of tests read in for testing \b{sb} |
ae3bb8ea | 1474 | my @WB_tests; # List of tests read in for testing \b{wb} |
99870f4d KW |
1475 | my @unhandled_properties; # Will contain a list of properties found in |
1476 | # the input that we didn't process. | |
f86864ac | 1477 | my @match_properties; # Properties that have match tables, to be |
99870f4d KW |
1478 | # listed in the pod |
1479 | my @map_properties; # Properties that get map files written | |
1480 | my @named_sequences; # NamedSequences.txt contents. | |
1481 | my %potential_files; # Generated list of all .txt files in the directory | |
1482 | # structure so we can warn if something is being | |
1483 | # ignored. | |
1484 | my @files_actually_output; # List of files we generated. | |
1485 | my @more_Names; # Some code point names are compound; this is used | |
1486 | # to store the extra components of them. | |
1487 | my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at | |
1488 | # the minimum before we consider it equivalent to a | |
1489 | # candidate rational | |
1490 | my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms | |
1491 | ||
1492 | # These store references to certain commonly used property objects | |
3c88a801 | 1493 | my $ccc; |
99870f4d KW |
1494 | my $gc; |
1495 | my $perl; | |
1496 | my $block; | |
3e20195b KW |
1497 | my $perl_charname; |
1498 | my $print; | |
ac7dbdb6 | 1499 | my $All; |
bc0c431d | 1500 | my $Assigned; # All assigned characters in this Unicode release |
359523e2 | 1501 | my $script; |
99870f4d KW |
1502 | |
1503 | # Are there conflicting names because of beginning with 'In_', or 'Is_' | |
1504 | my $has_In_conflicts = 0; | |
1505 | my $has_Is_conflicts = 0; | |
1506 | ||
1507 | sub internal_file_to_platform ($) { | |
1508 | # Convert our file paths which have '/' separators to those of the | |
1509 | # platform. | |
1510 | ||
1511 | my $file = shift; | |
1512 | return undef unless defined $file; | |
1513 | ||
1514 | return File::Spec->join(split '/', $file); | |
d07a55ed | 1515 | } |
5beb625e | 1516 | |
99870f4d KW |
1517 | sub file_exists ($) { # platform independent '-e'. This program internally |
1518 | # uses slash as a path separator. | |
1519 | my $file = shift; | |
1520 | return 0 if ! defined $file; | |
1521 | return -e internal_file_to_platform($file); | |
1522 | } | |
5beb625e | 1523 | |
99870f4d | 1524 | sub objaddr($) { |
23e33b60 KW |
1525 | # Returns the address of the blessed input object. |
1526 | # It doesn't check for blessedness because that would do a string eval | |
1527 | # every call, and the program is structured so that this is never called | |
1528 | # for a non-blessed object. | |
99870f4d | 1529 | |
23e33b60 | 1530 | no overloading; # If overloaded, numifying below won't work. |
99870f4d KW |
1531 | |
1532 | # Numifying a ref gives its address. | |
051df77b | 1533 | return pack 'J', $_[0]; |
99870f4d KW |
1534 | } |
1535 | ||
558712cf | 1536 | # These are used only if $annotate is true. |
c4019d52 KW |
1537 | # The entire range of Unicode characters is examined to populate these |
1538 | # after all the input has been processed. But most can be skipped, as they | |
1539 | # have the same descriptive phrases, such as being unassigned | |
1540 | my @viacode; # Contains the 1 million character names | |
1541 | my @printable; # boolean: And are those characters printable? | |
1542 | my @annotate_char_type; # Contains a type of those characters, specifically | |
1543 | # for the purposes of annotation. | |
1544 | my $annotate_ranges; # A map of ranges of code points that have the same | |
98dc9551 | 1545 | # name for the purposes of annotation. They map to the |
c4019d52 KW |
1546 | # upper edge of the range, so that the end point can |
1547 | # be immediately found. This is used to skip ahead to | |
1548 | # the end of a range, and avoid processing each | |
1549 | # individual code point in it. | |
1550 | my $unassigned_sans_noncharacters; # A Range_List of the unassigned | |
1551 | # characters, but excluding those which are | |
1552 | # also noncharacter code points | |
1553 | ||
1554 | # The annotation types are an extension of the regular range types, though | |
1555 | # some of the latter are folded into one. Make the new types negative to | |
1556 | # avoid conflicting with the regular types | |
1557 | my $SURROGATE_TYPE = -1; | |
1558 | my $UNASSIGNED_TYPE = -2; | |
1559 | my $PRIVATE_USE_TYPE = -3; | |
1560 | my $NONCHARACTER_TYPE = -4; | |
1561 | my $CONTROL_TYPE = -5; | |
2d88a86a KW |
1562 | my $ABOVE_UNICODE_TYPE = -6; |
1563 | my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program | |
c4019d52 KW |
1564 | |
1565 | sub populate_char_info ($) { | |
558712cf | 1566 | # Used only with the $annotate option. Populates the arrays with the |
c4019d52 KW |
1567 | # input code point's info that are needed for outputting more detailed |
1568 | # comments. If calling context wants a return, it is the end point of | |
1569 | # any contiguous range of characters that share essentially the same info | |
1570 | ||
1571 | my $i = shift; | |
1572 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
1573 | ||
1574 | $viacode[$i] = $perl_charname->value_of($i) || ""; | |
1575 | ||
1576 | # A character is generally printable if Unicode says it is, | |
1577 | # but below we make sure that most Unicode general category 'C' types | |
1578 | # aren't. | |
1579 | $printable[$i] = $print->contains($i); | |
1580 | ||
1581 | $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; | |
1582 | ||
1583 | # Only these two regular types are treated specially for annotations | |
1584 | # purposes | |
1585 | $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME | |
1586 | && $annotate_char_type[$i] != $HANGUL_SYLLABLE; | |
1587 | ||
1588 | # Give a generic name to all code points that don't have a real name. | |
1589 | # We output ranges, if applicable, for these. Also calculate the end | |
1590 | # point of the range. | |
1591 | my $end; | |
1592 | if (! $viacode[$i]) { | |
1d025d66 | 1593 | my $nonchar; |
2d88a86a KW |
1594 | if ($i > $MAX_UNICODE_CODEPOINT) { |
1595 | $viacode[$i] = 'Above-Unicode'; | |
1596 | $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE; | |
1597 | $printable[$i] = 0; | |
1598 | $end = $MAX_WORKING_CODEPOINT; | |
1599 | } | |
1600 | elsif ($gc-> table('Private_use')->contains($i)) { | |
c4019d52 KW |
1601 | $viacode[$i] = 'Private Use'; |
1602 | $annotate_char_type[$i] = $PRIVATE_USE_TYPE; | |
1603 | $printable[$i] = 0; | |
1604 | $end = $gc->table('Private_Use')->containing_range($i)->end; | |
1605 | } | |
1d025d66 KW |
1606 | elsif ((defined ($nonchar = |
1607 | Property::property_ref('Noncharacter_Code_Point')) | |
1608 | && $nonchar->table('Y')->contains($i))) | |
c4019d52 KW |
1609 | { |
1610 | $viacode[$i] = 'Noncharacter'; | |
1611 | $annotate_char_type[$i] = $NONCHARACTER_TYPE; | |
1612 | $printable[$i] = 0; | |
1613 | $end = property_ref('Noncharacter_Code_Point')->table('Y')-> | |
1614 | containing_range($i)->end; | |
1615 | } | |
1616 | elsif ($gc-> table('Control')->contains($i)) { | |
c71dea7f | 1617 | $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control'; |
c4019d52 KW |
1618 | $annotate_char_type[$i] = $CONTROL_TYPE; |
1619 | $printable[$i] = 0; | |
c4019d52 KW |
1620 | } |
1621 | elsif ($gc-> table('Unassigned')->contains($i)) { | |
c4019d52 KW |
1622 | $annotate_char_type[$i] = $UNASSIGNED_TYPE; |
1623 | $printable[$i] = 0; | |
1d025d66 KW |
1624 | if ($v_version lt v2.0.0) { # No blocks in earliest releases |
1625 | $viacode[$i] = 'Unassigned'; | |
1626 | $end = $gc-> table('Unassigned')->containing_range($i)->end; | |
1627 | } | |
1628 | else { | |
1629 | $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i); | |
c4019d52 | 1630 | |
bf06c733 KW |
1631 | # Because we name the unassigned by the blocks they are in, it |
1632 | # can't go past the end of that block, and it also can't go | |
1633 | # past the unassigned range it is in. The special table makes | |
1634 | # sure that the non-characters, which are unassigned, are | |
1635 | # separated out. | |
1636 | $end = min($block->containing_range($i)->end, | |
1637 | $unassigned_sans_noncharacters-> | |
1638 | containing_range($i)->end); | |
1d025d66 KW |
1639 | } |
1640 | } | |
1641 | elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases | |
1642 | $viacode[$i] = $gc->value_of($i); | |
1643 | $annotate_char_type[$i] = $UNKNOWN_TYPE; | |
1644 | $printable[$i] = 0; | |
1645 | } | |
1646 | elsif ($gc-> table('Surrogate')->contains($i)) { | |
1647 | $viacode[$i] = 'Surrogate'; | |
1648 | $annotate_char_type[$i] = $SURROGATE_TYPE; | |
1649 | $printable[$i] = 0; | |
1650 | $end = $gc->table('Surrogate')->containing_range($i)->end; | |
13ca76ff KW |
1651 | } |
1652 | else { | |
1653 | Carp::my_carp_bug("Can't figure out how to annotate " | |
1654 | . sprintf("U+%04X", $i) | |
1655 | . ". Proceeding anyway."); | |
c4019d52 KW |
1656 | $viacode[$i] = 'UNKNOWN'; |
1657 | $annotate_char_type[$i] = $UNKNOWN_TYPE; | |
1658 | $printable[$i] = 0; | |
1659 | } | |
1660 | } | |
1661 | ||
1662 | # Here, has a name, but if it's one in which the code point number is | |
1663 | # appended to the name, do that. | |
1664 | elsif ($annotate_char_type[$i] == $CP_IN_NAME) { | |
1665 | $viacode[$i] .= sprintf("-%04X", $i); | |
1666 | $end = $perl_charname->containing_range($i)->end; | |
1667 | } | |
1668 | ||
1669 | # And here, has a name, but if it's a hangul syllable one, replace it with | |
1670 | # the correct name from the Unicode algorithm | |
1671 | elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { | |
1672 | use integer; | |
1673 | my $SIndex = $i - $SBase; | |
1674 | my $L = $LBase + $SIndex / $NCount; | |
1675 | my $V = $VBase + ($SIndex % $NCount) / $TCount; | |
1676 | my $T = $TBase + $SIndex % $TCount; | |
1677 | $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; | |
1678 | $viacode[$i] .= $Jamo{$T} if $T != $TBase; | |
1679 | $end = $perl_charname->containing_range($i)->end; | |
1680 | } | |
1681 | ||
1682 | return if ! defined wantarray; | |
1683 | return $i if ! defined $end; # If not a range, return the input | |
1684 | ||
1685 | # Save this whole range so can find the end point quickly | |
1686 | $annotate_ranges->add_map($i, $end, $end); | |
1687 | ||
1688 | return $end; | |
1689 | } | |
1690 | ||
23e33b60 KW |
1691 | # Commented code below should work on Perl 5.8. |
1692 | ## This 'require' doesn't necessarily work in miniperl, and even if it does, | |
1693 | ## the native perl version of it (which is what would operate under miniperl) | |
1694 | ## is extremely slow, as it does a string eval every call. | |
7e017d6d | 1695 | #my $has_fast_scalar_util = $^X !~ /miniperl/ |
23e33b60 KW |
1696 | # && defined eval "require Scalar::Util"; |
1697 | # | |
1698 | #sub objaddr($) { | |
1699 | # # Returns the address of the blessed input object. Uses the XS version if | |
1700 | # # available. It doesn't check for blessedness because that would do a | |
1701 | # # string eval every call, and the program is structured so that this is | |
1702 | # # never called for a non-blessed object. | |
1703 | # | |
1704 | # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util; | |
1705 | # | |
1706 | # # Check at least that is a ref. | |
1707 | # my $pkg = ref($_[0]) or return undef; | |
1708 | # | |
1709 | # # Change to a fake package to defeat any overloaded stringify | |
1710 | # bless $_[0], 'main::Fake'; | |
1711 | # | |
1712 | # # Numifying a ref gives its address. | |
051df77b | 1713 | # my $addr = pack 'J', $_[0]; |
23e33b60 KW |
1714 | # |
1715 | # # Return to original class | |
1716 | # bless $_[0], $pkg; | |
1717 | # return $addr; | |
1718 | #} | |
1719 | ||
99870f4d KW |
1720 | sub max ($$) { |
1721 | my $a = shift; | |
1722 | my $b = shift; | |
1723 | return $a if $a >= $b; | |
1724 | return $b; | |
1725 | } | |
1726 | ||
1727 | sub min ($$) { | |
1728 | my $a = shift; | |
1729 | my $b = shift; | |
1730 | return $a if $a <= $b; | |
1731 | return $b; | |
1732 | } | |
1733 | ||
1734 | sub clarify_number ($) { | |
1735 | # This returns the input number with underscores inserted every 3 digits | |
1736 | # in large (5 digits or more) numbers. Input must be entirely digits, not | |
1737 | # checked. | |
1738 | ||
1739 | my $number = shift; | |
1740 | my $pos = length($number) - 3; | |
1741 | return $number if $pos <= 1; | |
1742 | while ($pos > 0) { | |
1743 | substr($number, $pos, 0) = '_'; | |
1744 | $pos -= 3; | |
5beb625e | 1745 | } |
99870f4d | 1746 | return $number; |
99598c8c JH |
1747 | } |
1748 | ||
731cb813 KW |
1749 | sub clarify_code_point_count ($) { |
1750 | # This is like clarify_number(), but the input is assumed to be a count of | |
1751 | # code points, rather than a generic number. | |
1752 | ||
2d88a86a KW |
1753 | my $append = ""; |
1754 | ||
1755 | my $number = shift; | |
1756 | if ($number > $MAX_UNICODE_CODEPOINTS) { | |
1757 | $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS); | |
1758 | return "All above-Unicode code points" if $number == 0; | |
1759 | $append = " + all above-Unicode code points"; | |
1760 | } | |
1761 | return clarify_number($number) . $append; | |
731cb813 | 1762 | } |
12ac2576 | 1763 | |
99870f4d | 1764 | package Carp; |
7ebf06b3 | 1765 | |
99870f4d KW |
1766 | # These routines give a uniform treatment of messages in this program. They |
1767 | # are placed in the Carp package to cause the stack trace to not include them, | |
1768 | # although an alternative would be to use another package and set @CARP_NOT | |
1769 | # for it. | |
12ac2576 | 1770 | |
99870f4d | 1771 | our $Verbose = 1 if main::DEBUG; # Useful info when debugging |
12ac2576 | 1772 | |
99f78760 KW |
1773 | # This is a work-around suggested by Nicholas Clark to fix a problem with Carp |
1774 | # and overload trying to load Scalar:Util under miniperl. See | |
1775 | # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html | |
1776 | undef $overload::VERSION; | |
1777 | ||
99870f4d KW |
1778 | sub my_carp { |
1779 | my $message = shift || ""; | |
1780 | my $nofold = shift || 0; | |
7ebf06b3 | 1781 | |
99870f4d KW |
1782 | if ($message) { |
1783 | $message = main::join_lines($message); | |
1784 | $message =~ s/^$0: *//; # Remove initial program name | |
1785 | $message =~ s/[.;,]+$//; # Remove certain ending punctuation | |
1786 | $message = "\n$0: $message;"; | |
12ac2576 | 1787 | |
99870f4d KW |
1788 | # Fold the message with program name, semi-colon end punctuation |
1789 | # (which looks good with the message that carp appends to it), and a | |
1790 | # hanging indent for continuation lines. | |
1791 | $message = main::simple_fold($message, "", 4) unless $nofold; | |
1792 | $message =~ s/\n$//; # Remove the trailing nl so what carp | |
1793 | # appends is to the same line | |
1794 | } | |
12ac2576 | 1795 | |
99870f4d | 1796 | return $message if defined wantarray; # If a caller just wants the msg |
12ac2576 | 1797 | |
99870f4d KW |
1798 | carp $message; |
1799 | return; | |
1800 | } | |
7ebf06b3 | 1801 | |
99870f4d KW |
1802 | sub my_carp_bug { |
1803 | # This is called when it is clear that the problem is caused by a bug in | |
1804 | # this program. | |
7ebf06b3 | 1805 | |
99870f4d KW |
1806 | my $message = shift; |
1807 | $message =~ s/^$0: *//; | |
1808 | $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); | |
1809 | carp $message; | |
1810 | return; | |
1811 | } | |
7ebf06b3 | 1812 | |
99870f4d KW |
1813 | sub carp_too_few_args { |
1814 | if (@_ != 2) { | |
1815 | my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken."); | |
1816 | return; | |
12ac2576 | 1817 | } |
7ebf06b3 | 1818 | |
99870f4d KW |
1819 | my $args_ref = shift; |
1820 | my $count = shift; | |
7ebf06b3 | 1821 | |
99870f4d KW |
1822 | my_carp_bug("Need at least $count arguments to " |
1823 | . (caller 1)[3] | |
1824 | . ". Instead got: '" | |
1825 | . join ', ', @$args_ref | |
1826 | . "'. No action taken."); | |
1827 | return; | |
12ac2576 JP |
1828 | } |
1829 | ||
99870f4d KW |
1830 | sub carp_extra_args { |
1831 | my $args_ref = shift; | |
1832 | my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_; | |
12ac2576 | 1833 | |
99870f4d KW |
1834 | unless (ref $args_ref) { |
1835 | my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); | |
1836 | return; | |
1837 | } | |
1838 | my ($package, $file, $line) = caller; | |
1839 | my $subroutine = (caller 1)[3]; | |
cf25bb62 | 1840 | |
99870f4d KW |
1841 | my $list; |
1842 | if (ref $args_ref eq 'HASH') { | |
1843 | foreach my $key (keys %$args_ref) { | |
1844 | $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; | |
cf25bb62 | 1845 | } |
99870f4d | 1846 | $list = join ', ', each %{$args_ref}; |
cf25bb62 | 1847 | } |
99870f4d KW |
1848 | elsif (ref $args_ref eq 'ARRAY') { |
1849 | foreach my $arg (@$args_ref) { | |
1850 | $arg = $UNDEF unless defined $arg; | |
1851 | } | |
1852 | $list = join ', ', @$args_ref; | |
1853 | } | |
1854 | else { | |
1855 | my_carp_bug("Can't cope with ref " | |
1856 | . ref($args_ref) | |
1857 | . " . argument to 'carp_extra_args'. Not checking arguments."); | |
1858 | return; | |
1859 | } | |
1860 | ||
1861 | my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); | |
1862 | return; | |
d73e5302 JH |
1863 | } |
1864 | ||
99870f4d KW |
1865 | package main; |
1866 | ||
1867 | { # Closure | |
1868 | ||
1869 | # This program uses the inside-out method for objects, as recommended in | |
1b6b3fa9 KW |
1870 | # "Perl Best Practices". (This is the best solution still, since this has |
1871 | # to run under miniperl.) This closure aids in generating those. There | |
99870f4d KW |
1872 | # are two routines. setup_package() is called once per package to set |
1873 | # things up, and then set_access() is called for each hash representing a | |
1874 | # field in the object. These routines arrange for the object to be | |
1875 | # properly destroyed when no longer used, and for standard accessor | |
1876 | # functions to be generated. If you need more complex accessors, just | |
1877 | # write your own and leave those accesses out of the call to set_access(). | |
1878 | # More details below. | |
1879 | ||
1880 | my %constructor_fields; # fields that are to be used in constructors; see | |
1881 | # below | |
1882 | ||
1883 | # The values of this hash will be the package names as keys to other | |
1884 | # hashes containing the name of each field in the package as keys, and | |
1885 | # references to their respective hashes as values. | |
1886 | my %package_fields; | |
1887 | ||
1888 | sub setup_package { | |
1889 | # Sets up the package, creating standard DESTROY and dump methods | |
1890 | # (unless already defined). The dump method is used in debugging by | |
1891 | # simple_dumper(). | |
1892 | # The optional parameters are: | |
1893 | # a) a reference to a hash, that gets populated by later | |
1894 | # set_access() calls with one of the accesses being | |
1895 | # 'constructor'. The caller can then refer to this, but it is | |
1896 | # not otherwise used by these two routines. | |
1897 | # b) a reference to a callback routine to call during destruction | |
1898 | # of the object, before any fields are actually destroyed | |
1899 | ||
1900 | my %args = @_; | |
1901 | my $constructor_ref = delete $args{'Constructor_Fields'}; | |
1902 | my $destroy_callback = delete $args{'Destroy_Callback'}; | |
1903 | Carp::carp_extra_args(\@_) if main::DEBUG && %args; | |
1904 | ||
1905 | my %fields; | |
1906 | my $package = (caller)[0]; | |
1907 | ||
1908 | $package_fields{$package} = \%fields; | |
1909 | $constructor_fields{$package} = $constructor_ref; | |
1910 | ||
1911 | unless ($package->can('DESTROY')) { | |
1912 | my $destroy_name = "${package}::DESTROY"; | |
1913 | no strict "refs"; | |
1914 | ||
1915 | # Use typeglob to give the anonymous subroutine the name we want | |
1916 | *$destroy_name = sub { | |
1917 | my $self = shift; | |
ffe43484 | 1918 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
1919 | |
1920 | $self->$destroy_callback if $destroy_callback; | |
1921 | foreach my $field (keys %{$package_fields{$package}}) { | |
1922 | #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; | |
1923 | delete $package_fields{$package}{$field}{$addr}; | |
1924 | } | |
1925 | return; | |
1926 | } | |
1927 | } | |
1928 | ||
1929 | unless ($package->can('dump')) { | |
1930 | my $dump_name = "${package}::dump"; | |
1931 | no strict "refs"; | |
1932 | *$dump_name = sub { | |
1933 | my $self = shift; | |
1934 | return dump_inside_out($self, $package_fields{$package}, @_); | |
1935 | } | |
1936 | } | |
1937 | return; | |
1938 | } | |
1939 | ||
1940 | sub set_access { | |
1941 | # Arrange for the input field to be garbage collected when no longer | |
1942 | # needed. Also, creates standard accessor functions for the field | |
1943 | # based on the optional parameters-- none if none of these parameters: | |
1944 | # 'addable' creates an 'add_NAME()' accessor function. | |
1945 | # 'readable' or 'readable_array' creates a 'NAME()' accessor | |
1946 | # function. | |
1947 | # 'settable' creates a 'set_NAME()' accessor function. | |
1948 | # 'constructor' doesn't create an accessor function, but adds the | |
1949 | # field to the hash that was previously passed to | |
1950 | # setup_package(); | |
1951 | # Any of the accesses can be abbreviated down, so that 'a', 'ad', | |
1952 | # 'add' etc. all mean 'addable'. | |
1953 | # The read accessor function will work on both array and scalar | |
1954 | # values. If another accessor in the parameter list is 'a', the read | |
1955 | # access assumes an array. You can also force it to be array access | |
1956 | # by specifying 'readable_array' instead of 'readable' | |
1957 | # | |
1958 | # A sort-of 'protected' access can be set-up by preceding the addable, | |
1959 | # readable or settable with some initial portion of 'protected_' (but, | |
1960 | # the underscore is required), like 'p_a', 'pro_set', etc. The | |
1961 | # "protection" is only by convention. All that happens is that the | |
1962 | # accessor functions' names begin with an underscore. So instead of | |
1963 | # calling set_foo, the call is _set_foo. (Real protection could be | |
c1739a4a | 1964 | # accomplished by having a new subroutine, end_package, called at the |
99870f4d KW |
1965 | # end of each package, and then storing the __LINE__ ranges and |
1966 | # checking them on every accessor. But that is way overkill.) | |
1967 | ||
1968 | # We create anonymous subroutines as the accessors and then use | |
1969 | # typeglobs to assign them to the proper package and name | |
1970 | ||
1971 | my $name = shift; # Name of the field | |
1972 | my $field = shift; # Reference to the inside-out hash containing the | |
1973 | # field | |
1974 | ||
1975 | my $package = (caller)[0]; | |
1976 | ||
1977 | if (! exists $package_fields{$package}) { | |
1978 | croak "$0: Must call 'setup_package' before 'set_access'"; | |
1979 | } | |
d73e5302 | 1980 | |
99870f4d KW |
1981 | # Stash the field so DESTROY can get it. |
1982 | $package_fields{$package}{$name} = $field; | |
cf25bb62 | 1983 | |
99870f4d KW |
1984 | # Remaining arguments are the accessors. For each... |
1985 | foreach my $access (@_) { | |
1986 | my $access = lc $access; | |
cf25bb62 | 1987 | |
99870f4d | 1988 | my $protected = ""; |
cf25bb62 | 1989 | |
99870f4d KW |
1990 | # Match the input as far as it goes. |
1991 | if ($access =~ /^(p[^_]*)_/) { | |
1992 | $protected = $1; | |
1993 | if (substr('protected_', 0, length $protected) | |
1994 | eq $protected) | |
1995 | { | |
1996 | ||
1997 | # Add 1 for the underscore not included in $protected | |
1998 | $access = substr($access, length($protected) + 1); | |
1999 | $protected = '_'; | |
2000 | } | |
2001 | else { | |
2002 | $protected = ""; | |
2003 | } | |
2004 | } | |
2005 | ||
2006 | if (substr('addable', 0, length $access) eq $access) { | |
2007 | my $subname = "${package}::${protected}add_$name"; | |
2008 | no strict "refs"; | |
2009 | ||
2010 | # add_ accessor. Don't add if already there, which we | |
2011 | # determine using 'eq' for scalars and '==' otherwise. | |
2012 | *$subname = sub { | |
2013 | use strict "refs"; | |
2014 | return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; | |
2015 | my $self = shift; | |
2016 | my $value = shift; | |
ffe43484 | 2017 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
2018 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; |
2019 | if (ref $value) { | |
f998e60c | 2020 | return if grep { $value == $_ } @{$field->{$addr}}; |
99870f4d KW |
2021 | } |
2022 | else { | |
f998e60c | 2023 | return if grep { $value eq $_ } @{$field->{$addr}}; |
99870f4d | 2024 | } |
f998e60c | 2025 | push @{$field->{$addr}}, $value; |
99870f4d KW |
2026 | return; |
2027 | } | |
2028 | } | |
2029 | elsif (substr('constructor', 0, length $access) eq $access) { | |
2030 | if ($protected) { | |
2031 | Carp::my_carp_bug("Can't set-up 'protected' constructors") | |
2032 | } | |
2033 | else { | |
2034 | $constructor_fields{$package}{$name} = $field; | |
2035 | } | |
2036 | } | |
2037 | elsif (substr('readable_array', 0, length $access) eq $access) { | |
2038 | ||
2039 | # Here has read access. If one of the other parameters for | |
2040 | # access is array, or this one specifies array (by being more | |
2041 | # than just 'readable_'), then create a subroutine that | |
2042 | # assumes the data is an array. Otherwise just a scalar | |
2043 | my $subname = "${package}::${protected}$name"; | |
2044 | if (grep { /^a/i } @_ | |
2045 | or length($access) > length('readable_')) | |
2046 | { | |
2047 | no strict "refs"; | |
2048 | *$subname = sub { | |
2049 | use strict "refs"; | |
23e33b60 | 2050 | Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; |
ffe43484 | 2051 | my $addr = do { no overloading; pack 'J', $_[0]; }; |
99870f4d KW |
2052 | if (ref $field->{$addr} ne 'ARRAY') { |
2053 | my $type = ref $field->{$addr}; | |
2054 | $type = 'scalar' unless $type; | |
2055 | Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); | |
2056 | return; | |
2057 | } | |
2058 | return scalar @{$field->{$addr}} unless wantarray; | |
2059 | ||
2060 | # Make a copy; had problems with caller modifying the | |
2061 | # original otherwise | |
2062 | my @return = @{$field->{$addr}}; | |
2063 | return @return; | |
2064 | } | |
2065 | } | |
2066 | else { | |
2067 | ||
2068 | # Here not an array value, a simpler function. | |
2069 | no strict "refs"; | |
2070 | *$subname = sub { | |
2071 | use strict "refs"; | |
23e33b60 | 2072 | Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; |
f998e60c | 2073 | no overloading; |
051df77b | 2074 | return $field->{pack 'J', $_[0]}; |
99870f4d KW |
2075 | } |
2076 | } | |
2077 | } | |
2078 | elsif (substr('settable', 0, length $access) eq $access) { | |
2079 | my $subname = "${package}::${protected}set_$name"; | |
2080 | no strict "refs"; | |
2081 | *$subname = sub { | |
2082 | use strict "refs"; | |
23e33b60 KW |
2083 | if (main::DEBUG) { |
2084 | return Carp::carp_too_few_args(\@_, 2) if @_ < 2; | |
2085 | Carp::carp_extra_args(\@_) if @_ > 2; | |
2086 | } | |
2087 | # $self is $_[0]; $value is $_[1] | |
f998e60c | 2088 | no overloading; |
051df77b | 2089 | $field->{pack 'J', $_[0]} = $_[1]; |
99870f4d KW |
2090 | return; |
2091 | } | |
2092 | } | |
2093 | else { | |
2094 | Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); | |
2095 | } | |
cf25bb62 | 2096 | } |
99870f4d | 2097 | return; |
cf25bb62 | 2098 | } |
99870f4d KW |
2099 | } |
2100 | ||
2101 | package Input_file; | |
2102 | ||
2103 | # All input files use this object, which stores various attributes about them, | |
2104 | # and provides for convenient, uniform handling. The run method wraps the | |
2105 | # processing. It handles all the bookkeeping of opening, reading, and closing | |
2106 | # the file, returning only significant input lines. | |
2107 | # | |
2108 | # Each object gets a handler which processes the body of the file, and is | |
74cd47d0 KW |
2109 | # called by run(). All character property files must use the generic, |
2110 | # default handler, which has code scrubbed to handle things you might not | |
2111 | # expect, including automatic EBCDIC handling. For files that don't deal with | |
2112 | # mapping code points to a property value, such as test files, | |
2113 | # PropertyAliases, PropValueAliases, and named sequences, you can override the | |
2114 | # handler to be a custom one. Such a handler should basically be a | |
2115 | # while(next_line()) {...} loop. | |
99870f4d KW |
2116 | # |
2117 | # You can also set up handlers to | |
537124e4 | 2118 | # 1) call before the first line is read, for pre processing |
83b68635 KW |
2119 | # 2) call to adjust each line of the input before the main handler gets |
2120 | # them. This can be automatically generated, if appropriately simple | |
2121 | # enough, by specifiying a Properties parameter in the constructor. | |
99870f4d | 2122 | # 3) call upon EOF before the main handler exits its loop |
537124e4 | 2123 | # 4) call at the end, for post processing |
99870f4d KW |
2124 | # |
2125 | # $_ is used to store the input line, and is to be filtered by the | |
2126 | # each_line_handler()s. So, if the format of the line is not in the desired | |
2127 | # format for the main handler, these are used to do that adjusting. They can | |
2128 | # be stacked (by enclosing them in an [ anonymous array ] in the constructor, | |
2129 | # so the $_ output of one is used as the input to the next. None of the other | |
2130 | # handlers are stackable, but could easily be changed to be so. | |
2131 | # | |
2132 | # Most of the handlers can call insert_lines() or insert_adjusted_lines() | |
2133 | # which insert the parameters as lines to be processed before the next input | |
2134 | # file line is read. This allows the EOF handler to flush buffers, for | |
2135 | # example. The difference between the two routines is that the lines inserted | |
2136 | # by insert_lines() are subjected to the each_line_handler()s. (So if you | |
2137 | # called it from such a handler, you would get infinite recursion.) Lines | |
2138 | # inserted by insert_adjusted_lines() go directly to the main handler without | |
2139 | # any adjustments. If the post-processing handler calls any of these, there | |
2140 | # will be no effect. Some error checking for these conditions could be added, | |
2141 | # but it hasn't been done. | |
2142 | # | |
2143 | # carp_bad_line() should be called to warn of bad input lines, which clears $_ | |
2144 | # to prevent further processing of the line. This routine will output the | |
2145 | # message as a warning once, and then keep a count of the lines that have the | |
2146 | # same message, and output that count at the end of the file's processing. | |
2147 | # This keeps the number of messages down to a manageable amount. | |
2148 | # | |
2149 | # get_missings() should be called to retrieve any @missing input lines. | |
2150 | # Messages will be raised if this isn't done if the options aren't to ignore | |
2151 | # missings. | |
2152 | ||
2153 | sub trace { return main::trace(@_); } | |
2154 | ||
99870f4d KW |
2155 | { # Closure |
2156 | # Keep track of fields that are to be put into the constructor. | |
2157 | my %constructor_fields; | |
2158 | ||
2159 | main::setup_package(Constructor_Fields => \%constructor_fields); | |
2160 | ||
2161 | my %file; # Input file name, required | |
2162 | main::set_access('file', \%file, qw{ c r }); | |
2163 | ||
2164 | my %first_released; # Unicode version file was first released in, required | |
2165 | main::set_access('first_released', \%first_released, qw{ c r }); | |
2166 | ||
2167 | my %handler; # Subroutine to process the input file, defaults to | |
2168 | # 'process_generic_property_file' | |
2169 | main::set_access('handler', \%handler, qw{ c }); | |
2170 | ||
2171 | my %property; | |
2172 | # name of property this file is for. defaults to none, meaning not | |
2173 | # applicable, or is otherwise determinable, for example, from each line. | |
696609bf | 2174 | main::set_access('property', \%property, qw{ c r }); |
99870f4d KW |
2175 | |
2176 | my %optional; | |
2177 | # If this is true, the file is optional. If not present, no warning is | |
2178 | # output. If it is present, the string given by this parameter is | |
2179 | # evaluated, and if false the file is not processed. | |
2180 | main::set_access('optional', \%optional, 'c', 'r'); | |
2181 | ||
2182 | my %non_skip; | |
2183 | # This is used for debugging, to skip processing of all but a few input | |
2184 | # files. Add 'non_skip => 1' to the constructor for those files you want | |
2185 | # processed when you set the $debug_skip global. | |
2186 | main::set_access('non_skip', \%non_skip, 'c'); | |
2187 | ||
37e2e78e | 2188 | my %skip; |
09ca89ce KW |
2189 | # This is used to skip processing of this input file semi-permanently, |
2190 | # when it evaluates to true. The value should be the reason the file is | |
2191 | # being skipped. It is used for files that we aren't planning to process | |
2192 | # anytime soon, but want to allow to be in the directory and not raise a | |
2193 | # message that we are not handling. Mostly for test files. This is in | |
2194 | # contrast to the non_skip element, which is supposed to be used very | |
2195 | # temporarily for debugging. Sets 'optional' to 1. Also, files that we | |
2196 | # pretty much will never look at can be placed in the global | |
1fec9f60 | 2197 | # %ignored_files instead. Ones used here will be added to %skipped files |
37e2e78e KW |
2198 | main::set_access('skip', \%skip, 'c'); |
2199 | ||
99870f4d KW |
2200 | my %each_line_handler; |
2201 | # list of subroutines to look at and filter each non-comment line in the | |
2202 | # file. defaults to none. The subroutines are called in order, each is | |
2203 | # to adjust $_ for the next one, and the final one adjusts it for | |
2204 | # 'handler' | |
2205 | main::set_access('each_line_handler', \%each_line_handler, 'c'); | |
2206 | ||
83b68635 KW |
2207 | my %properties; # Optional ordered list of the properties that occur in each |
2208 | # meaningful line of the input file. If present, an appropriate | |
2209 | # each_line_handler() is automatically generated and pushed onto the stack | |
2210 | # of such handlers. This is useful when a file contains multiple | |
2211 | # proerties per line, but no other special considerations are necessary. | |
2212 | # The special value "<ignored>" means to discard the corresponding input | |
2213 | # field. | |
2214 | # Any @missing lines in the file should also match this syntax; no such | |
2215 | # files exist as of 6.3. But if it happens in a future release, the code | |
2216 | # could be expanded to properly parse them. | |
2217 | main::set_access('properties', \%properties, qw{ c r }); | |
2218 | ||
99870f4d KW |
2219 | my %has_missings_defaults; |
2220 | # ? Are there lines in the file giving default values for code points | |
2221 | # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is | |
2222 | # the norm, but IGNORED means it has such lines, but the handler doesn't | |
2223 | # use them. Having these three states allows us to catch changes to the | |
83b68635 KW |
2224 | # UCD that this program should track. XXX This could be expanded to |
2225 | # specify the syntax for such lines, like %properties above. | |
99870f4d KW |
2226 | main::set_access('has_missings_defaults', |
2227 | \%has_missings_defaults, qw{ c r }); | |
2228 | ||
2229 | my %pre_handler; | |
2230 | # Subroutine to call before doing anything else in the file. If undef, no | |
2231 | # such handler is called. | |
2232 | main::set_access('pre_handler', \%pre_handler, qw{ c }); | |
2233 | ||
2234 | my %eof_handler; | |
2235 | # Subroutine to call upon getting an EOF on the input file, but before | |
2236 | # that is returned to the main handler. This is to allow buffers to be | |
2237 | # flushed. The handler is expected to call insert_lines() or | |
2238 | # insert_adjusted() with the buffered material | |
2239 | main::set_access('eof_handler', \%eof_handler, qw{ c r }); | |
2240 | ||
2241 | my %post_handler; | |
2242 | # Subroutine to call after all the lines of the file are read in and | |
2243 | # processed. If undef, no such handler is called. | |
2244 | main::set_access('post_handler', \%post_handler, qw{ c }); | |
2245 | ||
2246 | my %progress_message; | |
2247 | # Message to print to display progress in lieu of the standard one | |
2248 | main::set_access('progress_message', \%progress_message, qw{ c }); | |
2249 | ||
2250 | my %handle; | |
2251 | # cache open file handle, internal. Is undef if file hasn't been | |
2252 | # processed at all, empty if has; | |
2253 | main::set_access('handle', \%handle); | |
2254 | ||
2255 | my %added_lines; | |
2256 | # cache of lines added virtually to the file, internal | |
2257 | main::set_access('added_lines', \%added_lines); | |
2258 | ||
74cd47d0 KW |
2259 | my %remapped_lines; |
2260 | # cache of lines added virtually to the file, internal | |
2261 | main::set_access('remapped_lines', \%remapped_lines); | |
2262 | ||
99870f4d KW |
2263 | my %errors; |
2264 | # cache of errors found, internal | |
2265 | main::set_access('errors', \%errors); | |
2266 | ||
2267 | my %missings; | |
2268 | # storage of '@missing' defaults lines | |
2269 | main::set_access('missings', \%missings); | |
2270 | ||
74cd47d0 KW |
2271 | sub _next_line; |
2272 | sub _next_line_with_remapped_range; | |
2273 | ||
99870f4d KW |
2274 | sub new { |
2275 | my $class = shift; | |
2276 | ||
2277 | my $self = bless \do{ my $anonymous_scalar }, $class; | |
ffe43484 | 2278 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
2279 | |
2280 | # Set defaults | |
2281 | $handler{$addr} = \&main::process_generic_property_file; | |
2282 | $non_skip{$addr} = 0; | |
37e2e78e | 2283 | $skip{$addr} = 0; |
99870f4d KW |
2284 | $has_missings_defaults{$addr} = $NO_DEFAULTS; |
2285 | $handle{$addr} = undef; | |
2286 | $added_lines{$addr} = [ ]; | |
74cd47d0 | 2287 | $remapped_lines{$addr} = [ ]; |
99870f4d KW |
2288 | $each_line_handler{$addr} = [ ]; |
2289 | $errors{$addr} = { }; | |
2290 | $missings{$addr} = [ ]; | |
2291 | ||
2292 | # Two positional parameters. | |
99f78760 | 2293 | return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; |
99870f4d KW |
2294 | $file{$addr} = main::internal_file_to_platform(shift); |
2295 | $first_released{$addr} = shift; | |
2296 | ||
71bd4c0b KW |
2297 | undef $file{$addr} if $first_released{$addr} gt $v_version; |
2298 | ||
99870f4d KW |
2299 | # The rest of the arguments are key => value pairs |
2300 | # %constructor_fields has been set up earlier to list all possible | |
2301 | # ones. Either set or push, depending on how the default has been set | |
2302 | # up just above. | |
2303 | my %args = @_; | |
2304 | foreach my $key (keys %args) { | |
2305 | my $argument = $args{$key}; | |
2306 | ||
2307 | # Note that the fields are the lower case of the constructor keys | |
2308 | my $hash = $constructor_fields{lc $key}; | |
2309 | if (! defined $hash) { | |
2310 | Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); | |
2311 | next; | |
2312 | } | |
2313 | if (ref $hash->{$addr} eq 'ARRAY') { | |
2314 | if (ref $argument eq 'ARRAY') { | |
2315 | foreach my $argument (@{$argument}) { | |
2316 | next if ! defined $argument; | |
2317 | push @{$hash->{$addr}}, $argument; | |
2318 | } | |
2319 | } | |
2320 | else { | |
2321 | push @{$hash->{$addr}}, $argument if defined $argument; | |
2322 | } | |
2323 | } | |
2324 | else { | |
2325 | $hash->{$addr} = $argument; | |
2326 | } | |
2327 | delete $args{$key}; | |
2328 | }; | |
2329 | ||
2330 | # If the file has a property for it, it means that the property is not | |
2331 | # listed in the file's entries. So add a handler to the list of line | |
2332 | # handlers to insert the property name into the lines, to provide a | |
2333 | # uniform interface to the final processing subroutine. | |
2334 | # the final code doesn't have to worry about that. | |
2335 | if ($property{$addr}) { | |
2336 | push @{$each_line_handler{$addr}}, \&_insert_property_into_line; | |
2337 | } | |
2338 | ||
2339 | if ($non_skip{$addr} && ! $debug_skip && $verbosity) { | |
2340 | print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; | |
a3a8c5f0 | 2341 | } |
99870f4d | 2342 | |
09ca89ce KW |
2343 | # If skipping, set to optional, and add to list of ignored files, |
2344 | # including its reason | |
2345 | if ($skip{$addr}) { | |
2346 | $optional{$addr} = 1; | |
71bd4c0b | 2347 | $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr}; |
09ca89ce | 2348 | } |
83b68635 KW |
2349 | elsif ($properties{$addr}) { |
2350 | ||
2351 | # Add a handler for each line in the input so that it creates a | |
2352 | # separate input line for each property in those input lines, thus | |
2353 | # making them suitable for process_generic_property_file(). | |
2354 | ||
2355 | push @{$each_line_handler{$addr}}, | |
2356 | sub { | |
2357 | my $file = shift; | |
2358 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
2359 | ||
2360 | my @fields = split /\s*;\s*/, $_, -1; | |
2361 | ||
2362 | if (@fields - 1 > @{$properties{$addr}}) { | |
2363 | $file->carp_bad_line('Extra fields'); | |
2364 | $_ = ""; | |
2365 | return; | |
2366 | } | |
2367 | my $range = shift @fields; # 0th element is always the | |
2368 | # range | |
2369 | ||
2370 | # The next fields in the input line correspond | |
2371 | # respectively to the stored properties. | |
2372 | for my $i (0 .. @{$properties{$addr}} - 1) { | |
2373 | my $property_name = $properties{$addr}[$i]; | |
2374 | next if $property_name eq '<ignored>'; | |
2375 | $file->insert_adjusted_lines( | |
2376 | "$range; $property_name; $fields[$i]"); | |
2377 | } | |
2378 | $_ = ""; | |
2379 | ||
2380 | return; | |
2381 | }; | |
2382 | } | |
37e2e78e | 2383 | |
74cd47d0 KW |
2384 | { # On non-ascii platforms, we use a special handler |
2385 | no strict; | |
2386 | no warnings 'once'; | |
2387 | *next_line = (main::NON_ASCII_PLATFORM) | |
2388 | ? *_next_line_with_remapped_range | |
2389 | : *_next_line; | |
2390 | } | |
2391 | ||
99870f4d | 2392 | return $self; |
d73e5302 JH |
2393 | } |
2394 | ||
cf25bb62 | 2395 | |
99870f4d KW |
2396 | use overload |
2397 | fallback => 0, | |
2398 | qw("") => "_operator_stringify", | |
2399 | "." => \&main::_operator_dot, | |
1285127e | 2400 | ".=" => \&main::_operator_dot_equal, |
99870f4d | 2401 | ; |
cf25bb62 | 2402 | |
99870f4d KW |
2403 | sub _operator_stringify { |
2404 | my $self = shift; | |
cf25bb62 | 2405 | |
99870f4d | 2406 | return __PACKAGE__ . " object for " . $self->file; |
d73e5302 | 2407 | } |
d73e5302 | 2408 | |
99870f4d KW |
2409 | # flag to make sure extracted files are processed early |
2410 | my $seen_non_extracted_non_age = 0; | |
d73e5302 | 2411 | |
99870f4d KW |
2412 | sub run { |
2413 | # Process the input object $self. This opens and closes the file and | |
2414 | # calls all the handlers for it. Currently, this can only be called | |
2415 | # once per file, as it destroy's the EOF handler | |
d73e5302 | 2416 | |
99870f4d KW |
2417 | my $self = shift; |
2418 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
b6922eda | 2419 | |
ffe43484 | 2420 | my $addr = do { no overloading; pack 'J', $self; }; |
b6922eda | 2421 | |
99870f4d | 2422 | my $file = $file{$addr}; |
d73e5302 | 2423 | |
99870f4d KW |
2424 | # Don't process if not expecting this file (because released later |
2425 | # than this Unicode version), and isn't there. This means if someone | |
2426 | # copies it into an earlier version's directory, we will go ahead and | |
2427 | # process it. | |
71bd4c0b KW |
2428 | return if $first_released{$addr} gt $v_version |
2429 | && (! defined $file || ! -e $file); | |
99870f4d KW |
2430 | |
2431 | # If in debugging mode and this file doesn't have the non-skip | |
2432 | # flag set, and isn't one of the critical files, skip it. | |
2433 | if ($debug_skip | |
2434 | && $first_released{$addr} ne v0 | |
2435 | && ! $non_skip{$addr}) | |
2436 | { | |
2437 | print "Skipping $file in debugging\n" if $verbosity; | |
2438 | return; | |
2439 | } | |
2440 | ||
2441 | # File could be optional | |
37e2e78e | 2442 | if ($optional{$addr}) { |
99870f4d KW |
2443 | return unless -e $file; |
2444 | my $result = eval $optional{$addr}; | |
2445 | if (! defined $result) { | |
2446 | Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped."); | |
2447 | return; | |
2448 | } | |
2449 | if (! $result) { | |
2450 | if ($verbosity) { | |
2451 | print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n"; | |
2452 | } | |
2453 | return; | |
2454 | } | |
2455 | } | |
2456 | ||
2457 | if (! defined $file || ! -e $file) { | |
2458 | ||
2459 | # If the file doesn't exist, see if have internal data for it | |
2460 | # (based on first_released being 0). | |
2461 | if ($first_released{$addr} eq v0) { | |
2462 | $handle{$addr} = 'pretend_is_open'; | |
2463 | } | |
2464 | else { | |
2465 | if (! $optional{$addr} # File could be optional | |
2466 | && $v_version ge $first_released{$addr}) | |
2467 | { | |
f71c7390 | 2468 | print STDERR "Skipping processing input file '$file' because not found\n"; |
99870f4d KW |
2469 | } |
2470 | return; | |
2471 | } | |
2472 | } | |
2473 | else { | |
2474 | ||
37e2e78e KW |
2475 | # Here, the file exists. Some platforms may change the case of |
2476 | # its name | |
99870f4d | 2477 | if ($seen_non_extracted_non_age) { |
517956bf | 2478 | if ($file =~ /$EXTRACTED/i) { |
1675ea0d | 2479 | Carp::my_carp_bug(main::join_lines(<<END |
99f78760 | 2480 | $file should be processed just after the 'Prop...Alias' files, and before |
99870f4d KW |
2481 | anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may |
2482 | have subtle problems | |
2483 | END | |
2484 | )); | |
2485 | } | |
2486 | } | |
2487 | elsif ($EXTRACTED_DIR | |
2488 | && $first_released{$addr} ne v0 | |
517956bf CB |
2489 | && $file !~ /$EXTRACTED/i |
2490 | && lc($file) ne 'dage.txt') | |
99870f4d KW |
2491 | { |
2492 | # We don't set this (by the 'if' above) if we have no | |
2493 | # extracted directory, so if running on an early version, | |
2494 | # this test won't work. Not worth worrying about. | |
2495 | $seen_non_extracted_non_age = 1; | |
2496 | } | |
2497 | ||
2498 | # And mark the file as having being processed, and warn if it | |
2499 | # isn't a file we are expecting. As we process the files, | |
2500 | # they are deleted from the hash, so any that remain at the | |
2501 | # end of the program are files that we didn't process. | |
517956bf | 2502 | my $fkey = File::Spec->rel2abs($file); |
faf3cf6b KW |
2503 | my $expecting = delete $potential_files{lc($fkey)}; |
2504 | ||
678f13d5 KW |
2505 | Carp::my_carp("Was not expecting '$file'.") if |
2506 | ! $expecting | |
99870f4d KW |
2507 | && ! defined $handle{$addr}; |
2508 | ||
37e2e78e KW |
2509 | # Having deleted from expected files, we can quit if not to do |
2510 | # anything. Don't print progress unless really want verbosity | |
2511 | if ($skip{$addr}) { | |
2512 | print "Skipping $file.\n" if $verbosity >= $VERBOSE; | |
2513 | return; | |
2514 | } | |
2515 | ||
99870f4d KW |
2516 | # Open the file, converting the slashes used in this program |
2517 | # into the proper form for the OS | |
2518 | my $file_handle; | |
2519 | if (not open $file_handle, "<", $file) { | |
2520 | Carp::my_carp("Can't open $file. Skipping: $!"); | |
2521 | return 0; | |
2522 | } | |
2523 | $handle{$addr} = $file_handle; # Cache the open file handle | |
9e65c3f4 | 2524 | |
96f226dc KW |
2525 | if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') { |
2526 | if ($file !~ /^Unihan/i) { | |
cafe9cf0 KW |
2527 | $_ = <$file_handle>; |
2528 | if ($_ !~ / - $string_version \. /x) { | |
2529 | chomp; | |
2530 | $_ =~ s/^#\s*//; | |
2531 | die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version"); | |
2532 | } | |
96f226dc KW |
2533 | } |
2534 | else { | |
2535 | while (<$file_handle>) { | |
2536 | if ($_ !~ /^#/) { | |
2537 | Carp::my_carp_bug("Could not find the expected version info in file '$file'"); | |
2538 | last; | |
2539 | } | |
2540 | chomp; | |
2541 | $_ =~ s/^#\s*//; | |
2542 | next if $_ !~ / version: /x; | |
2543 | last if $_ =~ /$string_version/; | |
2544 | die Carp::my_carp("File '$file' is '$_'. It should be version $string_version"); | |
2545 | } | |
2546 | } | |
9e65c3f4 | 2547 | } |
99870f4d KW |
2548 | } |
2549 | ||
2550 | if ($verbosity >= $PROGRESS) { | |
2551 | if ($progress_message{$addr}) { | |
2552 | print "$progress_message{$addr}\n"; | |
2553 | } | |
2554 | else { | |
2555 | # If using a virtual file, say so. | |
2556 | print "Processing ", (-e $file) | |
2557 | ? $file | |
2558 | : "substitute $file", | |
2559 | "\n"; | |
2560 | } | |
2561 | } | |
2562 | ||
2563 | ||
2564 | # Call any special handler for before the file. | |
2565 | &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; | |
2566 | ||
2567 | # Then the main handler | |
2568 | &{$handler{$addr}}($self); | |
2569 | ||
2570 | # Then any special post-file handler. | |
2571 | &{$post_handler{$addr}}($self) if $post_handler{$addr}; | |
2572 | ||
2573 | # If any errors have been accumulated, output the counts (as the first | |
2574 | # error message in each class was output when it was encountered). | |
2575 | if ($errors{$addr}) { | |
2576 | my $total = 0; | |
2577 | my $types = 0; | |
2578 | foreach my $error (keys %{$errors{$addr}}) { | |
2579 | $total += $errors{$addr}->{$error}; | |
2580 | delete $errors{$addr}->{$error}; | |
2581 | $types++; | |
2582 | } | |
2583 | if ($total > 1) { | |
2584 | my $message | |
2585 | = "A total of $total lines had errors in $file. "; | |
2586 | ||
2587 | $message .= ($types == 1) | |
2588 | ? '(Only the first one was displayed.)' | |
2589 | : '(Only the first of each type was displayed.)'; | |
2590 | Carp::my_carp($message); | |
2591 | } | |
2592 | } | |
2593 | ||
2594 | if (@{$missings{$addr}}) { | |
2595 | Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); | |
2596 | } | |
2597 | ||
2598 | # If a real file handle, close it. | |
2599 | close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if | |
2600 | ref $handle{$addr}; | |
2601 | $handle{$addr} = ""; # Uses empty to indicate that has already seen | |
2602 | # the file, as opposed to undef | |
2603 | return; | |
2604 | } | |
2605 | ||
74cd47d0 | 2606 | sub _next_line { |
99870f4d KW |
2607 | # Sets $_ to be the next logical input line, if any. Returns non-zero |
2608 | # if such a line exists. 'logical' means that any lines that have | |
2609 | # been added via insert_lines() will be returned in $_ before the file | |
2610 | # is read again. | |
2611 | ||
2612 | my $self = shift; | |
2613 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
2614 | ||
ffe43484 | 2615 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
2616 | |
2617 | # Here the file is open (or if the handle is not a ref, is an open | |
2618 | # 'virtual' file). Get the next line; any inserted lines get priority | |
2619 | # over the file itself. | |
2620 | my $adjusted; | |
2621 | ||
2622 | LINE: | |
2623 | while (1) { # Loop until find non-comment, non-empty line | |
2624 | #local $to_trace = 1 if main::DEBUG; | |
2625 | my $inserted_ref = shift @{$added_lines{$addr}}; | |
2626 | if (defined $inserted_ref) { | |
2627 | ($adjusted, $_) = @{$inserted_ref}; | |
2628 | trace $adjusted, $_ if main::DEBUG && $to_trace; | |
2629 | return 1 if $adjusted; | |
2630 | } | |
2631 | else { | |
2632 | last if ! ref $handle{$addr}; # Don't read unless is real file | |
2633 | last if ! defined ($_ = readline $handle{$addr}); | |
2634 | } | |
2635 | chomp; | |
2636 | trace $_ if main::DEBUG && $to_trace; | |
2637 | ||
2638 | # See if this line is the comment line that defines what property | |
2639 | # value that code points that are not listed in the file should | |
2640 | # have. The format or existence of these lines is not guaranteed | |
2641 | # by Unicode since they are comments, but the documentation says | |
2642 | # that this was added for machine-readability, so probably won't | |
2643 | # change. This works starting in Unicode Version 5.0. They look | |
2644 | # like: | |
2645 | # | |
2646 | # @missing: 0000..10FFFF; Not_Reordered | |
2647 | # @missing: 0000..10FFFF; Decomposition_Mapping; <code point> | |
2648 | # @missing: 0000..10FFFF; ; NaN | |
2649 | # | |
2650 | # Save the line for a later get_missings() call. | |
2651 | if (/$missing_defaults_prefix/) { | |
2652 | if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { | |
2653 | $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); | |
2654 | } | |
2655 | elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { | |
2656 | my @defaults = split /\s* ; \s*/x, $_; | |
2657 | ||
2658 | # The first field is the @missing, which ends in a | |
2659 | # semi-colon, so can safely shift. | |
2660 | shift @defaults; | |
2661 | ||
2662 | # Some of these lines may have empty field placeholders | |
2663 | # which get in the way. An example is: | |
2664 | # @missing: 0000..10FFFF; ; NaN | |
2665 | # Remove them. Process starting from the top so the | |
2666 | # splice doesn't affect things still to be looked at. | |
2667 | for (my $i = @defaults - 1; $i >= 0; $i--) { | |
2668 | next if $defaults[$i] ne ""; | |
2669 | splice @defaults, $i, 1; | |
2670 | } | |
2671 | ||
2672 | # What's left should be just the property (maybe) and the | |
2673 | # default. Having only one element means it doesn't have | |
2674 | # the property. | |
2675 | my $default; | |
2676 | my $property; | |
2677 | if (@defaults >= 1) { | |
2678 | if (@defaults == 1) { | |
2679 | $default = $defaults[0]; | |
2680 | } | |
2681 | else { | |
2682 | $property = $defaults[0]; | |
2683 | $default = $defaults[1]; | |
2684 | } | |
2685 | } | |
2686 | ||
2687 | if (@defaults < 1 | |
2688 | || @defaults > 2 | |
2689 | || ($default =~ /^</ | |
2690 | && $default !~ /^<code *point>$/i | |
09f8d0ac KW |
2691 | && $default !~ /^<none>$/i |
2692 | && $default !~ /^<script>$/i)) | |
99870f4d KW |
2693 | { |
2694 | $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); | |
2695 | } | |
2696 | else { | |
2697 | ||
2698 | # If the property is missing from the line, it should | |
2699 | # be the one for the whole file | |
2700 | $property = $property{$addr} if ! defined $property; | |
2701 | ||
2702 | # Change <none> to the null string, which is what it | |
2703 | # really means. If the default is the code point | |
2704 | # itself, set it to <code point>, which is what | |
2705 | # Unicode uses (but sometimes they've forgotten the | |
2706 | # space) | |
2707 | if ($default =~ /^<none>$/i) { | |
2708 | $default = ""; | |
2709 | } | |
2710 | elsif ($default =~ /^<code *point>$/i) { | |
2711 | $default = $CODE_POINT; | |
2712 | } | |
09f8d0ac KW |
2713 | elsif ($default =~ /^<script>$/i) { |
2714 | ||
2715 | # Special case this one. Currently is from | |
2716 | # ScriptExtensions.txt, and means for all unlisted | |
2717 | # code points, use their Script property values. | |
2718 | # For the code points not listed in that file, the | |
2719 | # default value is 'Unknown'. | |
2720 | $default = "Unknown"; | |
2721 | } | |
99870f4d KW |
2722 | |
2723 | # Store them as a sub-arrays with both components. | |
2724 | push @{$missings{$addr}}, [ $default, $property ]; | |
2725 | } | |
2726 | } | |
2727 | ||
2728 | # There is nothing for the caller to process on this comment | |
2729 | # line. | |
2730 | next; | |
2731 | } | |
2732 | ||
2733 | # Remove comments and trailing space, and skip this line if the | |
2734 | # result is empty | |
2735 | s/#.*//; | |
2736 | s/\s+$//; | |
2737 | next if /^$/; | |
2738 | ||
2739 | # Call any handlers for this line, and skip further processing of | |
2740 | # the line if the handler sets the line to null. | |
2741 | foreach my $sub_ref (@{$each_line_handler{$addr}}) { | |
2742 | &{$sub_ref}($self); | |
2743 | next LINE if /^$/; | |
2744 | } | |
2745 | ||
2746 | # Here the line is ok. return success. | |
2747 | return 1; | |
2748 | } # End of looping through lines. | |
2749 | ||
2750 | # If there is an EOF handler, call it (only once) and if it generates | |
2751 | # more lines to process go back in the loop to handle them. | |
2752 | if ($eof_handler{$addr}) { | |
2753 | &{$eof_handler{$addr}}($self); | |
2754 | $eof_handler{$addr} = ""; # Currently only get one shot at it. | |
2755 | goto LINE if $added_lines{$addr}; | |
2756 | } | |
2757 | ||
2758 | # Return failure -- no more lines. | |
2759 | return 0; | |
2760 | ||
2761 | } | |
2762 | ||
74cd47d0 KW |
2763 | sub _next_line_with_remapped_range { |
2764 | my $self = shift; | |
2765 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
2766 | ||
2767 | # like _next_line(), but for use on non-ASCII platforms. It sets $_ | |
2768 | # to be the next logical input line, if any. Returns non-zero if such | |
2769 | # a line exists. 'logical' means that any lines that have been added | |
2770 | # via insert_lines() will be returned in $_ before the file is read | |
2771 | # again. | |
2772 | # | |
2773 | # The difference from _next_line() is that this remaps the Unicode | |
2774 | # code points in the input to those of the native platform. Each | |
2775 | # input line contains a single code point, or a single contiguous | |
2776 | # range of them This routine splits each range into its individual | |
2777 | # code points and caches them. It returns the cached values, | |
2778 | # translated into their native equivalents, one at a time, for each | |
2779 | # call, before reading the next line. Since native values can only be | |
2780 | # a single byte wide, no translation is needed for code points above | |
2781 | # 0xFF, and ranges that are entirely above that number are not split. | |
2782 | # If an input line contains the range 254-1000, it would be split into | |
2783 | # three elements: 254, 255, and 256-1000. (The downstream table | |
2784 | # insertion code will sort and coalesce the individual code points | |
2785 | # into appropriate ranges.) | |
2786 | ||
2787 | my $addr = do { no overloading; pack 'J', $self; }; | |
2788 | ||
2789 | while (1) { | |
2790 | ||
2791 | # Look in cache before reading the next line. Return any cached | |
2792 | # value, translated | |
2793 | my $inserted = shift @{$remapped_lines{$addr}}; | |
2794 | if (defined $inserted) { | |
2795 | trace $inserted if main::DEBUG && $to_trace; | |
2796 | $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer; | |
2797 | trace $_ if main::DEBUG && $to_trace; | |
2798 | return 1; | |
2799 | } | |
2800 | ||
2801 | # Get the next line. | |
2802 | return 0 unless _next_line($self); | |
2803 | ||
2804 | # If there is a special handler for it, return the line, | |
2805 | # untranslated. This should happen only for files that are | |
2806 | # special, not being code-point related, such as property names. | |
2807 | return 1 if $handler{$addr} | |
2808 | != \&main::process_generic_property_file; | |
2809 | ||
2810 | my ($range, $property_name, $map, @remainder) | |
2811 | = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields | |
2812 | ||
2813 | if (@remainder | |
2814 | || ! defined $property_name | |
2815 | || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) | |
2816 | { | |
2817 | Carp::my_carp_bug("Unrecognized input line '$_'. Ignored"); | |
2818 | } | |
2819 | ||
2820 | my $low = hex $1; | |
2821 | my $high = (defined $2) ? hex $2 : $low; | |
2822 | ||
2823 | # If the input maps the range to another code point, remap the | |
2824 | # target if it is between 0 and 255. | |
2825 | my $tail; | |
2826 | if (defined $map) { | |
2827 | $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe; | |
2828 | $tail = "$property_name; $map"; | |
2829 | $_ = "$range; $tail"; | |
2830 | } | |
2831 | else { | |
2832 | $tail = $property_name; | |
2833 | } | |
2834 | ||
2835 | # If entire range is above 255, just return it, unchanged (except | |
2836 | # any mapped-to code point, already changed above) | |
2837 | return 1 if $low > 255; | |
2838 | ||
2839 | # Cache an entry for every code point < 255. For those in the | |
2840 | # range above 255, return a dummy entry for just that portion of | |
2841 | # the range. Note that this will be out-of-order, but that is not | |
2842 | # a problem. | |
2843 | foreach my $code_point ($low .. $high) { | |
2844 | if ($code_point > 255) { | |
2845 | $_ = sprintf "%04X..%04X; $tail", $code_point, $high; | |
2846 | return 1; | |
2847 | } | |
2848 | push @{$remapped_lines{$addr}}, "$code_point; $tail"; | |
2849 | } | |
2850 | } # End of looping through lines. | |
2851 | ||
2852 | # NOTREACHED | |
2853 | } | |
2854 | ||
99870f4d KW |
2855 | # Not currently used, not fully tested. |
2856 | # sub peek { | |
2857 | # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank | |
2858 | # # record. Not callable from an each_line_handler(), nor does it call | |
2859 | # # an each_line_handler() on the line. | |
2860 | # | |
2861 | # my $self = shift; | |
ffe43484 | 2862 | # my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
2863 | # |
2864 | # foreach my $inserted_ref (@{$added_lines{$addr}}) { | |
2865 | # my ($adjusted, $line) = @{$inserted_ref}; | |
2866 | # next if $adjusted; | |
2867 | # | |
2868 | # # Remove comments and trailing space, and return a non-empty | |
2869 | # # resulting line | |
2870 | # $line =~ s/#.*//; | |
2871 | # $line =~ s/\s+$//; | |
2872 | # return $line if $line ne ""; | |
2873 | # } | |
2874 | # | |
2875 | # return if ! ref $handle{$addr}; # Don't read unless is real file | |
2876 | # while (1) { # Loop until find non-comment, non-empty line | |
2877 | # local $to_trace = 1 if main::DEBUG; | |
2878 | # trace $_ if main::DEBUG && $to_trace; | |
2879 | # return if ! defined (my $line = readline $handle{$addr}); | |
2880 | # chomp $line; | |
2881 | # push @{$added_lines{$addr}}, [ 0, $line ]; | |
2882 | # | |
2883 | # $line =~ s/#.*//; | |
2884 | # $line =~ s/\s+$//; | |
2885 | # return $line if $line ne ""; | |
2886 | # } | |
2887 | # | |
2888 | # return; | |
2889 | # } | |
2890 | ||
2891 | ||
2892 | sub insert_lines { | |
2893 | # Lines can be inserted so that it looks like they were in the input | |
2894 | # file at the place it was when this routine is called. See also | |
2895 | # insert_adjusted_lines(). Lines inserted via this routine go through | |
2896 | # any each_line_handler() | |
2897 | ||
2898 | my $self = shift; | |
2899 | ||
2900 | # Each inserted line is an array, with the first element being 0 to | |
2901 | # indicate that this line hasn't been adjusted, and needs to be | |
2902 | # processed. | |
f998e60c | 2903 | no overloading; |
051df77b | 2904 | push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_; |
99870f4d KW |
2905 | return; |
2906 | } | |
2907 | ||
2908 | sub insert_adjusted_lines { | |
2909 | # Lines can be inserted so that it looks like they were in the input | |
2910 | # file at the place it was when this routine is called. See also | |
2911 | # insert_lines(). Lines inserted via this routine are already fully | |
2912 | # adjusted, ready to be processed; each_line_handler()s handlers will | |
2913 | # not be called. This means this is not a completely general | |
2914 | # facility, as only the last each_line_handler on the stack should | |
2915 | # call this. It could be made more general, by passing to each of the | |
2916 | # line_handlers their position on the stack, which they would pass on | |
2917 | # to this routine, and that would replace the boolean first element in | |
2918 | # the anonymous array pushed here, so that the next_line routine could | |
2919 | # use that to call only those handlers whose index is after it on the | |
2920 | # stack. But this is overkill for what is needed now. | |
2921 | ||
2922 | my $self = shift; | |
2923 | trace $_[0] if main::DEBUG && $to_trace; | |
2924 | ||
2925 | # Each inserted line is an array, with the first element being 1 to | |
2926 | # indicate that this line has been adjusted | |
f998e60c | 2927 | no overloading; |
051df77b | 2928 | push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_; |
99870f4d KW |
2929 | return; |
2930 | } | |
2931 | ||
2932 | sub get_missings { | |
2933 | # Returns the stored up @missings lines' values, and clears the list. | |
2934 | # The values are in an array, consisting of the default in the first | |
2935 | # element, and the property in the 2nd. However, since these lines | |
2936 | # can be stacked up, the return is an array of all these arrays. | |
2937 | ||
2938 | my $self = shift; | |
2939 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
2940 | ||
ffe43484 | 2941 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
2942 | |
2943 | # If not accepting a list return, just return the first one. | |
2944 | return shift @{$missings{$addr}} unless wantarray; | |
2945 | ||
2946 | my @return = @{$missings{$addr}}; | |
2947 | undef @{$missings{$addr}}; | |
2948 | return @return; | |
2949 | } | |
2950 | ||
2951 | sub _insert_property_into_line { | |
2952 | # Add a property field to $_, if this file requires it. | |
2953 | ||
f998e60c | 2954 | my $self = shift; |
ffe43484 | 2955 | my $addr = do { no overloading; pack 'J', $self; }; |
f998e60c | 2956 | my $property = $property{$addr}; |
99870f4d KW |
2957 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; |
2958 | ||
2959 | $_ =~ s/(;|$)/; $property$1/; | |
2960 | return; | |
2961 | } | |
2962 | ||
2963 | sub carp_bad_line { | |
2964 | # Output consistent error messages, using either a generic one, or the | |
2965 | # one given by the optional parameter. To avoid gazillions of the | |
2966 | # same message in case the syntax of a file is way off, this routine | |
2967 | # only outputs the first instance of each message, incrementing a | |
2968 | # count so the totals can be output at the end of the file. | |
2969 | ||
2970 | my $self = shift; | |
2971 | my $message = shift; | |
2972 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
2973 | ||
ffe43484 | 2974 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
2975 | |
2976 | $message = 'Unexpected line' unless $message; | |
2977 | ||
2978 | # No trailing punctuation so as to fit with our addenda. | |
2979 | $message =~ s/[.:;,]$//; | |
2980 | ||
2981 | # If haven't seen this exact message before, output it now. Otherwise | |
2982 | # increment the count of how many times it has occurred | |
2983 | unless ($errors{$addr}->{$message}) { | |
2984 | Carp::my_carp("$message in '$_' in " | |
f998e60c | 2985 | . $file{$addr} |
99870f4d KW |
2986 | . " at line $.. Skipping this line;"); |
2987 | $errors{$addr}->{$message} = 1; | |
2988 | } | |
2989 | else { | |
2990 | $errors{$addr}->{$message}++; | |
2991 | } | |
2992 | ||
2993 | # Clear the line to prevent any further (meaningful) processing of it. | |
2994 | $_ = ""; | |
2995 | ||
2996 | return; | |
2997 | } | |
2998 | } # End closure | |
2999 | ||
3000 | package Multi_Default; | |
3001 | ||
3002 | # Certain properties in early versions of Unicode had more than one possible | |
3003 | # default for code points missing from the files. In these cases, one | |
3004 | # default applies to everything left over after all the others are applied, | |
3005 | # and for each of the others, there is a description of which class of code | |
3006 | # points applies to it. This object helps implement this by storing the | |
3007 | # defaults, and for all but that final default, an eval string that generates | |
3008 | # the class that it applies to. | |
3009 | ||
3010 | ||
3011 | { # Closure | |
3012 | ||
3013 | main::setup_package(); | |
3014 | ||
3015 | my %class_defaults; | |
3016 | # The defaults structure for the classes | |
3017 | main::set_access('class_defaults', \%class_defaults); | |
3018 | ||
3019 | my %other_default; | |
3020 | # The default that applies to everything left over. | |
3021 | main::set_access('other_default', \%other_default, 'r'); | |
3022 | ||
3023 | ||
3024 | sub new { | |
3025 | # The constructor is called with default => eval pairs, terminated by | |
3026 | # the left-over default. e.g. | |
3027 | # Multi_Default->new( | |
3028 | # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C | |
3029 | # - 0x200D', | |
3030 | # 'R' => 'some other expression that evaluates to code points', | |
3031 | # . | |
3032 | # . | |
3033 | # . | |
3034 | # 'U')); | |
3035 | ||
3036 | my $class = shift; | |
3037 | ||
3038 | my $self = bless \do{my $anonymous_scalar}, $class; | |
ffe43484 | 3039 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3040 | |
3041 | while (@_ > 1) { | |
3042 | my $default = shift; | |
3043 | my $eval = shift; | |
3044 | $class_defaults{$addr}->{$default} = $eval; | |
3045 | } | |
3046 | ||
3047 | $other_default{$addr} = shift; | |
3048 | ||
3049 | return $self; | |
3050 | } | |
3051 | ||
3052 | sub get_next_defaults { | |
3053 | # Iterates and returns the next class of defaults. | |
3054 | my $self = shift; | |
3055 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3056 | ||
ffe43484 | 3057 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3058 | |
3059 | return each %{$class_defaults{$addr}}; | |
3060 | } | |
3061 | } | |
3062 | ||
3063 | package Alias; | |
3064 | ||
3065 | # An alias is one of the names that a table goes by. This class defines them | |
3066 | # including some attributes. Everything is currently setup in the | |
3067 | # constructor. | |
3068 | ||
3069 | ||
3070 | { # Closure | |
3071 | ||
3072 | main::setup_package(); | |
3073 | ||
3074 | my %name; | |
3075 | main::set_access('name', \%name, 'r'); | |
3076 | ||
3077 | my %loose_match; | |
c12f2655 | 3078 | # Should this name match loosely or not. |
99870f4d KW |
3079 | main::set_access('loose_match', \%loose_match, 'r'); |
3080 | ||
33e96e72 KW |
3081 | my %make_re_pod_entry; |
3082 | # Some aliases should not get their own entries in the re section of the | |
3083 | # pod, because they are covered by a wild-card, and some we want to | |
3084 | # discourage use of. Binary | |
f82fe4ba | 3085 | main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); |
99870f4d | 3086 | |
fd1e3e84 KW |
3087 | my %ucd; |
3088 | # Is this documented to be accessible via Unicode::UCD | |
3089 | main::set_access('ucd', \%ucd, 'r', 's'); | |
3090 | ||
99870f4d KW |
3091 | my %status; |
3092 | # Aliases have a status, like deprecated, or even suppressed (which means | |
3093 | # they don't appear in documentation). Enum | |
3094 | main::set_access('status', \%status, 'r'); | |
3095 | ||
0eac1e20 | 3096 | my %ok_as_filename; |
99870f4d KW |
3097 | # Similarly, some aliases should not be considered as usable ones for |
3098 | # external use, such as file names, or we don't want documentation to | |
3099 | # recommend them. Boolean | |
0eac1e20 | 3100 | main::set_access('ok_as_filename', \%ok_as_filename, 'r'); |
99870f4d KW |
3101 | |
3102 | sub new { | |
3103 | my $class = shift; | |
3104 | ||
3105 | my $self = bless \do { my $anonymous_scalar }, $class; | |
ffe43484 | 3106 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3107 | |
3108 | $name{$addr} = shift; | |
3109 | $loose_match{$addr} = shift; | |
33e96e72 | 3110 | $make_re_pod_entry{$addr} = shift; |
0eac1e20 | 3111 | $ok_as_filename{$addr} = shift; |
99870f4d | 3112 | $status{$addr} = shift; |
fd1e3e84 | 3113 | $ucd{$addr} = shift; |
99870f4d KW |
3114 | |
3115 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3116 | ||
3117 | # Null names are never ok externally | |
0eac1e20 | 3118 | $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; |
99870f4d KW |
3119 | |
3120 | return $self; | |
3121 | } | |
3122 | } | |
3123 | ||
3124 | package Range; | |
3125 | ||
3126 | # A range is the basic unit for storing code points, and is described in the | |
3127 | # comments at the beginning of the program. Each range has a starting code | |
3128 | # point; an ending code point (not less than the starting one); a value | |
3129 | # that applies to every code point in between the two end-points, inclusive; | |
3130 | # and an enum type that applies to the value. The type is for the user's | |
3131 | # convenience, and has no meaning here, except that a non-zero type is | |
3132 | # considered to not obey the normal Unicode rules for having standard forms. | |
3133 | # | |
3134 | # The same structure is used for both map and match tables, even though in the | |
3135 | # latter, the value (and hence type) is irrelevant and could be used as a | |
3136 | # comment. In map tables, the value is what all the code points in the range | |
3137 | # map to. Type 0 values have the standardized version of the value stored as | |
3138 | # well, so as to not have to recalculate it a lot. | |
3139 | ||
3140 | sub trace { return main::trace(@_); } | |
3141 | ||
3142 | { # Closure | |
3143 | ||
3144 | main::setup_package(); | |
3145 | ||
3146 | my %start; | |
3147 | main::set_access('start', \%start, 'r', 's'); | |
3148 | ||
3149 | my %end; | |
3150 | main::set_access('end', \%end, 'r', 's'); | |
3151 | ||
3152 | my %value; | |
3153 | main::set_access('value', \%value, 'r'); | |
3154 | ||
3155 | my %type; | |
3156 | main::set_access('type', \%type, 'r'); | |
3157 | ||
3158 | my %standard_form; | |
3159 | # The value in internal standard form. Defined only if the type is 0. | |
3160 | main::set_access('standard_form', \%standard_form); | |
3161 | ||
3162 | # Note that if these fields change, the dump() method should as well | |
3163 | ||
3164 | sub new { | |
3165 | return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; | |
3166 | my $class = shift; | |
3167 | ||
3168 | my $self = bless \do { my $anonymous_scalar }, $class; | |
ffe43484 | 3169 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3170 | |
3171 | $start{$addr} = shift; | |
3172 | $end{$addr} = shift; | |
3173 | ||
3174 | my %args = @_; | |
3175 | ||
3176 | my $value = delete $args{'Value'}; # Can be 0 | |
3177 | $value = "" unless defined $value; | |
3178 | $value{$addr} = $value; | |
3179 | ||
3180 | $type{$addr} = delete $args{'Type'} || 0; | |
3181 | ||
3182 | Carp::carp_extra_args(\%args) if main::DEBUG && %args; | |
3183 | ||
99870f4d KW |
3184 | return $self; |
3185 | } | |
3186 | ||
3187 | use overload | |
3188 | fallback => 0, | |
3189 | qw("") => "_operator_stringify", | |
3190 | "." => \&main::_operator_dot, | |
1285127e | 3191 | ".=" => \&main::_operator_dot_equal, |
99870f4d KW |
3192 | ; |
3193 | ||
3194 | sub _operator_stringify { | |
3195 | my $self = shift; | |
ffe43484 | 3196 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3197 | |
3198 | # Output it like '0041..0065 (value)' | |
3199 | my $return = sprintf("%04X", $start{$addr}) | |
3200 | . '..' | |
3201 | . sprintf("%04X", $end{$addr}); | |
3202 | my $value = $value{$addr}; | |
3203 | my $type = $type{$addr}; | |
3204 | $return .= ' ('; | |
3205 | $return .= "$value"; | |
3206 | $return .= ", Type=$type" if $type != 0; | |
3207 | $return .= ')'; | |
3208 | ||
3209 | return $return; | |
3210 | } | |
3211 | ||
3212 | sub standard_form { | |
c292d35a NC |
3213 | # Calculate the standard form only if needed, and cache the result. |
3214 | # The standard form is the value itself if the type is special. | |
3215 | # This represents a considerable CPU and memory saving - at the time | |
3216 | # of writing there are 368676 non-special objects, but the standard | |
3217 | # form is only requested for 22047 of them - ie about 6%. | |
99870f4d KW |
3218 | |
3219 | my $self = shift; | |
3220 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3221 | ||
ffe43484 | 3222 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3223 | |
3224 | return $standard_form{$addr} if defined $standard_form{$addr}; | |
c292d35a NC |
3225 | |
3226 | my $value = $value{$addr}; | |
3227 | return $value if $type{$addr}; | |
3228 | return $standard_form{$addr} = main::standardize($value); | |
99870f4d KW |
3229 | } |
3230 | ||
3231 | sub dump { | |
3232 | # Human, not machine readable. For machine readable, comment out this | |
3233 | # entire routine and let the standard one take effect. | |
3234 | my $self = shift; | |
3235 | my $indent = shift; | |
3236 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3237 | ||
ffe43484 | 3238 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3239 | |
3240 | my $return = $indent | |
3241 | . sprintf("%04X", $start{$addr}) | |
3242 | . '..' | |
3243 | . sprintf("%04X", $end{$addr}) | |
3244 | . " '$value{$addr}';"; | |
3245 | if (! defined $standard_form{$addr}) { | |
3246 | $return .= "(type=$type{$addr})"; | |
3247 | } | |
3248 | elsif ($standard_form{$addr} ne $value{$addr}) { | |
3249 | $return .= "(standard '$standard_form{$addr}')"; | |
3250 | } | |
3251 | return $return; | |
3252 | } | |
3253 | } # End closure | |
3254 | ||
3255 | package _Range_List_Base; | |
3256 | ||
3257 | # Base class for range lists. A range list is simply an ordered list of | |
3258 | # ranges, so that the ranges with the lowest starting numbers are first in it. | |
3259 | # | |
3260 | # When a new range is added that is adjacent to an existing range that has the | |
3261 | # same value and type, it merges with it to form a larger range. | |
3262 | # | |
3263 | # Ranges generally do not overlap, except that there can be multiple entries | |
3264 | # of single code point ranges. This is because of NameAliases.txt. | |
3265 | # | |
3266 | # In this program, there is a standard value such that if two different | |
3267 | # values, have the same standard value, they are considered equivalent. This | |
3268 | # value was chosen so that it gives correct results on Unicode data | |
3269 | ||
3270 | # There are a number of methods to manipulate range lists, and some operators | |
3271 | # are overloaded to handle them. | |
3272 | ||
99870f4d KW |
3273 | sub trace { return main::trace(@_); } |
3274 | ||
3275 | { # Closure | |
3276 | ||
3277 | our $addr; | |
3278 | ||
5b348b71 KW |
3279 | # Max is initialized to a negative value that isn't adjacent to 0, for |
3280 | # simpler tests | |
3281 | my $max_init = -2; | |
3282 | ||
99870f4d KW |
3283 | main::setup_package(); |
3284 | ||
3285 | my %ranges; | |
3286 | # The list of ranges | |
3287 | main::set_access('ranges', \%ranges, 'readable_array'); | |
3288 | ||
3289 | my %max; | |
3290 | # The highest code point in the list. This was originally a method, but | |
3291 | # actual measurements said it was used a lot. | |
3292 | main::set_access('max', \%max, 'r'); | |
3293 | ||
3294 | my %each_range_iterator; | |
3295 | # Iterator position for each_range() | |
3296 | main::set_access('each_range_iterator', \%each_range_iterator); | |
3297 | ||
3298 | my %owner_name_of; | |
3299 | # Name of parent this is attached to, if any. Solely for better error | |
3300 | # messages. | |
3301 | main::set_access('owner_name_of', \%owner_name_of, 'p_r'); | |
3302 | ||
3303 | my %_search_ranges_cache; | |
3304 | # A cache of the previous result from _search_ranges(), for better | |
3305 | # performance | |
3306 | main::set_access('_search_ranges_cache', \%_search_ranges_cache); | |
3307 | ||
3308 | sub new { | |
3309 | my $class = shift; | |
3310 | my %args = @_; | |
3311 | ||
3312 | # Optional initialization data for the range list. | |
3313 | my $initialize = delete $args{'Initialize'}; | |
3314 | ||
3315 | my $self; | |
3316 | ||
3317 | # Use _union() to initialize. _union() returns an object of this | |
3318 | # class, which means that it will call this constructor recursively. | |
3319 | # But it won't have this $initialize parameter so that it won't | |
3320 | # infinitely loop on this. | |
3321 | return _union($class, $initialize, %args) if defined $initialize; | |
3322 | ||
3323 | $self = bless \do { my $anonymous_scalar }, $class; | |
ffe43484 | 3324 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3325 | |
3326 | # Optional parent object, only for debug info. | |
3327 | $owner_name_of{$addr} = delete $args{'Owner'}; | |
3328 | $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; | |
3329 | ||
3330 | # Stringify, in case it is an object. | |
3331 | $owner_name_of{$addr} = "$owner_name_of{$addr}"; | |
3332 | ||
3333 | # This is used only for error messages, and so a colon is added | |
3334 | $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; | |
3335 | ||
3336 | Carp::carp_extra_args(\%args) if main::DEBUG && %args; | |
3337 | ||
5b348b71 | 3338 | $max{$addr} = $max_init; |
99870f4d KW |
3339 | |
3340 | $_search_ranges_cache{$addr} = 0; | |
3341 | $ranges{$addr} = []; | |
3342 | ||
3343 | return $self; | |
3344 | } | |
3345 | ||
3346 | use overload | |
3347 | fallback => 0, | |
3348 | qw("") => "_operator_stringify", | |
3349 | "." => \&main::_operator_dot, | |
1285127e | 3350 | ".=" => \&main::_operator_dot_equal, |
99870f4d KW |
3351 | ; |
3352 | ||
3353 | sub _operator_stringify { | |
3354 | my $self = shift; | |
ffe43484 | 3355 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3356 | |
3357 | return "Range_List attached to '$owner_name_of{$addr}'" | |
3358 | if $owner_name_of{$addr}; | |
3359 | return "anonymous Range_List " . \$self; | |
3360 | } | |
3361 | ||
3362 | sub _union { | |
3363 | # Returns the union of the input code points. It can be called as | |
3364 | # either a constructor or a method. If called as a method, the result | |
3365 | # will be a new() instance of the calling object, containing the union | |
3366 | # of that object with the other parameter's code points; if called as | |
d59563d0 | 3367 | # a constructor, the first parameter gives the class that the new object |
99870f4d KW |
3368 | # should be, and the second parameter gives the code points to go into |
3369 | # it. | |
3370 | # In either case, there are two parameters looked at by this routine; | |
3371 | # any additional parameters are passed to the new() constructor. | |
3372 | # | |
3373 | # The code points can come in the form of some object that contains | |
3374 | # ranges, and has a conventionally named method to access them; or | |
3375 | # they can be an array of individual code points (as integers); or | |
3376 | # just a single code point. | |
3377 | # | |
3378 | # If they are ranges, this routine doesn't make any effort to preserve | |
3198cc57 KW |
3379 | # the range values and types of one input over the other. Therefore |
3380 | # this base class should not allow _union to be called from other than | |
99870f4d KW |
3381 | # initialization code, so as to prevent two tables from being added |
3382 | # together where the range values matter. The general form of this | |
3383 | # routine therefore belongs in a derived class, but it was moved here | |
3384 | # to avoid duplication of code. The failure to overload this in this | |
3385 | # class keeps it safe. | |
3198cc57 KW |
3386 | # |
3387 | # It does make the effort during initialization to accept tables with | |
3388 | # multiple values for the same code point, and to preserve the order | |
3389 | # of these. If there is only one input range or range set, it doesn't | |
3390 | # sort (as it should already be sorted to the desired order), and will | |
3391 | # accept multiple values per code point. Otherwise it will merge | |
3392 | # multiple values into a single one. | |
99870f4d KW |
3393 | |
3394 | my $self; | |
3395 | my @args; # Arguments to pass to the constructor | |
3396 | ||
3397 | my $class = shift; | |
3398 | ||
3399 | # If a method call, will start the union with the object itself, and | |
3400 | # the class of the new object will be the same as self. | |
3401 | if (ref $class) { | |
3402 | $self = $class; | |
3403 | $class = ref $self; | |
3404 | push @args, $self; | |
3405 | } | |
3406 | ||
3407 | # Add the other required parameter. | |
3408 | push @args, shift; | |
3409 | # Rest of parameters are passed on to the constructor | |
3410 | ||
3411 | # Accumulate all records from both lists. | |
3412 | my @records; | |
3198cc57 | 3413 | my $input_count = 0; |
99870f4d KW |
3414 | for my $arg (@args) { |
3415 | #local $to_trace = 0 if main::DEBUG; | |
3416 | trace "argument = $arg" if main::DEBUG && $to_trace; | |
3417 | if (! defined $arg) { | |
3418 | my $message = ""; | |
3419 | if (defined $self) { | |
f998e60c | 3420 | no overloading; |
051df77b | 3421 | $message .= $owner_name_of{pack 'J', $self}; |
99870f4d | 3422 | } |
ada6088e | 3423 | Carp::my_carp_bug($message . "Undefined argument to _union. No union done."); |
99870f4d KW |
3424 | return; |
3425 | } | |
3198cc57 | 3426 | |
99870f4d KW |
3427 | $arg = [ $arg ] if ! ref $arg; |
3428 | my $type = ref $arg; | |
3429 | if ($type eq 'ARRAY') { | |
3430 | foreach my $element (@$arg) { | |
3431 | push @records, Range->new($element, $element); | |
3198cc57 | 3432 | $input_count++; |
99870f4d KW |
3433 | } |
3434 | } | |
3435 | elsif ($arg->isa('Range')) { | |
3436 | push @records, $arg; | |
3198cc57 | 3437 | $input_count++; |
99870f4d KW |
3438 | } |
3439 | elsif ($arg->can('ranges')) { | |
3440 | push @records, $arg->ranges; | |
3198cc57 | 3441 | $input_count++; |
99870f4d KW |
3442 | } |
3443 | else { | |
3444 | my $message = ""; | |
3445 | if (defined $self) { | |
f998e60c | 3446 | no overloading; |
051df77b | 3447 | $message .= $owner_name_of{pack 'J', $self}; |
99870f4d KW |
3448 | } |
3449 | Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); | |
3450 | return; | |
3451 | } | |
3452 | } | |
3453 | ||
3454 | # Sort with the range containing the lowest ordinal first, but if | |
3455 | # two ranges start at the same code point, sort with the bigger range | |
3456 | # of the two first, because it takes fewer cycles. | |
3198cc57 KW |
3457 | if ($input_count > 1) { |
3458 | @records = sort { ($a->start <=> $b->start) | |
99870f4d KW |
3459 | or |
3460 | # if b is shorter than a, b->end will be | |
3461 | # less than a->end, and we want to select | |
3462 | # a, so want to return -1 | |
3463 | ($b->end <=> $a->end) | |
3464 | } @records; | |
3198cc57 | 3465 | } |
99870f4d KW |
3466 | |
3467 | my $new = $class->new(@_); | |
3468 | ||
3469 | # Fold in records so long as they add new information. | |
3470 | for my $set (@records) { | |
3471 | my $start = $set->start; | |
3472 | my $end = $set->end; | |
d59563d0 | 3473 | my $value = $set->value; |
3198cc57 | 3474 | my $type = $set->type; |
99870f4d | 3475 | if ($start > $new->max) { |
3198cc57 | 3476 | $new->_add_delete('+', $start, $end, $value, Type => $type); |
99870f4d KW |
3477 | } |
3478 | elsif ($end > $new->max) { | |
3198cc57 KW |
3479 | $new->_add_delete('+', $new->max +1, $end, $value, |
3480 | Type => $type); | |
3481 | } | |
3482 | elsif ($input_count == 1) { | |
3483 | # Here, overlaps existing range, but is from a single input, | |
3484 | # so preserve the multiple values from that input. | |
3485 | $new->_add_delete('+', $start, $end, $value, Type => $type, | |
3486 | Replace => $MULTIPLE_AFTER); | |
99870f4d KW |
3487 | } |
3488 | } | |
3489 | ||
3490 | return $new; | |
3491 | } | |
3492 | ||
3493 | sub range_count { # Return the number of ranges in the range list | |
3494 | my $self = shift; | |
3495 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3496 | ||
f998e60c | 3497 | no overloading; |
051df77b | 3498 | return scalar @{$ranges{pack 'J', $self}}; |
99870f4d KW |
3499 | } |
3500 | ||
3501 | sub min { | |
3502 | # Returns the minimum code point currently in the range list, or if | |
3503 | # the range list is empty, 2 beyond the max possible. This is a | |
3504 | # method because used so rarely, that not worth saving between calls, | |
3505 | # and having to worry about changing it as ranges are added and | |
3506 | # deleted. | |
3507 | ||
3508 | my $self = shift; | |
3509 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3510 | ||
ffe43484 | 3511 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3512 | |
3513 | # If the range list is empty, return a large value that isn't adjacent | |
3514 | # to any that could be in the range list, for simpler tests | |
2d88a86a | 3515 | return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; |
99870f4d KW |
3516 | return $ranges{$addr}->[0]->start; |
3517 | } | |
3518 | ||
3519 | sub contains { | |
3520 | # Boolean: Is argument in the range list? If so returns $i such that: | |
3521 | # range[$i]->end < $codepoint <= range[$i+1]->end | |
3522 | # which is one beyond what you want; this is so that the 0th range | |
3523 | # doesn't return false | |
3524 | my $self = shift; | |
3525 | my $codepoint = shift; | |
3526 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3527 | ||
99870f4d KW |
3528 | my $i = $self->_search_ranges($codepoint); |
3529 | return 0 unless defined $i; | |
3530 | ||
3531 | # The search returns $i, such that | |
3532 | # range[$i-1]->end < $codepoint <= range[$i]->end | |
3533 | # So is in the table if and only iff it is at least the start position | |
3534 | # of range $i. | |
f998e60c | 3535 | no overloading; |
051df77b | 3536 | return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint; |
99870f4d KW |
3537 | return $i + 1; |
3538 | } | |
3539 | ||
2f7a8815 KW |
3540 | sub containing_range { |
3541 | # Returns the range object that contains the code point, undef if none | |
3542 | ||
3543 | my $self = shift; | |
3544 | my $codepoint = shift; | |
3545 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3546 | ||
3547 | my $i = $self->contains($codepoint); | |
3548 | return unless $i; | |
3549 | ||
3550 | # contains() returns 1 beyond where we should look | |
3551 | no overloading; | |
3552 | return $ranges{pack 'J', $self}->[$i-1]; | |
3553 | } | |
3554 | ||
99870f4d KW |
3555 | sub value_of { |
3556 | # Returns the value associated with the code point, undef if none | |
3557 | ||
3558 | my $self = shift; | |
3559 | my $codepoint = shift; | |
3560 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3561 | ||
d69c231b KW |
3562 | my $range = $self->containing_range($codepoint); |
3563 | return unless defined $range; | |
99870f4d | 3564 | |
d69c231b | 3565 | return $range->value; |
99870f4d KW |
3566 | } |
3567 | ||
0a9dbafc KW |
3568 | sub type_of { |
3569 | # Returns the type of the range containing the code point, undef if | |
3570 | # the code point is not in the table | |
3571 | ||
3572 | my $self = shift; | |
3573 | my $codepoint = shift; | |
3574 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3575 | ||
3576 | my $range = $self->containing_range($codepoint); | |
3577 | return unless defined $range; | |
3578 | ||
3579 | return $range->type; | |
3580 | } | |
3581 | ||
99870f4d KW |
3582 | sub _search_ranges { |
3583 | # Find the range in the list which contains a code point, or where it | |
3584 | # should go if were to add it. That is, it returns $i, such that: | |
3585 | # range[$i-1]->end < $codepoint <= range[$i]->end | |
3586 | # Returns undef if no such $i is possible (e.g. at end of table), or | |
3587 | # if there is an error. | |
3588 | ||
3589 | my $self = shift; | |
3590 | my $code_point = shift; | |
3591 | Carp::carp_extra_args(\@_) if main::DEBUG && @_; | |
3592 | ||
ffe43484 | 3593 | my $addr = do { no overloading; pack 'J', $self; }; |
99870f4d KW |
3594 | |
3595 | return if $code_point > $max{$addr}; | |
3596 | my $r = $ranges{$addr}; # The current list of ranges | |
3597 | my $range_list_size = scalar @$r; | |
3598 | my $i; | |
3599 | ||
3600 | use integer; # want integer division | |
3601 | ||
3602 | # Use the cached result as the starting guess for this one, because, | |
3603 | # an experiment on 5.1 showed that 90% of the time the cache was the | |
3604 | # same as the result on the next call (and 7% it was one less). | |
3605 | $i = $_search_ranges_cache{$addr}; | |
3606 | $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. | |
3607 | # from an intervening deletion | |
3608 | #local $to_trace = 1 if main::DEBUG; | |
3609 | trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); | |
3610 | return $i if $code_point <= $r->[$i]->end | |
3611 | && ($i == 0 || $r->[$i-1]->end < $code_point); | |
3612 | ||
3613 | # Here the cache doesn't yield the correct $i. Try adding 1. | |
3614 | if ($i < $range_list_size - 1 | |
3615 | && $r->[$i]->end < $code_point && | |
3616 | $code_point <= $r->[$i+1]->end) | |
3617 | { | |
3618 | $i++; | |
3619 | trace "next \$i is correct: $i" if main::DEBUG && $to_trace; | |
3620 | $_search_ranges_cache{$addr} = $i; | |
3621 | return $i; | |
3622 | } | |
3623 | ||
3624 | # Here, adding 1 also didn't work. We do a binary search to | |
3625 | # find the correct position, starting with current $i | |
3626 | my $lower = 0; | |
3627 | my $upper = $range_list_size - 1; | |
3628 | while (1) { | |
3629 | trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; | |
3630 | ||
3631 | if ($code_point <= $r->[$i]->end) { | |
3632 | ||
3633 | # Here we have met the upper constraint. We can quit if we | |
3634 | # also meet the lower one. | |
3635 | last if $i == 0 || $r->[$i-1]->end < $code_point; | |
3636 | ||
3637 | $upper = $i; # Still too high. | |
3638 | ||
3639 | } | |
3640 | else { | |
3641 | ||
3642 | # Here, $r[$i]->end < $code_point, so look higher up. | |
3643 | $lower = $i; | |
3644 | } | |
3645 | ||
3646 | # Split search domain in half to try again. | |
3647 | my $temp = ($upper + $lower) / 2; | |
3648 | ||
3649 | # No point in continuing unless $i changes for next time | |
3650 | # in the loop. | |
3651 | if ($temp == $i) { | |
3652 | ||
3653 | # We can't reach the highest element because of the averaging. | |
3654 | # So if one below the upper edge, force it there and try one | |
3655 | # more time. | |
3656 | if ($i == $range_list_size - 2) { | |
3657 | ||
3658 | trace "Forcing to upper edge" if main::DEBUG && $to_trace; | |
3659 | $i = $range_list_size - 1; | |
3660 | ||
3661 | # Change $lower as well so if fails next time through, | |
3662 | # taking the average will yield the same $i, and we will | |
3663 | # quit with the error message just below. | |
3664 | $lower = $i; | |
3665 | next; | |
3666 | } | |
3667 | Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); | |
3668 | return; | |
3669 | } | |
3670 | $i = $temp; | |
3671 | } # End of while loop | |
3672 | ||
3673 | if (main::DEBUG && $to_trace) { | |
3674 | trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; | |
3675 | trace "i= [ $i ]", $r->[$i]; | |
3676 | trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; | |
3677 | } | |
3678 | ||
3679 | # Here we have found the offset. Cache it as a starting point for the | |
3680 | # next call. | |
3681 | $_search_ranges_cache{$addr} = $i; | |
3682 | return $i; | |
3683 | } | |
3684 | ||
3685 | sub _add_delete { | |
3686 | # Add, replace or delete ranges to or from a list. The $type | |
3687 | # parameter gives which: | |
3688 | # '+' => insert or replace a range, returning a list of any changed | |
3689 | # ranges. | |
3690 | # '-' => delete a range, returning a list of any deleted ranges. | |
3691 | # | |
3692 | # The next three parameters give respectively the start, end, and | |
3693 | # value associated with the range. 'value' should be null unless the | |
3694 | # operation is '+'; | |
3695 | # | |
3696 | # The range list is kept sorted so that the range with the lowest | |
3697 | # starting position is first in the list, and generally, adjacent | |
c1739a4a | 3698 | # ranges with the same values are merged into a single larger one (see |
99870f4d KW |
3699 | # exceptions below). |
3700 | # | |
c1739a4a | 3701 | # There are more parameters; all are key => value pairs: |
99870f4d KW |
3702 | # Type gives the type of the value. It is only valid for '+'. |
3703 | # All ranges have types; if this parameter is omitted, 0 is | |
3704 | # assumed. Ranges with type 0 are assumed to obey the | |
3705 | # Unicode rules for casing, etc; ranges with other types are | |
3706 | # not. Otherwise, the type is arbitrary, for the caller's | |
3707 | # convenience, and looked at only by this routine to keep | |
3708 | # adjacent ranges of different types from being merged into | |
3709 | # a single larger range, and when Replace => | |
3710 | # $IF_NOT_EQUIVALENT is specified (see just below). | |
3711 | # Replace determines what to do if the range list already contains | |
3712 | # ranges which coincide with all or portions of the input | |
3713 | # range. It is only valid for '+': | |
3714 | # => $NO means that the new value is not to replace | |
3715 | # any existing ones, but any empty gaps of the | |
3716 | # range list coinciding with the input range | |
3717 | # will be filled in with the new value. | |
3718 | # => $UNCONDITIONALLY means to replace the existing values with | |
3719 | # this one unconditionally. However, if the | |
3720 | # new and old values are identical, the | |
3721 | # replacement is skipped to save cycles | |
3722 | # => $IF_NOT_EQUIVALENT means to replace the existing values | |
d59563d0 | 3723 | # (the default) with this one if they are not equivalent. |
99870f4d | 3724 | # Ranges are equivalent if their types are the |
c1739a4a | 3725 | # same, and they are the same string; or if |
99870f4d KW |
3726 | # both are type 0 ranges, if their Unicode |
3727 | # standard forms are identical. In this last | |
3728 | # case, the routine chooses the more "modern" | |
3729 | # one to use. This is because some of the | |
3730 | # older files are formatted with values that | |
3731 | # are, for example, ALL CAPs, whereas the | |
3732 | # derived files have a more modern style, | |
3733 | # which looks better. By looking for this | |
3734 | # style when the pre-existing and replacement | |
3735 | # standard forms are the same, we can move to | |
3736 | # the modern style | |
9470941f | 3737 | # => $MULTIPLE_BEFORE means that if this range duplicates an |
99870f4d KW |
3738 | # existing one, but has a different value, |
3739 | # don't replace the existing one, but insert | |
3740 | # this, one so that the same range can occur | |
53d84487 KW |
3741 | # multiple times. They are stored LIFO, so |
3742 | # that the final one inserted is the first one | |
3743 | # returned in an ordered search of the table. | |