Commit | Line | Data |
---|---|---|
9d9177be KW |
1 | #!perl -w |
2 | use 5.015; | |
3 | use strict; | |
4 | use warnings; | |
99f21fb9 KW |
5 | use Unicode::UCD qw(prop_aliases |
6 | prop_values | |
7 | prop_value_aliases | |
8 | prop_invlist | |
9 | prop_invmap search_invlist | |
10 | ); | |
9d9177be | 11 | require 'regen/regen_lib.pl'; |
0c4ecf42 | 12 | require 'regen/charset_translations.pl'; |
9d9177be KW |
13 | |
14 | # This program outputs charclass_invlists.h, which contains various inversion | |
15 | # lists in the form of C arrays that are to be used as-is for inversion lists. | |
16 | # Thus, the lists it contains are essentially pre-compiled, and need only a | |
17 | # light-weight fast wrapper to make them usable at run-time. | |
18 | ||
19 | # As such, this code knows about the internal structure of these lists, and | |
20 | # any change made to that has to be done here as well. A random number stored | |
21 | # in the headers is used to minimize the possibility of things getting | |
22 | # out-of-sync, or the wrong data structure being passed. Currently that | |
23 | # random number is: | |
99f21fb9 KW |
24 | |
25 | # charclass_invlists.h now also has a partial implementation of inversion | |
26 | # maps; enough to generate tables for the line break properties, such as GCB | |
27 | ||
0a07b44b | 28 | my $VERSION_DATA_STRUCTURE_TYPE = 148565664; |
9d9177be | 29 | |
99f21fb9 KW |
30 | # integer or float |
31 | my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax; | |
32 | ||
33 | # Matches valid C language enum names: begins with ASCII alphabetic, then any | |
34 | # ASCII \w | |
35 | my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax; | |
36 | ||
9d9177be KW |
37 | my $out_fh = open_new('charclass_invlists.h', '>', |
38 | {style => '*', by => $0, | |
39 | from => "Unicode::UCD"}); | |
40 | ||
bffc0129 | 41 | my $in_file_pound_if = 0; |
43b443dd | 42 | |
289ce9cc KW |
43 | my $max_hdr_len = 3; # In headings, how wide a name is allowed? |
44 | ||
9d9177be KW |
45 | print $out_fh "/* See the generating file for comments */\n\n"; |
46 | ||
bffc0129 KW |
47 | # The symbols generated by this program are all currently defined only in a |
48 | # single dot c each. The code knows where most of them go, but this hash | |
49 | # gives overrides for the exceptions to the typical place | |
50 | my %exceptions_to_where_to_define = | |
51 | ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C', | |
52 | AboveLatin1 => 'PERL_IN_REGCOMP_C', | |
53 | Latin1 => 'PERL_IN_REGCOMP_C', | |
54 | UpperLatin1 => 'PERL_IN_REGCOMP_C', | |
55 | _Perl_Any_Folds => 'PERL_IN_REGCOMP_C', | |
56 | _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C', | |
57 | _Perl_IDCont => 'PERL_IN_UTF8_C', | |
58 | _Perl_IDStart => 'PERL_IN_UTF8_C', | |
59 | ); | |
015bb97c | 60 | |
f79a09fc | 61 | # This hash contains the properties with enums that have hard-coded references |
289ce9cc | 62 | # to them in C code. It is neeed to make sure that if perl is compiled |
f79a09fc KW |
63 | # with an older Unicode data set, that all the enum values the code is |
64 | # expecting will still be in the enum typedef. Thus the code doesn't have to | |
289ce9cc KW |
65 | # change. The Unicode version won't have any code points that have the enum |
66 | # values not in that version, so the code that handles them will not get | |
67 | # exercised. This is far better than having to #ifdef things. The names here | |
68 | # should be the long names of the respective property values. The reason for | |
69 | # this is because regexec.c uses them as case labels, and the long name is | |
70 | # generally more understandable than the short. | |
f79a09fc KW |
71 | my %hard_coded_enums = |
72 | ( gcb => [ | |
73 | 'Control', | |
74 | 'CR', | |
75 | 'Extend', | |
76 | 'L', | |
77 | 'LF', | |
78 | 'LV', | |
79 | 'LVT', | |
80 | 'Other', | |
81 | 'Prepend', | |
82 | 'Regional_Indicator', | |
83 | 'SpacingMark', | |
84 | 'T', | |
85 | 'V', | |
86 | ], | |
ca8226cf KW |
87 | lb => [ |
88 | 'Alphabetic', | |
89 | 'Break_After', | |
90 | 'Break_Before', | |
91 | 'Break_Both', | |
92 | 'Break_Symbols', | |
93 | 'Carriage_Return', | |
94 | 'Close_Parenthesis', | |
95 | 'Close_Punctuation', | |
96 | 'Combining_Mark', | |
97 | 'Contingent_Break', | |
98 | 'Exclamation', | |
99 | 'Glue', | |
100 | 'H2', | |
101 | 'H3', | |
102 | 'Hebrew_Letter', | |
103 | 'Hyphen', | |
104 | 'Ideographic', | |
105 | 'Infix_Numeric', | |
106 | 'Inseparable', | |
107 | 'JL', | |
108 | 'JT', | |
109 | 'JV', | |
110 | 'Line_Feed', | |
111 | 'Mandatory_Break', | |
112 | 'Next_Line', | |
113 | 'Nonstarter', | |
114 | 'Numeric', | |
115 | 'Open_Punctuation', | |
116 | 'Postfix_Numeric', | |
117 | 'Prefix_Numeric', | |
118 | 'Quotation', | |
119 | 'Regional_Indicator', | |
120 | 'Space', | |
121 | 'Word_Joiner', | |
122 | 'ZWSpace', | |
123 | ], | |
f79a09fc KW |
124 | sb => [ |
125 | 'ATerm', | |
126 | 'Close', | |
127 | 'CR', | |
128 | 'Extend', | |
129 | 'Format', | |
130 | 'LF', | |
131 | 'Lower', | |
132 | 'Numeric', | |
133 | 'OLetter', | |
134 | 'Other', | |
135 | 'SContinue', | |
136 | 'Sep', | |
137 | 'Sp', | |
138 | 'STerm', | |
139 | 'Upper', | |
140 | ], | |
141 | wb => [ | |
142 | 'ALetter', | |
143 | 'CR', | |
144 | 'Double_Quote', | |
145 | 'Extend', | |
146 | 'ExtendNumLet', | |
147 | 'Format', | |
148 | 'Hebrew_Letter', | |
149 | 'Katakana', | |
150 | 'LF', | |
151 | 'MidLetter', | |
152 | 'MidNum', | |
153 | 'MidNumLet', | |
154 | 'Newline', | |
155 | 'Numeric', | |
156 | 'Other', | |
f1f6961f | 157 | 'Perl_Tailored_HSpace', |
f79a09fc KW |
158 | 'Regional_Indicator', |
159 | 'Single_Quote', | |
f79a09fc KW |
160 | ], |
161 | ); | |
162 | ||
973a28ed KW |
163 | my %gcb_enums; |
164 | my @gcb_short_enums; | |
289ce9cc | 165 | my %gcb_abbreviations; |
6b659339 KW |
166 | my %lb_enums; |
167 | my @lb_short_enums; | |
289ce9cc | 168 | my %lb_abbreviations; |
7e54b87f KW |
169 | my %wb_enums; |
170 | my @wb_short_enums; | |
289ce9cc | 171 | my %wb_abbreviations; |
6b659339 | 172 | |
99f21fb9 KW |
173 | my @a2n; |
174 | ||
175 | sub uniques { | |
176 | # Returns non-duplicated input values. From "Perl Best Practices: | |
177 | # Encapsulated Cleverness". p. 455 in first edition. | |
178 | ||
179 | my %seen; | |
180 | return grep { ! $seen{$_}++ } @_; | |
181 | } | |
182 | ||
183 | sub a2n($) { | |
184 | my $cp = shift; | |
185 | ||
186 | # Returns the input Unicode code point translated to native. | |
187 | ||
188 | return $cp if $cp !~ $numeric_re || $cp > 255; | |
189 | return $a2n[$cp]; | |
190 | } | |
191 | ||
bffc0129 KW |
192 | sub end_file_pound_if { |
193 | if ($in_file_pound_if) { | |
194 | print $out_fh "\n#endif\t/* $in_file_pound_if */\n"; | |
195 | $in_file_pound_if = 0; | |
196 | } | |
197 | } | |
198 | ||
199 | sub switch_pound_if ($$) { | |
200 | my $name = shift; | |
201 | my $new_pound_if = shift; | |
202 | ||
203 | # Switch to new #if given by the 2nd argument. If there is an override | |
204 | # for this, it instead switches to that. The 1st argument is the | |
205 | # static's name, used to look up the overrides | |
206 | ||
207 | if (exists $exceptions_to_where_to_define{$name}) { | |
208 | $new_pound_if = $exceptions_to_where_to_define{$name}; | |
209 | } | |
210 | ||
211 | # Exit current #if if the new one is different from the old | |
212 | if ($in_file_pound_if | |
213 | && $in_file_pound_if !~ /$new_pound_if/) | |
214 | { | |
215 | end_file_pound_if; | |
216 | } | |
217 | ||
218 | # Enter new #if, if not already in it. | |
219 | if (! $in_file_pound_if) { | |
220 | $in_file_pound_if = "defined($new_pound_if)"; | |
221 | print $out_fh "\n#if $in_file_pound_if\n"; | |
43b443dd KW |
222 | } |
223 | } | |
224 | ||
0c4ecf42 | 225 | sub output_invlist ($$;$) { |
9d9177be KW |
226 | my $name = shift; |
227 | my $invlist = shift; # Reference to inversion list array | |
0c4ecf42 | 228 | my $charset = shift // ""; # name of character set for comment |
9d9177be | 229 | |
76d3994c | 230 | die "No inversion list for $name" unless defined $invlist |
ad85f59a | 231 | && ref $invlist eq 'ARRAY'; |
76d3994c | 232 | |
9d9177be KW |
233 | # Output the inversion list $invlist using the name $name for it. |
234 | # It is output in the exact internal form for inversion lists. | |
235 | ||
a0316a6c KW |
236 | # Is the last element of the header 0, or 1 ? |
237 | my $zero_or_one = 0; | |
ad85f59a | 238 | if (@$invlist && $invlist->[0] != 0) { |
a0316a6c | 239 | unshift @$invlist, 0; |
9d9177be KW |
240 | $zero_or_one = 1; |
241 | } | |
0a07b44b | 242 | my $count = @$invlist; |
9d9177be | 243 | |
bffc0129 | 244 | switch_pound_if ($name, 'PERL_IN_PERL_C'); |
43b443dd | 245 | |
0c4ecf42 KW |
246 | print $out_fh "\nstatic const UV ${name}_invlist[] = {"; |
247 | print $out_fh " /* for $charset */" if $charset; | |
248 | print $out_fh "\n"; | |
9d9177be | 249 | |
a0316a6c | 250 | print $out_fh "\t$count,\t/* Number of elements */\n"; |
9d9177be KW |
251 | print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n"; |
252 | print $out_fh "\t", $zero_or_one, | |
a0316a6c KW |
253 | ",\t/* 0 if the list starts at 0;", |
254 | "\n\t\t 1 if it starts at the element beyond 0 */\n"; | |
9d9177be KW |
255 | |
256 | # The main body are the UVs passed in to this routine. Do the final | |
257 | # element separately | |
47d53124 KW |
258 | for my $i (0 .. @$invlist - 1) { |
259 | printf $out_fh "\t0x%X", $invlist->[$i]; | |
260 | print $out_fh "," if $i < @$invlist - 1; | |
261 | print $out_fh "\n"; | |
9d9177be KW |
262 | } |
263 | ||
9d9177be KW |
264 | print $out_fh "};\n"; |
265 | } | |
266 | ||
99f21fb9 KW |
267 | sub output_invmap ($$$$$$$) { |
268 | my $name = shift; | |
269 | my $invmap = shift; # Reference to inversion map array | |
270 | my $prop_name = shift; | |
271 | my $input_format = shift; # The inversion map's format | |
272 | my $default = shift; # The property value for code points who | |
273 | # otherwise don't have a value specified. | |
274 | my $extra_enums = shift; # comma-separated list of our additions to the | |
275 | # property's standard possible values | |
276 | my $charset = shift // ""; # name of character set for comment | |
277 | ||
278 | # Output the inversion map $invmap for property $prop_name, but use $name | |
279 | # as the actual data structure's name. | |
280 | ||
281 | my $count = @$invmap; | |
282 | ||
283 | my $output_format; | |
284 | my $declaration_type; | |
285 | my %enums; | |
286 | my $name_prefix; | |
287 | ||
288 | if ($input_format eq 's') { | |
b83e6484 | 289 | my $orig_prop_name = $prop_name; |
02f811dd KW |
290 | $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name |
291 | my $short_name = (prop_aliases($prop_name))[0] // $prop_name; | |
b83e6484 KW |
292 | my @enums; |
293 | if ($orig_prop_name eq $prop_name) { | |
294 | @enums = prop_values($prop_name); | |
295 | } | |
296 | else { | |
297 | @enums = uniques(@$invmap); | |
298 | } | |
289ce9cc | 299 | |
99f21fb9 KW |
300 | if (! @enums) { |
301 | die "Only enum properties are currently handled; '$prop_name' isn't one"; | |
302 | } | |
303 | else { | |
f79a09fc | 304 | my @expected_enums = @{$hard_coded_enums{lc $short_name}}; |
289ce9cc KW |
305 | my @canonical_input_enums; |
306 | if (@expected_enums) { | |
307 | if (@expected_enums < @enums) { | |
308 | die 'You need to update %hard_coded_enums to reflect new' | |
309 | . " entries in this Unicode version\n" | |
310 | . "Expected: " . join(", ", sort @expected_enums) . "\n" | |
311 | . " Got: " . join(", ", sort @enums); | |
312 | } | |
f79a09fc | 313 | |
289ce9cc | 314 | if (! defined prop_aliases($prop_name)) { |
f79a09fc | 315 | |
289ce9cc KW |
316 | # Convert the input enums into canonical form and |
317 | # save for use below | |
318 | @canonical_input_enums = map { lc ($_ =~ s/_//gr) } | |
319 | @enums; | |
320 | } | |
321 | @enums = sort @expected_enums; | |
322 | } | |
99f21fb9 | 323 | |
289ce9cc KW |
324 | # The internal enums come last, and in the order specified |
325 | my @extras; | |
326 | if ($extra_enums ne "") { | |
327 | @extras = split /,/, $extra_enums; | |
328 | push @enums, @extras; | |
329 | } | |
6dc80864 | 330 | |
99f21fb9 KW |
331 | # Assign a value to each element of the enum. The default |
332 | # value always gets 0; the others are arbitrarily assigned. | |
333 | my $enum_val = 0; | |
02f811dd KW |
334 | my $canonical_default = prop_value_aliases($prop_name, $default); |
335 | $default = $canonical_default if defined $canonical_default; | |
99f21fb9 KW |
336 | $enums{$default} = $enum_val++; |
337 | for my $enum (@enums) { | |
338 | $enums{$enum} = $enum_val++ unless exists $enums{$enum}; | |
339 | } | |
6b659339 | 340 | |
289ce9cc KW |
341 | # Calculate the enum values for certain properties like |
342 | # _Perl_GCB and _Perl_LB, because we output special tables for | |
343 | # them. | |
344 | if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) { | |
345 | ||
346 | # We use string evals to allow the same code to work on | |
347 | # all tables we're doing. | |
348 | my $type = lc $prop_name; | |
349 | ||
350 | # We use lowercase single letter names for any property | |
351 | # values not in the release of Unicode being compiled now. | |
352 | my $placeholder = "a"; | |
353 | ||
354 | # Skip if we've already done this code, which populated | |
355 | # this hash | |
356 | if (eval "! \%${type}_enums") { | |
357 | ||
358 | # For each enum ... | |
359 | foreach my $enum (sort keys %enums) { | |
360 | my $value = $enums{$enum}; | |
361 | my $short; | |
362 | my $abbreviated_from; | |
363 | ||
364 | # Special case this wb property value to make the | |
365 | # name more clear | |
366 | if ($enum eq 'Perl_Tailored_HSpace') { | |
367 | $short = 'hs'; | |
368 | $abbreviated_from = $enum; | |
369 | } | |
370 | elsif (grep { $_ eq $enum } @extras) { | |
371 | ||
372 | # The 'short' name for one of the property | |
373 | # values added by this file is just the | |
374 | # lowercase of it | |
375 | $short = lc $enum; | |
376 | } | |
377 | elsif (grep {$_ eq lc ( $enum =~ s/_//gr) } | |
378 | @canonical_input_enums) | |
379 | { # On Unicode versions that predate the | |
380 | # official property, we have set up this array | |
381 | # to be the canonical form of each enum in the | |
382 | # substitute property. If the enum we're | |
383 | # looking at is canonically the same as one of | |
384 | # these, use its name instead of generating a | |
385 | # placeholder one in the next clause (which | |
386 | # will happen because prop_value_aliases() | |
387 | # will fail because it only works on official | |
388 | # properties) | |
389 | $short = $enum; | |
390 | } | |
391 | else { | |
392 | # Use the official short name for the other | |
393 | # property values, which should all be | |
394 | # official ones. | |
395 | ($short) = prop_value_aliases($type, $enum); | |
396 | ||
397 | # But create a placeholder for ones not in | |
398 | # this Unicode version. | |
399 | $short = $placeholder++ unless defined $short; | |
400 | } | |
401 | ||
402 | # If our short name is too long, or we already | |
403 | # know that the name is an abbreviation, truncate | |
404 | # to make sure it's short enough, and remember | |
405 | # that we did this so we can later place in a | |
406 | # comment in the generated file | |
407 | if ( $abbreviated_from | |
408 | || length $short > $max_hdr_len) | |
409 | { | |
410 | $short = substr($short, 0, $max_hdr_len); | |
411 | $abbreviated_from = $enum | |
412 | unless $abbreviated_from; | |
413 | # If the name we are to display conflicts, try | |
414 | # another. | |
415 | while (eval "exists | |
416 | \$${type}_abbreviations{$short}") | |
417 | { | |
418 | die $@ if $@; | |
419 | $short++; | |
420 | } | |
421 | ||
422 | eval "\$${type}_abbreviations{$short} = '$enum'"; | |
423 | die $@ if $@; | |
424 | } | |
425 | ||
426 | # Remember the mapping from the property value | |
427 | # (enum) name to its value. | |
428 | eval "\$${type}_enums{$enum} = $value"; | |
429 | die $@ if $@; | |
430 | ||
431 | # Remember the inverse mapping to the short name | |
432 | # so that we can properly label the generated | |
433 | # table's rows and columns | |
434 | eval "\$${type}_short_enums[$value] = '$short'"; | |
435 | die $@ if $@; | |
436 | } | |
7e54b87f KW |
437 | } |
438 | } | |
99f21fb9 KW |
439 | } |
440 | ||
bffc0129 KW |
441 | # Inversion map stuff is currently used only by regexec |
442 | switch_pound_if($name, 'PERL_IN_REGEXEC_C'); | |
99f21fb9 KW |
443 | { |
444 | ||
99f21fb9 KW |
445 | # The short names tend to be two lower case letters, but it looks |
446 | # better for those if they are upper. XXX | |
447 | $short_name = uc($short_name) if length($short_name) < 3 | |
448 | || substr($short_name, 0, 1) =~ /[[:lower:]]/; | |
85e5f08b | 449 | $name_prefix = "${short_name}_"; |
99f21fb9 KW |
450 | my $enum_count = keys %enums; |
451 | print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n"; | |
452 | ||
453 | print $out_fh "\ntypedef enum {\n"; | |
6dc80864 KW |
454 | my @enum_list; |
455 | foreach my $enum (keys %enums) { | |
456 | $enum_list[$enums{$enum}] = $enum; | |
457 | } | |
458 | foreach my $i (0 .. @enum_list - 1) { | |
459 | my $name = $enum_list[$i]; | |
460 | print $out_fh "\t${name_prefix}$name = $i"; | |
461 | print $out_fh "," if $i < $enum_count - 1; | |
462 | print $out_fh "\n"; | |
99f21fb9 KW |
463 | } |
464 | $declaration_type = "${name_prefix}enum"; | |
465 | print $out_fh "} $declaration_type;\n"; | |
466 | ||
467 | $output_format = "${name_prefix}%s"; | |
468 | } | |
469 | } | |
470 | else { | |
471 | die "'$input_format' invmap() format for '$prop_name' unimplemented"; | |
472 | } | |
473 | ||
474 | die "No inversion map for $prop_name" unless defined $invmap | |
475 | && ref $invmap eq 'ARRAY' | |
476 | && $count; | |
477 | ||
478 | print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {"; | |
479 | print $out_fh " /* for $charset */" if $charset; | |
480 | print $out_fh "\n"; | |
481 | ||
482 | # The main body are the scalars passed in to this routine. | |
483 | for my $i (0 .. $count - 1) { | |
484 | my $element = $invmap->[$i]; | |
02f811dd KW |
485 | my $full_element_name = prop_value_aliases($prop_name, $element); |
486 | $element = $full_element_name if defined $full_element_name; | |
487 | $element = $name_prefix . $element; | |
99f21fb9 KW |
488 | print $out_fh "\t$element"; |
489 | print $out_fh "," if $i < $count - 1; | |
490 | print $out_fh "\n"; | |
491 | } | |
492 | print $out_fh "};\n"; | |
99f21fb9 KW |
493 | } |
494 | ||
5a7e5385 | 495 | sub mk_invlist_from_sorted_cp_list { |
a02047bf KW |
496 | |
497 | # Returns an inversion list constructed from the sorted input array of | |
498 | # code points | |
499 | ||
500 | my $list_ref = shift; | |
501 | ||
99f21fb9 KW |
502 | return unless @$list_ref; |
503 | ||
a02047bf KW |
504 | # Initialize to just the first element |
505 | my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); | |
506 | ||
507 | # For each succeeding element, if it extends the previous range, adjust | |
508 | # up, otherwise add it. | |
509 | for my $i (1 .. @$list_ref - 1) { | |
510 | if ($invlist[-1] == $list_ref->[$i]) { | |
511 | $invlist[-1]++; | |
512 | } | |
513 | else { | |
514 | push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; | |
515 | } | |
516 | } | |
517 | return @invlist; | |
518 | } | |
519 | ||
520 | # Read in the Case Folding rules, and construct arrays of code points for the | |
521 | # properties we need. | |
522 | my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); | |
523 | die "Could not find inversion map for Case_Folding" unless defined $format; | |
524 | die "Incorrect format '$format' for Case_Folding inversion map" | |
347b9066 KW |
525 | unless $format eq 'al' |
526 | || $format eq 'a'; | |
a02047bf KW |
527 | my @has_multi_char_fold; |
528 | my @is_non_final_fold; | |
529 | ||
530 | for my $i (0 .. @$folds_ref - 1) { | |
531 | next unless ref $folds_ref->[$i]; # Skip single-char folds | |
532 | push @has_multi_char_fold, $cp_ref->[$i]; | |
533 | ||
b6a6e956 | 534 | # Add to the non-finals list each code point that is in a non-final |
a02047bf KW |
535 | # position |
536 | for my $j (0 .. @{$folds_ref->[$i]} - 2) { | |
537 | push @is_non_final_fold, $folds_ref->[$i][$j] | |
538 | unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold; | |
539 | } | |
540 | } | |
541 | ||
a02047bf KW |
542 | sub _Perl_Non_Final_Folds { |
543 | @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; | |
5a7e5385 | 544 | return mk_invlist_from_sorted_cp_list(\@is_non_final_fold); |
a02047bf KW |
545 | } |
546 | ||
99f21fb9 KW |
547 | sub prop_name_for_cmp ($) { # Sort helper |
548 | my $name = shift; | |
549 | ||
550 | # Returns the input lowercased, with non-alphas removed, as well as | |
551 | # everything starting with a comma | |
552 | ||
553 | $name =~ s/,.*//; | |
554 | $name =~ s/[[:^alpha:]]//g; | |
555 | return lc $name; | |
556 | } | |
557 | ||
892d8259 | 558 | sub UpperLatin1 { |
5a7e5385 | 559 | return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); |
892d8259 KW |
560 | } |
561 | ||
289ce9cc KW |
562 | sub output_table_common { |
563 | ||
564 | # Common subroutine to actually output the generated rules table. | |
565 | ||
566 | my ($property, | |
567 | $table_value_defines_ref, | |
568 | $table_ref, | |
569 | $names_ref, | |
570 | $abbreviations_ref) = @_; | |
571 | my $size = @$table_ref; | |
572 | ||
573 | # Output the #define list, sorted by numeric value | |
574 | if ($table_value_defines_ref) { | |
575 | my $max_name_length = 0; | |
576 | my @defines; | |
577 | ||
578 | # Put in order, and at the same time find the longest name | |
579 | while (my ($enum, $value) = each %$table_value_defines_ref) { | |
580 | $defines[$value] = $enum; | |
581 | ||
582 | my $length = length $enum; | |
583 | $max_name_length = $length if $length > $max_name_length; | |
584 | } | |
585 | ||
586 | print $out_fh "\n"; | |
587 | ||
588 | # Output, so that the values are vertically aligned in a column after | |
589 | # the longest name | |
590 | foreach my $i (0 .. @defines - 1) { | |
591 | next unless defined $defines[$i]; | |
592 | printf $out_fh "#define %-*s %2d\n", | |
593 | $max_name_length, | |
594 | $defines[$i], | |
595 | $i; | |
596 | } | |
597 | } | |
598 | ||
599 | my $column_width = 2; # We currently allow 2 digits for the number | |
600 | ||
601 | # If the maximum value in the table is 1, it can be a bool. (Being above | |
602 | # a U8 is not currently handled | |
603 | my $max_element = 0; | |
604 | for my $i (0 .. $size - 1) { | |
605 | for my $j (0 .. $size - 1) { | |
606 | next if $max_element >= $table_ref->[$i][$j]; | |
607 | $max_element = $table_ref->[$i][$j]; | |
608 | } | |
609 | } | |
610 | die "Need wider table column width given '$max_element" | |
611 | if length $max_element > $column_width; | |
612 | ||
613 | my $table_type = ($max_element == 1) | |
614 | ? 'bool' | |
615 | : 'U8'; | |
616 | ||
617 | # If a name is longer than the width set aside for a column, its column | |
618 | # needs to have increased spacing so that the name doesn't get truncated | |
619 | # nor run into an adjacent column | |
620 | my @spacers; | |
621 | ||
622 | # If we are being compiled on a Unicode version earlier than that which | |
623 | # this file was designed for, it may be that some of the property values | |
624 | # aren't in the current release, and so would be undefined if we didn't | |
625 | # define them ourselves. Earlier code has done this, making them | |
626 | # lowercase characters of length one. We look to see if any exist, so | |
627 | # that we can add an annotation to the output table | |
628 | my $has_placeholder = 0; | |
629 | ||
630 | for my $i (0 .. $size - 1) { | |
631 | no warnings 'numeric'; | |
632 | $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax; | |
633 | $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width); | |
634 | } | |
635 | ||
636 | print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n"; | |
637 | ||
638 | # Calculate the column heading line | |
639 | my $header_line = "/* " | |
640 | . (" " x $max_hdr_len) # We let the row heading meld to | |
641 | # the '*/' for those that are at | |
642 | # the max | |
643 | . " " x 3; # Space for '*/ ' | |
644 | # Now each column | |
645 | for my $i (0 .. $size - 1) { | |
646 | $header_line .= sprintf "%s%*s", | |
647 | $spacers[$i], | |
648 | $column_width + 1, # 1 for the ',' | |
649 | $names_ref->[$i]; | |
650 | } | |
651 | $header_line .= " */\n"; | |
652 | ||
653 | # If we have annotations, output it now. | |
654 | if ($has_placeholder || scalar %$abbreviations_ref) { | |
655 | my $text = ""; | |
656 | foreach my $abbr (sort keys %$abbreviations_ref) { | |
657 | $text .= "; " if $text; | |
658 | $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'"; | |
659 | } | |
660 | if ($has_placeholder) { | |
661 | $text .= "; other " if $text; | |
662 | $text .= "lowercase names are placeholders for" | |
663 | . " property values not defined until a later Unicode" | |
664 | . " release, so are irrelevant in this one, as they are" | |
665 | . " not assigned to any code points"; | |
666 | } | |
667 | ||
668 | my $indent = " " x 3; | |
669 | $text = $indent . "/* $text */"; | |
670 | ||
671 | # Wrap the text so that it is no wider than the table, which the | |
672 | # header line gives. | |
673 | my $output_width = length $header_line; | |
674 | while (length $text > $output_width) { | |
675 | my $cur_line = substr($text, 0, $output_width); | |
676 | ||
677 | # Find the first blank back from the right end to wrap at. | |
678 | for (my $i = $output_width -1; $i > 0; $i--) { | |
679 | if (substr($text, $i, 1) eq " ") { | |
680 | print $out_fh substr($text, 0, $i), "\n"; | |
681 | ||
682 | # Set so will look at just the remaining tail (which will | |
683 | # be indented and have a '*' after the indent | |
684 | $text = $indent . " * " . substr($text, $i + 1); | |
685 | last; | |
686 | } | |
687 | } | |
688 | } | |
689 | ||
690 | # And any remaining | |
691 | print $out_fh $text, "\n" if $text; | |
692 | } | |
693 | ||
694 | # We calculated the header line earlier just to get its width so that we | |
695 | # could make sure the annotations fit into that. | |
696 | print $out_fh $header_line; | |
697 | ||
698 | # Now output the bulk of the table. | |
699 | for my $i (0 .. $size - 1) { | |
700 | ||
701 | # First the row heading. | |
702 | printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i]; | |
703 | print $out_fh "{"; # Then the brace for this row | |
704 | ||
705 | # Then each column | |
706 | for my $j (0 .. $size -1) { | |
707 | print $out_fh $spacers[$j]; | |
708 | printf $out_fh "%*d", $column_width, $table_ref->[$i][$j]; | |
709 | print $out_fh "," if $j < $size - 1; | |
710 | } | |
711 | print $out_fh " }"; | |
712 | print $out_fh "," if $i < $size - 1; | |
713 | print $out_fh "\n"; | |
714 | } | |
715 | ||
716 | print $out_fh "};\n"; | |
717 | } | |
718 | ||
973a28ed KW |
719 | sub output_GCB_table() { |
720 | ||
721 | # Create and output the pair table for use in determining Grapheme Cluster | |
722 | # Breaks, given in http://www.unicode.org/reports/tr29/. | |
723 | ||
724 | # The table is constructed in reverse order of the rules, to make the | |
725 | # lower-numbered, higher priority ones override the later ones, as the | |
726 | # algorithm stops at the earliest matching rule | |
727 | ||
728 | my @gcb_table; | |
729 | my $table_size = @gcb_short_enums; | |
730 | ||
731 | # Otherwise, break everywhere. | |
732 | # GB10 Any ÷ Any | |
733 | for my $i (0 .. $table_size - 1) { | |
734 | for my $j (0 .. $table_size - 1) { | |
735 | $gcb_table[$i][$j] = 1; | |
736 | } | |
737 | } | |
738 | ||
739 | # Do not break before extending characters. | |
740 | # Do not break before SpacingMarks, or after Prepend characters. | |
741 | # GB9 × Extend | |
742 | # GB9a × SpacingMark | |
743 | # GB9b Prepend × | |
744 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc KW |
745 | $gcb_table[$i][$gcb_enums{'Extend'}] = 0; |
746 | $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0; | |
747 | $gcb_table[$gcb_enums{'Prepend'}][$i] = 0; | |
973a28ed KW |
748 | } |
749 | ||
750 | # Do not break between regional indicator symbols. | |
751 | # GB8a Regional_Indicator × Regional_Indicator | |
289ce9cc KW |
752 | $gcb_table[$gcb_enums{'Regional_Indicator'}] |
753 | [$gcb_enums{'Regional_Indicator'}] = 0; | |
973a28ed KW |
754 | |
755 | # Do not break Hangul syllable sequences. | |
756 | # GB8 ( LVT | T) × T | |
757 | $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0; | |
758 | $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0; | |
759 | ||
760 | # GB7 ( LV | V ) × ( V | T ) | |
761 | $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0; | |
762 | $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0; | |
763 | $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0; | |
764 | $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0; | |
765 | ||
766 | # GB6 L × ( L | V | LV | LVT ) | |
767 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0; | |
768 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0; | |
769 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0; | |
770 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0; | |
771 | ||
289ce9cc KW |
772 | # Do not break between a CR and LF. Otherwise, break before and after |
773 | # controls. | |
973a28ed KW |
774 | # GB5 ÷ ( Control | CR | LF ) |
775 | # GB4 ( Control | CR | LF ) ÷ | |
776 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc | 777 | $gcb_table[$i][$gcb_enums{'Control'}] = 1; |
973a28ed KW |
778 | $gcb_table[$i][$gcb_enums{'CR'}] = 1; |
779 | $gcb_table[$i][$gcb_enums{'LF'}] = 1; | |
289ce9cc | 780 | $gcb_table[$gcb_enums{'Control'}][$i] = 1; |
973a28ed KW |
781 | $gcb_table[$gcb_enums{'CR'}][$i] = 1; |
782 | $gcb_table[$gcb_enums{'LF'}][$i] = 1; | |
783 | } | |
784 | ||
785 | # GB3 CR × LF | |
786 | $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0; | |
787 | ||
788 | # Break at the start and end of text. | |
789 | # GB1 sot ÷ | |
790 | # GB2 ÷ eot | |
791 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc KW |
792 | $gcb_table[$i][$gcb_enums{'EDGE'}] = 1; |
793 | $gcb_table[$gcb_enums{'EDGE'}][$i] = 1; | |
973a28ed KW |
794 | } |
795 | ||
796 | # But, unspecified by Unicode, we shouldn't break on an empty string. | |
289ce9cc | 797 | $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0; |
973a28ed | 798 | |
289ce9cc KW |
799 | output_table_common('GCB', undef, |
800 | \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations); | |
973a28ed KW |
801 | } |
802 | ||
6b659339 KW |
803 | sub output_LB_table() { |
804 | ||
805 | # Create and output the enums, #defines, and pair table for use in | |
806 | # determining Line Breaks. This uses the default line break algorithm, | |
807 | # given in http://www.unicode.org/reports/tr14/, but tailored by example 7 | |
808 | # in that page, as the Unicode-furnished tests assume that tailoring. | |
809 | ||
6b659339 KW |
810 | # The result is really just true or false. But we follow along with tr14, |
811 | # creating a rule which is false for something like X SP* X. That gets | |
812 | # encoding 2. The rest of the actions are synthetic ones that indicate | |
813 | # some context handling is required. These each are added to the | |
814 | # underlying 0, 1, or 2, instead of replacing them, so that the underlying | |
815 | # value can be retrieved. Actually only rules from 7 through 18 (which | |
816 | # are the ones where space matter) are possible to have 2 added to them. | |
817 | # The others below add just 0 or 1. It might be possible for one | |
818 | # synthetic rule to be added to another, yielding a larger value. This | |
819 | # doesn't happen in the Unicode 8.0 rule set, and as you can see from the | |
820 | # names of the middle grouping below, it is impossible for that to occur | |
821 | # for them because they all start with mutually exclusive classes. That | |
822 | # the final rule can't be added to any of the others isn't obvious from | |
823 | # its name, so it is assigned a power of 2 higher than the others can get | |
824 | # to so any addition would preserve all data. (And the code will reach an | |
825 | # assert(0) on debugging builds should this happen.) | |
826 | my %lb_actions = ( | |
827 | LB_NOBREAK => 0, | |
828 | LB_BREAKABLE => 1, | |
829 | LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2, | |
830 | ||
831 | LB_CM_foo => 3, # Rule 9 | |
832 | LB_SP_foo => 6, # Rule 18 | |
833 | LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25 | |
834 | LB_SY_or_IS_then_various => 11, # Rule 25 | |
835 | LB_HY_or_BA_then_foo => 13, # Rule 21 | |
836 | ||
837 | LB_various_then_PO_or_PR => (1<<4), # Rule 25 | |
838 | ); | |
839 | ||
6b659339 KW |
840 | # Construct the LB pair table. This is based on the rules in |
841 | # http://www.unicode.org/reports/tr14/, but modified as those rules are | |
842 | # designed for someone taking a string of text and sequentially going | |
843 | # through it to find the break opportunities, whereas, Perl requires | |
844 | # determining if a given random spot is a break opportunity, without | |
845 | # knowing all the entire string before it. | |
846 | # | |
847 | # The table is constructed in reverse order of the rules, to make the | |
848 | # lower-numbered, higher priority ones override the later ones, as the | |
849 | # algorithm stops at the earliest matching rule | |
850 | ||
851 | my @lb_table; | |
852 | my $table_size = @lb_short_enums; | |
853 | ||
854 | # LB31. Break everywhere else | |
855 | for my $i (0 .. $table_size - 1) { | |
856 | for my $j (0 .. $table_size - 1) { | |
857 | $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'}; | |
858 | } | |
859 | } | |
860 | ||
861 | # LB30a. Don't break between Regional Indicators | |
289ce9cc KW |
862 | $lb_table[$lb_enums{'Regional_Indicator'}] |
863 | [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
864 | |
865 | # LB30 Do not break between letters, numbers, or ordinary symbols and | |
866 | # opening or closing parentheses. | |
867 | # (AL | HL | NU) × OP | |
289ce9cc KW |
868 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}] |
869 | = $lb_actions{'LB_NOBREAK'}; | |
870 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}] | |
871 | = $lb_actions{'LB_NOBREAK'}; | |
872 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}] | |
873 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
874 | |
875 | # CP × (AL | HL | NU) | |
289ce9cc KW |
876 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}] |
877 | = $lb_actions{'LB_NOBREAK'}; | |
878 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}] | |
879 | = $lb_actions{'LB_NOBREAK'}; | |
880 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}] | |
881 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
882 | |
883 | # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). | |
884 | # IS × (AL | HL) | |
289ce9cc KW |
885 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}] |
886 | = $lb_actions{'LB_NOBREAK'}; | |
887 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
888 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
889 | |
890 | # LB28 Do not break between alphabetics (“at”). | |
891 | # (AL | HL) × (AL | HL) | |
289ce9cc KW |
892 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}] |
893 | = $lb_actions{'LB_NOBREAK'}; | |
894 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}] | |
895 | = $lb_actions{'LB_NOBREAK'}; | |
896 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}] | |
897 | = $lb_actions{'LB_NOBREAK'}; | |
898 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}] | |
899 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
900 | |
901 | # LB27 Treat a Korean Syllable Block the same as ID. | |
902 | # (JL | JV | JT | H2 | H3) × IN | |
289ce9cc KW |
903 | $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}] |
904 | = $lb_actions{'LB_NOBREAK'}; | |
905 | $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}] | |
906 | = $lb_actions{'LB_NOBREAK'}; | |
907 | $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}] | |
908 | = $lb_actions{'LB_NOBREAK'}; | |
909 | $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}] | |
910 | = $lb_actions{'LB_NOBREAK'}; | |
911 | $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}] | |
912 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
913 | |
914 | # (JL | JV | JT | H2 | H3) × PO | |
289ce9cc KW |
915 | $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}] |
916 | = $lb_actions{'LB_NOBREAK'}; | |
917 | $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}] | |
918 | = $lb_actions{'LB_NOBREAK'}; | |
919 | $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}] | |
920 | = $lb_actions{'LB_NOBREAK'}; | |
921 | $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}] | |
922 | = $lb_actions{'LB_NOBREAK'}; | |
923 | $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}] | |
924 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
925 | |
926 | # PR × (JL | JV | JT | H2 | H3) | |
289ce9cc KW |
927 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}] |
928 | = $lb_actions{'LB_NOBREAK'}; | |
929 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}] | |
930 | = $lb_actions{'LB_NOBREAK'}; | |
931 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}] | |
932 | = $lb_actions{'LB_NOBREAK'}; | |
933 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}] | |
934 | = $lb_actions{'LB_NOBREAK'}; | |
935 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}] | |
936 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
937 | |
938 | # LB26 Do not break a Korean syllable. | |
939 | # JL × (JL | JV | H2 | H3) | |
940 | $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'}; | |
941 | $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
942 | $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'}; | |
943 | $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'}; | |
944 | ||
945 | # (JV | H2) × (JV | JT) | |
946 | $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
947 | $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
948 | $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
949 | $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
950 | ||
951 | # (JT | H3) × JT | |
952 | $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
953 | $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
954 | ||
955 | # LB25 Do not break between the following pairs of classes relevant to | |
956 | # numbers, as tailored by example 7 in | |
957 | # http://www.unicode.org/reports/tr14/#Examples | |
958 | # We follow that tailoring because Unicode's test cases expect it | |
959 | # (PR | PO) × ( OP | HY )? NU | |
289ce9cc KW |
960 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}] |
961 | = $lb_actions{'LB_NOBREAK'}; | |
962 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}] | |
963 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
964 | |
965 | # Given that (OP | HY )? is optional, we have to test for it in code. | |
966 | # We add in the action (instead of overriding) for this, so that in | |
967 | # the code we can recover the underlying break value. | |
289ce9cc | 968 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 969 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 970 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 971 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 972 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}] |
6b659339 | 973 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 974 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}] |
6b659339 KW |
975 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
976 | ||
977 | # ( OP | HY ) × NU | |
289ce9cc KW |
978 | $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}] |
979 | = $lb_actions{'LB_NOBREAK'}; | |
980 | $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}] | |
981 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
982 | |
983 | # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP ) | |
984 | # which can be rewritten as: | |
985 | # NU (SY | IS)* × (NU | SY | IS | CL | CP ) | |
289ce9cc KW |
986 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}] |
987 | = $lb_actions{'LB_NOBREAK'}; | |
988 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}] | |
989 | = $lb_actions{'LB_NOBREAK'}; | |
990 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}] | |
991 | = $lb_actions{'LB_NOBREAK'}; | |
992 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}] | |
993 | = $lb_actions{'LB_NOBREAK'}; | |
994 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}] | |
995 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
996 | |
997 | # Like earlier where we have to test in code, we add in the action so | |
998 | # that we can recover the underlying values. This is done in rules | |
999 | # below, as well. The code assumes that we haven't added 2 actions. | |
1000 | # Shoul a later Unicode release break that assumption, then tests | |
1001 | # should start failing. | |
289ce9cc | 1002 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}] |
6b659339 | 1003 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1004 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}] |
6b659339 | 1005 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1006 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1007 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1008 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1009 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1010 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1011 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1012 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}] |
6b659339 | 1013 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1014 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}] |
6b659339 | 1015 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1016 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1017 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1018 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1019 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1020 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}] |
6b659339 KW |
1021 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
1022 | ||
1023 | # NU (NU | SY | IS)* (CL | CP)? × (PO | PR) | |
1024 | # which can be rewritten as: | |
1025 | # NU (SY | IS)* (CL | CP)? × (PO | PR) | |
289ce9cc KW |
1026 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}] |
1027 | = $lb_actions{'LB_NOBREAK'}; | |
1028 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}] | |
1029 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1030 | |
289ce9cc | 1031 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1032 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1033 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1034 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1035 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1036 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1037 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 KW |
1038 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
1039 | ||
289ce9cc | 1040 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1041 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1042 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1043 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1044 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1045 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1046 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 KW |
1047 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
1048 | ||
1049 | # LB24 Do not break between prefix and letters or ideographs. | |
1050 | # PR × ID | |
289ce9cc KW |
1051 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}] |
1052 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1053 | |
1054 | # PR × (AL | HL) | |
289ce9cc KW |
1055 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}] |
1056 | = $lb_actions{'LB_NOBREAK'}; | |
1057 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1058 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1059 | |
1060 | # PO × (AL | HL) | |
289ce9cc KW |
1061 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}] |
1062 | = $lb_actions{'LB_NOBREAK'}; | |
1063 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1064 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1065 | |
1066 | # LB23 Do not break within ‘a9’, ‘3a’, or ‘H%’. | |
1067 | # ID × PO | |
289ce9cc KW |
1068 | $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}] |
1069 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1070 | |
1071 | # (AL | HL) × NU | |
289ce9cc KW |
1072 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}] |
1073 | = $lb_actions{'LB_NOBREAK'}; | |
1074 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}] | |
1075 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1076 | |
1077 | # NU × (AL | HL) | |
289ce9cc KW |
1078 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}] |
1079 | = $lb_actions{'LB_NOBREAK'}; | |
1080 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1081 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1082 | |
1083 | # LB22 Do not break between two ellipses, or between letters, numbers or | |
1084 | # exclamations and ellipsis. | |
1085 | # (AL | HL) × IN | |
289ce9cc KW |
1086 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}] |
1087 | = $lb_actions{'LB_NOBREAK'}; | |
1088 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}] | |
1089 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1090 | |
289ce9cc KW |
1091 | # Exclamation × IN |
1092 | $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}] | |
1093 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1094 | |
1095 | # ID × IN | |
289ce9cc KW |
1096 | $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}] |
1097 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1098 | |
1099 | # IN × IN | |
289ce9cc KW |
1100 | $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}] |
1101 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1102 | |
1103 | # NU × IN | |
289ce9cc KW |
1104 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}] |
1105 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1106 | |
1107 | # LB21b Don’t break between Solidus and Hebrew letters. | |
1108 | # SY × HL | |
289ce9cc KW |
1109 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}] |
1110 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1111 | |
1112 | # LB21a Don't break after Hebrew + Hyphen. | |
1113 | # HL (HY | BA) × | |
1114 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1115 | $lb_table[$lb_enums{'Hyphen'}][$i] |
1116 | += $lb_actions{'LB_HY_or_BA_then_foo'}; | |
1117 | $lb_table[$lb_enums{'Break_After'}][$i] | |
1118 | += $lb_actions{'LB_HY_or_BA_then_foo'}; | |
6b659339 KW |
1119 | } |
1120 | ||
1121 | # LB21 Do not break before hyphen-minus, other hyphens, fixed-width | |
1122 | # spaces, small kana, and other non-starters, or after acute accents. | |
1123 | # × BA | |
1124 | # × HY | |
1125 | # × NS | |
1126 | # BB × | |
1127 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1128 | $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'}; |
1129 | $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'}; | |
1130 | $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'}; | |
1131 | $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1132 | } |
1133 | ||
1134 | # LB20 Break before and after unresolved CB. | |
1135 | # ÷ CB | |
1136 | # CB ÷ | |
1137 | # Conditional breaks should be resolved external to the line breaking | |
1138 | # rules. However, the default action is to treat unresolved CB as breaking | |
1139 | # before and after. | |
1140 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1141 | $lb_table[$i][$lb_enums{'Contingent_Break'}] |
1142 | = $lb_actions{'LB_BREAKABLE'}; | |
1143 | $lb_table[$lb_enums{'Contingent_Break'}][$i] | |
1144 | = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 KW |
1145 | } |
1146 | ||
1147 | # LB19 Do not break before or after quotation marks, such as ‘ ” ’. | |
1148 | # × QU | |
1149 | # QU × | |
1150 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1151 | $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'}; |
1152 | $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1153 | } |
1154 | ||
1155 | # LB18 Break after spaces | |
1156 | # SP ÷ | |
1157 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1158 | $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'}; |
6b659339 KW |
1159 | } |
1160 | ||
1161 | # LB17 Do not break within ‘——’, even with intervening spaces. | |
1162 | # B2 SP* × B2 | |
289ce9cc | 1163 | $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}] |
6b659339 KW |
1164 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1165 | ||
1166 | # LB16 Do not break between closing punctuation and a nonstarter even with | |
1167 | # intervening spaces. | |
1168 | # (CL | CP) SP* × NS | |
289ce9cc | 1169 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}] |
6b659339 | 1170 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1171 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}] |
6b659339 KW |
1172 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1173 | ||
1174 | ||
1175 | # LB15 Do not break within ‘”[’, even with intervening spaces. | |
1176 | # QU SP* × OP | |
289ce9cc | 1177 | $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}] |
6b659339 KW |
1178 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1179 | ||
1180 | # LB14 Do not break after ‘[’, even after spaces. | |
1181 | # OP SP* × | |
1182 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1183 | $lb_table[$lb_enums{'Open_Punctuation'}][$i] |
6b659339 KW |
1184 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1185 | } | |
1186 | ||
1187 | # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as | |
1188 | # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples | |
1189 | # [^NU] × CL | |
1190 | # [^NU] × CP | |
1191 | # × EX | |
1192 | # [^NU] × IS | |
1193 | # [^NU] × SY | |
1194 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1195 | $lb_table[$i][$lb_enums{'Exclamation'}] |
6b659339 KW |
1196 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1197 | ||
289ce9cc | 1198 | next if $i == $lb_enums{'Numeric'}; |
6b659339 | 1199 | |
289ce9cc | 1200 | $lb_table[$i][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1201 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1202 | $lb_table[$i][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1203 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1204 | $lb_table[$i][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1205 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1206 | $lb_table[$i][$lb_enums{'Break_Symbols'}] |
6b659339 KW |
1207 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1208 | } | |
1209 | ||
1210 | # LB12a Do not break before NBSP and related characters, except after | |
1211 | # spaces and hyphens. | |
1212 | # [^SP BA HY] × GL | |
1213 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1214 | next if $i == $lb_enums{'Space'} |
1215 | || $i == $lb_enums{'Break_After'} | |
1216 | || $i == $lb_enums{'Hyphen'}; | |
6b659339 KW |
1217 | |
1218 | # We don't break, but if a property above has said don't break even | |
1219 | # with space between, don't override that (also in the next few rules) | |
289ce9cc | 1220 | next if $lb_table[$i][$lb_enums{'Glue'}] |
6b659339 | 1221 | == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1222 | $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1223 | } |
1224 | ||
1225 | # LB12 Do not break after NBSP and related characters. | |
1226 | # GL × | |
1227 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1228 | next if $lb_table[$lb_enums{'Glue'}][$i] |
6b659339 | 1229 | == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1230 | $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1231 | } |
1232 | ||
1233 | # LB11 Do not break before or after Word joiner and related characters. | |
1234 | # × WJ | |
1235 | # WJ × | |
1236 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1237 | if ($lb_table[$i][$lb_enums{'Word_Joiner'}] |
6b659339 KW |
1238 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1239 | { | |
289ce9cc | 1240 | $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'}; |
6b659339 | 1241 | } |
289ce9cc | 1242 | if ($lb_table[$lb_enums{'Word_Joiner'}][$i] |
6b659339 KW |
1243 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1244 | { | |
289ce9cc | 1245 | $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1246 | } |
1247 | } | |
1248 | ||
1249 | # Special case this here to avoid having to do a special case in the code, | |
1250 | # by making this the same as other things with a SP in front of them that | |
1251 | # don't break, we avoid an extra test | |
289ce9cc | 1252 | $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}] |
6b659339 KW |
1253 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1254 | ||
1255 | # LB9 and LB10 are done in the same loop | |
1256 | # | |
1257 | # LB9 Do not break a combining character sequence; treat it as if it has | |
1258 | # the line breaking class of the base character in all of the | |
1259 | # higher-numbered rules. | |
1260 | # Treat X CM* as if it were X. | |
1261 | # where X is any line break class except BK, CR, LF, NL, SP, or ZW. | |
1262 | ||
1263 | # LB10 Treat any remaining combining mark as AL. This catches the case | |
1264 | # where a CM is the first character on the line or follows SP, BK, CR, LF, | |
1265 | # NL, or ZW. | |
1266 | for my $i (0 .. @lb_table - 1) { | |
1267 | ||
1268 | # When the CM is the first in the pair, we don't know without looking | |
1269 | # behind whether the CM is going to inherit from an earlier character, | |
1270 | # or not. So have to figure this out in the code | |
289ce9cc KW |
1271 | $lb_table[$lb_enums{'Combining_Mark'}][$i] = $lb_actions{'LB_CM_foo'}; |
1272 | ||
1273 | if ( $i == $lb_enums{'Mandatory_Break'} | |
1274 | || $i == $lb_enums{'EDGE'} | |
1275 | || $i == $lb_enums{'Carriage_Return'} | |
1276 | || $i == $lb_enums{'Line_Feed'} | |
1277 | || $i == $lb_enums{'Next_Line'} | |
1278 | || $i == $lb_enums{'Space'} | |
1279 | || $i == $lb_enums{'ZWSpace'}) | |
6b659339 KW |
1280 | { |
1281 | # For these classes, a following CM doesn't combine, and should do | |
289ce9cc KW |
1282 | # whatever 'Alphabetic' would do. |
1283 | $lb_table[$i][$lb_enums{'Combining_Mark'}] | |
1284 | = $lb_table[$i][$lb_enums{'Alphabetic'}]; | |
6b659339 KW |
1285 | } |
1286 | else { | |
1287 | # For these classes, the CM combines, so doesn't break, inheriting | |
1288 | # the type of nobreak from the master character. | |
289ce9cc | 1289 | if ($lb_table[$i][$lb_enums{'Combining_Mark'}] |
6b659339 KW |
1290 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1291 | { | |
289ce9cc KW |
1292 | $lb_table[$i][$lb_enums{'Combining_Mark'}] |
1293 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1294 | } |
1295 | } | |
1296 | } | |
1297 | ||
1298 | # LB8 Break before any character following a zero-width space, even if one | |
1299 | # or more spaces intervene. | |
1300 | # ZW SP* ÷ | |
1301 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1302 | $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'}; |
6b659339 KW |
1303 | } |
1304 | ||
1305 | # Because of LB8-10, we need to look at context for "SP x", and this must | |
1306 | # be done in the code. So override the existing rules for that, by adding | |
1307 | # a constant to get new rules that tell the code it needs to look at | |
1308 | # context. By adding this action instead of replacing the existing one, | |
1309 | # we can get back to the original rule if necessary. | |
1310 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1311 | $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'}; |
6b659339 KW |
1312 | } |
1313 | ||
1314 | # LB7 Do not break before spaces or zero width space. | |
1315 | # × SP | |
1316 | # × ZW | |
1317 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1318 | $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'}; |
1319 | $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1320 | } |
1321 | ||
1322 | # LB6 Do not break before hard line breaks. | |
1323 | # × ( BK | CR | LF | NL ) | |
1324 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1325 | $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'}; |
1326 | $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'}; | |
1327 | $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'}; | |
1328 | $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1329 | } |
1330 | ||
1331 | # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. | |
1332 | # CR × LF | |
1333 | # CR ! | |
1334 | # LF ! | |
1335 | # NL ! | |
1336 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1337 | $lb_table[$lb_enums{'Carriage_Return'}][$i] |
1338 | = $lb_actions{'LB_BREAKABLE'}; | |
1339 | $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'}; | |
1340 | $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 | 1341 | } |
289ce9cc KW |
1342 | $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}] |
1343 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1344 | |
1345 | # LB4 Always break after hard line breaks. | |
1346 | # BK ! | |
1347 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1348 | $lb_table[$lb_enums{'Mandatory_Break'}][$i] |
1349 | = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 KW |
1350 | } |
1351 | ||
1352 | # LB2 Never break at the start of text. | |
1353 | # sot × | |
1354 | # LB3 Always break at the end of text. | |
1355 | # ! eot | |
1356 | # but these are reversed in the loop below, so that won't break if there | |
1357 | # is no text | |
1358 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1359 | $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'}; |
1360 | $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1361 | } |
1362 | ||
1363 | # LB1 Assign a line breaking class to each code point of the input. | |
1364 | # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes | |
1365 | # depending on criteria outside the scope of this algorithm. | |
1366 | # | |
1367 | # In the absence of such criteria all characters with a specific | |
1368 | # combination of original class and General_Category property value are | |
1369 | # resolved as follows: | |
1370 | # Original Resolved General_Category | |
1371 | # AI, SG, XX AL Any | |
1372 | # SA CM Only Mn or Mc | |
1373 | # SA AL Any except Mn and Mc | |
1374 | # CJ NS Any | |
1375 | # | |
1376 | # This is done in mktables, so we never see any of the remapped-from | |
1377 | # classes. | |
1378 | ||
289ce9cc KW |
1379 | output_table_common('LB', \%lb_actions, |
1380 | \@lb_table, \@lb_short_enums, \%lb_abbreviations); | |
6b659339 KW |
1381 | } |
1382 | ||
7e54b87f KW |
1383 | sub output_WB_table() { |
1384 | ||
1385 | # Create and output the enums, #defines, and pair table for use in | |
1386 | # determining Word Breaks, given in http://www.unicode.org/reports/tr29/. | |
1387 | ||
1388 | # This uses the same mechanism in the other bounds tables generated by | |
1389 | # this file. The actions that could override a 0 or 1 are added to those | |
1390 | # numbers; the actions that clearly don't depend on the underlying rule | |
1391 | # simply overwrite | |
1392 | my %wb_actions = ( | |
1393 | WB_NOBREAK => 0, | |
1394 | WB_BREAKABLE => 1, | |
1395 | WB_hs_then_hs => 2, | |
1396 | WB_Ex_or_FO_then_foo => 3, | |
1397 | WB_DQ_then_HL => 4, | |
1398 | WB_HL_then_DQ => 6, | |
1399 | WB_LE_or_HL_then_MB_or_ML_or_SQ => 8, | |
1400 | WB_MB_or_ML_or_SQ_then_LE_or_HL => 10, | |
1401 | WB_MB_or_MN_or_SQ_then_NU => 12, | |
1402 | WB_NU_then_MB_or_MN_or_SQ => 14, | |
1403 | ); | |
1404 | ||
7e54b87f KW |
1405 | # Construct the WB pair table. |
1406 | # The table is constructed in reverse order of the rules, to make the | |
1407 | # lower-numbered, higher priority ones override the later ones, as the | |
1408 | # algorithm stops at the earliest matching rule | |
1409 | ||
1410 | my @wb_table; | |
1411 | my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN | |
1412 | ||
1413 | # Otherwise, break everywhere (including around ideographs). | |
1414 | # WB14 Any ÷ Any | |
1415 | for my $i (0 .. $table_size - 1) { | |
1416 | for my $j (0 .. $table_size - 1) { | |
1417 | $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'}; | |
1418 | } | |
1419 | } | |
1420 | ||
1421 | # Do not break between regional indicator symbols. | |
1422 | # WB13c Regional_Indicator × Regional_Indicator | |
289ce9cc KW |
1423 | $wb_table[$wb_enums{'Regional_Indicator'}] |
1424 | [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1425 | |
1426 | # Do not break from extenders. | |
1427 | # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) | |
289ce9cc KW |
1428 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}] |
1429 | = $wb_actions{'WB_NOBREAK'}; | |
1430 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}] | |
1431 | = $wb_actions{'WB_NOBREAK'}; | |
1432 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}] | |
1433 | = $wb_actions{'WB_NOBREAK'}; | |
1434 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}] | |
1435 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1436 | |
1437 | # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) | |
1438 | # × # ExtendNumLet | |
289ce9cc KW |
1439 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}] |
1440 | = $wb_actions{'WB_NOBREAK'}; | |
1441 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}] | |
1442 | = $wb_actions{'WB_NOBREAK'}; | |
1443 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}] | |
1444 | = $wb_actions{'WB_NOBREAK'}; | |
1445 | $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}] | |
1446 | = $wb_actions{'WB_NOBREAK'}; | |
1447 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}] | |
1448 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1449 | |
1450 | # Do not break between Katakana. | |
1451 | # WB13 Katakana × Katakana | |
289ce9cc KW |
1452 | $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}] |
1453 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1454 | |
1455 | # Do not break within sequences, such as “3.2” or “3,456.789”. | |
1456 | # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric | |
289ce9cc | 1457 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1458 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
289ce9cc | 1459 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}] |
7e54b87f | 1460 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
289ce9cc | 1461 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}] |
7e54b87f KW |
1462 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
1463 | ||
1464 | # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric | |
289ce9cc | 1465 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}] |
7e54b87f | 1466 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
289ce9cc | 1467 | $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}] |
7e54b87f | 1468 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
289ce9cc | 1469 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}] |
7e54b87f KW |
1470 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
1471 | ||
1472 | # Do not break within sequences of digits, or digits adjacent to letters | |
1473 | # (“3a”, or “A3”). | |
1474 | # WB10 Numeric × (ALetter | Hebrew_Letter) | |
289ce9cc KW |
1475 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}] |
1476 | = $wb_actions{'WB_NOBREAK'}; | |
1477 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}] | |
1478 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1479 | |
1480 | # WB9 (ALetter | Hebrew_Letter) × Numeric | |
289ce9cc KW |
1481 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}] |
1482 | = $wb_actions{'WB_NOBREAK'}; | |
1483 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}] | |
1484 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1485 | |
1486 | # WB8 Numeric × Numeric | |
289ce9cc KW |
1487 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}] |
1488 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1489 | |
1490 | # Do not break letters across certain punctuation. | |
1491 | # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | |
289ce9cc KW |
1492 | $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}] |
1493 | += $wb_actions{'WB_DQ_then_HL'}; | |
7e54b87f KW |
1494 | |
1495 | # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | |
289ce9cc KW |
1496 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}] |
1497 | += $wb_actions{'WB_HL_then_DQ'}; | |
7e54b87f KW |
1498 | |
1499 | # WB7a Hebrew_Letter × Single_Quote | |
289ce9cc KW |
1500 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] |
1501 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1502 | |
1503 | # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) | |
1504 | # × (ALetter | Hebrew_Letter) | |
289ce9cc | 1505 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}] |
7e54b87f | 1506 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1507 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f | 1508 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1509 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}] |
7e54b87f | 1510 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1511 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f | 1512 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1513 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}] |
7e54b87f | 1514 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1515 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f KW |
1516 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
1517 | ||
1518 | # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | |
1519 | # | Single_Quote) (ALetter | Hebrew_Letter) | |
289ce9cc | 1520 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1521 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1522 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1523 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1524 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}] |
7e54b87f | 1525 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1526 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}] |
7e54b87f | 1527 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1528 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}] |
7e54b87f | 1529 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1530 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] |
7e54b87f KW |
1531 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
1532 | ||
1533 | # Do not break between most letters. | |
1534 | # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) | |
289ce9cc KW |
1535 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}] |
1536 | = $wb_actions{'WB_NOBREAK'}; | |
1537 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}] | |
1538 | = $wb_actions{'WB_NOBREAK'}; | |
1539 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}] | |
1540 | = $wb_actions{'WB_NOBREAK'}; | |
1541 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}] | |
1542 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1543 | |
1544 | # Ignore Format and Extend characters, except when they appear at the | |
1545 | # beginning of a region of text. | |
1546 | # WB4 X (Extend | Format)* → X | |
1547 | for my $i (0 .. @wb_table - 1) { | |
289ce9cc KW |
1548 | $wb_table[$wb_enums{'Extend'}][$i] |
1549 | = $wb_actions{'WB_Ex_or_FO_then_foo'}; | |
1550 | $wb_table[$wb_enums{'Format'}][$i] | |
1551 | = $wb_actions{'WB_Ex_or_FO_then_foo'}; | |
7e54b87f KW |
1552 | } |
1553 | ||
1554 | # Implied is that these attach to the character before them, except for | |
1555 | # the characters that mark the end of a region of text. The rules below | |
1556 | # override the ones set up here, for all the characters that need | |
1557 | # overriding. | |
1558 | for my $i (0 .. @wb_table - 1) { | |
289ce9cc KW |
1559 | $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; |
1560 | $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1561 | } |
1562 | ||
1563 | # Break before and after white space | |
1564 | # WB3b ÷ (Newline | CR | LF) | |
1565 | # WB3a (Newline | CR | LF) ÷ | |
1566 | # et. al. | |
289ce9cc | 1567 | for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { |
7e54b87f KW |
1568 | for my $j (0 .. @wb_table - 1) { |
1569 | $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'}; | |
1570 | $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'}; | |
1571 | } | |
1572 | } | |
1573 | ||
1574 | # But do not break within white space. | |
1575 | # WB3 CR × LF | |
1576 | # et.al. | |
289ce9cc KW |
1577 | for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { |
1578 | for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { | |
7e54b87f KW |
1579 | $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'}; |
1580 | } | |
1581 | } | |
1582 | ||
1583 | # And do not break horizontal space followed by Extend or Format | |
289ce9cc KW |
1584 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}] |
1585 | = $wb_actions{'WB_NOBREAK'}; | |
1586 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}] | |
1587 | = $wb_actions{'WB_NOBREAK'}; | |
1588 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}] | |
1589 | [$wb_enums{'Perl_Tailored_HSpace'}] | |
1590 | = $wb_actions{'WB_hs_then_hs'}; | |
7e54b87f KW |
1591 | |
1592 | # Break at the start and end of text. | |
1593 | # WB2 ÷ eot | |
1594 | # WB1 sot ÷ | |
1595 | for my $i (0 .. @wb_table - 1) { | |
289ce9cc KW |
1596 | $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'}; |
1597 | $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'}; | |
7e54b87f KW |
1598 | } |
1599 | ||
1600 | # But, unspecified by Unicode, we shouldn't break on an empty string. | |
289ce9cc | 1601 | $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0; |
7e54b87f | 1602 | |
289ce9cc KW |
1603 | output_table_common('WB', \%wb_actions, |
1604 | \@wb_table, \@wb_short_enums, \%wb_abbreviations); | |
7e54b87f KW |
1605 | } |
1606 | ||
9d9177be KW |
1607 | output_invlist("Latin1", [ 0, 256 ]); |
1608 | output_invlist("AboveLatin1", [ 256 ]); | |
1609 | ||
bffc0129 | 1610 | end_file_pound_if; |
43b443dd | 1611 | |
3f427fd9 KW |
1612 | # We construct lists for all the POSIX and backslash sequence character |
1613 | # classes in two forms: | |
1614 | # 1) ones which match only in the ASCII range | |
1615 | # 2) ones which match either in the Latin1 range, or the entire Unicode range | |
1616 | # | |
1617 | # These get compiled in, and hence affect the memory footprint of every Perl | |
1618 | # program, even those not using Unicode. To minimize the size, currently | |
1619 | # the Latin1 version is generated for the beyond ASCII range except for those | |
1620 | # lists that are quite small for the entire range, such as for \s, which is 22 | |
1621 | # UVs long plus 4 UVs (currently) for the header. | |
1622 | # | |
1623 | # To save even more memory, the ASCII versions could be derived from the | |
1624 | # larger ones at runtime, saving some memory (minus the expense of the machine | |
1625 | # instructions to do so), but these are all small anyway, so their total is | |
1626 | # about 100 UVs. | |
1627 | # | |
1628 | # In the list of properties below that get generated, the L1 prefix is a fake | |
1629 | # property that means just the Latin1 range of the full property (whose name | |
1630 | # has an X prefix instead of L1). | |
a02047bf KW |
1631 | # |
1632 | # An initial & means to use the subroutine from this file instead of an | |
1633 | # official inversion list. | |
3f427fd9 | 1634 | |
0c4ecf42 KW |
1635 | for my $charset (get_supported_code_pages()) { |
1636 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
1637 | ||
99f21fb9 KW |
1638 | @a2n = @{get_a2n($charset)}; |
1639 | no warnings 'qw'; | |
1640 | # Ignore non-alpha in sort | |
1641 | for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( | |
1c8c3428 KW |
1642 | ASCII |
1643 | Cased | |
1644 | VertSpace | |
1645 | XPerlSpace | |
1646 | XPosixAlnum | |
1647 | XPosixAlpha | |
1648 | XPosixBlank | |
1649 | XPosixCntrl | |
1650 | XPosixDigit | |
1651 | XPosixGraph | |
1652 | XPosixLower | |
1653 | XPosixPrint | |
1654 | XPosixPunct | |
1655 | XPosixSpace | |
1656 | XPosixUpper | |
1657 | XPosixWord | |
1658 | XPosixXDigit | |
1659 | _Perl_Any_Folds | |
1660 | &NonL1_Perl_Non_Final_Folds | |
1661 | _Perl_Folds_To_Multi_Char | |
1662 | &UpperLatin1 | |
1663 | _Perl_IDStart | |
1664 | _Perl_IDCont | |
02f811dd | 1665 | _Perl_GCB,EDGE |
ca8226cf | 1666 | _Perl_LB,EDGE |
bf4268fa | 1667 | _Perl_SB,EDGE |
190d69bb | 1668 | _Perl_WB,EDGE,UNKNOWN |
1c8c3428 | 1669 | ) |
0f5e3c71 KW |
1670 | ) { |
1671 | ||
1672 | # For the Latin1 properties, we change to use the eXtended version of the | |
1673 | # base property, then go through the result and get rid of everything not | |
1674 | # in Latin1 (above 255). Actually, we retain the element for the range | |
1675 | # that crosses the 255/256 boundary if it is one that matches the | |
1676 | # property. For example, in the Word property, there is a range of code | |
1677 | # points that start at U+00F8 and goes through U+02C1. Instead of | |
1678 | # artificially cutting that off at 256 because 256 is the first code point | |
1679 | # above Latin1, we let the range go to its natural ending. That gives us | |
1680 | # extra information with no added space taken. But if the range that | |
1681 | # crosses the boundary is one that doesn't match the property, we don't | |
1682 | # start a new range above 255, as that could be construed as going to | |
1683 | # infinity. For example, the Upper property doesn't include the character | |
1684 | # at 255, but does include the one at 256. We don't include the 256 one. | |
1685 | my $prop_name = $prop; | |
1686 | my $is_local_sub = $prop_name =~ s/^&//; | |
99f21fb9 KW |
1687 | my $extra_enums = ""; |
1688 | $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x; | |
0f5e3c71 KW |
1689 | my $lookup_prop = $prop_name; |
1690 | my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ | |
1691 | or $lookup_prop =~ s/^L1//); | |
1692 | my $nonl1_only = 0; | |
1693 | $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; | |
99f21fb9 | 1694 | ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x; |
0f5e3c71 KW |
1695 | |
1696 | my @invlist; | |
99f21fb9 KW |
1697 | my @invmap; |
1698 | my $map_format; | |
1699 | my $map_default; | |
1700 | my $maps_to_code_point; | |
1701 | my $to_adjust; | |
0f5e3c71 KW |
1702 | if ($is_local_sub) { |
1703 | @invlist = eval $lookup_prop; | |
289ce9cc | 1704 | die $@ if $@; |
0f5e3c71 KW |
1705 | } |
1706 | else { | |
1707 | @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); | |
99f21fb9 | 1708 | if (! @invlist) { |
99f21fb9 | 1709 | |
ad85f59a KW |
1710 | # If couldn't find a non-empty inversion list, see if it is |
1711 | # instead an inversion map | |
1712 | my ($list_ref, $map_ref, $format, $default) | |
99f21fb9 | 1713 | = prop_invmap($lookup_prop, '_perl_core_internal_ok'); |
ad85f59a KW |
1714 | if (! $list_ref) { |
1715 | # An empty return here could mean an unknown property, or | |
1716 | # merely that the original inversion list is empty. Call | |
1717 | # in scalar context to differentiate | |
1718 | my $count = prop_invlist($lookup_prop, | |
1719 | '_perl_core_internal_ok'); | |
1720 | die "Could not find inversion list for '$lookup_prop'" | |
1721 | unless defined $count; | |
1722 | } | |
1723 | else { | |
18b852b3 KW |
1724 | @invlist = @$list_ref; |
1725 | @invmap = @$map_ref; | |
1726 | $map_format = $format; | |
1727 | $map_default = $default; | |
1728 | $maps_to_code_point = $map_format =~ /x/; | |
1729 | $to_adjust = $map_format =~ /a/; | |
ad85f59a | 1730 | } |
99f21fb9 | 1731 | } |
0f5e3c71 | 1732 | } |
ad85f59a KW |
1733 | |
1734 | ||
1735 | # Short-circuit an empty inversion list. | |
1736 | if (! @invlist) { | |
1737 | output_invlist($prop_name, \@invlist, $charset); | |
1738 | next; | |
1739 | } | |
ceb1de32 | 1740 | |
99f21fb9 KW |
1741 | # Re-order the Unicode code points to native ones for this platform. |
1742 | # This is only needed for code points below 256, because native code | |
1743 | # points are only in that range. For inversion maps of properties | |
1744 | # where the mappings are adjusted (format =~ /a/), this reordering | |
1745 | # could mess up the adjustment pattern that was in the input, so that | |
1746 | # has to be dealt with. | |
1747 | # | |
1748 | # And inversion maps that map to code points need to eventually have | |
1749 | # all those code points remapped to native, and it's better to do that | |
1750 | # here, going through the whole list not just those below 256. This | |
1751 | # is because some inversion maps have adjustments (format =~ /a/) | |
1752 | # which may be affected by the reordering. This code needs to be done | |
1753 | # both for when we are translating the inversion lists for < 256, and | |
1754 | # for the inversion maps for everything. By doing both in this loop, | |
1755 | # we can share that code. | |
1756 | # | |
1757 | # So, we go through everything for an inversion map to code points; | |
1758 | # otherwise, we can skip any remapping at all if we are going to | |
1759 | # output only the above-Latin1 values, or if the range spans the whole | |
1760 | # of 0..256, as the remap will also include all of 0..256 (256 not | |
1761 | # 255 because a re-ordering could cause 256 to need to be in the same | |
1762 | # range as 255.) | |
1763 | if ((@invmap && $maps_to_code_point) | |
1764 | || (! $nonl1_only || ($invlist[0] < 256 | |
1765 | && ! ($invlist[0] == 0 && $invlist[1] > 256)))) | |
ceb1de32 | 1766 | { |
fb4554ea | 1767 | |
99f21fb9 | 1768 | if (! @invmap) { # Straight inversion list |
fb4554ea KW |
1769 | # Look at all the ranges that start before 257. |
1770 | my @latin1_list; | |
1771 | while (@invlist) { | |
1772 | last if $invlist[0] > 256; | |
1773 | my $upper = @invlist > 1 | |
1774 | ? $invlist[1] - 1 # In range | |
8a6c81cf KW |
1775 | |
1776 | # To infinity. You may want to stop much much | |
1777 | # earlier; going this high may expose perl | |
1778 | # deficiencies with very large numbers. | |
1779 | : $Unicode::UCD::MAX_CP; | |
fb4554ea | 1780 | for my $j ($invlist[0] .. $upper) { |
99f21fb9 | 1781 | push @latin1_list, a2n($j); |
0f5e3c71 | 1782 | } |
fb4554ea KW |
1783 | |
1784 | shift @invlist; # Shift off the range that's in the list | |
1785 | shift @invlist; # Shift off the range not in the list | |
0c4ecf42 | 1786 | } |
fb4554ea KW |
1787 | |
1788 | # Here @invlist contains all the ranges in the original that start | |
1789 | # at code points above 256, and @latin1_list contains all the | |
1790 | # native code points for ranges that start with a Unicode code | |
1791 | # point below 257. We sort the latter and convert it to inversion | |
1792 | # list format. Then simply prepend it to the list of the higher | |
1793 | # code points. | |
1794 | @latin1_list = sort { $a <=> $b } @latin1_list; | |
5a7e5385 | 1795 | @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list); |
fb4554ea | 1796 | unshift @invlist, @latin1_list; |
99f21fb9 KW |
1797 | } |
1798 | else { # Is an inversion map | |
1799 | ||
1800 | # This is a similar procedure as plain inversion list, but has | |
1801 | # multiple buckets. A plain inversion list just has two | |
1802 | # buckets, 1) 'in' the list; and 2) 'not' in the list, and we | |
1803 | # pretty much can ignore the 2nd bucket, as it is completely | |
1804 | # defined by the 1st. But here, what we do is create buckets | |
1805 | # which contain the code points that map to each, translated | |
1806 | # to native and turned into an inversion list. Thus each | |
1807 | # bucket is an inversion list of native code points that map | |
1808 | # to it or don't map to it. We use these to create an | |
1809 | # inversion map for the whole property. | |
1810 | ||
1811 | # As mentioned earlier, we use this procedure to not just | |
1812 | # remap the inversion list to native values, but also the maps | |
1813 | # of code points to native ones. In the latter case we have | |
1814 | # to look at the whole of the inversion map (or at least to | |
1815 | # above Unicode; as the maps of code points above that should | |
1816 | # all be to the default). | |
1817 | my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256; | |
1818 | ||
1819 | my %mapped_lists; # A hash whose keys are the buckets. | |
1820 | while (@invlist) { | |
1821 | last if $invlist[0] > $upper_limit; | |
1822 | ||
1823 | # This shouldn't actually happen, as prop_invmap() returns | |
1824 | # an extra element at the end that is beyond $upper_limit | |
1825 | die "inversion map that extends to infinity is unimplemented" unless @invlist > 1; | |
1826 | ||
1827 | my $bucket; | |
1828 | ||
1829 | # A hash key can't be a ref (we are only expecting arrays | |
1830 | # of scalars here), so convert any such to a string that | |
1831 | # will be converted back later (using a vertical tab as | |
1832 | # the separator). Even if the mapping is to code points, | |
1833 | # we don't translate to native here because the code | |
1834 | # output_map() calls to output these arrays assumes the | |
1835 | # input is Unicode, not native. | |
1836 | if (ref $invmap[0]) { | |
1837 | $bucket = join "\cK", @{$invmap[0]}; | |
1838 | } | |
1839 | elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) { | |
1840 | ||
1841 | # Do convert to native for maps to single code points. | |
1842 | # There are some properties that have a few outlier | |
1843 | # maps that aren't code points, so the above test | |
1844 | # skips those. | |
1845 | $bucket = a2n($invmap[0]); | |
1846 | } else { | |
1847 | $bucket = $invmap[0]; | |
1848 | } | |
1849 | ||
1850 | # We now have the bucket that all code points in the range | |
1851 | # map to, though possibly they need to be adjusted. Go | |
1852 | # through the range and put each translated code point in | |
1853 | # it into its bucket. | |
1854 | my $base_map = $invmap[0]; | |
1855 | for my $j ($invlist[0] .. $invlist[1] - 1) { | |
1856 | if ($to_adjust | |
1857 | # The 1st code point doesn't need adjusting | |
1858 | && $j > $invlist[0] | |
1859 | ||
1860 | # Skip any non-numeric maps: these are outliers | |
1861 | # that aren't code points. | |
1862 | && $base_map =~ $numeric_re | |
1863 | ||
1864 | # 'ne' because the default can be a string | |
1865 | && $base_map ne $map_default) | |
1866 | { | |
1867 | # We adjust, by incrementing each the bucket and | |
1868 | # the map. For code point maps, translate to | |
1869 | # native | |
1870 | $base_map++; | |
1871 | $bucket = ($maps_to_code_point) | |
1872 | ? a2n($base_map) | |
1873 | : $base_map; | |
1874 | } | |
1875 | ||
1876 | # Add the native code point to the bucket for the | |
1877 | # current map | |
1878 | push @{$mapped_lists{$bucket}}, a2n($j); | |
1879 | } # End of loop through all code points in the range | |
1880 | ||
1881 | # Get ready for the next range | |
1882 | shift @invlist; | |
1883 | shift @invmap; | |
1884 | } # End of loop through all ranges in the map. | |
1885 | ||
1886 | # Here, @invlist and @invmap retain all the ranges from the | |
1887 | # originals that start with code points above $upper_limit. | |
1888 | # Each bucket in %mapped_lists contains all the code points | |
1889 | # that map to that bucket. If the bucket is for a map to a | |
1890 | # single code point is a single code point, the bucket has | |
1891 | # been converted to native. If something else (including | |
1892 | # multiple code points), no conversion is done. | |
1893 | # | |
1894 | # Now we recreate the inversion map into %xlated, but this | |
1895 | # time for the native character set. | |
1896 | my %xlated; | |
1897 | foreach my $bucket (keys %mapped_lists) { | |
1898 | ||
1899 | # Sort and convert this bucket to an inversion list. The | |
1900 | # result will be that ranges that start with even-numbered | |
1901 | # indexes will be for code points that map to this bucket; | |
1902 | # odd ones map to some other bucket, and are discarded | |
1903 | # below. | |
1904 | @{$mapped_lists{$bucket}} | |
1905 | = sort{ $a <=> $b} @{$mapped_lists{$bucket}}; | |
1906 | @{$mapped_lists{$bucket}} | |
1907 | = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}}); | |
1908 | ||
1909 | # Add each even-numbered range in the bucket to %xlated; | |
1910 | # so that the keys of %xlated become the range start code | |
1911 | # points, and the values are their corresponding maps. | |
1912 | while (@{$mapped_lists{$bucket}}) { | |
1913 | my $range_start = $mapped_lists{$bucket}->[0]; | |
1914 | if ($bucket =~ /\cK/) { | |
1915 | @{$xlated{$range_start}} = split /\cK/, $bucket; | |
1916 | } | |
1917 | else { | |
1918 | $xlated{$range_start} = $bucket; | |
1919 | } | |
1920 | shift @{$mapped_lists{$bucket}}; # Discard odd ranges | |
1921 | shift @{$mapped_lists{$bucket}}; # Get ready for next | |
1922 | # iteration | |
1923 | } | |
1924 | } # End of loop through all the buckets. | |
1925 | ||
1926 | # Here %xlated's keys are the range starts of all the code | |
1927 | # points in the inversion map. Construct an inversion list | |
1928 | # from them. | |
1929 | my @new_invlist = sort { $a <=> $b } keys %xlated; | |
1930 | ||
1931 | # If the list is adjusted, we want to munge this list so that | |
1932 | # we only have one entry for where consecutive code points map | |
1933 | # to consecutive values. We just skip the subsequent entries | |
1934 | # where this is the case. | |
1935 | if ($to_adjust) { | |
1936 | my @temp; | |
1937 | for my $i (0 .. @new_invlist - 1) { | |
1938 | next if $i > 0 | |
1939 | && $new_invlist[$i-1] + 1 == $new_invlist[$i] | |
1940 | && $xlated{$new_invlist[$i-1]} =~ $numeric_re | |
1941 | && $xlated{$new_invlist[$i]} =~ $numeric_re | |
1942 | && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]}; | |
1943 | push @temp, $new_invlist[$i]; | |
1944 | } | |
1945 | @new_invlist = @temp; | |
1946 | } | |
1947 | ||
1948 | # The inversion map comes from %xlated's values. We can | |
1949 | # unshift each onto the front of the untouched portion, in | |
1950 | # reverse order of the portion we did process. | |
1951 | foreach my $start (reverse @new_invlist) { | |
1952 | unshift @invmap, $xlated{$start}; | |
1953 | } | |
1954 | ||
1955 | # Finally prepend the inversion list we have just constructed to the | |
1956 | # one that contains anything we didn't process. | |
1957 | unshift @invlist, @new_invlist; | |
1958 | } | |
1959 | } | |
1960 | ||
1961 | # prop_invmap() returns an extra final entry, which we can now | |
1962 | # discard. | |
1963 | if (@invmap) { | |
1964 | pop @invlist; | |
1965 | pop @invmap; | |
ceb1de32 | 1966 | } |
0f5e3c71 KW |
1967 | |
1968 | if ($l1_only) { | |
99f21fb9 | 1969 | die "Unimplemented to do a Latin-1 only inversion map" if @invmap; |
0f5e3c71 KW |
1970 | for my $i (0 .. @invlist - 1 - 1) { |
1971 | if ($invlist[$i] > 255) { | |
1972 | ||
1973 | # In an inversion list, even-numbered elements give the code | |
1974 | # points that begin ranges that match the property; | |
1975 | # odd-numbered give ones that begin ranges that don't match. | |
1976 | # If $i is odd, we are at the first code point above 255 that | |
1977 | # doesn't match, which means the range it is ending does | |
1978 | # match, and crosses the 255/256 boundary. We want to include | |
1979 | # this ending point, so increment $i, so the splice below | |
1980 | # includes it. Conversely, if $i is even, it is the first | |
1981 | # code point above 255 that matches, which means there was no | |
1982 | # matching range that crossed the boundary, and we don't want | |
1983 | # to include this code point, so splice before it. | |
1984 | $i++ if $i % 2 != 0; | |
1985 | ||
1986 | # Remove everything past this. | |
1987 | splice @invlist, $i; | |
99f21fb9 | 1988 | splice @invmap, $i if @invmap; |
0f5e3c71 KW |
1989 | last; |
1990 | } | |
0c4ecf42 KW |
1991 | } |
1992 | } | |
0f5e3c71 KW |
1993 | elsif ($nonl1_only) { |
1994 | my $found_nonl1 = 0; | |
1995 | for my $i (0 .. @invlist - 1 - 1) { | |
1996 | next if $invlist[$i] < 256; | |
1997 | ||
1998 | # Here, we have the first element in the array that indicates an | |
1999 | # element above Latin1. Get rid of all previous ones. | |
2000 | splice @invlist, 0, $i; | |
99f21fb9 | 2001 | splice @invmap, 0, $i if @invmap; |
0f5e3c71 KW |
2002 | |
2003 | # If this one's index is not divisible by 2, it means that this | |
2004 | # element is inverting away from being in the list, which means | |
99f21fb9 KW |
2005 | # all code points from 256 to this one are in this list (or |
2006 | # map to the default for inversion maps) | |
2007 | if ($i % 2 != 0) { | |
2008 | unshift @invlist, 256; | |
2009 | unshift @invmap, $map_default if @invmap; | |
2010 | } | |
0f5e3c71 | 2011 | $found_nonl1 = 1; |
3f427fd9 KW |
2012 | last; |
2013 | } | |
0f5e3c71 | 2014 | die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1; |
3f427fd9 | 2015 | } |
3f427fd9 | 2016 | |
0f5e3c71 | 2017 | output_invlist($prop_name, \@invlist, $charset); |
99f21fb9 | 2018 | output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap; |
0f5e3c71 | 2019 | } |
bffc0129 | 2020 | end_file_pound_if; |
0c4ecf42 | 2021 | print $out_fh "\n" . get_conditional_compile_line_end(); |
9d9177be KW |
2022 | } |
2023 | ||
973a28ed KW |
2024 | switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C'); |
2025 | ||
2026 | output_GCB_table(); | |
6b659339 | 2027 | output_LB_table(); |
7e54b87f | 2028 | output_WB_table(); |
6b659339 | 2029 | |
973a28ed KW |
2030 | end_file_pound_if; |
2031 | ||
2308ab83 | 2032 | my $sources_list = "lib/unicore/mktables.lst"; |
216b41c2 KW |
2033 | my @sources = ($0, qw(lib/unicore/mktables |
2034 | lib/Unicode/UCD.pm | |
2035 | regen/charset_translations.pl | |
2036 | )); | |
9a3da3ad FC |
2037 | { |
2038 | # Depend on mktables’ own sources. It’s a shorter list of files than | |
2039 | # those that Unicode::UCD uses. | |
2308ab83 KW |
2040 | if (! open my $mktables_list, $sources_list) { |
2041 | ||
2042 | # This should force a rebuild once $sources_list exists | |
2043 | push @sources, $sources_list; | |
2044 | } | |
2045 | else { | |
2046 | while(<$mktables_list>) { | |
2047 | last if /===/; | |
2048 | chomp; | |
2049 | push @sources, "lib/unicore/$_" if /^[^#]/; | |
2050 | } | |
9a3da3ad FC |
2051 | } |
2052 | } | |
6b659339 KW |
2053 | |
2054 | read_only_bottom_close_and_rename($out_fh, \@sources); |