Commit | Line | Data |
---|---|---|
9d9177be KW |
1 | #!perl -w |
2 | use 5.015; | |
3 | use strict; | |
4 | use warnings; | |
99f21fb9 KW |
5 | use Unicode::UCD qw(prop_aliases |
6 | prop_values | |
7 | prop_value_aliases | |
8 | prop_invlist | |
9 | prop_invmap search_invlist | |
10 | ); | |
3d7c117d MB |
11 | require './regen/regen_lib.pl'; |
12 | require './regen/charset_translations.pl'; | |
9d9177be KW |
13 | |
14 | # This program outputs charclass_invlists.h, which contains various inversion | |
15 | # lists in the form of C arrays that are to be used as-is for inversion lists. | |
16 | # Thus, the lists it contains are essentially pre-compiled, and need only a | |
17 | # light-weight fast wrapper to make them usable at run-time. | |
18 | ||
19 | # As such, this code knows about the internal structure of these lists, and | |
20 | # any change made to that has to be done here as well. A random number stored | |
21 | # in the headers is used to minimize the possibility of things getting | |
22 | # out-of-sync, or the wrong data structure being passed. Currently that | |
23 | # random number is: | |
99f21fb9 KW |
24 | |
25 | # charclass_invlists.h now also has a partial implementation of inversion | |
26 | # maps; enough to generate tables for the line break properties, such as GCB | |
27 | ||
0a07b44b | 28 | my $VERSION_DATA_STRUCTURE_TYPE = 148565664; |
9d9177be | 29 | |
99f21fb9 KW |
30 | # integer or float |
31 | my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax; | |
32 | ||
33 | # Matches valid C language enum names: begins with ASCII alphabetic, then any | |
34 | # ASCII \w | |
35 | my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax; | |
36 | ||
9d9177be KW |
37 | my $out_fh = open_new('charclass_invlists.h', '>', |
38 | {style => '*', by => $0, | |
39 | from => "Unicode::UCD"}); | |
40 | ||
bffc0129 | 41 | my $in_file_pound_if = 0; |
43b443dd | 42 | |
289ce9cc KW |
43 | my $max_hdr_len = 3; # In headings, how wide a name is allowed? |
44 | ||
9d9177be KW |
45 | print $out_fh "/* See the generating file for comments */\n\n"; |
46 | ||
bffc0129 KW |
47 | # The symbols generated by this program are all currently defined only in a |
48 | # single dot c each. The code knows where most of them go, but this hash | |
49 | # gives overrides for the exceptions to the typical place | |
50 | my %exceptions_to_where_to_define = | |
51 | ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C', | |
52 | AboveLatin1 => 'PERL_IN_REGCOMP_C', | |
53 | Latin1 => 'PERL_IN_REGCOMP_C', | |
54 | UpperLatin1 => 'PERL_IN_REGCOMP_C', | |
55 | _Perl_Any_Folds => 'PERL_IN_REGCOMP_C', | |
56 | _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C', | |
57 | _Perl_IDCont => 'PERL_IN_UTF8_C', | |
58 | _Perl_IDStart => 'PERL_IN_UTF8_C', | |
59 | ); | |
015bb97c | 60 | |
f79a09fc | 61 | # This hash contains the properties with enums that have hard-coded references |
289ce9cc | 62 | # to them in C code. It is neeed to make sure that if perl is compiled |
f79a09fc KW |
63 | # with an older Unicode data set, that all the enum values the code is |
64 | # expecting will still be in the enum typedef. Thus the code doesn't have to | |
289ce9cc KW |
65 | # change. The Unicode version won't have any code points that have the enum |
66 | # values not in that version, so the code that handles them will not get | |
67 | # exercised. This is far better than having to #ifdef things. The names here | |
68 | # should be the long names of the respective property values. The reason for | |
69 | # this is because regexec.c uses them as case labels, and the long name is | |
70 | # generally more understandable than the short. | |
f79a09fc KW |
71 | my %hard_coded_enums = |
72 | ( gcb => [ | |
73 | 'Control', | |
74 | 'CR', | |
b0e24409 KW |
75 | 'E_Base', |
76 | 'E_Base_GAZ', | |
77 | 'E_Modifier', | |
f79a09fc | 78 | 'Extend', |
b0e24409 | 79 | 'Glue_After_Zwj', |
f79a09fc KW |
80 | 'L', |
81 | 'LF', | |
82 | 'LV', | |
83 | 'LVT', | |
84 | 'Other', | |
85 | 'Prepend', | |
86 | 'Regional_Indicator', | |
87 | 'SpacingMark', | |
88 | 'T', | |
89 | 'V', | |
b0e24409 | 90 | 'ZWJ', |
f79a09fc | 91 | ], |
ca8226cf KW |
92 | lb => [ |
93 | 'Alphabetic', | |
94 | 'Break_After', | |
95 | 'Break_Before', | |
96 | 'Break_Both', | |
97 | 'Break_Symbols', | |
98 | 'Carriage_Return', | |
99 | 'Close_Parenthesis', | |
100 | 'Close_Punctuation', | |
101 | 'Combining_Mark', | |
102 | 'Contingent_Break', | |
b0e24409 KW |
103 | 'E_Base', |
104 | 'E_Modifier', | |
ca8226cf KW |
105 | 'Exclamation', |
106 | 'Glue', | |
107 | 'H2', | |
108 | 'H3', | |
109 | 'Hebrew_Letter', | |
110 | 'Hyphen', | |
111 | 'Ideographic', | |
112 | 'Infix_Numeric', | |
113 | 'Inseparable', | |
114 | 'JL', | |
115 | 'JT', | |
116 | 'JV', | |
117 | 'Line_Feed', | |
118 | 'Mandatory_Break', | |
119 | 'Next_Line', | |
120 | 'Nonstarter', | |
121 | 'Numeric', | |
122 | 'Open_Punctuation', | |
123 | 'Postfix_Numeric', | |
124 | 'Prefix_Numeric', | |
125 | 'Quotation', | |
126 | 'Regional_Indicator', | |
127 | 'Space', | |
128 | 'Word_Joiner', | |
b0e24409 | 129 | 'ZWJ', |
ca8226cf KW |
130 | 'ZWSpace', |
131 | ], | |
f79a09fc KW |
132 | sb => [ |
133 | 'ATerm', | |
134 | 'Close', | |
135 | 'CR', | |
136 | 'Extend', | |
137 | 'Format', | |
138 | 'LF', | |
139 | 'Lower', | |
140 | 'Numeric', | |
141 | 'OLetter', | |
142 | 'Other', | |
143 | 'SContinue', | |
144 | 'Sep', | |
145 | 'Sp', | |
146 | 'STerm', | |
147 | 'Upper', | |
148 | ], | |
149 | wb => [ | |
150 | 'ALetter', | |
151 | 'CR', | |
152 | 'Double_Quote', | |
b0e24409 KW |
153 | 'E_Base', |
154 | 'E_Base_GAZ', | |
155 | 'E_Modifier', | |
f79a09fc KW |
156 | 'Extend', |
157 | 'ExtendNumLet', | |
158 | 'Format', | |
b0e24409 | 159 | 'Glue_After_Zwj', |
f79a09fc KW |
160 | 'Hebrew_Letter', |
161 | 'Katakana', | |
162 | 'LF', | |
163 | 'MidLetter', | |
164 | 'MidNum', | |
165 | 'MidNumLet', | |
166 | 'Newline', | |
167 | 'Numeric', | |
168 | 'Other', | |
f1f6961f | 169 | 'Perl_Tailored_HSpace', |
f79a09fc KW |
170 | 'Regional_Indicator', |
171 | 'Single_Quote', | |
b0e24409 | 172 | 'ZWJ', |
f79a09fc KW |
173 | ], |
174 | ); | |
175 | ||
973a28ed KW |
176 | my %gcb_enums; |
177 | my @gcb_short_enums; | |
289ce9cc | 178 | my %gcb_abbreviations; |
6b659339 KW |
179 | my %lb_enums; |
180 | my @lb_short_enums; | |
289ce9cc | 181 | my %lb_abbreviations; |
7e54b87f KW |
182 | my %wb_enums; |
183 | my @wb_short_enums; | |
289ce9cc | 184 | my %wb_abbreviations; |
6b659339 | 185 | |
99f21fb9 KW |
186 | my @a2n; |
187 | ||
188 | sub uniques { | |
189 | # Returns non-duplicated input values. From "Perl Best Practices: | |
190 | # Encapsulated Cleverness". p. 455 in first edition. | |
191 | ||
192 | my %seen; | |
193 | return grep { ! $seen{$_}++ } @_; | |
194 | } | |
195 | ||
196 | sub a2n($) { | |
197 | my $cp = shift; | |
198 | ||
199 | # Returns the input Unicode code point translated to native. | |
200 | ||
201 | return $cp if $cp !~ $numeric_re || $cp > 255; | |
202 | return $a2n[$cp]; | |
203 | } | |
204 | ||
bffc0129 KW |
205 | sub end_file_pound_if { |
206 | if ($in_file_pound_if) { | |
207 | print $out_fh "\n#endif\t/* $in_file_pound_if */\n"; | |
208 | $in_file_pound_if = 0; | |
209 | } | |
210 | } | |
211 | ||
212 | sub switch_pound_if ($$) { | |
213 | my $name = shift; | |
214 | my $new_pound_if = shift; | |
215 | ||
216 | # Switch to new #if given by the 2nd argument. If there is an override | |
217 | # for this, it instead switches to that. The 1st argument is the | |
218 | # static's name, used to look up the overrides | |
219 | ||
220 | if (exists $exceptions_to_where_to_define{$name}) { | |
221 | $new_pound_if = $exceptions_to_where_to_define{$name}; | |
222 | } | |
223 | ||
224 | # Exit current #if if the new one is different from the old | |
225 | if ($in_file_pound_if | |
226 | && $in_file_pound_if !~ /$new_pound_if/) | |
227 | { | |
228 | end_file_pound_if; | |
229 | } | |
230 | ||
231 | # Enter new #if, if not already in it. | |
232 | if (! $in_file_pound_if) { | |
233 | $in_file_pound_if = "defined($new_pound_if)"; | |
234 | print $out_fh "\n#if $in_file_pound_if\n"; | |
43b443dd KW |
235 | } |
236 | } | |
237 | ||
0c4ecf42 | 238 | sub output_invlist ($$;$) { |
9d9177be KW |
239 | my $name = shift; |
240 | my $invlist = shift; # Reference to inversion list array | |
0c4ecf42 | 241 | my $charset = shift // ""; # name of character set for comment |
9d9177be | 242 | |
76d3994c | 243 | die "No inversion list for $name" unless defined $invlist |
ad85f59a | 244 | && ref $invlist eq 'ARRAY'; |
76d3994c | 245 | |
9d9177be KW |
246 | # Output the inversion list $invlist using the name $name for it. |
247 | # It is output in the exact internal form for inversion lists. | |
248 | ||
a0316a6c KW |
249 | # Is the last element of the header 0, or 1 ? |
250 | my $zero_or_one = 0; | |
ad85f59a | 251 | if (@$invlist && $invlist->[0] != 0) { |
a0316a6c | 252 | unshift @$invlist, 0; |
9d9177be KW |
253 | $zero_or_one = 1; |
254 | } | |
0a07b44b | 255 | my $count = @$invlist; |
9d9177be | 256 | |
bffc0129 | 257 | switch_pound_if ($name, 'PERL_IN_PERL_C'); |
43b443dd | 258 | |
0c4ecf42 KW |
259 | print $out_fh "\nstatic const UV ${name}_invlist[] = {"; |
260 | print $out_fh " /* for $charset */" if $charset; | |
261 | print $out_fh "\n"; | |
9d9177be | 262 | |
a0316a6c | 263 | print $out_fh "\t$count,\t/* Number of elements */\n"; |
9d9177be KW |
264 | print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n"; |
265 | print $out_fh "\t", $zero_or_one, | |
a0316a6c KW |
266 | ",\t/* 0 if the list starts at 0;", |
267 | "\n\t\t 1 if it starts at the element beyond 0 */\n"; | |
9d9177be KW |
268 | |
269 | # The main body are the UVs passed in to this routine. Do the final | |
270 | # element separately | |
47d53124 KW |
271 | for my $i (0 .. @$invlist - 1) { |
272 | printf $out_fh "\t0x%X", $invlist->[$i]; | |
273 | print $out_fh "," if $i < @$invlist - 1; | |
274 | print $out_fh "\n"; | |
9d9177be KW |
275 | } |
276 | ||
9d9177be KW |
277 | print $out_fh "};\n"; |
278 | } | |
279 | ||
99f21fb9 KW |
280 | sub output_invmap ($$$$$$$) { |
281 | my $name = shift; | |
282 | my $invmap = shift; # Reference to inversion map array | |
283 | my $prop_name = shift; | |
284 | my $input_format = shift; # The inversion map's format | |
285 | my $default = shift; # The property value for code points who | |
286 | # otherwise don't have a value specified. | |
287 | my $extra_enums = shift; # comma-separated list of our additions to the | |
288 | # property's standard possible values | |
289 | my $charset = shift // ""; # name of character set for comment | |
290 | ||
291 | # Output the inversion map $invmap for property $prop_name, but use $name | |
292 | # as the actual data structure's name. | |
293 | ||
294 | my $count = @$invmap; | |
295 | ||
296 | my $output_format; | |
297 | my $declaration_type; | |
298 | my %enums; | |
299 | my $name_prefix; | |
300 | ||
301 | if ($input_format eq 's') { | |
b83e6484 | 302 | my $orig_prop_name = $prop_name; |
02f811dd KW |
303 | $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name |
304 | my $short_name = (prop_aliases($prop_name))[0] // $prop_name; | |
b83e6484 KW |
305 | my @enums; |
306 | if ($orig_prop_name eq $prop_name) { | |
307 | @enums = prop_values($prop_name); | |
308 | } | |
309 | else { | |
310 | @enums = uniques(@$invmap); | |
311 | } | |
289ce9cc | 312 | |
99f21fb9 KW |
313 | if (! @enums) { |
314 | die "Only enum properties are currently handled; '$prop_name' isn't one"; | |
315 | } | |
316 | else { | |
f79a09fc | 317 | my @expected_enums = @{$hard_coded_enums{lc $short_name}}; |
289ce9cc KW |
318 | my @canonical_input_enums; |
319 | if (@expected_enums) { | |
320 | if (@expected_enums < @enums) { | |
321 | die 'You need to update %hard_coded_enums to reflect new' | |
322 | . " entries in this Unicode version\n" | |
323 | . "Expected: " . join(", ", sort @expected_enums) . "\n" | |
324 | . " Got: " . join(", ", sort @enums); | |
325 | } | |
f79a09fc | 326 | |
289ce9cc | 327 | if (! defined prop_aliases($prop_name)) { |
f79a09fc | 328 | |
289ce9cc KW |
329 | # Convert the input enums into canonical form and |
330 | # save for use below | |
331 | @canonical_input_enums = map { lc ($_ =~ s/_//gr) } | |
332 | @enums; | |
333 | } | |
334 | @enums = sort @expected_enums; | |
335 | } | |
99f21fb9 | 336 | |
289ce9cc KW |
337 | # The internal enums come last, and in the order specified |
338 | my @extras; | |
339 | if ($extra_enums ne "") { | |
340 | @extras = split /,/, $extra_enums; | |
341 | push @enums, @extras; | |
342 | } | |
6dc80864 | 343 | |
99f21fb9 KW |
344 | # Assign a value to each element of the enum. The default |
345 | # value always gets 0; the others are arbitrarily assigned. | |
346 | my $enum_val = 0; | |
02f811dd KW |
347 | my $canonical_default = prop_value_aliases($prop_name, $default); |
348 | $default = $canonical_default if defined $canonical_default; | |
99f21fb9 KW |
349 | $enums{$default} = $enum_val++; |
350 | for my $enum (@enums) { | |
351 | $enums{$enum} = $enum_val++ unless exists $enums{$enum}; | |
352 | } | |
6b659339 | 353 | |
289ce9cc KW |
354 | # Calculate the enum values for certain properties like |
355 | # _Perl_GCB and _Perl_LB, because we output special tables for | |
356 | # them. | |
357 | if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) { | |
358 | ||
359 | # We use string evals to allow the same code to work on | |
360 | # all tables we're doing. | |
361 | my $type = lc $prop_name; | |
362 | ||
363 | # We use lowercase single letter names for any property | |
364 | # values not in the release of Unicode being compiled now. | |
365 | my $placeholder = "a"; | |
366 | ||
367 | # Skip if we've already done this code, which populated | |
368 | # this hash | |
369 | if (eval "! \%${type}_enums") { | |
370 | ||
371 | # For each enum ... | |
372 | foreach my $enum (sort keys %enums) { | |
373 | my $value = $enums{$enum}; | |
374 | my $short; | |
375 | my $abbreviated_from; | |
376 | ||
377 | # Special case this wb property value to make the | |
378 | # name more clear | |
379 | if ($enum eq 'Perl_Tailored_HSpace') { | |
380 | $short = 'hs'; | |
381 | $abbreviated_from = $enum; | |
382 | } | |
383 | elsif (grep { $_ eq $enum } @extras) { | |
384 | ||
385 | # The 'short' name for one of the property | |
386 | # values added by this file is just the | |
387 | # lowercase of it | |
388 | $short = lc $enum; | |
389 | } | |
390 | elsif (grep {$_ eq lc ( $enum =~ s/_//gr) } | |
391 | @canonical_input_enums) | |
392 | { # On Unicode versions that predate the | |
393 | # official property, we have set up this array | |
394 | # to be the canonical form of each enum in the | |
395 | # substitute property. If the enum we're | |
396 | # looking at is canonically the same as one of | |
397 | # these, use its name instead of generating a | |
398 | # placeholder one in the next clause (which | |
399 | # will happen because prop_value_aliases() | |
400 | # will fail because it only works on official | |
401 | # properties) | |
402 | $short = $enum; | |
403 | } | |
404 | else { | |
405 | # Use the official short name for the other | |
406 | # property values, which should all be | |
407 | # official ones. | |
408 | ($short) = prop_value_aliases($type, $enum); | |
409 | ||
410 | # But create a placeholder for ones not in | |
411 | # this Unicode version. | |
412 | $short = $placeholder++ unless defined $short; | |
413 | } | |
414 | ||
415 | # If our short name is too long, or we already | |
416 | # know that the name is an abbreviation, truncate | |
417 | # to make sure it's short enough, and remember | |
418 | # that we did this so we can later place in a | |
419 | # comment in the generated file | |
420 | if ( $abbreviated_from | |
421 | || length $short > $max_hdr_len) | |
422 | { | |
423 | $short = substr($short, 0, $max_hdr_len); | |
424 | $abbreviated_from = $enum | |
425 | unless $abbreviated_from; | |
426 | # If the name we are to display conflicts, try | |
427 | # another. | |
428 | while (eval "exists | |
429 | \$${type}_abbreviations{$short}") | |
430 | { | |
431 | die $@ if $@; | |
432 | $short++; | |
433 | } | |
434 | ||
435 | eval "\$${type}_abbreviations{$short} = '$enum'"; | |
436 | die $@ if $@; | |
437 | } | |
438 | ||
439 | # Remember the mapping from the property value | |
440 | # (enum) name to its value. | |
441 | eval "\$${type}_enums{$enum} = $value"; | |
442 | die $@ if $@; | |
443 | ||
444 | # Remember the inverse mapping to the short name | |
445 | # so that we can properly label the generated | |
446 | # table's rows and columns | |
447 | eval "\$${type}_short_enums[$value] = '$short'"; | |
448 | die $@ if $@; | |
449 | } | |
7e54b87f KW |
450 | } |
451 | } | |
99f21fb9 KW |
452 | } |
453 | ||
bffc0129 KW |
454 | # Inversion map stuff is currently used only by regexec |
455 | switch_pound_if($name, 'PERL_IN_REGEXEC_C'); | |
99f21fb9 KW |
456 | { |
457 | ||
99f21fb9 KW |
458 | # The short names tend to be two lower case letters, but it looks |
459 | # better for those if they are upper. XXX | |
460 | $short_name = uc($short_name) if length($short_name) < 3 | |
461 | || substr($short_name, 0, 1) =~ /[[:lower:]]/; | |
85e5f08b | 462 | $name_prefix = "${short_name}_"; |
99f21fb9 KW |
463 | my $enum_count = keys %enums; |
464 | print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n"; | |
465 | ||
466 | print $out_fh "\ntypedef enum {\n"; | |
6dc80864 KW |
467 | my @enum_list; |
468 | foreach my $enum (keys %enums) { | |
469 | $enum_list[$enums{$enum}] = $enum; | |
470 | } | |
471 | foreach my $i (0 .. @enum_list - 1) { | |
472 | my $name = $enum_list[$i]; | |
473 | print $out_fh "\t${name_prefix}$name = $i"; | |
474 | print $out_fh "," if $i < $enum_count - 1; | |
475 | print $out_fh "\n"; | |
99f21fb9 KW |
476 | } |
477 | $declaration_type = "${name_prefix}enum"; | |
478 | print $out_fh "} $declaration_type;\n"; | |
479 | ||
480 | $output_format = "${name_prefix}%s"; | |
481 | } | |
482 | } | |
483 | else { | |
484 | die "'$input_format' invmap() format for '$prop_name' unimplemented"; | |
485 | } | |
486 | ||
487 | die "No inversion map for $prop_name" unless defined $invmap | |
488 | && ref $invmap eq 'ARRAY' | |
489 | && $count; | |
490 | ||
491 | print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {"; | |
492 | print $out_fh " /* for $charset */" if $charset; | |
493 | print $out_fh "\n"; | |
494 | ||
495 | # The main body are the scalars passed in to this routine. | |
496 | for my $i (0 .. $count - 1) { | |
497 | my $element = $invmap->[$i]; | |
02f811dd KW |
498 | my $full_element_name = prop_value_aliases($prop_name, $element); |
499 | $element = $full_element_name if defined $full_element_name; | |
500 | $element = $name_prefix . $element; | |
99f21fb9 KW |
501 | print $out_fh "\t$element"; |
502 | print $out_fh "," if $i < $count - 1; | |
503 | print $out_fh "\n"; | |
504 | } | |
505 | print $out_fh "};\n"; | |
99f21fb9 KW |
506 | } |
507 | ||
5a7e5385 | 508 | sub mk_invlist_from_sorted_cp_list { |
a02047bf KW |
509 | |
510 | # Returns an inversion list constructed from the sorted input array of | |
511 | # code points | |
512 | ||
513 | my $list_ref = shift; | |
514 | ||
99f21fb9 KW |
515 | return unless @$list_ref; |
516 | ||
a02047bf KW |
517 | # Initialize to just the first element |
518 | my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); | |
519 | ||
520 | # For each succeeding element, if it extends the previous range, adjust | |
521 | # up, otherwise add it. | |
522 | for my $i (1 .. @$list_ref - 1) { | |
523 | if ($invlist[-1] == $list_ref->[$i]) { | |
524 | $invlist[-1]++; | |
525 | } | |
526 | else { | |
527 | push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; | |
528 | } | |
529 | } | |
530 | return @invlist; | |
531 | } | |
532 | ||
533 | # Read in the Case Folding rules, and construct arrays of code points for the | |
534 | # properties we need. | |
535 | my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); | |
536 | die "Could not find inversion map for Case_Folding" unless defined $format; | |
537 | die "Incorrect format '$format' for Case_Folding inversion map" | |
347b9066 KW |
538 | unless $format eq 'al' |
539 | || $format eq 'a'; | |
a02047bf KW |
540 | my @has_multi_char_fold; |
541 | my @is_non_final_fold; | |
542 | ||
543 | for my $i (0 .. @$folds_ref - 1) { | |
544 | next unless ref $folds_ref->[$i]; # Skip single-char folds | |
545 | push @has_multi_char_fold, $cp_ref->[$i]; | |
546 | ||
b6a6e956 | 547 | # Add to the non-finals list each code point that is in a non-final |
a02047bf KW |
548 | # position |
549 | for my $j (0 .. @{$folds_ref->[$i]} - 2) { | |
550 | push @is_non_final_fold, $folds_ref->[$i][$j] | |
551 | unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold; | |
552 | } | |
553 | } | |
554 | ||
a02047bf KW |
555 | sub _Perl_Non_Final_Folds { |
556 | @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; | |
5a7e5385 | 557 | return mk_invlist_from_sorted_cp_list(\@is_non_final_fold); |
a02047bf KW |
558 | } |
559 | ||
99f21fb9 KW |
560 | sub prop_name_for_cmp ($) { # Sort helper |
561 | my $name = shift; | |
562 | ||
563 | # Returns the input lowercased, with non-alphas removed, as well as | |
564 | # everything starting with a comma | |
565 | ||
566 | $name =~ s/,.*//; | |
567 | $name =~ s/[[:^alpha:]]//g; | |
568 | return lc $name; | |
569 | } | |
570 | ||
892d8259 | 571 | sub UpperLatin1 { |
5a7e5385 | 572 | return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); |
892d8259 KW |
573 | } |
574 | ||
289ce9cc KW |
575 | sub output_table_common { |
576 | ||
577 | # Common subroutine to actually output the generated rules table. | |
578 | ||
579 | my ($property, | |
580 | $table_value_defines_ref, | |
581 | $table_ref, | |
582 | $names_ref, | |
583 | $abbreviations_ref) = @_; | |
584 | my $size = @$table_ref; | |
585 | ||
586 | # Output the #define list, sorted by numeric value | |
587 | if ($table_value_defines_ref) { | |
588 | my $max_name_length = 0; | |
589 | my @defines; | |
590 | ||
591 | # Put in order, and at the same time find the longest name | |
592 | while (my ($enum, $value) = each %$table_value_defines_ref) { | |
593 | $defines[$value] = $enum; | |
594 | ||
595 | my $length = length $enum; | |
596 | $max_name_length = $length if $length > $max_name_length; | |
597 | } | |
598 | ||
599 | print $out_fh "\n"; | |
600 | ||
601 | # Output, so that the values are vertically aligned in a column after | |
602 | # the longest name | |
603 | foreach my $i (0 .. @defines - 1) { | |
604 | next unless defined $defines[$i]; | |
605 | printf $out_fh "#define %-*s %2d\n", | |
606 | $max_name_length, | |
607 | $defines[$i], | |
608 | $i; | |
609 | } | |
610 | } | |
611 | ||
612 | my $column_width = 2; # We currently allow 2 digits for the number | |
613 | ||
614 | # If the maximum value in the table is 1, it can be a bool. (Being above | |
615 | # a U8 is not currently handled | |
616 | my $max_element = 0; | |
617 | for my $i (0 .. $size - 1) { | |
618 | for my $j (0 .. $size - 1) { | |
619 | next if $max_element >= $table_ref->[$i][$j]; | |
620 | $max_element = $table_ref->[$i][$j]; | |
621 | } | |
622 | } | |
623 | die "Need wider table column width given '$max_element" | |
624 | if length $max_element > $column_width; | |
625 | ||
626 | my $table_type = ($max_element == 1) | |
627 | ? 'bool' | |
628 | : 'U8'; | |
629 | ||
630 | # If a name is longer than the width set aside for a column, its column | |
631 | # needs to have increased spacing so that the name doesn't get truncated | |
632 | # nor run into an adjacent column | |
633 | my @spacers; | |
634 | ||
635 | # If we are being compiled on a Unicode version earlier than that which | |
636 | # this file was designed for, it may be that some of the property values | |
637 | # aren't in the current release, and so would be undefined if we didn't | |
638 | # define them ourselves. Earlier code has done this, making them | |
639 | # lowercase characters of length one. We look to see if any exist, so | |
640 | # that we can add an annotation to the output table | |
641 | my $has_placeholder = 0; | |
642 | ||
643 | for my $i (0 .. $size - 1) { | |
644 | no warnings 'numeric'; | |
645 | $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax; | |
646 | $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width); | |
647 | } | |
648 | ||
649 | print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n"; | |
650 | ||
651 | # Calculate the column heading line | |
652 | my $header_line = "/* " | |
653 | . (" " x $max_hdr_len) # We let the row heading meld to | |
654 | # the '*/' for those that are at | |
655 | # the max | |
656 | . " " x 3; # Space for '*/ ' | |
657 | # Now each column | |
658 | for my $i (0 .. $size - 1) { | |
659 | $header_line .= sprintf "%s%*s", | |
660 | $spacers[$i], | |
661 | $column_width + 1, # 1 for the ',' | |
662 | $names_ref->[$i]; | |
663 | } | |
664 | $header_line .= " */\n"; | |
665 | ||
666 | # If we have annotations, output it now. | |
667 | if ($has_placeholder || scalar %$abbreviations_ref) { | |
668 | my $text = ""; | |
669 | foreach my $abbr (sort keys %$abbreviations_ref) { | |
670 | $text .= "; " if $text; | |
671 | $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'"; | |
672 | } | |
673 | if ($has_placeholder) { | |
674 | $text .= "; other " if $text; | |
675 | $text .= "lowercase names are placeholders for" | |
676 | . " property values not defined until a later Unicode" | |
677 | . " release, so are irrelevant in this one, as they are" | |
678 | . " not assigned to any code points"; | |
679 | } | |
680 | ||
681 | my $indent = " " x 3; | |
682 | $text = $indent . "/* $text */"; | |
683 | ||
684 | # Wrap the text so that it is no wider than the table, which the | |
685 | # header line gives. | |
686 | my $output_width = length $header_line; | |
687 | while (length $text > $output_width) { | |
688 | my $cur_line = substr($text, 0, $output_width); | |
689 | ||
690 | # Find the first blank back from the right end to wrap at. | |
691 | for (my $i = $output_width -1; $i > 0; $i--) { | |
692 | if (substr($text, $i, 1) eq " ") { | |
693 | print $out_fh substr($text, 0, $i), "\n"; | |
694 | ||
695 | # Set so will look at just the remaining tail (which will | |
696 | # be indented and have a '*' after the indent | |
697 | $text = $indent . " * " . substr($text, $i + 1); | |
698 | last; | |
699 | } | |
700 | } | |
701 | } | |
702 | ||
703 | # And any remaining | |
704 | print $out_fh $text, "\n" if $text; | |
705 | } | |
706 | ||
707 | # We calculated the header line earlier just to get its width so that we | |
708 | # could make sure the annotations fit into that. | |
709 | print $out_fh $header_line; | |
710 | ||
711 | # Now output the bulk of the table. | |
712 | for my $i (0 .. $size - 1) { | |
713 | ||
714 | # First the row heading. | |
715 | printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i]; | |
716 | print $out_fh "{"; # Then the brace for this row | |
717 | ||
718 | # Then each column | |
719 | for my $j (0 .. $size -1) { | |
720 | print $out_fh $spacers[$j]; | |
721 | printf $out_fh "%*d", $column_width, $table_ref->[$i][$j]; | |
722 | print $out_fh "," if $j < $size - 1; | |
723 | } | |
724 | print $out_fh " }"; | |
725 | print $out_fh "," if $i < $size - 1; | |
726 | print $out_fh "\n"; | |
727 | } | |
728 | ||
729 | print $out_fh "};\n"; | |
730 | } | |
731 | ||
973a28ed KW |
732 | sub output_GCB_table() { |
733 | ||
734 | # Create and output the pair table for use in determining Grapheme Cluster | |
735 | # Breaks, given in http://www.unicode.org/reports/tr29/. | |
b0e24409 KW |
736 | my %gcb_actions = ( |
737 | GCB_NOBREAK => 0, | |
738 | GCB_BREAKABLE => 1, | |
739 | GCB_RI_then_RI => 2, # Rules 12 and 13 | |
740 | GCB_EX_then_EM => 3, # Rule 10 | |
741 | ); | |
973a28ed KW |
742 | |
743 | # The table is constructed in reverse order of the rules, to make the | |
744 | # lower-numbered, higher priority ones override the later ones, as the | |
745 | # algorithm stops at the earliest matching rule | |
746 | ||
747 | my @gcb_table; | |
748 | my $table_size = @gcb_short_enums; | |
749 | ||
750 | # Otherwise, break everywhere. | |
b0e24409 | 751 | # GB99 Any ÷ Any |
973a28ed KW |
752 | for my $i (0 .. $table_size - 1) { |
753 | for my $j (0 .. $table_size - 1) { | |
754 | $gcb_table[$i][$j] = 1; | |
755 | } | |
756 | } | |
757 | ||
b0e24409 KW |
758 | # Do not break within emoji flag sequences. That is, do not break between |
759 | # regional indicator (RI) symbols if there is an odd number of RI | |
760 | # characters before the break point. Must be resolved in runtime code. | |
761 | # | |
762 | # GB12 ^ (RI RI)* RI × RI | |
763 | # GB13 [^RI] (RI RI)* RI × RI | |
764 | $gcb_table[$gcb_enums{'Regional_Indicator'}] | |
765 | [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI}; | |
766 | ||
767 | # Do not break within emoji modifier sequences or emoji zwj sequences. | |
768 | # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ ) | |
769 | $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0; | |
770 | $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0; | |
771 | ||
772 | # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier | |
773 | $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}] | |
774 | = $gcb_actions{GCB_EX_then_EM}; | |
775 | $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0; | |
776 | $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0; | |
777 | ||
778 | # Do not break before extending characters or ZWJ. | |
973a28ed | 779 | # Do not break before SpacingMarks, or after Prepend characters. |
973a28ed | 780 | # GB9b Prepend × |
b0e24409 KW |
781 | # GB9a × SpacingMark |
782 | # GB9 × ( Extend | ZWJ ) | |
973a28ed | 783 | for my $i (0 .. @gcb_table - 1) { |
289ce9cc | 784 | $gcb_table[$gcb_enums{'Prepend'}][$i] = 0; |
b0e24409 KW |
785 | $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0; |
786 | $gcb_table[$i][$gcb_enums{'Extend'}] = 0; | |
787 | $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0; | |
973a28ed KW |
788 | } |
789 | ||
973a28ed KW |
790 | # Do not break Hangul syllable sequences. |
791 | # GB8 ( LVT | T) × T | |
792 | $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0; | |
793 | $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0; | |
794 | ||
795 | # GB7 ( LV | V ) × ( V | T ) | |
796 | $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0; | |
797 | $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0; | |
798 | $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0; | |
799 | $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0; | |
800 | ||
801 | # GB6 L × ( L | V | LV | LVT ) | |
802 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0; | |
803 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0; | |
804 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0; | |
805 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0; | |
806 | ||
289ce9cc KW |
807 | # Do not break between a CR and LF. Otherwise, break before and after |
808 | # controls. | |
973a28ed KW |
809 | # GB5 ÷ ( Control | CR | LF ) |
810 | # GB4 ( Control | CR | LF ) ÷ | |
811 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc | 812 | $gcb_table[$i][$gcb_enums{'Control'}] = 1; |
973a28ed KW |
813 | $gcb_table[$i][$gcb_enums{'CR'}] = 1; |
814 | $gcb_table[$i][$gcb_enums{'LF'}] = 1; | |
289ce9cc | 815 | $gcb_table[$gcb_enums{'Control'}][$i] = 1; |
973a28ed KW |
816 | $gcb_table[$gcb_enums{'CR'}][$i] = 1; |
817 | $gcb_table[$gcb_enums{'LF'}][$i] = 1; | |
818 | } | |
819 | ||
820 | # GB3 CR × LF | |
821 | $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0; | |
822 | ||
b0e24409 | 823 | # Break at the start and end of text, unless the text is empty |
973a28ed KW |
824 | # GB1 sot ÷ |
825 | # GB2 ÷ eot | |
826 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc KW |
827 | $gcb_table[$i][$gcb_enums{'EDGE'}] = 1; |
828 | $gcb_table[$gcb_enums{'EDGE'}][$i] = 1; | |
973a28ed | 829 | } |
289ce9cc | 830 | $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0; |
973a28ed | 831 | |
b0e24409 | 832 | output_table_common('GCB', \%gcb_actions, |
289ce9cc | 833 | \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations); |
973a28ed KW |
834 | } |
835 | ||
6b659339 KW |
836 | sub output_LB_table() { |
837 | ||
838 | # Create and output the enums, #defines, and pair table for use in | |
839 | # determining Line Breaks. This uses the default line break algorithm, | |
840 | # given in http://www.unicode.org/reports/tr14/, but tailored by example 7 | |
841 | # in that page, as the Unicode-furnished tests assume that tailoring. | |
842 | ||
6b659339 KW |
843 | # The result is really just true or false. But we follow along with tr14, |
844 | # creating a rule which is false for something like X SP* X. That gets | |
845 | # encoding 2. The rest of the actions are synthetic ones that indicate | |
846 | # some context handling is required. These each are added to the | |
847 | # underlying 0, 1, or 2, instead of replacing them, so that the underlying | |
848 | # value can be retrieved. Actually only rules from 7 through 18 (which | |
849 | # are the ones where space matter) are possible to have 2 added to them. | |
850 | # The others below add just 0 or 1. It might be possible for one | |
851 | # synthetic rule to be added to another, yielding a larger value. This | |
852 | # doesn't happen in the Unicode 8.0 rule set, and as you can see from the | |
853 | # names of the middle grouping below, it is impossible for that to occur | |
854 | # for them because they all start with mutually exclusive classes. That | |
855 | # the final rule can't be added to any of the others isn't obvious from | |
856 | # its name, so it is assigned a power of 2 higher than the others can get | |
857 | # to so any addition would preserve all data. (And the code will reach an | |
858 | # assert(0) on debugging builds should this happen.) | |
859 | my %lb_actions = ( | |
860 | LB_NOBREAK => 0, | |
861 | LB_BREAKABLE => 1, | |
862 | LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2, | |
863 | ||
b0e24409 | 864 | LB_CM_ZWJ_foo => 3, # Rule 9 |
6b659339 KW |
865 | LB_SP_foo => 6, # Rule 18 |
866 | LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25 | |
867 | LB_SY_or_IS_then_various => 11, # Rule 25 | |
868 | LB_HY_or_BA_then_foo => 13, # Rule 21 | |
b0e24409 | 869 | LB_RI_then_RI => 15, # Rule 30a |
6b659339 | 870 | |
b0e24409 | 871 | LB_various_then_PO_or_PR => (1<<5), # Rule 25 |
6b659339 KW |
872 | ); |
873 | ||
6b659339 KW |
874 | # Construct the LB pair table. This is based on the rules in |
875 | # http://www.unicode.org/reports/tr14/, but modified as those rules are | |
876 | # designed for someone taking a string of text and sequentially going | |
877 | # through it to find the break opportunities, whereas, Perl requires | |
878 | # determining if a given random spot is a break opportunity, without | |
879 | # knowing all the entire string before it. | |
880 | # | |
881 | # The table is constructed in reverse order of the rules, to make the | |
882 | # lower-numbered, higher priority ones override the later ones, as the | |
883 | # algorithm stops at the earliest matching rule | |
884 | ||
885 | my @lb_table; | |
886 | my $table_size = @lb_short_enums; | |
887 | ||
888 | # LB31. Break everywhere else | |
889 | for my $i (0 .. $table_size - 1) { | |
890 | for my $j (0 .. $table_size - 1) { | |
891 | $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'}; | |
892 | } | |
893 | } | |
894 | ||
b0e24409 KW |
895 | # LB30b Do not break between an emoji base and an emoji modifier. |
896 | # EB × EM | |
897 | $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}] | |
898 | = $lb_actions{'LB_NOBREAK'}; | |
899 | ||
900 | # LB30a Break between two regional indicator symbols if and only if there | |
901 | # are an even number of regional indicators preceding the position of the | |
902 | # break. | |
903 | # sot (RI RI)* RI × RI | |
904 | # [^RI] (RI RI)* RI × RI | |
289ce9cc | 905 | $lb_table[$lb_enums{'Regional_Indicator'}] |
b0e24409 | 906 | [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'}; |
6b659339 KW |
907 | |
908 | # LB30 Do not break between letters, numbers, or ordinary symbols and | |
909 | # opening or closing parentheses. | |
910 | # (AL | HL | NU) × OP | |
289ce9cc KW |
911 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}] |
912 | = $lb_actions{'LB_NOBREAK'}; | |
913 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}] | |
914 | = $lb_actions{'LB_NOBREAK'}; | |
915 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}] | |
916 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
917 | |
918 | # CP × (AL | HL | NU) | |
289ce9cc KW |
919 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}] |
920 | = $lb_actions{'LB_NOBREAK'}; | |
921 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}] | |
922 | = $lb_actions{'LB_NOBREAK'}; | |
923 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}] | |
924 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
925 | |
926 | # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). | |
927 | # IS × (AL | HL) | |
289ce9cc KW |
928 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}] |
929 | = $lb_actions{'LB_NOBREAK'}; | |
930 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
931 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
932 | |
933 | # LB28 Do not break between alphabetics (“at”). | |
934 | # (AL | HL) × (AL | HL) | |
289ce9cc KW |
935 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}] |
936 | = $lb_actions{'LB_NOBREAK'}; | |
937 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}] | |
938 | = $lb_actions{'LB_NOBREAK'}; | |
939 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}] | |
940 | = $lb_actions{'LB_NOBREAK'}; | |
941 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}] | |
942 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
943 | |
944 | # LB27 Treat a Korean Syllable Block the same as ID. | |
945 | # (JL | JV | JT | H2 | H3) × IN | |
289ce9cc KW |
946 | $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}] |
947 | = $lb_actions{'LB_NOBREAK'}; | |
948 | $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}] | |
949 | = $lb_actions{'LB_NOBREAK'}; | |
950 | $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}] | |
951 | = $lb_actions{'LB_NOBREAK'}; | |
952 | $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}] | |
953 | = $lb_actions{'LB_NOBREAK'}; | |
954 | $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}] | |
955 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
956 | |
957 | # (JL | JV | JT | H2 | H3) × PO | |
289ce9cc KW |
958 | $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}] |
959 | = $lb_actions{'LB_NOBREAK'}; | |
960 | $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}] | |
961 | = $lb_actions{'LB_NOBREAK'}; | |
962 | $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}] | |
963 | = $lb_actions{'LB_NOBREAK'}; | |
964 | $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}] | |
965 | = $lb_actions{'LB_NOBREAK'}; | |
966 | $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}] | |
967 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
968 | |
969 | # PR × (JL | JV | JT | H2 | H3) | |
289ce9cc KW |
970 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}] |
971 | = $lb_actions{'LB_NOBREAK'}; | |
972 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}] | |
973 | = $lb_actions{'LB_NOBREAK'}; | |
974 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}] | |
975 | = $lb_actions{'LB_NOBREAK'}; | |
976 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}] | |
977 | = $lb_actions{'LB_NOBREAK'}; | |
978 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}] | |
979 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
980 | |
981 | # LB26 Do not break a Korean syllable. | |
982 | # JL × (JL | JV | H2 | H3) | |
983 | $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'}; | |
984 | $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
985 | $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'}; | |
986 | $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'}; | |
987 | ||
988 | # (JV | H2) × (JV | JT) | |
989 | $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
990 | $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
991 | $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
992 | $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
993 | ||
994 | # (JT | H3) × JT | |
995 | $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
996 | $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
997 | ||
998 | # LB25 Do not break between the following pairs of classes relevant to | |
999 | # numbers, as tailored by example 7 in | |
1000 | # http://www.unicode.org/reports/tr14/#Examples | |
1001 | # We follow that tailoring because Unicode's test cases expect it | |
1002 | # (PR | PO) × ( OP | HY )? NU | |
289ce9cc KW |
1003 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}] |
1004 | = $lb_actions{'LB_NOBREAK'}; | |
1005 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}] | |
1006 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1007 | |
1008 | # Given that (OP | HY )? is optional, we have to test for it in code. | |
1009 | # We add in the action (instead of overriding) for this, so that in | |
1010 | # the code we can recover the underlying break value. | |
289ce9cc | 1011 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 1012 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 1013 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 1014 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 1015 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}] |
6b659339 | 1016 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 1017 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}] |
6b659339 KW |
1018 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
1019 | ||
1020 | # ( OP | HY ) × NU | |
289ce9cc KW |
1021 | $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}] |
1022 | = $lb_actions{'LB_NOBREAK'}; | |
1023 | $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}] | |
1024 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1025 | |
1026 | # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP ) | |
1027 | # which can be rewritten as: | |
1028 | # NU (SY | IS)* × (NU | SY | IS | CL | CP ) | |
289ce9cc KW |
1029 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}] |
1030 | = $lb_actions{'LB_NOBREAK'}; | |
1031 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}] | |
1032 | = $lb_actions{'LB_NOBREAK'}; | |
1033 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}] | |
1034 | = $lb_actions{'LB_NOBREAK'}; | |
1035 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}] | |
1036 | = $lb_actions{'LB_NOBREAK'}; | |
1037 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}] | |
1038 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1039 | |
1040 | # Like earlier where we have to test in code, we add in the action so | |
1041 | # that we can recover the underlying values. This is done in rules | |
1042 | # below, as well. The code assumes that we haven't added 2 actions. | |
1043 | # Shoul a later Unicode release break that assumption, then tests | |
1044 | # should start failing. | |
289ce9cc | 1045 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}] |
6b659339 | 1046 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1047 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}] |
6b659339 | 1048 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1049 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1050 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1051 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1052 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1053 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1054 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1055 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}] |
6b659339 | 1056 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1057 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}] |
6b659339 | 1058 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1059 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1060 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1061 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1062 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1063 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}] |
6b659339 KW |
1064 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
1065 | ||
1066 | # NU (NU | SY | IS)* (CL | CP)? × (PO | PR) | |
1067 | # which can be rewritten as: | |
1068 | # NU (SY | IS)* (CL | CP)? × (PO | PR) | |
289ce9cc KW |
1069 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}] |
1070 | = $lb_actions{'LB_NOBREAK'}; | |
1071 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}] | |
1072 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1073 | |
289ce9cc | 1074 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1075 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1076 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1077 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1078 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1079 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1080 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 KW |
1081 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
1082 | ||
289ce9cc | 1083 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1084 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1085 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1086 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1087 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1088 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1089 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 KW |
1090 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
1091 | ||
b0e24409 KW |
1092 | # LB24 Do not break between numeric prefix/postfix and letters, or between |
1093 | # letters and prefix/postfix. | |
1094 | # (PR | PO) × (AL | HL) | |
289ce9cc KW |
1095 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}] |
1096 | = $lb_actions{'LB_NOBREAK'}; | |
1097 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1098 | = $lb_actions{'LB_NOBREAK'}; | |
289ce9cc KW |
1099 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}] |
1100 | = $lb_actions{'LB_NOBREAK'}; | |
1101 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1102 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1103 | |
b0e24409 KW |
1104 | # (AL | HL) × (PR | PO) |
1105 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}] | |
1106 | = $lb_actions{'LB_NOBREAK'}; | |
1107 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}] | |
1108 | = $lb_actions{'LB_NOBREAK'}; | |
1109 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}] | |
1110 | = $lb_actions{'LB_NOBREAK'}; | |
1111 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}] | |
1112 | = $lb_actions{'LB_NOBREAK'}; | |
1113 | ||
1114 | # LB23a Do not break between numeric prefixes and ideographs, or between | |
1115 | # ideographs and numeric postfixes. | |
1116 | # PR × (ID | EB | EM) | |
1117 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}] | |
1118 | = $lb_actions{'LB_NOBREAK'}; | |
1119 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}] | |
1120 | = $lb_actions{'LB_NOBREAK'}; | |
1121 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}] | |
1122 | = $lb_actions{'LB_NOBREAK'}; | |
1123 | ||
1124 | # (ID | EB | EM) × PO | |
289ce9cc KW |
1125 | $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}] |
1126 | = $lb_actions{'LB_NOBREAK'}; | |
b0e24409 KW |
1127 | $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}] |
1128 | = $lb_actions{'LB_NOBREAK'}; | |
1129 | $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}] | |
1130 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1131 | |
b0e24409 | 1132 | # LB23 Do not break between digits and letters |
6b659339 | 1133 | # (AL | HL) × NU |
289ce9cc KW |
1134 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}] |
1135 | = $lb_actions{'LB_NOBREAK'}; | |
1136 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}] | |
1137 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1138 | |
1139 | # NU × (AL | HL) | |
289ce9cc KW |
1140 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}] |
1141 | = $lb_actions{'LB_NOBREAK'}; | |
1142 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1143 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1144 | |
1145 | # LB22 Do not break between two ellipses, or between letters, numbers or | |
1146 | # exclamations and ellipsis. | |
1147 | # (AL | HL) × IN | |
289ce9cc KW |
1148 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}] |
1149 | = $lb_actions{'LB_NOBREAK'}; | |
1150 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}] | |
1151 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1152 | |
289ce9cc KW |
1153 | # Exclamation × IN |
1154 | $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}] | |
1155 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1156 | |
b0e24409 | 1157 | # (ID | EB | EM) × IN |
289ce9cc KW |
1158 | $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}] |
1159 | = $lb_actions{'LB_NOBREAK'}; | |
b0e24409 KW |
1160 | $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}] |
1161 | = $lb_actions{'LB_NOBREAK'}; | |
1162 | $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}] | |
1163 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1164 | |
1165 | # IN × IN | |
289ce9cc KW |
1166 | $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}] |
1167 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1168 | |
1169 | # NU × IN | |
289ce9cc KW |
1170 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}] |
1171 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1172 | |
1173 | # LB21b Don’t break between Solidus and Hebrew letters. | |
1174 | # SY × HL | |
289ce9cc KW |
1175 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}] |
1176 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1177 | |
1178 | # LB21a Don't break after Hebrew + Hyphen. | |
1179 | # HL (HY | BA) × | |
1180 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1181 | $lb_table[$lb_enums{'Hyphen'}][$i] |
1182 | += $lb_actions{'LB_HY_or_BA_then_foo'}; | |
1183 | $lb_table[$lb_enums{'Break_After'}][$i] | |
1184 | += $lb_actions{'LB_HY_or_BA_then_foo'}; | |
6b659339 KW |
1185 | } |
1186 | ||
1187 | # LB21 Do not break before hyphen-minus, other hyphens, fixed-width | |
1188 | # spaces, small kana, and other non-starters, or after acute accents. | |
1189 | # × BA | |
1190 | # × HY | |
1191 | # × NS | |
1192 | # BB × | |
1193 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1194 | $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'}; |
1195 | $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'}; | |
1196 | $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'}; | |
1197 | $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1198 | } |
1199 | ||
1200 | # LB20 Break before and after unresolved CB. | |
1201 | # ÷ CB | |
1202 | # CB ÷ | |
1203 | # Conditional breaks should be resolved external to the line breaking | |
1204 | # rules. However, the default action is to treat unresolved CB as breaking | |
1205 | # before and after. | |
1206 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1207 | $lb_table[$i][$lb_enums{'Contingent_Break'}] |
1208 | = $lb_actions{'LB_BREAKABLE'}; | |
1209 | $lb_table[$lb_enums{'Contingent_Break'}][$i] | |
1210 | = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 KW |
1211 | } |
1212 | ||
1213 | # LB19 Do not break before or after quotation marks, such as ‘ ” ’. | |
1214 | # × QU | |
1215 | # QU × | |
1216 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1217 | $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'}; |
1218 | $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1219 | } |
1220 | ||
1221 | # LB18 Break after spaces | |
1222 | # SP ÷ | |
1223 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1224 | $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'}; |
6b659339 KW |
1225 | } |
1226 | ||
1227 | # LB17 Do not break within ‘——’, even with intervening spaces. | |
1228 | # B2 SP* × B2 | |
289ce9cc | 1229 | $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}] |
6b659339 KW |
1230 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1231 | ||
1232 | # LB16 Do not break between closing punctuation and a nonstarter even with | |
1233 | # intervening spaces. | |
1234 | # (CL | CP) SP* × NS | |
289ce9cc | 1235 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}] |
6b659339 | 1236 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1237 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}] |
6b659339 KW |
1238 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1239 | ||
1240 | ||
1241 | # LB15 Do not break within ‘”[’, even with intervening spaces. | |
1242 | # QU SP* × OP | |
289ce9cc | 1243 | $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}] |
6b659339 KW |
1244 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1245 | ||
1246 | # LB14 Do not break after ‘[’, even after spaces. | |
1247 | # OP SP* × | |
1248 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1249 | $lb_table[$lb_enums{'Open_Punctuation'}][$i] |
6b659339 KW |
1250 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1251 | } | |
1252 | ||
1253 | # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as | |
1254 | # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples | |
1255 | # [^NU] × CL | |
1256 | # [^NU] × CP | |
1257 | # × EX | |
1258 | # [^NU] × IS | |
1259 | # [^NU] × SY | |
1260 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1261 | $lb_table[$i][$lb_enums{'Exclamation'}] |
6b659339 KW |
1262 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1263 | ||
289ce9cc | 1264 | next if $i == $lb_enums{'Numeric'}; |
6b659339 | 1265 | |
289ce9cc | 1266 | $lb_table[$i][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1267 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1268 | $lb_table[$i][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1269 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1270 | $lb_table[$i][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1271 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1272 | $lb_table[$i][$lb_enums{'Break_Symbols'}] |
6b659339 KW |
1273 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1274 | } | |
1275 | ||
1276 | # LB12a Do not break before NBSP and related characters, except after | |
1277 | # spaces and hyphens. | |
1278 | # [^SP BA HY] × GL | |
1279 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1280 | next if $i == $lb_enums{'Space'} |
1281 | || $i == $lb_enums{'Break_After'} | |
1282 | || $i == $lb_enums{'Hyphen'}; | |
6b659339 KW |
1283 | |
1284 | # We don't break, but if a property above has said don't break even | |
1285 | # with space between, don't override that (also in the next few rules) | |
289ce9cc | 1286 | next if $lb_table[$i][$lb_enums{'Glue'}] |
6b659339 | 1287 | == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1288 | $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1289 | } |
1290 | ||
1291 | # LB12 Do not break after NBSP and related characters. | |
1292 | # GL × | |
1293 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1294 | next if $lb_table[$lb_enums{'Glue'}][$i] |
6b659339 | 1295 | == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1296 | $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1297 | } |
1298 | ||
1299 | # LB11 Do not break before or after Word joiner and related characters. | |
1300 | # × WJ | |
1301 | # WJ × | |
1302 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1303 | if ($lb_table[$i][$lb_enums{'Word_Joiner'}] |
6b659339 KW |
1304 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1305 | { | |
289ce9cc | 1306 | $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'}; |
6b659339 | 1307 | } |
289ce9cc | 1308 | if ($lb_table[$lb_enums{'Word_Joiner'}][$i] |
6b659339 KW |
1309 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1310 | { | |
289ce9cc | 1311 | $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1312 | } |
1313 | } | |
1314 | ||
1315 | # Special case this here to avoid having to do a special case in the code, | |
1316 | # by making this the same as other things with a SP in front of them that | |
1317 | # don't break, we avoid an extra test | |
289ce9cc | 1318 | $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}] |
6b659339 KW |
1319 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1320 | ||
1321 | # LB9 and LB10 are done in the same loop | |
1322 | # | |
1323 | # LB9 Do not break a combining character sequence; treat it as if it has | |
1324 | # the line breaking class of the base character in all of the | |
b0e24409 KW |
1325 | # higher-numbered rules. Treat ZWJ as if it were CM |
1326 | # Treat X (CM|ZWJ)* as if it were X. | |
6b659339 KW |
1327 | # where X is any line break class except BK, CR, LF, NL, SP, or ZW. |
1328 | ||
b0e24409 KW |
1329 | # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the |
1330 | # case where a CM or ZWJ is the first character on the line or follows SP, | |
1331 | # BK, CR, LF, NL, or ZW. | |
6b659339 KW |
1332 | for my $i (0 .. @lb_table - 1) { |
1333 | ||
b0e24409 KW |
1334 | # When the CM or ZWJ is the first in the pair, we don't know without |
1335 | # looking behind whether the CM or ZWJ is going to attach to an | |
1336 | # earlier character, or not. So have to figure this out at runtime in | |
1337 | # the code | |
1338 | $lb_table[$lb_enums{'Combining_Mark'}][$i] | |
1339 | = $lb_actions{'LB_CM_ZWJ_foo'}; | |
1340 | $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'}; | |
289ce9cc KW |
1341 | |
1342 | if ( $i == $lb_enums{'Mandatory_Break'} | |
1343 | || $i == $lb_enums{'EDGE'} | |
1344 | || $i == $lb_enums{'Carriage_Return'} | |
1345 | || $i == $lb_enums{'Line_Feed'} | |
1346 | || $i == $lb_enums{'Next_Line'} | |
1347 | || $i == $lb_enums{'Space'} | |
1348 | || $i == $lb_enums{'ZWSpace'}) | |
6b659339 KW |
1349 | { |
1350 | # For these classes, a following CM doesn't combine, and should do | |
289ce9cc KW |
1351 | # whatever 'Alphabetic' would do. |
1352 | $lb_table[$i][$lb_enums{'Combining_Mark'}] | |
1353 | = $lb_table[$i][$lb_enums{'Alphabetic'}]; | |
b0e24409 KW |
1354 | $lb_table[$i][$lb_enums{'ZWJ'}] |
1355 | = $lb_table[$i][$lb_enums{'Alphabetic'}]; | |
6b659339 KW |
1356 | } |
1357 | else { | |
b0e24409 KW |
1358 | # For these classes, the CM or ZWJ combines, so doesn't break, |
1359 | # inheriting the type of nobreak from the master character. | |
289ce9cc | 1360 | if ($lb_table[$i][$lb_enums{'Combining_Mark'}] |
6b659339 KW |
1361 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1362 | { | |
289ce9cc KW |
1363 | $lb_table[$i][$lb_enums{'Combining_Mark'}] |
1364 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1365 | } |
b0e24409 KW |
1366 | if ($lb_table[$i][$lb_enums{'ZWJ'}] |
1367 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) | |
1368 | { | |
1369 | $lb_table[$i][$lb_enums{'ZWJ'}] | |
1370 | = $lb_actions{'LB_NOBREAK'}; | |
1371 | } | |
6b659339 KW |
1372 | } |
1373 | } | |
1374 | ||
b0e24409 KW |
1375 | # LB8a Do not break between a zero width joiner and an ideograph, emoji |
1376 | # base or emoji modifier. This rule prevents breaks within emoji joiner | |
1377 | # sequences. | |
1378 | # ZWJ × (ID | EB | EM) | |
1379 | $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}] | |
1380 | = $lb_actions{'LB_NOBREAK'}; | |
1381 | $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}] | |
1382 | = $lb_actions{'LB_NOBREAK'}; | |
1383 | $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}] | |
1384 | = $lb_actions{'LB_NOBREAK'}; | |
1385 | ||
6b659339 KW |
1386 | # LB8 Break before any character following a zero-width space, even if one |
1387 | # or more spaces intervene. | |
1388 | # ZW SP* ÷ | |
1389 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1390 | $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'}; |
6b659339 KW |
1391 | } |
1392 | ||
1393 | # Because of LB8-10, we need to look at context for "SP x", and this must | |
1394 | # be done in the code. So override the existing rules for that, by adding | |
1395 | # a constant to get new rules that tell the code it needs to look at | |
1396 | # context. By adding this action instead of replacing the existing one, | |
1397 | # we can get back to the original rule if necessary. | |
1398 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1399 | $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'}; |
6b659339 KW |
1400 | } |
1401 | ||
1402 | # LB7 Do not break before spaces or zero width space. | |
1403 | # × SP | |
1404 | # × ZW | |
1405 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1406 | $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'}; |
1407 | $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1408 | } |
1409 | ||
1410 | # LB6 Do not break before hard line breaks. | |
1411 | # × ( BK | CR | LF | NL ) | |
1412 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1413 | $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'}; |
1414 | $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'}; | |
1415 | $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'}; | |
1416 | $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1417 | } |
1418 | ||
1419 | # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. | |
1420 | # CR × LF | |
1421 | # CR ! | |
1422 | # LF ! | |
1423 | # NL ! | |
1424 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1425 | $lb_table[$lb_enums{'Carriage_Return'}][$i] |
1426 | = $lb_actions{'LB_BREAKABLE'}; | |
1427 | $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'}; | |
1428 | $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 | 1429 | } |
289ce9cc KW |
1430 | $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}] |
1431 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1432 | |
1433 | # LB4 Always break after hard line breaks. | |
1434 | # BK ! | |
1435 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1436 | $lb_table[$lb_enums{'Mandatory_Break'}][$i] |
1437 | = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 KW |
1438 | } |
1439 | ||
6b659339 KW |
1440 | # LB3 Always break at the end of text. |
1441 | # ! eot | |
b0e24409 KW |
1442 | # LB2 Never break at the start of text. |
1443 | # sot × | |
6b659339 | 1444 | for my $i (0 .. @lb_table - 1) { |
289ce9cc KW |
1445 | $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'}; |
1446 | $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1447 | } |
1448 | ||
1449 | # LB1 Assign a line breaking class to each code point of the input. | |
1450 | # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes | |
1451 | # depending on criteria outside the scope of this algorithm. | |
1452 | # | |
1453 | # In the absence of such criteria all characters with a specific | |
1454 | # combination of original class and General_Category property value are | |
1455 | # resolved as follows: | |
1456 | # Original Resolved General_Category | |
1457 | # AI, SG, XX AL Any | |
1458 | # SA CM Only Mn or Mc | |
1459 | # SA AL Any except Mn and Mc | |
1460 | # CJ NS Any | |
1461 | # | |
1462 | # This is done in mktables, so we never see any of the remapped-from | |
1463 | # classes. | |
1464 | ||
289ce9cc KW |
1465 | output_table_common('LB', \%lb_actions, |
1466 | \@lb_table, \@lb_short_enums, \%lb_abbreviations); | |
6b659339 KW |
1467 | } |
1468 | ||
7e54b87f KW |
1469 | sub output_WB_table() { |
1470 | ||
1471 | # Create and output the enums, #defines, and pair table for use in | |
1472 | # determining Word Breaks, given in http://www.unicode.org/reports/tr29/. | |
1473 | ||
1474 | # This uses the same mechanism in the other bounds tables generated by | |
1475 | # this file. The actions that could override a 0 or 1 are added to those | |
1476 | # numbers; the actions that clearly don't depend on the underlying rule | |
1477 | # simply overwrite | |
1478 | my %wb_actions = ( | |
1479 | WB_NOBREAK => 0, | |
1480 | WB_BREAKABLE => 1, | |
1481 | WB_hs_then_hs => 2, | |
b0e24409 | 1482 | WB_Ex_or_FO_or_ZWJ_then_foo => 3, |
7e54b87f KW |
1483 | WB_DQ_then_HL => 4, |
1484 | WB_HL_then_DQ => 6, | |
1485 | WB_LE_or_HL_then_MB_or_ML_or_SQ => 8, | |
1486 | WB_MB_or_ML_or_SQ_then_LE_or_HL => 10, | |
1487 | WB_MB_or_MN_or_SQ_then_NU => 12, | |
1488 | WB_NU_then_MB_or_MN_or_SQ => 14, | |
b0e24409 | 1489 | WB_RI_then_RI => 16, |
7e54b87f KW |
1490 | ); |
1491 | ||
7e54b87f KW |
1492 | # Construct the WB pair table. |
1493 | # The table is constructed in reverse order of the rules, to make the | |
1494 | # lower-numbered, higher priority ones override the later ones, as the | |
1495 | # algorithm stops at the earliest matching rule | |
1496 | ||
1497 | my @wb_table; | |
1498 | my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN | |
1499 | ||
1500 | # Otherwise, break everywhere (including around ideographs). | |
b0e24409 | 1501 | # WB99 Any ÷ Any |
7e54b87f KW |
1502 | for my $i (0 .. $table_size - 1) { |
1503 | for my $j (0 .. $table_size - 1) { | |
1504 | $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'}; | |
1505 | } | |
1506 | } | |
1507 | ||
b0e24409 KW |
1508 | # Do not break within emoji flag sequences. That is, do not break between |
1509 | # regional indicator (RI) symbols if there is an odd number of RI | |
1510 | # characters before the break point. | |
1511 | # WB16 [^RI] (RI RI)* RI × RI | |
1512 | # WB15 ^ (RI RI)* RI × RI | |
289ce9cc | 1513 | $wb_table[$wb_enums{'Regional_Indicator'}] |
b0e24409 KW |
1514 | [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'}; |
1515 | ||
1516 | # Do not break within emoji modifier sequences. | |
1517 | # WB14 ( E_Base | EBG ) × E_Modifier | |
1518 | $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}] | |
1519 | = $wb_actions{'WB_NOBREAK'}; | |
1520 | $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}] | |
1521 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1522 | |
1523 | # Do not break from extenders. | |
1524 | # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) | |
289ce9cc KW |
1525 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}] |
1526 | = $wb_actions{'WB_NOBREAK'}; | |
1527 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}] | |
1528 | = $wb_actions{'WB_NOBREAK'}; | |
1529 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}] | |
1530 | = $wb_actions{'WB_NOBREAK'}; | |
1531 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}] | |
1532 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1533 | |
1534 | # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) | |
1535 | # × # ExtendNumLet | |
289ce9cc KW |
1536 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}] |
1537 | = $wb_actions{'WB_NOBREAK'}; | |
1538 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}] | |
1539 | = $wb_actions{'WB_NOBREAK'}; | |
1540 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}] | |
1541 | = $wb_actions{'WB_NOBREAK'}; | |
1542 | $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}] | |
1543 | = $wb_actions{'WB_NOBREAK'}; | |
1544 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}] | |
1545 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1546 | |
1547 | # Do not break between Katakana. | |
1548 | # WB13 Katakana × Katakana | |
289ce9cc KW |
1549 | $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}] |
1550 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1551 | |
1552 | # Do not break within sequences, such as “3.2” or “3,456.789”. | |
1553 | # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric | |
289ce9cc | 1554 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1555 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
289ce9cc | 1556 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}] |
7e54b87f | 1557 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
289ce9cc | 1558 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}] |
7e54b87f KW |
1559 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
1560 | ||
1561 | # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric | |
289ce9cc | 1562 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}] |
7e54b87f | 1563 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
289ce9cc | 1564 | $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}] |
7e54b87f | 1565 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
289ce9cc | 1566 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}] |
7e54b87f KW |
1567 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
1568 | ||
1569 | # Do not break within sequences of digits, or digits adjacent to letters | |
1570 | # (“3a”, or “A3”). | |
1571 | # WB10 Numeric × (ALetter | Hebrew_Letter) | |
289ce9cc KW |
1572 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}] |
1573 | = $wb_actions{'WB_NOBREAK'}; | |
1574 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}] | |
1575 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1576 | |
1577 | # WB9 (ALetter | Hebrew_Letter) × Numeric | |
289ce9cc KW |
1578 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}] |
1579 | = $wb_actions{'WB_NOBREAK'}; | |
1580 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}] | |
1581 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1582 | |
1583 | # WB8 Numeric × Numeric | |
289ce9cc KW |
1584 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}] |
1585 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1586 | |
1587 | # Do not break letters across certain punctuation. | |
1588 | # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | |
289ce9cc KW |
1589 | $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}] |
1590 | += $wb_actions{'WB_DQ_then_HL'}; | |
7e54b87f KW |
1591 | |
1592 | # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | |
289ce9cc KW |
1593 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}] |
1594 | += $wb_actions{'WB_HL_then_DQ'}; | |
7e54b87f KW |
1595 | |
1596 | # WB7a Hebrew_Letter × Single_Quote | |
289ce9cc KW |
1597 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] |
1598 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1599 | |
1600 | # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) | |
1601 | # × (ALetter | Hebrew_Letter) | |
289ce9cc | 1602 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}] |
7e54b87f | 1603 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1604 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f | 1605 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1606 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}] |
7e54b87f | 1607 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1608 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f | 1609 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1610 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}] |
7e54b87f | 1611 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1612 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f KW |
1613 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
1614 | ||
1615 | # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | |
1616 | # | Single_Quote) (ALetter | Hebrew_Letter) | |
289ce9cc | 1617 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1618 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1619 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1620 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1621 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}] |
7e54b87f | 1622 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1623 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}] |
7e54b87f | 1624 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1625 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}] |
7e54b87f | 1626 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1627 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] |
7e54b87f KW |
1628 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
1629 | ||
1630 | # Do not break between most letters. | |
1631 | # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) | |
289ce9cc KW |
1632 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}] |
1633 | = $wb_actions{'WB_NOBREAK'}; | |
1634 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}] | |
1635 | = $wb_actions{'WB_NOBREAK'}; | |
1636 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}] | |
1637 | = $wb_actions{'WB_NOBREAK'}; | |
1638 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}] | |
1639 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f | 1640 | |
b0e24409 KW |
1641 | # Ignore Format and Extend characters, except after sot, CR, LF, and |
1642 | # Newline. This also has the effect of: Any × (Format | Extend | ZWJ) | |
1643 | # WB4 X (Extend | Format | ZWJ)* → X | |
7e54b87f | 1644 | for my $i (0 .. @wb_table - 1) { |
289ce9cc | 1645 | $wb_table[$wb_enums{'Extend'}][$i] |
b0e24409 | 1646 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; |
289ce9cc | 1647 | $wb_table[$wb_enums{'Format'}][$i] |
b0e24409 KW |
1648 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; |
1649 | $wb_table[$wb_enums{'ZWJ'}][$i] | |
1650 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; | |
1651 | } | |
1652 | for my $i (0 .. @wb_table - 1) { | |
1653 | $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; | |
1654 | $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; | |
1655 | $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1656 | } |
1657 | ||
1658 | # Implied is that these attach to the character before them, except for | |
1659 | # the characters that mark the end of a region of text. The rules below | |
1660 | # override the ones set up here, for all the characters that need | |
1661 | # overriding. | |
1662 | for my $i (0 .. @wb_table - 1) { | |
289ce9cc KW |
1663 | $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; |
1664 | $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1665 | } |
1666 | ||
b0e24409 KW |
1667 | # Do not break within emoji zwj sequences. |
1668 | # WB3c ZWJ × ( Glue_After_Zwj | EBG ) | |
1669 | $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}] | |
1670 | = $wb_actions{'WB_NOBREAK'}; | |
1671 | $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}] | |
1672 | = $wb_actions{'WB_NOBREAK'}; | |
1673 | ||
7e54b87f KW |
1674 | # Break before and after white space |
1675 | # WB3b ÷ (Newline | CR | LF) | |
1676 | # WB3a (Newline | CR | LF) ÷ | |
1677 | # et. al. | |
289ce9cc | 1678 | for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { |
7e54b87f KW |
1679 | for my $j (0 .. @wb_table - 1) { |
1680 | $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'}; | |
1681 | $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'}; | |
1682 | } | |
1683 | } | |
1684 | ||
1685 | # But do not break within white space. | |
1686 | # WB3 CR × LF | |
1687 | # et.al. | |
289ce9cc KW |
1688 | for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { |
1689 | for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { | |
7e54b87f KW |
1690 | $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'}; |
1691 | } | |
1692 | } | |
1693 | ||
b0e24409 | 1694 | # And do not break horizontal space followed by Extend or Format or ZWJ |
289ce9cc KW |
1695 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}] |
1696 | = $wb_actions{'WB_NOBREAK'}; | |
1697 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}] | |
1698 | = $wb_actions{'WB_NOBREAK'}; | |
b0e24409 KW |
1699 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}] |
1700 | = $wb_actions{'WB_NOBREAK'}; | |
289ce9cc KW |
1701 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}] |
1702 | [$wb_enums{'Perl_Tailored_HSpace'}] | |
1703 | = $wb_actions{'WB_hs_then_hs'}; | |
7e54b87f | 1704 | |
b0e24409 KW |
1705 | # Break at the start and end of text, unless the text is empty |
1706 | # WB2 Any ÷ eot | |
1707 | # WB1 sot ÷ Any | |
7e54b87f | 1708 | for my $i (0 .. @wb_table - 1) { |
289ce9cc KW |
1709 | $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'}; |
1710 | $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'}; | |
7e54b87f | 1711 | } |
289ce9cc | 1712 | $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0; |
7e54b87f | 1713 | |
289ce9cc KW |
1714 | output_table_common('WB', \%wb_actions, |
1715 | \@wb_table, \@wb_short_enums, \%wb_abbreviations); | |
7e54b87f KW |
1716 | } |
1717 | ||
9d9177be KW |
1718 | output_invlist("Latin1", [ 0, 256 ]); |
1719 | output_invlist("AboveLatin1", [ 256 ]); | |
1720 | ||
bffc0129 | 1721 | end_file_pound_if; |
43b443dd | 1722 | |
3f427fd9 KW |
1723 | # We construct lists for all the POSIX and backslash sequence character |
1724 | # classes in two forms: | |
1725 | # 1) ones which match only in the ASCII range | |
1726 | # 2) ones which match either in the Latin1 range, or the entire Unicode range | |
1727 | # | |
1728 | # These get compiled in, and hence affect the memory footprint of every Perl | |
1729 | # program, even those not using Unicode. To minimize the size, currently | |
1730 | # the Latin1 version is generated for the beyond ASCII range except for those | |
1731 | # lists that are quite small for the entire range, such as for \s, which is 22 | |
1732 | # UVs long plus 4 UVs (currently) for the header. | |
1733 | # | |
1734 | # To save even more memory, the ASCII versions could be derived from the | |
1735 | # larger ones at runtime, saving some memory (minus the expense of the machine | |
1736 | # instructions to do so), but these are all small anyway, so their total is | |
1737 | # about 100 UVs. | |
1738 | # | |
1739 | # In the list of properties below that get generated, the L1 prefix is a fake | |
1740 | # property that means just the Latin1 range of the full property (whose name | |
1741 | # has an X prefix instead of L1). | |
a02047bf KW |
1742 | # |
1743 | # An initial & means to use the subroutine from this file instead of an | |
1744 | # official inversion list. | |
3f427fd9 | 1745 | |
0c4ecf42 KW |
1746 | for my $charset (get_supported_code_pages()) { |
1747 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
1748 | ||
99f21fb9 KW |
1749 | @a2n = @{get_a2n($charset)}; |
1750 | no warnings 'qw'; | |
1751 | # Ignore non-alpha in sort | |
1752 | for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( | |
c0382778 | 1753 | Assigned |
1c8c3428 KW |
1754 | ASCII |
1755 | Cased | |
1756 | VertSpace | |
1757 | XPerlSpace | |
1758 | XPosixAlnum | |
1759 | XPosixAlpha | |
1760 | XPosixBlank | |
1761 | XPosixCntrl | |
1762 | XPosixDigit | |
1763 | XPosixGraph | |
1764 | XPosixLower | |
1765 | XPosixPrint | |
1766 | XPosixPunct | |
1767 | XPosixSpace | |
1768 | XPosixUpper | |
1769 | XPosixWord | |
1770 | XPosixXDigit | |
1771 | _Perl_Any_Folds | |
1772 | &NonL1_Perl_Non_Final_Folds | |
1773 | _Perl_Folds_To_Multi_Char | |
1774 | &UpperLatin1 | |
1775 | _Perl_IDStart | |
1776 | _Perl_IDCont | |
02f811dd | 1777 | _Perl_GCB,EDGE |
ca8226cf | 1778 | _Perl_LB,EDGE |
bf4268fa | 1779 | _Perl_SB,EDGE |
190d69bb | 1780 | _Perl_WB,EDGE,UNKNOWN |
1c8c3428 | 1781 | ) |
0f5e3c71 KW |
1782 | ) { |
1783 | ||
1784 | # For the Latin1 properties, we change to use the eXtended version of the | |
1785 | # base property, then go through the result and get rid of everything not | |
1786 | # in Latin1 (above 255). Actually, we retain the element for the range | |
1787 | # that crosses the 255/256 boundary if it is one that matches the | |
1788 | # property. For example, in the Word property, there is a range of code | |
1789 | # points that start at U+00F8 and goes through U+02C1. Instead of | |
1790 | # artificially cutting that off at 256 because 256 is the first code point | |
1791 | # above Latin1, we let the range go to its natural ending. That gives us | |
1792 | # extra information with no added space taken. But if the range that | |
1793 | # crosses the boundary is one that doesn't match the property, we don't | |
1794 | # start a new range above 255, as that could be construed as going to | |
1795 | # infinity. For example, the Upper property doesn't include the character | |
1796 | # at 255, but does include the one at 256. We don't include the 256 one. | |
1797 | my $prop_name = $prop; | |
1798 | my $is_local_sub = $prop_name =~ s/^&//; | |
99f21fb9 KW |
1799 | my $extra_enums = ""; |
1800 | $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x; | |
0f5e3c71 KW |
1801 | my $lookup_prop = $prop_name; |
1802 | my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ | |
1803 | or $lookup_prop =~ s/^L1//); | |
1804 | my $nonl1_only = 0; | |
1805 | $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; | |
99f21fb9 | 1806 | ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x; |
0f5e3c71 KW |
1807 | |
1808 | my @invlist; | |
99f21fb9 KW |
1809 | my @invmap; |
1810 | my $map_format; | |
1811 | my $map_default; | |
1812 | my $maps_to_code_point; | |
1813 | my $to_adjust; | |
0f5e3c71 KW |
1814 | if ($is_local_sub) { |
1815 | @invlist = eval $lookup_prop; | |
289ce9cc | 1816 | die $@ if $@; |
0f5e3c71 KW |
1817 | } |
1818 | else { | |
1819 | @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); | |
99f21fb9 | 1820 | if (! @invlist) { |
99f21fb9 | 1821 | |
ad85f59a KW |
1822 | # If couldn't find a non-empty inversion list, see if it is |
1823 | # instead an inversion map | |
1824 | my ($list_ref, $map_ref, $format, $default) | |
99f21fb9 | 1825 | = prop_invmap($lookup_prop, '_perl_core_internal_ok'); |
ad85f59a KW |
1826 | if (! $list_ref) { |
1827 | # An empty return here could mean an unknown property, or | |
1828 | # merely that the original inversion list is empty. Call | |
1829 | # in scalar context to differentiate | |
1830 | my $count = prop_invlist($lookup_prop, | |
1831 | '_perl_core_internal_ok'); | |
1832 | die "Could not find inversion list for '$lookup_prop'" | |
1833 | unless defined $count; | |
1834 | } | |
1835 | else { | |
18b852b3 KW |
1836 | @invlist = @$list_ref; |
1837 | @invmap = @$map_ref; | |
1838 | $map_format = $format; | |
1839 | $map_default = $default; | |
1840 | $maps_to_code_point = $map_format =~ /x/; | |
1841 | $to_adjust = $map_format =~ /a/; | |
ad85f59a | 1842 | } |
99f21fb9 | 1843 | } |
0f5e3c71 | 1844 | } |
ad85f59a KW |
1845 | |
1846 | ||
1847 | # Short-circuit an empty inversion list. | |
1848 | if (! @invlist) { | |
1849 | output_invlist($prop_name, \@invlist, $charset); | |
1850 | next; | |
1851 | } | |
ceb1de32 | 1852 | |
99f21fb9 KW |
1853 | # Re-order the Unicode code points to native ones for this platform. |
1854 | # This is only needed for code points below 256, because native code | |
1855 | # points are only in that range. For inversion maps of properties | |
1856 | # where the mappings are adjusted (format =~ /a/), this reordering | |
1857 | # could mess up the adjustment pattern that was in the input, so that | |
1858 | # has to be dealt with. | |
1859 | # | |
1860 | # And inversion maps that map to code points need to eventually have | |
1861 | # all those code points remapped to native, and it's better to do that | |
1862 | # here, going through the whole list not just those below 256. This | |
1863 | # is because some inversion maps have adjustments (format =~ /a/) | |
1864 | # which may be affected by the reordering. This code needs to be done | |
1865 | # both for when we are translating the inversion lists for < 256, and | |
1866 | # for the inversion maps for everything. By doing both in this loop, | |
1867 | # we can share that code. | |
1868 | # | |
1869 | # So, we go through everything for an inversion map to code points; | |
1870 | # otherwise, we can skip any remapping at all if we are going to | |
1871 | # output only the above-Latin1 values, or if the range spans the whole | |
1872 | # of 0..256, as the remap will also include all of 0..256 (256 not | |
1873 | # 255 because a re-ordering could cause 256 to need to be in the same | |
1874 | # range as 255.) | |
1875 | if ((@invmap && $maps_to_code_point) | |
1876 | || (! $nonl1_only || ($invlist[0] < 256 | |
1877 | && ! ($invlist[0] == 0 && $invlist[1] > 256)))) | |
ceb1de32 | 1878 | { |
fb4554ea | 1879 | |
99f21fb9 | 1880 | if (! @invmap) { # Straight inversion list |
fb4554ea KW |
1881 | # Look at all the ranges that start before 257. |
1882 | my @latin1_list; | |
1883 | while (@invlist) { | |
1884 | last if $invlist[0] > 256; | |
1885 | my $upper = @invlist > 1 | |
1886 | ? $invlist[1] - 1 # In range | |
8a6c81cf KW |
1887 | |
1888 | # To infinity. You may want to stop much much | |
1889 | # earlier; going this high may expose perl | |
1890 | # deficiencies with very large numbers. | |
1891 | : $Unicode::UCD::MAX_CP; | |
fb4554ea | 1892 | for my $j ($invlist[0] .. $upper) { |
99f21fb9 | 1893 | push @latin1_list, a2n($j); |
0f5e3c71 | 1894 | } |
fb4554ea KW |
1895 | |
1896 | shift @invlist; # Shift off the range that's in the list | |
1897 | shift @invlist; # Shift off the range not in the list | |
0c4ecf42 | 1898 | } |
fb4554ea KW |
1899 | |
1900 | # Here @invlist contains all the ranges in the original that start | |
1901 | # at code points above 256, and @latin1_list contains all the | |
1902 | # native code points for ranges that start with a Unicode code | |
1903 | # point below 257. We sort the latter and convert it to inversion | |
1904 | # list format. Then simply prepend it to the list of the higher | |
1905 | # code points. | |
1906 | @latin1_list = sort { $a <=> $b } @latin1_list; | |
5a7e5385 | 1907 | @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list); |
fb4554ea | 1908 | unshift @invlist, @latin1_list; |
99f21fb9 KW |
1909 | } |
1910 | else { # Is an inversion map | |
1911 | ||
1912 | # This is a similar procedure as plain inversion list, but has | |
1913 | # multiple buckets. A plain inversion list just has two | |
1914 | # buckets, 1) 'in' the list; and 2) 'not' in the list, and we | |
1915 | # pretty much can ignore the 2nd bucket, as it is completely | |
1916 | # defined by the 1st. But here, what we do is create buckets | |
1917 | # which contain the code points that map to each, translated | |
1918 | # to native and turned into an inversion list. Thus each | |
1919 | # bucket is an inversion list of native code points that map | |
1920 | # to it or don't map to it. We use these to create an | |
1921 | # inversion map for the whole property. | |
1922 | ||
1923 | # As mentioned earlier, we use this procedure to not just | |
1924 | # remap the inversion list to native values, but also the maps | |
1925 | # of code points to native ones. In the latter case we have | |
1926 | # to look at the whole of the inversion map (or at least to | |
1927 | # above Unicode; as the maps of code points above that should | |
1928 | # all be to the default). | |
1929 | my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256; | |
1930 | ||
1931 | my %mapped_lists; # A hash whose keys are the buckets. | |
1932 | while (@invlist) { | |
1933 | last if $invlist[0] > $upper_limit; | |
1934 | ||
1935 | # This shouldn't actually happen, as prop_invmap() returns | |
1936 | # an extra element at the end that is beyond $upper_limit | |
1937 | die "inversion map that extends to infinity is unimplemented" unless @invlist > 1; | |
1938 | ||
1939 | my $bucket; | |
1940 | ||
1941 | # A hash key can't be a ref (we are only expecting arrays | |
1942 | # of scalars here), so convert any such to a string that | |
1943 | # will be converted back later (using a vertical tab as | |
1944 | # the separator). Even if the mapping is to code points, | |
1945 | # we don't translate to native here because the code | |
1946 | # output_map() calls to output these arrays assumes the | |
1947 | # input is Unicode, not native. | |
1948 | if (ref $invmap[0]) { | |
1949 | $bucket = join "\cK", @{$invmap[0]}; | |
1950 | } | |
1951 | elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) { | |
1952 | ||
1953 | # Do convert to native for maps to single code points. | |
1954 | # There are some properties that have a few outlier | |
1955 | # maps that aren't code points, so the above test | |
1956 | # skips those. | |
1957 | $bucket = a2n($invmap[0]); | |
1958 | } else { | |
1959 | $bucket = $invmap[0]; | |
1960 | } | |
1961 | ||
1962 | # We now have the bucket that all code points in the range | |
1963 | # map to, though possibly they need to be adjusted. Go | |
1964 | # through the range and put each translated code point in | |
1965 | # it into its bucket. | |
1966 | my $base_map = $invmap[0]; | |
1967 | for my $j ($invlist[0] .. $invlist[1] - 1) { | |
1968 | if ($to_adjust | |
1969 | # The 1st code point doesn't need adjusting | |
1970 | && $j > $invlist[0] | |
1971 | ||
1972 | # Skip any non-numeric maps: these are outliers | |
1973 | # that aren't code points. | |
1974 | && $base_map =~ $numeric_re | |
1975 | ||
1976 | # 'ne' because the default can be a string | |
1977 | && $base_map ne $map_default) | |
1978 | { | |
1979 | # We adjust, by incrementing each the bucket and | |
1980 | # the map. For code point maps, translate to | |
1981 | # native | |
1982 | $base_map++; | |
1983 | $bucket = ($maps_to_code_point) | |
1984 | ? a2n($base_map) | |
1985 | : $base_map; | |
1986 | } | |
1987 | ||
1988 | # Add the native code point to the bucket for the | |
1989 | # current map | |
1990 | push @{$mapped_lists{$bucket}}, a2n($j); | |
1991 | } # End of loop through all code points in the range | |
1992 | ||
1993 | # Get ready for the next range | |
1994 | shift @invlist; | |
1995 | shift @invmap; | |
1996 | } # End of loop through all ranges in the map. | |
1997 | ||
1998 | # Here, @invlist and @invmap retain all the ranges from the | |
1999 | # originals that start with code points above $upper_limit. | |
2000 | # Each bucket in %mapped_lists contains all the code points | |
2001 | # that map to that bucket. If the bucket is for a map to a | |
2002 | # single code point is a single code point, the bucket has | |
2003 | # been converted to native. If something else (including | |
2004 | # multiple code points), no conversion is done. | |
2005 | # | |
2006 | # Now we recreate the inversion map into %xlated, but this | |
2007 | # time for the native character set. | |
2008 | my %xlated; | |
2009 | foreach my $bucket (keys %mapped_lists) { | |
2010 | ||
2011 | # Sort and convert this bucket to an inversion list. The | |
2012 | # result will be that ranges that start with even-numbered | |
2013 | # indexes will be for code points that map to this bucket; | |
2014 | # odd ones map to some other bucket, and are discarded | |
2015 | # below. | |
2016 | @{$mapped_lists{$bucket}} | |
2017 | = sort{ $a <=> $b} @{$mapped_lists{$bucket}}; | |
2018 | @{$mapped_lists{$bucket}} | |
2019 | = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}}); | |
2020 | ||
2021 | # Add each even-numbered range in the bucket to %xlated; | |
2022 | # so that the keys of %xlated become the range start code | |
2023 | # points, and the values are their corresponding maps. | |
2024 | while (@{$mapped_lists{$bucket}}) { | |
2025 | my $range_start = $mapped_lists{$bucket}->[0]; | |
2026 | if ($bucket =~ /\cK/) { | |
2027 | @{$xlated{$range_start}} = split /\cK/, $bucket; | |
2028 | } | |
2029 | else { | |
2030 | $xlated{$range_start} = $bucket; | |
2031 | } | |
2032 | shift @{$mapped_lists{$bucket}}; # Discard odd ranges | |
2033 | shift @{$mapped_lists{$bucket}}; # Get ready for next | |
2034 | # iteration | |
2035 | } | |
2036 | } # End of loop through all the buckets. | |
2037 | ||
2038 | # Here %xlated's keys are the range starts of all the code | |
2039 | # points in the inversion map. Construct an inversion list | |
2040 | # from them. | |
2041 | my @new_invlist = sort { $a <=> $b } keys %xlated; | |
2042 | ||
2043 | # If the list is adjusted, we want to munge this list so that | |
2044 | # we only have one entry for where consecutive code points map | |
2045 | # to consecutive values. We just skip the subsequent entries | |
2046 | # where this is the case. | |
2047 | if ($to_adjust) { | |
2048 | my @temp; | |
2049 | for my $i (0 .. @new_invlist - 1) { | |
2050 | next if $i > 0 | |
2051 | && $new_invlist[$i-1] + 1 == $new_invlist[$i] | |
2052 | && $xlated{$new_invlist[$i-1]} =~ $numeric_re | |
2053 | && $xlated{$new_invlist[$i]} =~ $numeric_re | |
2054 | && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]}; | |
2055 | push @temp, $new_invlist[$i]; | |
2056 | } | |
2057 | @new_invlist = @temp; | |
2058 | } | |
2059 | ||
2060 | # The inversion map comes from %xlated's values. We can | |
2061 | # unshift each onto the front of the untouched portion, in | |
2062 | # reverse order of the portion we did process. | |
2063 | foreach my $start (reverse @new_invlist) { | |
2064 | unshift @invmap, $xlated{$start}; | |
2065 | } | |
2066 | ||
2067 | # Finally prepend the inversion list we have just constructed to the | |
2068 | # one that contains anything we didn't process. | |
2069 | unshift @invlist, @new_invlist; | |
2070 | } | |
2071 | } | |
2072 | ||
2073 | # prop_invmap() returns an extra final entry, which we can now | |
2074 | # discard. | |
2075 | if (@invmap) { | |
2076 | pop @invlist; | |
2077 | pop @invmap; | |
ceb1de32 | 2078 | } |
0f5e3c71 KW |
2079 | |
2080 | if ($l1_only) { | |
99f21fb9 | 2081 | die "Unimplemented to do a Latin-1 only inversion map" if @invmap; |
0f5e3c71 KW |
2082 | for my $i (0 .. @invlist - 1 - 1) { |
2083 | if ($invlist[$i] > 255) { | |
2084 | ||
2085 | # In an inversion list, even-numbered elements give the code | |
2086 | # points that begin ranges that match the property; | |
2087 | # odd-numbered give ones that begin ranges that don't match. | |
2088 | # If $i is odd, we are at the first code point above 255 that | |
2089 | # doesn't match, which means the range it is ending does | |
2090 | # match, and crosses the 255/256 boundary. We want to include | |
2091 | # this ending point, so increment $i, so the splice below | |
2092 | # includes it. Conversely, if $i is even, it is the first | |
2093 | # code point above 255 that matches, which means there was no | |
2094 | # matching range that crossed the boundary, and we don't want | |
2095 | # to include this code point, so splice before it. | |
2096 | $i++ if $i % 2 != 0; | |
2097 | ||
2098 | # Remove everything past this. | |
2099 | splice @invlist, $i; | |
99f21fb9 | 2100 | splice @invmap, $i if @invmap; |
0f5e3c71 KW |
2101 | last; |
2102 | } | |
0c4ecf42 KW |
2103 | } |
2104 | } | |
0f5e3c71 KW |
2105 | elsif ($nonl1_only) { |
2106 | my $found_nonl1 = 0; | |
2107 | for my $i (0 .. @invlist - 1 - 1) { | |
2108 | next if $invlist[$i] < 256; | |
2109 | ||
2110 | # Here, we have the first element in the array that indicates an | |
2111 | # element above Latin1. Get rid of all previous ones. | |
2112 | splice @invlist, 0, $i; | |
99f21fb9 | 2113 | splice @invmap, 0, $i if @invmap; |
0f5e3c71 KW |
2114 | |
2115 | # If this one's index is not divisible by 2, it means that this | |
2116 | # element is inverting away from being in the list, which means | |
99f21fb9 KW |
2117 | # all code points from 256 to this one are in this list (or |
2118 | # map to the default for inversion maps) | |
2119 | if ($i % 2 != 0) { | |
2120 | unshift @invlist, 256; | |
2121 | unshift @invmap, $map_default if @invmap; | |
2122 | } | |
0f5e3c71 | 2123 | $found_nonl1 = 1; |
3f427fd9 KW |
2124 | last; |
2125 | } | |
0f5e3c71 | 2126 | die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1; |
3f427fd9 | 2127 | } |
3f427fd9 | 2128 | |
0f5e3c71 | 2129 | output_invlist($prop_name, \@invlist, $charset); |
99f21fb9 | 2130 | output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap; |
0f5e3c71 | 2131 | } |
bffc0129 | 2132 | end_file_pound_if; |
0c4ecf42 | 2133 | print $out_fh "\n" . get_conditional_compile_line_end(); |
9d9177be KW |
2134 | } |
2135 | ||
973a28ed KW |
2136 | switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C'); |
2137 | ||
2138 | output_GCB_table(); | |
6b659339 | 2139 | output_LB_table(); |
7e54b87f | 2140 | output_WB_table(); |
6b659339 | 2141 | |
973a28ed KW |
2142 | end_file_pound_if; |
2143 | ||
2308ab83 | 2144 | my $sources_list = "lib/unicore/mktables.lst"; |
216b41c2 KW |
2145 | my @sources = ($0, qw(lib/unicore/mktables |
2146 | lib/Unicode/UCD.pm | |
2147 | regen/charset_translations.pl | |
2148 | )); | |
9a3da3ad FC |
2149 | { |
2150 | # Depend on mktables’ own sources. It’s a shorter list of files than | |
2151 | # those that Unicode::UCD uses. | |
1ae6ead9 | 2152 | if (! open my $mktables_list, '<', $sources_list) { |
2308ab83 KW |
2153 | |
2154 | # This should force a rebuild once $sources_list exists | |
2155 | push @sources, $sources_list; | |
2156 | } | |
2157 | else { | |
2158 | while(<$mktables_list>) { | |
2159 | last if /===/; | |
2160 | chomp; | |
2161 | push @sources, "lib/unicore/$_" if /^[^#]/; | |
2162 | } | |
9a3da3ad FC |
2163 | } |
2164 | } | |
6b659339 KW |
2165 | |
2166 | read_only_bottom_close_and_rename($out_fh, \@sources); |