This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add support for bison 3.0
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
bffc0129
KW
45# The symbols generated by this program are all currently defined only in a
46# single dot c each. The code knows where most of them go, but this hash
47# gives overrides for the exceptions to the typical place
48my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
015bb97c 58
f79a09fc
KW
59# This hash contains the properties with enums that have hard-coded references
60# to them in C code. Its only use is to make sure that if perl is compiled
61# with an older Unicode data set, that all the enum values the code is
62# expecting will still be in the enum typedef. Thus the code doesn't have to
63# change. The Unicode version won't have any code points that have these enum
64# values, so the code that handles them will not get exercised. This is far
65# better than having to #ifdef things.
66my %hard_coded_enums =
67 ( gcb => [
68 'Control',
69 'CR',
70 'Extend',
71 'L',
72 'LF',
73 'LV',
74 'LVT',
75 'Other',
76 'Prepend',
77 'Regional_Indicator',
78 'SpacingMark',
79 'T',
80 'V',
81 ],
ca8226cf
KW
82 lb => [
83 'Alphabetic',
84 'Break_After',
85 'Break_Before',
86 'Break_Both',
87 'Break_Symbols',
88 'Carriage_Return',
89 'Close_Parenthesis',
90 'Close_Punctuation',
91 'Combining_Mark',
92 'Contingent_Break',
93 'Exclamation',
94 'Glue',
95 'H2',
96 'H3',
97 'Hebrew_Letter',
98 'Hyphen',
99 'Ideographic',
100 'Infix_Numeric',
101 'Inseparable',
102 'JL',
103 'JT',
104 'JV',
105 'Line_Feed',
106 'Mandatory_Break',
107 'Next_Line',
108 'Nonstarter',
109 'Numeric',
110 'Open_Punctuation',
111 'Postfix_Numeric',
112 'Prefix_Numeric',
113 'Quotation',
114 'Regional_Indicator',
115 'Space',
116 'Word_Joiner',
117 'ZWSpace',
118 ],
f79a09fc
KW
119 sb => [
120 'ATerm',
121 'Close',
122 'CR',
123 'Extend',
124 'Format',
125 'LF',
126 'Lower',
127 'Numeric',
128 'OLetter',
129 'Other',
130 'SContinue',
131 'Sep',
132 'Sp',
133 'STerm',
134 'Upper',
135 ],
136 wb => [
137 'ALetter',
138 'CR',
139 'Double_Quote',
140 'Extend',
141 'ExtendNumLet',
142 'Format',
143 'Hebrew_Letter',
144 'Katakana',
145 'LF',
146 'MidLetter',
147 'MidNum',
148 'MidNumLet',
149 'Newline',
150 'Numeric',
151 'Other',
f1f6961f 152 'Perl_Tailored_HSpace',
f79a09fc
KW
153 'Regional_Indicator',
154 'Single_Quote',
f79a09fc
KW
155 ],
156);
157
973a28ed
KW
158my %gcb_enums;
159my @gcb_short_enums;
6b659339
KW
160my %lb_enums;
161my @lb_short_enums;
162
99f21fb9
KW
163my @a2n;
164
165sub uniques {
166 # Returns non-duplicated input values. From "Perl Best Practices:
167 # Encapsulated Cleverness". p. 455 in first edition.
168
169 my %seen;
170 return grep { ! $seen{$_}++ } @_;
171}
172
173sub a2n($) {
174 my $cp = shift;
175
176 # Returns the input Unicode code point translated to native.
177
178 return $cp if $cp !~ $numeric_re || $cp > 255;
179 return $a2n[$cp];
180}
181
bffc0129
KW
182sub end_file_pound_if {
183 if ($in_file_pound_if) {
184 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
185 $in_file_pound_if = 0;
186 }
187}
188
189sub switch_pound_if ($$) {
190 my $name = shift;
191 my $new_pound_if = shift;
192
193 # Switch to new #if given by the 2nd argument. If there is an override
194 # for this, it instead switches to that. The 1st argument is the
195 # static's name, used to look up the overrides
196
197 if (exists $exceptions_to_where_to_define{$name}) {
198 $new_pound_if = $exceptions_to_where_to_define{$name};
199 }
200
201 # Exit current #if if the new one is different from the old
202 if ($in_file_pound_if
203 && $in_file_pound_if !~ /$new_pound_if/)
204 {
205 end_file_pound_if;
206 }
207
208 # Enter new #if, if not already in it.
209 if (! $in_file_pound_if) {
210 $in_file_pound_if = "defined($new_pound_if)";
211 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
212 }
213}
214
0c4ecf42 215sub output_invlist ($$;$) {
9d9177be
KW
216 my $name = shift;
217 my $invlist = shift; # Reference to inversion list array
0c4ecf42 218 my $charset = shift // ""; # name of character set for comment
9d9177be 219
76d3994c 220 die "No inversion list for $name" unless defined $invlist
ad85f59a 221 && ref $invlist eq 'ARRAY';
76d3994c 222
9d9177be
KW
223 # Output the inversion list $invlist using the name $name for it.
224 # It is output in the exact internal form for inversion lists.
225
a0316a6c
KW
226 # Is the last element of the header 0, or 1 ?
227 my $zero_or_one = 0;
ad85f59a 228 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 229 unshift @$invlist, 0;
9d9177be
KW
230 $zero_or_one = 1;
231 }
0a07b44b 232 my $count = @$invlist;
9d9177be 233
bffc0129 234 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 235
0c4ecf42
KW
236 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
237 print $out_fh " /* for $charset */" if $charset;
238 print $out_fh "\n";
9d9177be 239
a0316a6c 240 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
241 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
242 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
243 ",\t/* 0 if the list starts at 0;",
244 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
245
246 # The main body are the UVs passed in to this routine. Do the final
247 # element separately
47d53124
KW
248 for my $i (0 .. @$invlist - 1) {
249 printf $out_fh "\t0x%X", $invlist->[$i];
250 print $out_fh "," if $i < @$invlist - 1;
251 print $out_fh "\n";
9d9177be
KW
252 }
253
9d9177be
KW
254 print $out_fh "};\n";
255}
256
99f21fb9
KW
257sub output_invmap ($$$$$$$) {
258 my $name = shift;
259 my $invmap = shift; # Reference to inversion map array
260 my $prop_name = shift;
261 my $input_format = shift; # The inversion map's format
262 my $default = shift; # The property value for code points who
263 # otherwise don't have a value specified.
264 my $extra_enums = shift; # comma-separated list of our additions to the
265 # property's standard possible values
266 my $charset = shift // ""; # name of character set for comment
267
268 # Output the inversion map $invmap for property $prop_name, but use $name
269 # as the actual data structure's name.
270
271 my $count = @$invmap;
272
273 my $output_format;
274 my $declaration_type;
275 my %enums;
276 my $name_prefix;
277
278 if ($input_format eq 's') {
b83e6484 279 my $orig_prop_name = $prop_name;
02f811dd
KW
280 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
281 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
b83e6484
KW
282 my @enums;
283 if ($orig_prop_name eq $prop_name) {
284 @enums = prop_values($prop_name);
285 }
286 else {
287 @enums = uniques(@$invmap);
288 }
99f21fb9
KW
289 if (! @enums) {
290 die "Only enum properties are currently handled; '$prop_name' isn't one";
291 }
292 else {
293
f79a09fc 294 # Convert short names to long
99f21fb9 295 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
f79a09fc
KW
296
297 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
298 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
299 if @expected_enums < @enums;
300
301 # Remove the enums found in the input from the ones we expect
302 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
303 splice(@expected_enums, $i, 1)
304 if grep { $expected_enums[$i] eq $_ } @enums;
305 }
306
307 # The ones remaining must be because we're using an older
308 # Unicode version. Add them to the list.
309 push @enums, @expected_enums;
310
311 # Add in the extra values coded into this program, and sort.
99f21fb9
KW
312 @enums = sort @enums;
313
6dc80864
KW
314 # The internal enums comes last.
315 push @enums, split /,/, $extra_enums if $extra_enums ne "";
316
99f21fb9
KW
317 # Assign a value to each element of the enum. The default
318 # value always gets 0; the others are arbitrarily assigned.
319 my $enum_val = 0;
02f811dd
KW
320 my $canonical_default = prop_value_aliases($prop_name, $default);
321 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
322 $enums{$default} = $enum_val++;
323 for my $enum (@enums) {
324 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
325 }
6b659339 326
973a28ed
KW
327 # Calculate the enum values for properties _Perl_GCB and
328 # _Perl_LB because we output special tables for them
329 if ($name eq '_Perl_GCB' && ! %gcb_enums) {
330 while (my ($enum, $value) = each %enums) {
331 my ($short) = prop_value_aliases('GCB', $enum);
332 $short = lc $enum unless defined $short;
333 $gcb_enums{$short} = $value;
334 @gcb_short_enums[$value] = $short;
335 }
336 }
337 elsif ($name eq '_Perl_LB' && ! %lb_enums) {
6b659339
KW
338 while (my ($enum, $value) = each %enums) {
339 my ($short) = prop_value_aliases('LB', $enum);
340 $short = substr(lc $enum, 0, 2) unless defined $short;
341 $lb_enums{$short} = $value;
342 @lb_short_enums[$value] = $short;
343 }
344 }
99f21fb9
KW
345 }
346
bffc0129
KW
347 # Inversion map stuff is currently used only by regexec
348 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
349 {
350
99f21fb9
KW
351 # The short names tend to be two lower case letters, but it looks
352 # better for those if they are upper. XXX
353 $short_name = uc($short_name) if length($short_name) < 3
354 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 355 $name_prefix = "${short_name}_";
99f21fb9
KW
356 my $enum_count = keys %enums;
357 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
358
359 print $out_fh "\ntypedef enum {\n";
6dc80864
KW
360 my @enum_list;
361 foreach my $enum (keys %enums) {
362 $enum_list[$enums{$enum}] = $enum;
363 }
364 foreach my $i (0 .. @enum_list - 1) {
365 my $name = $enum_list[$i];
366 print $out_fh "\t${name_prefix}$name = $i";
367 print $out_fh "," if $i < $enum_count - 1;
368 print $out_fh "\n";
99f21fb9
KW
369 }
370 $declaration_type = "${name_prefix}enum";
371 print $out_fh "} $declaration_type;\n";
372
373 $output_format = "${name_prefix}%s";
374 }
375 }
376 else {
377 die "'$input_format' invmap() format for '$prop_name' unimplemented";
378 }
379
380 die "No inversion map for $prop_name" unless defined $invmap
381 && ref $invmap eq 'ARRAY'
382 && $count;
383
384 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
385 print $out_fh " /* for $charset */" if $charset;
386 print $out_fh "\n";
387
388 # The main body are the scalars passed in to this routine.
389 for my $i (0 .. $count - 1) {
390 my $element = $invmap->[$i];
02f811dd
KW
391 my $full_element_name = prop_value_aliases($prop_name, $element);
392 $element = $full_element_name if defined $full_element_name;
393 $element = $name_prefix . $element;
99f21fb9
KW
394 print $out_fh "\t$element";
395 print $out_fh "," if $i < $count - 1;
396 print $out_fh "\n";
397 }
398 print $out_fh "};\n";
99f21fb9
KW
399}
400
5a7e5385 401sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
402
403 # Returns an inversion list constructed from the sorted input array of
404 # code points
405
406 my $list_ref = shift;
407
99f21fb9
KW
408 return unless @$list_ref;
409
a02047bf
KW
410 # Initialize to just the first element
411 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
412
413 # For each succeeding element, if it extends the previous range, adjust
414 # up, otherwise add it.
415 for my $i (1 .. @$list_ref - 1) {
416 if ($invlist[-1] == $list_ref->[$i]) {
417 $invlist[-1]++;
418 }
419 else {
420 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
421 }
422 }
423 return @invlist;
424}
425
426# Read in the Case Folding rules, and construct arrays of code points for the
427# properties we need.
428my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
429die "Could not find inversion map for Case_Folding" unless defined $format;
430die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
431 unless $format eq 'al'
432 || $format eq 'a';
a02047bf
KW
433my @has_multi_char_fold;
434my @is_non_final_fold;
435
436for my $i (0 .. @$folds_ref - 1) {
437 next unless ref $folds_ref->[$i]; # Skip single-char folds
438 push @has_multi_char_fold, $cp_ref->[$i];
439
b6a6e956 440 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
441 # position
442 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
443 push @is_non_final_fold, $folds_ref->[$i][$j]
444 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
445 }
446}
447
a02047bf
KW
448sub _Perl_Non_Final_Folds {
449 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 450 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
451}
452
99f21fb9
KW
453sub prop_name_for_cmp ($) { # Sort helper
454 my $name = shift;
455
456 # Returns the input lowercased, with non-alphas removed, as well as
457 # everything starting with a comma
458
459 $name =~ s/,.*//;
460 $name =~ s/[[:^alpha:]]//g;
461 return lc $name;
462}
463
892d8259 464sub UpperLatin1 {
5a7e5385 465 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
466}
467
973a28ed
KW
468sub output_GCB_table() {
469
470 # Create and output the pair table for use in determining Grapheme Cluster
471 # Breaks, given in http://www.unicode.org/reports/tr29/.
472
473 # The table is constructed in reverse order of the rules, to make the
474 # lower-numbered, higher priority ones override the later ones, as the
475 # algorithm stops at the earliest matching rule
476
477 my @gcb_table;
478 my $table_size = @gcb_short_enums;
479
480 # Otherwise, break everywhere.
481 # GB10 Any ÷ Any
482 for my $i (0 .. $table_size - 1) {
483 for my $j (0 .. $table_size - 1) {
484 $gcb_table[$i][$j] = 1;
485 }
486 }
487
488 # Do not break before extending characters.
489 # Do not break before SpacingMarks, or after Prepend characters.
490 # GB9 × Extend
491 # GB9a × SpacingMark
492 # GB9b Prepend ×
493 for my $i (0 .. @gcb_table - 1) {
494 $gcb_table[$i][$gcb_enums{'EX'}] = 0;
495 $gcb_table[$i][$gcb_enums{'SM'}] = 0;
496 $gcb_table[$gcb_enums{'PP'}][$i] = 0;
497 }
498
499 # Do not break between regional indicator symbols.
500 # GB8a Regional_Indicator × Regional_Indicator
501 $gcb_table[$gcb_enums{'RI'}][$gcb_enums{'RI'}] = 0;
502
503 # Do not break Hangul syllable sequences.
504 # GB8 ( LVT | T) × T
505 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
506 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
507
508 # GB7 ( LV | V ) × ( V | T )
509 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
510 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
511 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
512 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
513
514 # GB6 L × ( L | V | LV | LVT )
515 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
516 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
517 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
518 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
519
520 # Do not break between a CR and LF. Otherwise, break before and after controls.
521 # GB5 ÷ ( Control | CR | LF )
522 # GB4 ( Control | CR | LF ) ÷
523 for my $i (0 .. @gcb_table - 1) {
524 $gcb_table[$i][$gcb_enums{'CN'}] = 1;
525 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
526 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
527 $gcb_table[$gcb_enums{'CN'}][$i] = 1;
528 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
529 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
530 }
531
532 # GB3 CR × LF
533 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
534
535 # Break at the start and end of text.
536 # GB1 sot ÷
537 # GB2 ÷ eot
538 for my $i (0 .. @gcb_table - 1) {
539 $gcb_table[$i][$gcb_enums{'edge'}] = 1;
540 $gcb_table[$gcb_enums{'edge'}][$i] = 1;
541 }
542
543 # But, unspecified by Unicode, we shouldn't break on an empty string.
544 $gcb_table[$gcb_enums{'edge'}][$gcb_enums{'edge'}] = 0;
545
546 print $out_fh "\nstatic const bool GCB_table[$table_size][$table_size] = {\n";
78e3ac85 547 print $out_fh "/* ";
973a28ed
KW
548 for my $i (0 .. @gcb_table - 1) {
549 printf $out_fh "%5s", $gcb_short_enums[$i];
550 }
78e3ac85 551 print $out_fh "*/\n";
973a28ed
KW
552
553 for my $i (0 .. @gcb_table - 1) {
78e3ac85
AC
554 printf $out_fh "/*%4s*/ ", $gcb_short_enums[$i];
555 print $out_fh "{";
556 print $out_fh join ", ", map sprintf("%3d", $_), @{ $gcb_table[$i] };
557 print $out_fh "}";
558 print $out_fh "," if $i < @gcb_table - 1;
973a28ed
KW
559 print $out_fh "\n";
560 }
561
562 print $out_fh "};\n";
563}
564
6b659339
KW
565sub output_LB_table() {
566
567 # Create and output the enums, #defines, and pair table for use in
568 # determining Line Breaks. This uses the default line break algorithm,
569 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
570 # in that page, as the Unicode-furnished tests assume that tailoring.
571
6b659339
KW
572 # The result is really just true or false. But we follow along with tr14,
573 # creating a rule which is false for something like X SP* X. That gets
574 # encoding 2. The rest of the actions are synthetic ones that indicate
575 # some context handling is required. These each are added to the
576 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
577 # value can be retrieved. Actually only rules from 7 through 18 (which
578 # are the ones where space matter) are possible to have 2 added to them.
579 # The others below add just 0 or 1. It might be possible for one
580 # synthetic rule to be added to another, yielding a larger value. This
581 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
582 # names of the middle grouping below, it is impossible for that to occur
583 # for them because they all start with mutually exclusive classes. That
584 # the final rule can't be added to any of the others isn't obvious from
585 # its name, so it is assigned a power of 2 higher than the others can get
586 # to so any addition would preserve all data. (And the code will reach an
587 # assert(0) on debugging builds should this happen.)
588 my %lb_actions = (
589 LB_NOBREAK => 0,
590 LB_BREAKABLE => 1,
591 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
592
593 LB_CM_foo => 3, # Rule 9
594 LB_SP_foo => 6, # Rule 18
595 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
596 LB_SY_or_IS_then_various => 11, # Rule 25
597 LB_HY_or_BA_then_foo => 13, # Rule 21
598
599 LB_various_then_PO_or_PR => (1<<4), # Rule 25
600 );
601
602 # Output the #define list, sorted by numeric value
603 my @defines;
604 while (my ($enum, $value) = each %lb_actions) {
605 $defines[$value] = $enum;
606 }
607
608 print $out_fh "\n";
609
610 foreach my $i (0 .. @defines - 1) {
611 next unless defined $defines[$i];
612 print $out_fh "#define $defines[$i]\t$i\n";
613 }
614
615 # Construct the LB pair table. This is based on the rules in
616 # http://www.unicode.org/reports/tr14/, but modified as those rules are
617 # designed for someone taking a string of text and sequentially going
618 # through it to find the break opportunities, whereas, Perl requires
619 # determining if a given random spot is a break opportunity, without
620 # knowing all the entire string before it.
621 #
622 # The table is constructed in reverse order of the rules, to make the
623 # lower-numbered, higher priority ones override the later ones, as the
624 # algorithm stops at the earliest matching rule
625
626 my @lb_table;
627 my $table_size = @lb_short_enums;
628
629 # LB31. Break everywhere else
630 for my $i (0 .. $table_size - 1) {
631 for my $j (0 .. $table_size - 1) {
632 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
633 }
634 }
635
636 # LB30a. Don't break between Regional Indicators
637 $lb_table[$lb_enums{'RI'}][$lb_enums{'RI'}] = $lb_actions{'LB_NOBREAK'};
638
639 # LB30 Do not break between letters, numbers, or ordinary symbols and
640 # opening or closing parentheses.
641 # (AL | HL | NU) × OP
642 $lb_table[$lb_enums{'AL'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
643 $lb_table[$lb_enums{'HL'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
644 $lb_table[$lb_enums{'NU'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
645
646 # CP × (AL | HL | NU)
647 $lb_table[$lb_enums{'CP'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
648 $lb_table[$lb_enums{'CP'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
649 $lb_table[$lb_enums{'CP'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
650
651 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
652 # IS × (AL | HL)
653 $lb_table[$lb_enums{'IS'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
654 $lb_table[$lb_enums{'IS'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
655
656 # LB28 Do not break between alphabetics (“at”).
657 # (AL | HL) × (AL | HL)
658 $lb_table[$lb_enums{'AL'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
659 $lb_table[$lb_enums{'HL'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
660 $lb_table[$lb_enums{'AL'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
661 $lb_table[$lb_enums{'HL'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
662
663 # LB27 Treat a Korean Syllable Block the same as ID.
664 # (JL | JV | JT | H2 | H3) × IN
665 $lb_table[$lb_enums{'JL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
666 $lb_table[$lb_enums{'JV'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
667 $lb_table[$lb_enums{'JT'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
668 $lb_table[$lb_enums{'H2'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
669 $lb_table[$lb_enums{'H3'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
670
671 # (JL | JV | JT | H2 | H3) × PO
672 $lb_table[$lb_enums{'JL'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
673 $lb_table[$lb_enums{'JV'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
674 $lb_table[$lb_enums{'JT'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
675 $lb_table[$lb_enums{'H2'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
676 $lb_table[$lb_enums{'H3'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
677
678 # PR × (JL | JV | JT | H2 | H3)
679 $lb_table[$lb_enums{'PR'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
680 $lb_table[$lb_enums{'PR'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
681 $lb_table[$lb_enums{'PR'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
682 $lb_table[$lb_enums{'PR'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
683 $lb_table[$lb_enums{'PR'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
684
685 # LB26 Do not break a Korean syllable.
686 # JL × (JL | JV | H2 | H3)
687 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
688 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
689 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
690 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
691
692 # (JV | H2) × (JV | JT)
693 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
694 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
695 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
696 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
697
698 # (JT | H3) × JT
699 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
700 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
701
702 # LB25 Do not break between the following pairs of classes relevant to
703 # numbers, as tailored by example 7 in
704 # http://www.unicode.org/reports/tr14/#Examples
705 # We follow that tailoring because Unicode's test cases expect it
706 # (PR | PO) × ( OP | HY )? NU
707 $lb_table[$lb_enums{'PR'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
708 $lb_table[$lb_enums{'PO'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
709
710 # Given that (OP | HY )? is optional, we have to test for it in code.
711 # We add in the action (instead of overriding) for this, so that in
712 # the code we can recover the underlying break value.
713 $lb_table[$lb_enums{'PR'}][$lb_enums{'OP'}]
714 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
715 $lb_table[$lb_enums{'PO'}][$lb_enums{'OP'}]
716 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
717 $lb_table[$lb_enums{'PR'}][$lb_enums{'HY'}]
718 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
719 $lb_table[$lb_enums{'PO'}][$lb_enums{'HY'}]
720 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
721
722 # ( OP | HY ) × NU
723 $lb_table[$lb_enums{'OP'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
724 $lb_table[$lb_enums{'HY'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
725
726 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
727 # which can be rewritten as:
728 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
729 $lb_table[$lb_enums{'NU'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
730 $lb_table[$lb_enums{'NU'}][$lb_enums{'SY'}] = $lb_actions{'LB_NOBREAK'};
731 $lb_table[$lb_enums{'NU'}][$lb_enums{'IS'}] = $lb_actions{'LB_NOBREAK'};
732 $lb_table[$lb_enums{'NU'}][$lb_enums{'CL'}] = $lb_actions{'LB_NOBREAK'};
733 $lb_table[$lb_enums{'NU'}][$lb_enums{'CP'}] = $lb_actions{'LB_NOBREAK'};
734
735 # Like earlier where we have to test in code, we add in the action so
736 # that we can recover the underlying values. This is done in rules
737 # below, as well. The code assumes that we haven't added 2 actions.
738 # Shoul a later Unicode release break that assumption, then tests
739 # should start failing.
740 $lb_table[$lb_enums{'SY'}][$lb_enums{'NU'}]
741 += $lb_actions{'LB_SY_or_IS_then_various'};
742 $lb_table[$lb_enums{'SY'}][$lb_enums{'SY'}]
743 += $lb_actions{'LB_SY_or_IS_then_various'};
744 $lb_table[$lb_enums{'SY'}][$lb_enums{'IS'}]
745 += $lb_actions{'LB_SY_or_IS_then_various'};
746 $lb_table[$lb_enums{'SY'}][$lb_enums{'CL'}]
747 += $lb_actions{'LB_SY_or_IS_then_various'};
748 $lb_table[$lb_enums{'SY'}][$lb_enums{'CP'}]
749 += $lb_actions{'LB_SY_or_IS_then_various'};
750 $lb_table[$lb_enums{'IS'}][$lb_enums{'NU'}]
751 += $lb_actions{'LB_SY_or_IS_then_various'};
752 $lb_table[$lb_enums{'IS'}][$lb_enums{'SY'}]
753 += $lb_actions{'LB_SY_or_IS_then_various'};
754 $lb_table[$lb_enums{'IS'}][$lb_enums{'IS'}]
755 += $lb_actions{'LB_SY_or_IS_then_various'};
756 $lb_table[$lb_enums{'IS'}][$lb_enums{'CL'}]
757 += $lb_actions{'LB_SY_or_IS_then_various'};
758 $lb_table[$lb_enums{'IS'}][$lb_enums{'CP'}]
759 += $lb_actions{'LB_SY_or_IS_then_various'};
760
761 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
762 # which can be rewritten as:
763 # NU (SY | IS)* (CL | CP)? × (PO | PR)
764 $lb_table[$lb_enums{'NU'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
765 $lb_table[$lb_enums{'NU'}][$lb_enums{'PR'}] = $lb_actions{'LB_NOBREAK'};
766
767 $lb_table[$lb_enums{'CP'}][$lb_enums{'PO'}]
768 += $lb_actions{'LB_various_then_PO_or_PR'};
769 $lb_table[$lb_enums{'CL'}][$lb_enums{'PO'}]
770 += $lb_actions{'LB_various_then_PO_or_PR'};
771 $lb_table[$lb_enums{'IS'}][$lb_enums{'PO'}]
772 += $lb_actions{'LB_various_then_PO_or_PR'};
773 $lb_table[$lb_enums{'SY'}][$lb_enums{'PO'}]
774 += $lb_actions{'LB_various_then_PO_or_PR'};
775
776 $lb_table[$lb_enums{'CP'}][$lb_enums{'PR'}]
777 += $lb_actions{'LB_various_then_PO_or_PR'};
778 $lb_table[$lb_enums{'CL'}][$lb_enums{'PR'}]
779 += $lb_actions{'LB_various_then_PO_or_PR'};
780 $lb_table[$lb_enums{'IS'}][$lb_enums{'PR'}]
781 += $lb_actions{'LB_various_then_PO_or_PR'};
782 $lb_table[$lb_enums{'SY'}][$lb_enums{'PR'}]
783 += $lb_actions{'LB_various_then_PO_or_PR'};
784
785 # LB24 Do not break between prefix and letters or ideographs.
786 # PR × ID
787 $lb_table[$lb_enums{'PR'}][$lb_enums{'ID'}] = $lb_actions{'LB_NOBREAK'};
788
789 # PR × (AL | HL)
790 $lb_table[$lb_enums{'PR'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
791 $lb_table[$lb_enums{'PR'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
792
793 # PO × (AL | HL)
794 $lb_table[$lb_enums{'PO'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
795 $lb_table[$lb_enums{'PO'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
796
797 # LB23 Do not break within ‘a9’, ‘3a’, or ‘H%’.
798 # ID × PO
799 $lb_table[$lb_enums{'ID'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
800
801 # (AL | HL) × NU
802 $lb_table[$lb_enums{'AL'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
803 $lb_table[$lb_enums{'HL'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
804
805 # NU × (AL | HL)
806 $lb_table[$lb_enums{'NU'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
807 $lb_table[$lb_enums{'NU'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
808
809 # LB22 Do not break between two ellipses, or between letters, numbers or
810 # exclamations and ellipsis.
811 # (AL | HL) × IN
812 $lb_table[$lb_enums{'AL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
813 $lb_table[$lb_enums{'HL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
814
815 # EX × IN
816 $lb_table[$lb_enums{'EX'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
817
818 # ID × IN
819 $lb_table[$lb_enums{'ID'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
820
821 # IN × IN
822 $lb_table[$lb_enums{'IN'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
823
824 # NU × IN
825 $lb_table[$lb_enums{'NU'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
826
827 # LB21b Don’t break between Solidus and Hebrew letters.
828 # SY × HL
829 $lb_table[$lb_enums{'SY'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
830
831 # LB21a Don't break after Hebrew + Hyphen.
832 # HL (HY | BA) ×
833 for my $i (0 .. @lb_table - 1) {
834 $lb_table[$lb_enums{'HY'}][$i] += $lb_actions{'LB_HY_or_BA_then_foo'};
835 $lb_table[$lb_enums{'BA'}][$i] += $lb_actions{'LB_HY_or_BA_then_foo'};
836 }
837
838 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
839 # spaces, small kana, and other non-starters, or after acute accents.
840 # × BA
841 # × HY
842 # × NS
843 # BB ×
844 for my $i (0 .. @lb_table - 1) {
845 $lb_table[$i][$lb_enums{'BA'}] = $lb_actions{'LB_NOBREAK'};
846 $lb_table[$i][$lb_enums{'HY'}] = $lb_actions{'LB_NOBREAK'};
847 $lb_table[$i][$lb_enums{'NS'}] = $lb_actions{'LB_NOBREAK'};
848 $lb_table[$lb_enums{'BB'}][$i] = $lb_actions{'LB_NOBREAK'};
849 }
850
851 # LB20 Break before and after unresolved CB.
852 # ÷ CB
853 # CB ÷
854 # Conditional breaks should be resolved external to the line breaking
855 # rules. However, the default action is to treat unresolved CB as breaking
856 # before and after.
857 for my $i (0 .. @lb_table - 1) {
858 $lb_table[$i][$lb_enums{'CB'}] = $lb_actions{'LB_BREAKABLE'};
859 $lb_table[$lb_enums{'CB'}][$i] = $lb_actions{'LB_BREAKABLE'};
860 }
861
862 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
863 # × QU
864 # QU ×
865 for my $i (0 .. @lb_table - 1) {
866 $lb_table[$i][$lb_enums{'QU'}] = $lb_actions{'LB_NOBREAK'};
867 $lb_table[$lb_enums{'QU'}][$i] = $lb_actions{'LB_NOBREAK'};
868 }
869
870 # LB18 Break after spaces
871 # SP ÷
872 for my $i (0 .. @lb_table - 1) {
873 $lb_table[$lb_enums{'SP'}][$i] = $lb_actions{'LB_BREAKABLE'};
874 }
875
876 # LB17 Do not break within ‘——’, even with intervening spaces.
877 # B2 SP* × B2
878 $lb_table[$lb_enums{'B2'}][$lb_enums{'B2'}]
879 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
880
881 # LB16 Do not break between closing punctuation and a nonstarter even with
882 # intervening spaces.
883 # (CL | CP) SP* × NS
884 $lb_table[$lb_enums{'CL'}][$lb_enums{'NS'}]
885 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
886 $lb_table[$lb_enums{'CP'}][$lb_enums{'NS'}]
887 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
888
889
890 # LB15 Do not break within ‘”[’, even with intervening spaces.
891 # QU SP* × OP
892 $lb_table[$lb_enums{'QU'}][$lb_enums{'OP'}]
893 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
894
895 # LB14 Do not break after ‘[’, even after spaces.
896 # OP SP* ×
897 for my $i (0 .. @lb_table - 1) {
898 $lb_table[$lb_enums{'OP'}][$i]
899 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
900 }
901
902 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
903 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
904 # [^NU] × CL
905 # [^NU] × CP
906 # × EX
907 # [^NU] × IS
908 # [^NU] × SY
909 for my $i (0 .. @lb_table - 1) {
910 $lb_table[$i][$lb_enums{'EX'}]
911 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
912
913 next if $i == $lb_enums{'NU'};
914
915 $lb_table[$i][$lb_enums{'CL'}]
916 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
917 $lb_table[$i][$lb_enums{'CP'}]
918 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
919 $lb_table[$i][$lb_enums{'IS'}]
920 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
921 $lb_table[$i][$lb_enums{'SY'}]
922 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
923 }
924
925 # LB12a Do not break before NBSP and related characters, except after
926 # spaces and hyphens.
927 # [^SP BA HY] × GL
928 for my $i (0 .. @lb_table - 1) {
929 next if $i == $lb_enums{'SP'}
930 || $i == $lb_enums{'BA'}
931 || $i == $lb_enums{'HY'};
932
933 # We don't break, but if a property above has said don't break even
934 # with space between, don't override that (also in the next few rules)
935 next if $lb_table[$i][$lb_enums{'GL'}]
936 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
937 $lb_table[$i][$lb_enums{'GL'}] = $lb_actions{'LB_NOBREAK'};
938 }
939
940 # LB12 Do not break after NBSP and related characters.
941 # GL ×
942 for my $i (0 .. @lb_table - 1) {
943 next if $lb_table[$lb_enums{'GL'}][$i]
944 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
945 $lb_table[$lb_enums{'GL'}][$i] = $lb_actions{'LB_NOBREAK'};
946 }
947
948 # LB11 Do not break before or after Word joiner and related characters.
949 # × WJ
950 # WJ ×
951 for my $i (0 .. @lb_table - 1) {
952 if ($lb_table[$i][$lb_enums{'WJ'}]
953 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
954 {
955 $lb_table[$i][$lb_enums{'WJ'}] = $lb_actions{'LB_NOBREAK'};
956 }
957 if ($lb_table[$lb_enums{'WJ'}][$i]
958 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
959 {
960 $lb_table[$lb_enums{'WJ'}][$i] = $lb_actions{'LB_NOBREAK'};
961 }
962 }
963
964 # Special case this here to avoid having to do a special case in the code,
965 # by making this the same as other things with a SP in front of them that
966 # don't break, we avoid an extra test
967 $lb_table[$lb_enums{'SP'}][$lb_enums{'WJ'}]
968 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
969
970 # LB9 and LB10 are done in the same loop
971 #
972 # LB9 Do not break a combining character sequence; treat it as if it has
973 # the line breaking class of the base character in all of the
974 # higher-numbered rules.
975 # Treat X CM* as if it were X.
976 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
977
978 # LB10 Treat any remaining combining mark as AL. This catches the case
979 # where a CM is the first character on the line or follows SP, BK, CR, LF,
980 # NL, or ZW.
981 for my $i (0 .. @lb_table - 1) {
982
983 # When the CM is the first in the pair, we don't know without looking
984 # behind whether the CM is going to inherit from an earlier character,
985 # or not. So have to figure this out in the code
986 $lb_table[$lb_enums{'CM'}][$i] = $lb_actions{'LB_CM_foo'};
987
988 if ( $i == $lb_enums{'BK'}
989 || $i == $lb_enums{'ed'}
990 || $i == $lb_enums{'CR'}
991 || $i == $lb_enums{'LF'}
992 || $i == $lb_enums{'NL'}
993 || $i == $lb_enums{'SP'}
994 || $i == $lb_enums{'ZW'})
995 {
996 # For these classes, a following CM doesn't combine, and should do
997 # whatever 'AL' would do.
998 $lb_table[$i][$lb_enums{'CM'}] = $lb_table[$i][$lb_enums{'AL'}];
999 }
1000 else {
1001 # For these classes, the CM combines, so doesn't break, inheriting
1002 # the type of nobreak from the master character.
1003 if ($lb_table[$i][$lb_enums{'CM'}]
1004 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1005 {
1006 $lb_table[$i][$lb_enums{'CM'}] = $lb_actions{'LB_NOBREAK'};
1007 }
1008 }
1009 }
1010
1011 # LB8 Break before any character following a zero-width space, even if one
1012 # or more spaces intervene.
1013 # ZW SP* ÷
1014 for my $i (0 .. @lb_table - 1) {
1015 $lb_table[$lb_enums{'ZW'}][$i] = $lb_actions{'LB_BREAKABLE'};
1016 }
1017
1018 # Because of LB8-10, we need to look at context for "SP x", and this must
1019 # be done in the code. So override the existing rules for that, by adding
1020 # a constant to get new rules that tell the code it needs to look at
1021 # context. By adding this action instead of replacing the existing one,
1022 # we can get back to the original rule if necessary.
1023 for my $i (0 .. @lb_table - 1) {
1024 $lb_table[$lb_enums{'SP'}][$i] += $lb_actions{'LB_SP_foo'};
1025 }
1026
1027 # LB7 Do not break before spaces or zero width space.
1028 # × SP
1029 # × ZW
1030 for my $i (0 .. @lb_table - 1) {
1031 $lb_table[$i][$lb_enums{'SP'}] = $lb_actions{'LB_NOBREAK'};
1032 $lb_table[$i][$lb_enums{'ZW'}] = $lb_actions{'LB_NOBREAK'};
1033 }
1034
1035 # LB6 Do not break before hard line breaks.
1036 # × ( BK | CR | LF | NL )
1037 for my $i (0 .. @lb_table - 1) {
1038 $lb_table[$i][$lb_enums{'BK'}] = $lb_actions{'LB_NOBREAK'};
1039 $lb_table[$i][$lb_enums{'CR'}] = $lb_actions{'LB_NOBREAK'};
1040 $lb_table[$i][$lb_enums{'LF'}] = $lb_actions{'LB_NOBREAK'};
1041 $lb_table[$i][$lb_enums{'NL'}] = $lb_actions{'LB_NOBREAK'};
1042 }
1043
1044 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1045 # CR × LF
1046 # CR !
1047 # LF !
1048 # NL !
1049 for my $i (0 .. @lb_table - 1) {
1050 $lb_table[$lb_enums{'CR'}][$i] = $lb_actions{'LB_BREAKABLE'};
1051 $lb_table[$lb_enums{'LF'}][$i] = $lb_actions{'LB_BREAKABLE'};
1052 $lb_table[$lb_enums{'NL'}][$i] = $lb_actions{'LB_BREAKABLE'};
1053 }
1054 $lb_table[$lb_enums{'CR'}][$lb_enums{'LF'}] = $lb_actions{'LB_NOBREAK'};
1055
1056 # LB4 Always break after hard line breaks.
1057 # BK !
1058 for my $i (0 .. @lb_table - 1) {
1059 $lb_table[$lb_enums{'BK'}][$i] = $lb_actions{'LB_BREAKABLE'};
1060 }
1061
1062 # LB2 Never break at the start of text.
1063 # sot ×
1064 # LB3 Always break at the end of text.
1065 # ! eot
1066 # but these are reversed in the loop below, so that won't break if there
1067 # is no text
1068 for my $i (0 .. @lb_table - 1) {
1069 $lb_table[$i][$lb_enums{'ed'}] = $lb_actions{'LB_BREAKABLE'};
1070 $lb_table[$lb_enums{'ed'}][$i] = $lb_actions{'LB_NOBREAK'};
1071 }
1072
1073 # LB1 Assign a line breaking class to each code point of the input.
1074 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1075 # depending on criteria outside the scope of this algorithm.
1076 #
1077 # In the absence of such criteria all characters with a specific
1078 # combination of original class and General_Category property value are
1079 # resolved as follows:
1080 # Original Resolved General_Category
1081 # AI, SG, XX AL Any
1082 # SA CM Only Mn or Mc
1083 # SA AL Any except Mn and Mc
1084 # CJ NS Any
1085 #
1086 # This is done in mktables, so we never see any of the remapped-from
1087 # classes.
1088
1089 print $out_fh "\nstatic const U8 LB_table[$table_size][$table_size] = {\n";
1090 print $out_fh "\n/* 'ed' stands for 'edge' */\n";
78e3ac85 1091 print $out_fh "/* ";
6b659339
KW
1092 for my $i (0 .. @lb_table - 1) {
1093 print $out_fh " $lb_short_enums[$i]";
1094 }
1095 print $out_fh " */\n";
1096
1097 for my $i (0 .. @lb_table - 1) {
1098 print $out_fh "/* $lb_short_enums[$i] */ ";
78e3ac85
AC
1099 print $out_fh "{ ";
1100 print $out_fh join ", ", map sprintf("%2d", $_), @{ $lb_table[$i] };
1101 print $out_fh " }";
1102 print $out_fh "," if $i < @lb_table - 1;
6b659339
KW
1103 print $out_fh "\n";
1104 }
1105
1106 print $out_fh "};\n";
6b659339
KW
1107}
1108
9d9177be
KW
1109output_invlist("Latin1", [ 0, 256 ]);
1110output_invlist("AboveLatin1", [ 256 ]);
1111
bffc0129 1112end_file_pound_if;
43b443dd 1113
3f427fd9
KW
1114# We construct lists for all the POSIX and backslash sequence character
1115# classes in two forms:
1116# 1) ones which match only in the ASCII range
1117# 2) ones which match either in the Latin1 range, or the entire Unicode range
1118#
1119# These get compiled in, and hence affect the memory footprint of every Perl
1120# program, even those not using Unicode. To minimize the size, currently
1121# the Latin1 version is generated for the beyond ASCII range except for those
1122# lists that are quite small for the entire range, such as for \s, which is 22
1123# UVs long plus 4 UVs (currently) for the header.
1124#
1125# To save even more memory, the ASCII versions could be derived from the
1126# larger ones at runtime, saving some memory (minus the expense of the machine
1127# instructions to do so), but these are all small anyway, so their total is
1128# about 100 UVs.
1129#
1130# In the list of properties below that get generated, the L1 prefix is a fake
1131# property that means just the Latin1 range of the full property (whose name
1132# has an X prefix instead of L1).
a02047bf
KW
1133#
1134# An initial & means to use the subroutine from this file instead of an
1135# official inversion list.
3f427fd9 1136
0c4ecf42
KW
1137for my $charset (get_supported_code_pages()) {
1138 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1139
99f21fb9
KW
1140 @a2n = @{get_a2n($charset)};
1141 no warnings 'qw';
1142 # Ignore non-alpha in sort
1143 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
1144 ASCII
1145 Cased
1146 VertSpace
1147 XPerlSpace
1148 XPosixAlnum
1149 XPosixAlpha
1150 XPosixBlank
1151 XPosixCntrl
1152 XPosixDigit
1153 XPosixGraph
1154 XPosixLower
1155 XPosixPrint
1156 XPosixPunct
1157 XPosixSpace
1158 XPosixUpper
1159 XPosixWord
1160 XPosixXDigit
1161 _Perl_Any_Folds
1162 &NonL1_Perl_Non_Final_Folds
1163 _Perl_Folds_To_Multi_Char
1164 &UpperLatin1
1165 _Perl_IDStart
1166 _Perl_IDCont
02f811dd 1167 _Perl_GCB,EDGE
ca8226cf 1168 _Perl_LB,EDGE
bf4268fa 1169 _Perl_SB,EDGE
190d69bb 1170 _Perl_WB,EDGE,UNKNOWN
1c8c3428 1171 )
0f5e3c71
KW
1172 ) {
1173
1174 # For the Latin1 properties, we change to use the eXtended version of the
1175 # base property, then go through the result and get rid of everything not
1176 # in Latin1 (above 255). Actually, we retain the element for the range
1177 # that crosses the 255/256 boundary if it is one that matches the
1178 # property. For example, in the Word property, there is a range of code
1179 # points that start at U+00F8 and goes through U+02C1. Instead of
1180 # artificially cutting that off at 256 because 256 is the first code point
1181 # above Latin1, we let the range go to its natural ending. That gives us
1182 # extra information with no added space taken. But if the range that
1183 # crosses the boundary is one that doesn't match the property, we don't
1184 # start a new range above 255, as that could be construed as going to
1185 # infinity. For example, the Upper property doesn't include the character
1186 # at 255, but does include the one at 256. We don't include the 256 one.
1187 my $prop_name = $prop;
1188 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1189 my $extra_enums = "";
1190 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1191 my $lookup_prop = $prop_name;
1192 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1193 or $lookup_prop =~ s/^L1//);
1194 my $nonl1_only = 0;
1195 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1196 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1197
1198 my @invlist;
99f21fb9
KW
1199 my @invmap;
1200 my $map_format;
1201 my $map_default;
1202 my $maps_to_code_point;
1203 my $to_adjust;
0f5e3c71
KW
1204 if ($is_local_sub) {
1205 @invlist = eval $lookup_prop;
1206 }
1207 else {
1208 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1209 if (! @invlist) {
99f21fb9 1210
ad85f59a
KW
1211 # If couldn't find a non-empty inversion list, see if it is
1212 # instead an inversion map
1213 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1214 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1215 if (! $list_ref) {
1216 # An empty return here could mean an unknown property, or
1217 # merely that the original inversion list is empty. Call
1218 # in scalar context to differentiate
1219 my $count = prop_invlist($lookup_prop,
1220 '_perl_core_internal_ok');
1221 die "Could not find inversion list for '$lookup_prop'"
1222 unless defined $count;
1223 }
1224 else {
18b852b3
KW
1225 @invlist = @$list_ref;
1226 @invmap = @$map_ref;
1227 $map_format = $format;
1228 $map_default = $default;
1229 $maps_to_code_point = $map_format =~ /x/;
1230 $to_adjust = $map_format =~ /a/;
ad85f59a 1231 }
99f21fb9 1232 }
0f5e3c71 1233 }
ad85f59a
KW
1234
1235
1236 # Short-circuit an empty inversion list.
1237 if (! @invlist) {
1238 output_invlist($prop_name, \@invlist, $charset);
1239 next;
1240 }
ceb1de32 1241
99f21fb9
KW
1242 # Re-order the Unicode code points to native ones for this platform.
1243 # This is only needed for code points below 256, because native code
1244 # points are only in that range. For inversion maps of properties
1245 # where the mappings are adjusted (format =~ /a/), this reordering
1246 # could mess up the adjustment pattern that was in the input, so that
1247 # has to be dealt with.
1248 #
1249 # And inversion maps that map to code points need to eventually have
1250 # all those code points remapped to native, and it's better to do that
1251 # here, going through the whole list not just those below 256. This
1252 # is because some inversion maps have adjustments (format =~ /a/)
1253 # which may be affected by the reordering. This code needs to be done
1254 # both for when we are translating the inversion lists for < 256, and
1255 # for the inversion maps for everything. By doing both in this loop,
1256 # we can share that code.
1257 #
1258 # So, we go through everything for an inversion map to code points;
1259 # otherwise, we can skip any remapping at all if we are going to
1260 # output only the above-Latin1 values, or if the range spans the whole
1261 # of 0..256, as the remap will also include all of 0..256 (256 not
1262 # 255 because a re-ordering could cause 256 to need to be in the same
1263 # range as 255.)
1264 if ((@invmap && $maps_to_code_point)
1265 || (! $nonl1_only || ($invlist[0] < 256
1266 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1267 {
fb4554ea 1268
99f21fb9 1269 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1270 # Look at all the ranges that start before 257.
1271 my @latin1_list;
1272 while (@invlist) {
1273 last if $invlist[0] > 256;
1274 my $upper = @invlist > 1
1275 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1276
1277 # To infinity. You may want to stop much much
1278 # earlier; going this high may expose perl
1279 # deficiencies with very large numbers.
1280 : $Unicode::UCD::MAX_CP;
fb4554ea 1281 for my $j ($invlist[0] .. $upper) {
99f21fb9 1282 push @latin1_list, a2n($j);
0f5e3c71 1283 }
fb4554ea
KW
1284
1285 shift @invlist; # Shift off the range that's in the list
1286 shift @invlist; # Shift off the range not in the list
0c4ecf42 1287 }
fb4554ea
KW
1288
1289 # Here @invlist contains all the ranges in the original that start
1290 # at code points above 256, and @latin1_list contains all the
1291 # native code points for ranges that start with a Unicode code
1292 # point below 257. We sort the latter and convert it to inversion
1293 # list format. Then simply prepend it to the list of the higher
1294 # code points.
1295 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1296 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1297 unshift @invlist, @latin1_list;
99f21fb9
KW
1298 }
1299 else { # Is an inversion map
1300
1301 # This is a similar procedure as plain inversion list, but has
1302 # multiple buckets. A plain inversion list just has two
1303 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1304 # pretty much can ignore the 2nd bucket, as it is completely
1305 # defined by the 1st. But here, what we do is create buckets
1306 # which contain the code points that map to each, translated
1307 # to native and turned into an inversion list. Thus each
1308 # bucket is an inversion list of native code points that map
1309 # to it or don't map to it. We use these to create an
1310 # inversion map for the whole property.
1311
1312 # As mentioned earlier, we use this procedure to not just
1313 # remap the inversion list to native values, but also the maps
1314 # of code points to native ones. In the latter case we have
1315 # to look at the whole of the inversion map (or at least to
1316 # above Unicode; as the maps of code points above that should
1317 # all be to the default).
1318 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1319
1320 my %mapped_lists; # A hash whose keys are the buckets.
1321 while (@invlist) {
1322 last if $invlist[0] > $upper_limit;
1323
1324 # This shouldn't actually happen, as prop_invmap() returns
1325 # an extra element at the end that is beyond $upper_limit
1326 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1327
1328 my $bucket;
1329
1330 # A hash key can't be a ref (we are only expecting arrays
1331 # of scalars here), so convert any such to a string that
1332 # will be converted back later (using a vertical tab as
1333 # the separator). Even if the mapping is to code points,
1334 # we don't translate to native here because the code
1335 # output_map() calls to output these arrays assumes the
1336 # input is Unicode, not native.
1337 if (ref $invmap[0]) {
1338 $bucket = join "\cK", @{$invmap[0]};
1339 }
1340 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1341
1342 # Do convert to native for maps to single code points.
1343 # There are some properties that have a few outlier
1344 # maps that aren't code points, so the above test
1345 # skips those.
1346 $bucket = a2n($invmap[0]);
1347 } else {
1348 $bucket = $invmap[0];
1349 }
1350
1351 # We now have the bucket that all code points in the range
1352 # map to, though possibly they need to be adjusted. Go
1353 # through the range and put each translated code point in
1354 # it into its bucket.
1355 my $base_map = $invmap[0];
1356 for my $j ($invlist[0] .. $invlist[1] - 1) {
1357 if ($to_adjust
1358 # The 1st code point doesn't need adjusting
1359 && $j > $invlist[0]
1360
1361 # Skip any non-numeric maps: these are outliers
1362 # that aren't code points.
1363 && $base_map =~ $numeric_re
1364
1365 # 'ne' because the default can be a string
1366 && $base_map ne $map_default)
1367 {
1368 # We adjust, by incrementing each the bucket and
1369 # the map. For code point maps, translate to
1370 # native
1371 $base_map++;
1372 $bucket = ($maps_to_code_point)
1373 ? a2n($base_map)
1374 : $base_map;
1375 }
1376
1377 # Add the native code point to the bucket for the
1378 # current map
1379 push @{$mapped_lists{$bucket}}, a2n($j);
1380 } # End of loop through all code points in the range
1381
1382 # Get ready for the next range
1383 shift @invlist;
1384 shift @invmap;
1385 } # End of loop through all ranges in the map.
1386
1387 # Here, @invlist and @invmap retain all the ranges from the
1388 # originals that start with code points above $upper_limit.
1389 # Each bucket in %mapped_lists contains all the code points
1390 # that map to that bucket. If the bucket is for a map to a
1391 # single code point is a single code point, the bucket has
1392 # been converted to native. If something else (including
1393 # multiple code points), no conversion is done.
1394 #
1395 # Now we recreate the inversion map into %xlated, but this
1396 # time for the native character set.
1397 my %xlated;
1398 foreach my $bucket (keys %mapped_lists) {
1399
1400 # Sort and convert this bucket to an inversion list. The
1401 # result will be that ranges that start with even-numbered
1402 # indexes will be for code points that map to this bucket;
1403 # odd ones map to some other bucket, and are discarded
1404 # below.
1405 @{$mapped_lists{$bucket}}
1406 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
1407 @{$mapped_lists{$bucket}}
1408 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
1409
1410 # Add each even-numbered range in the bucket to %xlated;
1411 # so that the keys of %xlated become the range start code
1412 # points, and the values are their corresponding maps.
1413 while (@{$mapped_lists{$bucket}}) {
1414 my $range_start = $mapped_lists{$bucket}->[0];
1415 if ($bucket =~ /\cK/) {
1416 @{$xlated{$range_start}} = split /\cK/, $bucket;
1417 }
1418 else {
1419 $xlated{$range_start} = $bucket;
1420 }
1421 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
1422 shift @{$mapped_lists{$bucket}}; # Get ready for next
1423 # iteration
1424 }
1425 } # End of loop through all the buckets.
1426
1427 # Here %xlated's keys are the range starts of all the code
1428 # points in the inversion map. Construct an inversion list
1429 # from them.
1430 my @new_invlist = sort { $a <=> $b } keys %xlated;
1431
1432 # If the list is adjusted, we want to munge this list so that
1433 # we only have one entry for where consecutive code points map
1434 # to consecutive values. We just skip the subsequent entries
1435 # where this is the case.
1436 if ($to_adjust) {
1437 my @temp;
1438 for my $i (0 .. @new_invlist - 1) {
1439 next if $i > 0
1440 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
1441 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
1442 && $xlated{$new_invlist[$i]} =~ $numeric_re
1443 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
1444 push @temp, $new_invlist[$i];
1445 }
1446 @new_invlist = @temp;
1447 }
1448
1449 # The inversion map comes from %xlated's values. We can
1450 # unshift each onto the front of the untouched portion, in
1451 # reverse order of the portion we did process.
1452 foreach my $start (reverse @new_invlist) {
1453 unshift @invmap, $xlated{$start};
1454 }
1455
1456 # Finally prepend the inversion list we have just constructed to the
1457 # one that contains anything we didn't process.
1458 unshift @invlist, @new_invlist;
1459 }
1460 }
1461
1462 # prop_invmap() returns an extra final entry, which we can now
1463 # discard.
1464 if (@invmap) {
1465 pop @invlist;
1466 pop @invmap;
ceb1de32 1467 }
0f5e3c71
KW
1468
1469 if ($l1_only) {
99f21fb9 1470 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
1471 for my $i (0 .. @invlist - 1 - 1) {
1472 if ($invlist[$i] > 255) {
1473
1474 # In an inversion list, even-numbered elements give the code
1475 # points that begin ranges that match the property;
1476 # odd-numbered give ones that begin ranges that don't match.
1477 # If $i is odd, we are at the first code point above 255 that
1478 # doesn't match, which means the range it is ending does
1479 # match, and crosses the 255/256 boundary. We want to include
1480 # this ending point, so increment $i, so the splice below
1481 # includes it. Conversely, if $i is even, it is the first
1482 # code point above 255 that matches, which means there was no
1483 # matching range that crossed the boundary, and we don't want
1484 # to include this code point, so splice before it.
1485 $i++ if $i % 2 != 0;
1486
1487 # Remove everything past this.
1488 splice @invlist, $i;
99f21fb9 1489 splice @invmap, $i if @invmap;
0f5e3c71
KW
1490 last;
1491 }
0c4ecf42
KW
1492 }
1493 }
0f5e3c71
KW
1494 elsif ($nonl1_only) {
1495 my $found_nonl1 = 0;
1496 for my $i (0 .. @invlist - 1 - 1) {
1497 next if $invlist[$i] < 256;
1498
1499 # Here, we have the first element in the array that indicates an
1500 # element above Latin1. Get rid of all previous ones.
1501 splice @invlist, 0, $i;
99f21fb9 1502 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
1503
1504 # If this one's index is not divisible by 2, it means that this
1505 # element is inverting away from being in the list, which means
99f21fb9
KW
1506 # all code points from 256 to this one are in this list (or
1507 # map to the default for inversion maps)
1508 if ($i % 2 != 0) {
1509 unshift @invlist, 256;
1510 unshift @invmap, $map_default if @invmap;
1511 }
0f5e3c71 1512 $found_nonl1 = 1;
3f427fd9
KW
1513 last;
1514 }
0f5e3c71 1515 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 1516 }
3f427fd9 1517
0f5e3c71 1518 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 1519 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 1520 }
bffc0129 1521 end_file_pound_if;
0c4ecf42 1522 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
1523}
1524
973a28ed
KW
1525switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
1526
1527output_GCB_table();
6b659339
KW
1528output_LB_table();
1529
973a28ed
KW
1530end_file_pound_if;
1531
2308ab83 1532my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
1533my @sources = ($0, qw(lib/unicore/mktables
1534 lib/Unicode/UCD.pm
1535 regen/charset_translations.pl
1536 ));
9a3da3ad
FC
1537{
1538 # Depend on mktables’ own sources. It’s a shorter list of files than
1539 # those that Unicode::UCD uses.
2308ab83
KW
1540 if (! open my $mktables_list, $sources_list) {
1541
1542 # This should force a rebuild once $sources_list exists
1543 push @sources, $sources_list;
1544 }
1545 else {
1546 while(<$mktables_list>) {
1547 last if /===/;
1548 chomp;
1549 push @sources, "lib/unicore/$_" if /^[^#]/;
1550 }
9a3da3ad
FC
1551 }
1552}
6b659339
KW
1553
1554read_only_bottom_close_and_rename($out_fh, \@sources);