This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Tell mktables what Unicode version mk_invlist.pl handles
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
289ce9cc
KW
43my $max_hdr_len = 3; # In headings, how wide a name is allowed?
44
9d9177be
KW
45print $out_fh "/* See the generating file for comments */\n\n";
46
bffc0129
KW
47# The symbols generated by this program are all currently defined only in a
48# single dot c each. The code knows where most of them go, but this hash
49# gives overrides for the exceptions to the typical place
50my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
59 );
015bb97c 60
f79a09fc 61# This hash contains the properties with enums that have hard-coded references
289ce9cc 62# to them in C code. It is neeed to make sure that if perl is compiled
f79a09fc
KW
63# with an older Unicode data set, that all the enum values the code is
64# expecting will still be in the enum typedef. Thus the code doesn't have to
289ce9cc
KW
65# change. The Unicode version won't have any code points that have the enum
66# values not in that version, so the code that handles them will not get
67# exercised. This is far better than having to #ifdef things. The names here
68# should be the long names of the respective property values. The reason for
69# this is because regexec.c uses them as case labels, and the long name is
70# generally more understandable than the short.
f79a09fc
KW
71my %hard_coded_enums =
72 ( gcb => [
73 'Control',
74 'CR',
75 'Extend',
76 'L',
77 'LF',
78 'LV',
79 'LVT',
80 'Other',
81 'Prepend',
82 'Regional_Indicator',
83 'SpacingMark',
84 'T',
85 'V',
86 ],
ca8226cf
KW
87 lb => [
88 'Alphabetic',
89 'Break_After',
90 'Break_Before',
91 'Break_Both',
92 'Break_Symbols',
93 'Carriage_Return',
94 'Close_Parenthesis',
95 'Close_Punctuation',
96 'Combining_Mark',
97 'Contingent_Break',
98 'Exclamation',
99 'Glue',
100 'H2',
101 'H3',
102 'Hebrew_Letter',
103 'Hyphen',
104 'Ideographic',
105 'Infix_Numeric',
106 'Inseparable',
107 'JL',
108 'JT',
109 'JV',
110 'Line_Feed',
111 'Mandatory_Break',
112 'Next_Line',
113 'Nonstarter',
114 'Numeric',
115 'Open_Punctuation',
116 'Postfix_Numeric',
117 'Prefix_Numeric',
118 'Quotation',
119 'Regional_Indicator',
120 'Space',
121 'Word_Joiner',
122 'ZWSpace',
123 ],
f79a09fc
KW
124 sb => [
125 'ATerm',
126 'Close',
127 'CR',
128 'Extend',
129 'Format',
130 'LF',
131 'Lower',
132 'Numeric',
133 'OLetter',
134 'Other',
135 'SContinue',
136 'Sep',
137 'Sp',
138 'STerm',
139 'Upper',
140 ],
141 wb => [
142 'ALetter',
143 'CR',
144 'Double_Quote',
145 'Extend',
146 'ExtendNumLet',
147 'Format',
148 'Hebrew_Letter',
149 'Katakana',
150 'LF',
151 'MidLetter',
152 'MidNum',
153 'MidNumLet',
154 'Newline',
155 'Numeric',
156 'Other',
f1f6961f 157 'Perl_Tailored_HSpace',
f79a09fc
KW
158 'Regional_Indicator',
159 'Single_Quote',
f79a09fc
KW
160 ],
161);
162
973a28ed
KW
163my %gcb_enums;
164my @gcb_short_enums;
289ce9cc 165my %gcb_abbreviations;
6b659339
KW
166my %lb_enums;
167my @lb_short_enums;
289ce9cc 168my %lb_abbreviations;
7e54b87f
KW
169my %wb_enums;
170my @wb_short_enums;
289ce9cc 171my %wb_abbreviations;
6b659339 172
99f21fb9
KW
173my @a2n;
174
175sub uniques {
176 # Returns non-duplicated input values. From "Perl Best Practices:
177 # Encapsulated Cleverness". p. 455 in first edition.
178
179 my %seen;
180 return grep { ! $seen{$_}++ } @_;
181}
182
183sub a2n($) {
184 my $cp = shift;
185
186 # Returns the input Unicode code point translated to native.
187
188 return $cp if $cp !~ $numeric_re || $cp > 255;
189 return $a2n[$cp];
190}
191
bffc0129
KW
192sub end_file_pound_if {
193 if ($in_file_pound_if) {
194 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
195 $in_file_pound_if = 0;
196 }
197}
198
199sub switch_pound_if ($$) {
200 my $name = shift;
201 my $new_pound_if = shift;
202
203 # Switch to new #if given by the 2nd argument. If there is an override
204 # for this, it instead switches to that. The 1st argument is the
205 # static's name, used to look up the overrides
206
207 if (exists $exceptions_to_where_to_define{$name}) {
208 $new_pound_if = $exceptions_to_where_to_define{$name};
209 }
210
211 # Exit current #if if the new one is different from the old
212 if ($in_file_pound_if
213 && $in_file_pound_if !~ /$new_pound_if/)
214 {
215 end_file_pound_if;
216 }
217
218 # Enter new #if, if not already in it.
219 if (! $in_file_pound_if) {
220 $in_file_pound_if = "defined($new_pound_if)";
221 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
222 }
223}
224
0c4ecf42 225sub output_invlist ($$;$) {
9d9177be
KW
226 my $name = shift;
227 my $invlist = shift; # Reference to inversion list array
0c4ecf42 228 my $charset = shift // ""; # name of character set for comment
9d9177be 229
76d3994c 230 die "No inversion list for $name" unless defined $invlist
ad85f59a 231 && ref $invlist eq 'ARRAY';
76d3994c 232
9d9177be
KW
233 # Output the inversion list $invlist using the name $name for it.
234 # It is output in the exact internal form for inversion lists.
235
a0316a6c
KW
236 # Is the last element of the header 0, or 1 ?
237 my $zero_or_one = 0;
ad85f59a 238 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 239 unshift @$invlist, 0;
9d9177be
KW
240 $zero_or_one = 1;
241 }
0a07b44b 242 my $count = @$invlist;
9d9177be 243
bffc0129 244 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 245
0c4ecf42
KW
246 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
247 print $out_fh " /* for $charset */" if $charset;
248 print $out_fh "\n";
9d9177be 249
a0316a6c 250 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
251 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
252 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
253 ",\t/* 0 if the list starts at 0;",
254 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
255
256 # The main body are the UVs passed in to this routine. Do the final
257 # element separately
47d53124
KW
258 for my $i (0 .. @$invlist - 1) {
259 printf $out_fh "\t0x%X", $invlist->[$i];
260 print $out_fh "," if $i < @$invlist - 1;
261 print $out_fh "\n";
9d9177be
KW
262 }
263
9d9177be
KW
264 print $out_fh "};\n";
265}
266
99f21fb9
KW
267sub output_invmap ($$$$$$$) {
268 my $name = shift;
269 my $invmap = shift; # Reference to inversion map array
270 my $prop_name = shift;
271 my $input_format = shift; # The inversion map's format
272 my $default = shift; # The property value for code points who
273 # otherwise don't have a value specified.
274 my $extra_enums = shift; # comma-separated list of our additions to the
275 # property's standard possible values
276 my $charset = shift // ""; # name of character set for comment
277
278 # Output the inversion map $invmap for property $prop_name, but use $name
279 # as the actual data structure's name.
280
281 my $count = @$invmap;
282
283 my $output_format;
284 my $declaration_type;
285 my %enums;
286 my $name_prefix;
287
288 if ($input_format eq 's') {
b83e6484 289 my $orig_prop_name = $prop_name;
02f811dd
KW
290 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
291 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
b83e6484
KW
292 my @enums;
293 if ($orig_prop_name eq $prop_name) {
294 @enums = prop_values($prop_name);
295 }
296 else {
297 @enums = uniques(@$invmap);
298 }
289ce9cc 299
99f21fb9
KW
300 if (! @enums) {
301 die "Only enum properties are currently handled; '$prop_name' isn't one";
302 }
303 else {
f79a09fc 304 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
289ce9cc
KW
305 my @canonical_input_enums;
306 if (@expected_enums) {
307 if (@expected_enums < @enums) {
308 die 'You need to update %hard_coded_enums to reflect new'
309 . " entries in this Unicode version\n"
310 . "Expected: " . join(", ", sort @expected_enums) . "\n"
311 . " Got: " . join(", ", sort @enums);
312 }
f79a09fc 313
289ce9cc 314 if (! defined prop_aliases($prop_name)) {
f79a09fc 315
289ce9cc
KW
316 # Convert the input enums into canonical form and
317 # save for use below
318 @canonical_input_enums = map { lc ($_ =~ s/_//gr) }
319 @enums;
320 }
321 @enums = sort @expected_enums;
322 }
99f21fb9 323
289ce9cc
KW
324 # The internal enums come last, and in the order specified
325 my @extras;
326 if ($extra_enums ne "") {
327 @extras = split /,/, $extra_enums;
328 push @enums, @extras;
329 }
6dc80864 330
99f21fb9
KW
331 # Assign a value to each element of the enum. The default
332 # value always gets 0; the others are arbitrarily assigned.
333 my $enum_val = 0;
02f811dd
KW
334 my $canonical_default = prop_value_aliases($prop_name, $default);
335 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
336 $enums{$default} = $enum_val++;
337 for my $enum (@enums) {
338 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
339 }
6b659339 340
289ce9cc
KW
341 # Calculate the enum values for certain properties like
342 # _Perl_GCB and _Perl_LB, because we output special tables for
343 # them.
344 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
345
346 # We use string evals to allow the same code to work on
347 # all tables we're doing.
348 my $type = lc $prop_name;
349
350 # We use lowercase single letter names for any property
351 # values not in the release of Unicode being compiled now.
352 my $placeholder = "a";
353
354 # Skip if we've already done this code, which populated
355 # this hash
356 if (eval "! \%${type}_enums") {
357
358 # For each enum ...
359 foreach my $enum (sort keys %enums) {
360 my $value = $enums{$enum};
361 my $short;
362 my $abbreviated_from;
363
364 # Special case this wb property value to make the
365 # name more clear
366 if ($enum eq 'Perl_Tailored_HSpace') {
367 $short = 'hs';
368 $abbreviated_from = $enum;
369 }
370 elsif (grep { $_ eq $enum } @extras) {
371
372 # The 'short' name for one of the property
373 # values added by this file is just the
374 # lowercase of it
375 $short = lc $enum;
376 }
377 elsif (grep {$_ eq lc ( $enum =~ s/_//gr) }
378 @canonical_input_enums)
379 { # On Unicode versions that predate the
380 # official property, we have set up this array
381 # to be the canonical form of each enum in the
382 # substitute property. If the enum we're
383 # looking at is canonically the same as one of
384 # these, use its name instead of generating a
385 # placeholder one in the next clause (which
386 # will happen because prop_value_aliases()
387 # will fail because it only works on official
388 # properties)
389 $short = $enum;
390 }
391 else {
392 # Use the official short name for the other
393 # property values, which should all be
394 # official ones.
395 ($short) = prop_value_aliases($type, $enum);
396
397 # But create a placeholder for ones not in
398 # this Unicode version.
399 $short = $placeholder++ unless defined $short;
400 }
401
402 # If our short name is too long, or we already
403 # know that the name is an abbreviation, truncate
404 # to make sure it's short enough, and remember
405 # that we did this so we can later place in a
406 # comment in the generated file
407 if ( $abbreviated_from
408 || length $short > $max_hdr_len)
409 {
410 $short = substr($short, 0, $max_hdr_len);
411 $abbreviated_from = $enum
412 unless $abbreviated_from;
413 # If the name we are to display conflicts, try
414 # another.
415 while (eval "exists
416 \$${type}_abbreviations{$short}")
417 {
418 die $@ if $@;
419 $short++;
420 }
421
422 eval "\$${type}_abbreviations{$short} = '$enum'";
423 die $@ if $@;
424 }
425
426 # Remember the mapping from the property value
427 # (enum) name to its value.
428 eval "\$${type}_enums{$enum} = $value";
429 die $@ if $@;
430
431 # Remember the inverse mapping to the short name
432 # so that we can properly label the generated
433 # table's rows and columns
434 eval "\$${type}_short_enums[$value] = '$short'";
435 die $@ if $@;
436 }
7e54b87f
KW
437 }
438 }
99f21fb9
KW
439 }
440
bffc0129
KW
441 # Inversion map stuff is currently used only by regexec
442 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
443 {
444
99f21fb9
KW
445 # The short names tend to be two lower case letters, but it looks
446 # better for those if they are upper. XXX
447 $short_name = uc($short_name) if length($short_name) < 3
448 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 449 $name_prefix = "${short_name}_";
99f21fb9
KW
450 my $enum_count = keys %enums;
451 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
452
453 print $out_fh "\ntypedef enum {\n";
6dc80864
KW
454 my @enum_list;
455 foreach my $enum (keys %enums) {
456 $enum_list[$enums{$enum}] = $enum;
457 }
458 foreach my $i (0 .. @enum_list - 1) {
459 my $name = $enum_list[$i];
460 print $out_fh "\t${name_prefix}$name = $i";
461 print $out_fh "," if $i < $enum_count - 1;
462 print $out_fh "\n";
99f21fb9
KW
463 }
464 $declaration_type = "${name_prefix}enum";
465 print $out_fh "} $declaration_type;\n";
466
467 $output_format = "${name_prefix}%s";
468 }
469 }
470 else {
471 die "'$input_format' invmap() format for '$prop_name' unimplemented";
472 }
473
474 die "No inversion map for $prop_name" unless defined $invmap
475 && ref $invmap eq 'ARRAY'
476 && $count;
477
478 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
479 print $out_fh " /* for $charset */" if $charset;
480 print $out_fh "\n";
481
482 # The main body are the scalars passed in to this routine.
483 for my $i (0 .. $count - 1) {
484 my $element = $invmap->[$i];
02f811dd
KW
485 my $full_element_name = prop_value_aliases($prop_name, $element);
486 $element = $full_element_name if defined $full_element_name;
487 $element = $name_prefix . $element;
99f21fb9
KW
488 print $out_fh "\t$element";
489 print $out_fh "," if $i < $count - 1;
490 print $out_fh "\n";
491 }
492 print $out_fh "};\n";
99f21fb9
KW
493}
494
5a7e5385 495sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
496
497 # Returns an inversion list constructed from the sorted input array of
498 # code points
499
500 my $list_ref = shift;
501
99f21fb9
KW
502 return unless @$list_ref;
503
a02047bf
KW
504 # Initialize to just the first element
505 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
506
507 # For each succeeding element, if it extends the previous range, adjust
508 # up, otherwise add it.
509 for my $i (1 .. @$list_ref - 1) {
510 if ($invlist[-1] == $list_ref->[$i]) {
511 $invlist[-1]++;
512 }
513 else {
514 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
515 }
516 }
517 return @invlist;
518}
519
520# Read in the Case Folding rules, and construct arrays of code points for the
521# properties we need.
522my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
523die "Could not find inversion map for Case_Folding" unless defined $format;
524die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
525 unless $format eq 'al'
526 || $format eq 'a';
a02047bf
KW
527my @has_multi_char_fold;
528my @is_non_final_fold;
529
530for my $i (0 .. @$folds_ref - 1) {
531 next unless ref $folds_ref->[$i]; # Skip single-char folds
532 push @has_multi_char_fold, $cp_ref->[$i];
533
b6a6e956 534 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
535 # position
536 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
537 push @is_non_final_fold, $folds_ref->[$i][$j]
538 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
539 }
540}
541
a02047bf
KW
542sub _Perl_Non_Final_Folds {
543 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 544 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
545}
546
99f21fb9
KW
547sub prop_name_for_cmp ($) { # Sort helper
548 my $name = shift;
549
550 # Returns the input lowercased, with non-alphas removed, as well as
551 # everything starting with a comma
552
553 $name =~ s/,.*//;
554 $name =~ s/[[:^alpha:]]//g;
555 return lc $name;
556}
557
892d8259 558sub UpperLatin1 {
5a7e5385 559 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
560}
561
289ce9cc
KW
562sub output_table_common {
563
564 # Common subroutine to actually output the generated rules table.
565
566 my ($property,
567 $table_value_defines_ref,
568 $table_ref,
569 $names_ref,
570 $abbreviations_ref) = @_;
571 my $size = @$table_ref;
572
573 # Output the #define list, sorted by numeric value
574 if ($table_value_defines_ref) {
575 my $max_name_length = 0;
576 my @defines;
577
578 # Put in order, and at the same time find the longest name
579 while (my ($enum, $value) = each %$table_value_defines_ref) {
580 $defines[$value] = $enum;
581
582 my $length = length $enum;
583 $max_name_length = $length if $length > $max_name_length;
584 }
585
586 print $out_fh "\n";
587
588 # Output, so that the values are vertically aligned in a column after
589 # the longest name
590 foreach my $i (0 .. @defines - 1) {
591 next unless defined $defines[$i];
592 printf $out_fh "#define %-*s %2d\n",
593 $max_name_length,
594 $defines[$i],
595 $i;
596 }
597 }
598
599 my $column_width = 2; # We currently allow 2 digits for the number
600
601 # If the maximum value in the table is 1, it can be a bool. (Being above
602 # a U8 is not currently handled
603 my $max_element = 0;
604 for my $i (0 .. $size - 1) {
605 for my $j (0 .. $size - 1) {
606 next if $max_element >= $table_ref->[$i][$j];
607 $max_element = $table_ref->[$i][$j];
608 }
609 }
610 die "Need wider table column width given '$max_element"
611 if length $max_element > $column_width;
612
613 my $table_type = ($max_element == 1)
614 ? 'bool'
615 : 'U8';
616
617 # If a name is longer than the width set aside for a column, its column
618 # needs to have increased spacing so that the name doesn't get truncated
619 # nor run into an adjacent column
620 my @spacers;
621
622 # If we are being compiled on a Unicode version earlier than that which
623 # this file was designed for, it may be that some of the property values
624 # aren't in the current release, and so would be undefined if we didn't
625 # define them ourselves. Earlier code has done this, making them
626 # lowercase characters of length one. We look to see if any exist, so
627 # that we can add an annotation to the output table
628 my $has_placeholder = 0;
629
630 for my $i (0 .. $size - 1) {
631 no warnings 'numeric';
632 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
633 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
634 }
635
636 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
637
638 # Calculate the column heading line
639 my $header_line = "/* "
640 . (" " x $max_hdr_len) # We let the row heading meld to
641 # the '*/' for those that are at
642 # the max
643 . " " x 3; # Space for '*/ '
644 # Now each column
645 for my $i (0 .. $size - 1) {
646 $header_line .= sprintf "%s%*s",
647 $spacers[$i],
648 $column_width + 1, # 1 for the ','
649 $names_ref->[$i];
650 }
651 $header_line .= " */\n";
652
653 # If we have annotations, output it now.
654 if ($has_placeholder || scalar %$abbreviations_ref) {
655 my $text = "";
656 foreach my $abbr (sort keys %$abbreviations_ref) {
657 $text .= "; " if $text;
658 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
659 }
660 if ($has_placeholder) {
661 $text .= "; other " if $text;
662 $text .= "lowercase names are placeholders for"
663 . " property values not defined until a later Unicode"
664 . " release, so are irrelevant in this one, as they are"
665 . " not assigned to any code points";
666 }
667
668 my $indent = " " x 3;
669 $text = $indent . "/* $text */";
670
671 # Wrap the text so that it is no wider than the table, which the
672 # header line gives.
673 my $output_width = length $header_line;
674 while (length $text > $output_width) {
675 my $cur_line = substr($text, 0, $output_width);
676
677 # Find the first blank back from the right end to wrap at.
678 for (my $i = $output_width -1; $i > 0; $i--) {
679 if (substr($text, $i, 1) eq " ") {
680 print $out_fh substr($text, 0, $i), "\n";
681
682 # Set so will look at just the remaining tail (which will
683 # be indented and have a '*' after the indent
684 $text = $indent . " * " . substr($text, $i + 1);
685 last;
686 }
687 }
688 }
689
690 # And any remaining
691 print $out_fh $text, "\n" if $text;
692 }
693
694 # We calculated the header line earlier just to get its width so that we
695 # could make sure the annotations fit into that.
696 print $out_fh $header_line;
697
698 # Now output the bulk of the table.
699 for my $i (0 .. $size - 1) {
700
701 # First the row heading.
702 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
703 print $out_fh "{"; # Then the brace for this row
704
705 # Then each column
706 for my $j (0 .. $size -1) {
707 print $out_fh $spacers[$j];
708 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
709 print $out_fh "," if $j < $size - 1;
710 }
711 print $out_fh " }";
712 print $out_fh "," if $i < $size - 1;
713 print $out_fh "\n";
714 }
715
716 print $out_fh "};\n";
717}
718
973a28ed
KW
719sub output_GCB_table() {
720
721 # Create and output the pair table for use in determining Grapheme Cluster
722 # Breaks, given in http://www.unicode.org/reports/tr29/.
723
724 # The table is constructed in reverse order of the rules, to make the
725 # lower-numbered, higher priority ones override the later ones, as the
726 # algorithm stops at the earliest matching rule
727
728 my @gcb_table;
729 my $table_size = @gcb_short_enums;
730
731 # Otherwise, break everywhere.
732 # GB10 Any ÷ Any
733 for my $i (0 .. $table_size - 1) {
734 for my $j (0 .. $table_size - 1) {
735 $gcb_table[$i][$j] = 1;
736 }
737 }
738
739 # Do not break before extending characters.
740 # Do not break before SpacingMarks, or after Prepend characters.
741 # GB9 × Extend
742 # GB9a × SpacingMark
743 # GB9b Prepend ×
744 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
745 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
746 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
747 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
973a28ed
KW
748 }
749
750 # Do not break between regional indicator symbols.
751 # GB8a Regional_Indicator × Regional_Indicator
289ce9cc
KW
752 $gcb_table[$gcb_enums{'Regional_Indicator'}]
753 [$gcb_enums{'Regional_Indicator'}] = 0;
973a28ed
KW
754
755 # Do not break Hangul syllable sequences.
756 # GB8 ( LVT | T) × T
757 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
758 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
759
760 # GB7 ( LV | V ) × ( V | T )
761 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
762 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
763 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
764 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
765
766 # GB6 L × ( L | V | LV | LVT )
767 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
768 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
769 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
770 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
771
289ce9cc
KW
772 # Do not break between a CR and LF. Otherwise, break before and after
773 # controls.
973a28ed
KW
774 # GB5 ÷ ( Control | CR | LF )
775 # GB4 ( Control | CR | LF ) ÷
776 for my $i (0 .. @gcb_table - 1) {
289ce9cc 777 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
973a28ed
KW
778 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
779 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
289ce9cc 780 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
973a28ed
KW
781 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
782 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
783 }
784
785 # GB3 CR × LF
786 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
787
788 # Break at the start and end of text.
789 # GB1 sot ÷
790 # GB2 ÷ eot
791 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
792 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
793 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
973a28ed
KW
794 }
795
796 # But, unspecified by Unicode, we shouldn't break on an empty string.
289ce9cc 797 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
973a28ed 798
289ce9cc
KW
799 output_table_common('GCB', undef,
800 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
973a28ed
KW
801}
802
6b659339
KW
803sub output_LB_table() {
804
805 # Create and output the enums, #defines, and pair table for use in
806 # determining Line Breaks. This uses the default line break algorithm,
807 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
808 # in that page, as the Unicode-furnished tests assume that tailoring.
809
6b659339
KW
810 # The result is really just true or false. But we follow along with tr14,
811 # creating a rule which is false for something like X SP* X. That gets
812 # encoding 2. The rest of the actions are synthetic ones that indicate
813 # some context handling is required. These each are added to the
814 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
815 # value can be retrieved. Actually only rules from 7 through 18 (which
816 # are the ones where space matter) are possible to have 2 added to them.
817 # The others below add just 0 or 1. It might be possible for one
818 # synthetic rule to be added to another, yielding a larger value. This
819 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
820 # names of the middle grouping below, it is impossible for that to occur
821 # for them because they all start with mutually exclusive classes. That
822 # the final rule can't be added to any of the others isn't obvious from
823 # its name, so it is assigned a power of 2 higher than the others can get
824 # to so any addition would preserve all data. (And the code will reach an
825 # assert(0) on debugging builds should this happen.)
826 my %lb_actions = (
827 LB_NOBREAK => 0,
828 LB_BREAKABLE => 1,
829 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
830
831 LB_CM_foo => 3, # Rule 9
832 LB_SP_foo => 6, # Rule 18
833 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
834 LB_SY_or_IS_then_various => 11, # Rule 25
835 LB_HY_or_BA_then_foo => 13, # Rule 21
836
837 LB_various_then_PO_or_PR => (1<<4), # Rule 25
838 );
839
6b659339
KW
840 # Construct the LB pair table. This is based on the rules in
841 # http://www.unicode.org/reports/tr14/, but modified as those rules are
842 # designed for someone taking a string of text and sequentially going
843 # through it to find the break opportunities, whereas, Perl requires
844 # determining if a given random spot is a break opportunity, without
845 # knowing all the entire string before it.
846 #
847 # The table is constructed in reverse order of the rules, to make the
848 # lower-numbered, higher priority ones override the later ones, as the
849 # algorithm stops at the earliest matching rule
850
851 my @lb_table;
852 my $table_size = @lb_short_enums;
853
854 # LB31. Break everywhere else
855 for my $i (0 .. $table_size - 1) {
856 for my $j (0 .. $table_size - 1) {
857 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
858 }
859 }
860
861 # LB30a. Don't break between Regional Indicators
289ce9cc
KW
862 $lb_table[$lb_enums{'Regional_Indicator'}]
863 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
864
865 # LB30 Do not break between letters, numbers, or ordinary symbols and
866 # opening or closing parentheses.
867 # (AL | HL | NU) × OP
289ce9cc
KW
868 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
869 = $lb_actions{'LB_NOBREAK'};
870 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
871 = $lb_actions{'LB_NOBREAK'};
872 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
873 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
874
875 # CP × (AL | HL | NU)
289ce9cc
KW
876 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
877 = $lb_actions{'LB_NOBREAK'};
878 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
879 = $lb_actions{'LB_NOBREAK'};
880 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
881 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
882
883 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
884 # IS × (AL | HL)
289ce9cc
KW
885 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
886 = $lb_actions{'LB_NOBREAK'};
887 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
888 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
889
890 # LB28 Do not break between alphabetics (“at”).
891 # (AL | HL) × (AL | HL)
289ce9cc
KW
892 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
893 = $lb_actions{'LB_NOBREAK'};
894 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
895 = $lb_actions{'LB_NOBREAK'};
896 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
897 = $lb_actions{'LB_NOBREAK'};
898 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
899 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
900
901 # LB27 Treat a Korean Syllable Block the same as ID.
902 # (JL | JV | JT | H2 | H3) × IN
289ce9cc
KW
903 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
904 = $lb_actions{'LB_NOBREAK'};
905 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
906 = $lb_actions{'LB_NOBREAK'};
907 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
908 = $lb_actions{'LB_NOBREAK'};
909 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
910 = $lb_actions{'LB_NOBREAK'};
911 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
912 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
913
914 # (JL | JV | JT | H2 | H3) × PO
289ce9cc
KW
915 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
916 = $lb_actions{'LB_NOBREAK'};
917 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
918 = $lb_actions{'LB_NOBREAK'};
919 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
920 = $lb_actions{'LB_NOBREAK'};
921 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
922 = $lb_actions{'LB_NOBREAK'};
923 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
924 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
925
926 # PR × (JL | JV | JT | H2 | H3)
289ce9cc
KW
927 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
928 = $lb_actions{'LB_NOBREAK'};
929 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
930 = $lb_actions{'LB_NOBREAK'};
931 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
932 = $lb_actions{'LB_NOBREAK'};
933 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
934 = $lb_actions{'LB_NOBREAK'};
935 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
936 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
937
938 # LB26 Do not break a Korean syllable.
939 # JL × (JL | JV | H2 | H3)
940 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
941 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
942 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
943 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
944
945 # (JV | H2) × (JV | JT)
946 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
947 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
948 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
949 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
950
951 # (JT | H3) × JT
952 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
953 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
954
955 # LB25 Do not break between the following pairs of classes relevant to
956 # numbers, as tailored by example 7 in
957 # http://www.unicode.org/reports/tr14/#Examples
958 # We follow that tailoring because Unicode's test cases expect it
959 # (PR | PO) × ( OP | HY )? NU
289ce9cc
KW
960 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
961 = $lb_actions{'LB_NOBREAK'};
962 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
963 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
964
965 # Given that (OP | HY )? is optional, we have to test for it in code.
966 # We add in the action (instead of overriding) for this, so that in
967 # the code we can recover the underlying break value.
289ce9cc 968 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 969 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 970 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 971 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 972 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339 973 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 974 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339
KW
975 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
976
977 # ( OP | HY ) × NU
289ce9cc
KW
978 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
979 = $lb_actions{'LB_NOBREAK'};
980 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
981 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
982
983 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
984 # which can be rewritten as:
985 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
289ce9cc
KW
986 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
987 = $lb_actions{'LB_NOBREAK'};
988 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
989 = $lb_actions{'LB_NOBREAK'};
990 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
991 = $lb_actions{'LB_NOBREAK'};
992 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
993 = $lb_actions{'LB_NOBREAK'};
994 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
995 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
996
997 # Like earlier where we have to test in code, we add in the action so
998 # that we can recover the underlying values. This is done in rules
999 # below, as well. The code assumes that we haven't added 2 actions.
1000 # Shoul a later Unicode release break that assumption, then tests
1001 # should start failing.
289ce9cc 1002 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
6b659339 1003 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1004 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
6b659339 1005 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1006 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
6b659339 1007 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1008 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
6b659339 1009 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1010 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
6b659339 1011 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1012 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
6b659339 1013 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1014 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
6b659339 1015 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1016 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
6b659339 1017 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1018 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
6b659339 1019 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1020 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
6b659339
KW
1021 += $lb_actions{'LB_SY_or_IS_then_various'};
1022
1023 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1024 # which can be rewritten as:
1025 # NU (SY | IS)* (CL | CP)? × (PO | PR)
289ce9cc
KW
1026 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1027 = $lb_actions{'LB_NOBREAK'};
1028 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1029 = $lb_actions{'LB_NOBREAK'};
6b659339 1030
289ce9cc 1031 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1032 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1033 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1034 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1035 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1036 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1037 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
6b659339
KW
1038 += $lb_actions{'LB_various_then_PO_or_PR'};
1039
289ce9cc 1040 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1041 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1042 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1043 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1044 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1045 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1046 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
6b659339
KW
1047 += $lb_actions{'LB_various_then_PO_or_PR'};
1048
1049 # LB24 Do not break between prefix and letters or ideographs.
1050 # PR × ID
289ce9cc
KW
1051 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1052 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1053
1054 # PR × (AL | HL)
289ce9cc
KW
1055 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1056 = $lb_actions{'LB_NOBREAK'};
1057 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1058 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1059
1060 # PO × (AL | HL)
289ce9cc
KW
1061 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1062 = $lb_actions{'LB_NOBREAK'};
1063 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1064 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1065
1066 # LB23 Do not break within ‘a9’, ‘3a’, or ‘H%’.
1067 # ID × PO
289ce9cc
KW
1068 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1069 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1070
1071 # (AL | HL) × NU
289ce9cc
KW
1072 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1073 = $lb_actions{'LB_NOBREAK'};
1074 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1075 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1076
1077 # NU × (AL | HL)
289ce9cc
KW
1078 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1079 = $lb_actions{'LB_NOBREAK'};
1080 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1081 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1082
1083 # LB22 Do not break between two ellipses, or between letters, numbers or
1084 # exclamations and ellipsis.
1085 # (AL | HL) × IN
289ce9cc
KW
1086 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1087 = $lb_actions{'LB_NOBREAK'};
1088 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1089 = $lb_actions{'LB_NOBREAK'};
6b659339 1090
289ce9cc
KW
1091 # Exclamation × IN
1092 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1093 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1094
1095 # ID × IN
289ce9cc
KW
1096 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1097 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1098
1099 # IN × IN
289ce9cc
KW
1100 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1101 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1102
1103 # NU × IN
289ce9cc
KW
1104 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1105 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1106
1107 # LB21b Don’t break between Solidus and Hebrew letters.
1108 # SY × HL
289ce9cc
KW
1109 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1110 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1111
1112 # LB21a Don't break after Hebrew + Hyphen.
1113 # HL (HY | BA) ×
1114 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1115 $lb_table[$lb_enums{'Hyphen'}][$i]
1116 += $lb_actions{'LB_HY_or_BA_then_foo'};
1117 $lb_table[$lb_enums{'Break_After'}][$i]
1118 += $lb_actions{'LB_HY_or_BA_then_foo'};
6b659339
KW
1119 }
1120
1121 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1122 # spaces, small kana, and other non-starters, or after acute accents.
1123 # × BA
1124 # × HY
1125 # × NS
1126 # BB ×
1127 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1128 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1129 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1130 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1131 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1132 }
1133
1134 # LB20 Break before and after unresolved CB.
1135 # ÷ CB
1136 # CB ÷
1137 # Conditional breaks should be resolved external to the line breaking
1138 # rules. However, the default action is to treat unresolved CB as breaking
1139 # before and after.
1140 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1141 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1142 = $lb_actions{'LB_BREAKABLE'};
1143 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1144 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1145 }
1146
1147 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1148 # × QU
1149 # QU ×
1150 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1151 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1152 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1153 }
1154
1155 # LB18 Break after spaces
1156 # SP ÷
1157 for my $i (0 .. @lb_table - 1) {
289ce9cc 1158 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1159 }
1160
1161 # LB17 Do not break within ‘——’, even with intervening spaces.
1162 # B2 SP* × B2
289ce9cc 1163 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
6b659339
KW
1164 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1165
1166 # LB16 Do not break between closing punctuation and a nonstarter even with
1167 # intervening spaces.
1168 # (CL | CP) SP* × NS
289ce9cc 1169 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
6b659339 1170 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1171 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
6b659339
KW
1172 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1173
1174
1175 # LB15 Do not break within ‘”[’, even with intervening spaces.
1176 # QU SP* × OP
289ce9cc 1177 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
6b659339
KW
1178 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1179
1180 # LB14 Do not break after ‘[’, even after spaces.
1181 # OP SP* ×
1182 for my $i (0 .. @lb_table - 1) {
289ce9cc 1183 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
6b659339
KW
1184 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1185 }
1186
1187 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1188 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1189 # [^NU] × CL
1190 # [^NU] × CP
1191 # × EX
1192 # [^NU] × IS
1193 # [^NU] × SY
1194 for my $i (0 .. @lb_table - 1) {
289ce9cc 1195 $lb_table[$i][$lb_enums{'Exclamation'}]
6b659339
KW
1196 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1197
289ce9cc 1198 next if $i == $lb_enums{'Numeric'};
6b659339 1199
289ce9cc 1200 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
6b659339 1201 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1202 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
6b659339 1203 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1204 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
6b659339 1205 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1206 $lb_table[$i][$lb_enums{'Break_Symbols'}]
6b659339
KW
1207 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1208 }
1209
1210 # LB12a Do not break before NBSP and related characters, except after
1211 # spaces and hyphens.
1212 # [^SP BA HY] × GL
1213 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1214 next if $i == $lb_enums{'Space'}
1215 || $i == $lb_enums{'Break_After'}
1216 || $i == $lb_enums{'Hyphen'};
6b659339
KW
1217
1218 # We don't break, but if a property above has said don't break even
1219 # with space between, don't override that (also in the next few rules)
289ce9cc 1220 next if $lb_table[$i][$lb_enums{'Glue'}]
6b659339 1221 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1222 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1223 }
1224
1225 # LB12 Do not break after NBSP and related characters.
1226 # GL ×
1227 for my $i (0 .. @lb_table - 1) {
289ce9cc 1228 next if $lb_table[$lb_enums{'Glue'}][$i]
6b659339 1229 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1230 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1231 }
1232
1233 # LB11 Do not break before or after Word joiner and related characters.
1234 # × WJ
1235 # WJ ×
1236 for my $i (0 .. @lb_table - 1) {
289ce9cc 1237 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
6b659339
KW
1238 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1239 {
289ce9cc 1240 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
6b659339 1241 }
289ce9cc 1242 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
6b659339
KW
1243 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1244 {
289ce9cc 1245 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1246 }
1247 }
1248
1249 # Special case this here to avoid having to do a special case in the code,
1250 # by making this the same as other things with a SP in front of them that
1251 # don't break, we avoid an extra test
289ce9cc 1252 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
6b659339
KW
1253 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1254
1255 # LB9 and LB10 are done in the same loop
1256 #
1257 # LB9 Do not break a combining character sequence; treat it as if it has
1258 # the line breaking class of the base character in all of the
1259 # higher-numbered rules.
1260 # Treat X CM* as if it were X.
1261 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1262
1263 # LB10 Treat any remaining combining mark as AL. This catches the case
1264 # where a CM is the first character on the line or follows SP, BK, CR, LF,
1265 # NL, or ZW.
1266 for my $i (0 .. @lb_table - 1) {
1267
1268 # When the CM is the first in the pair, we don't know without looking
1269 # behind whether the CM is going to inherit from an earlier character,
1270 # or not. So have to figure this out in the code
289ce9cc
KW
1271 $lb_table[$lb_enums{'Combining_Mark'}][$i] = $lb_actions{'LB_CM_foo'};
1272
1273 if ( $i == $lb_enums{'Mandatory_Break'}
1274 || $i == $lb_enums{'EDGE'}
1275 || $i == $lb_enums{'Carriage_Return'}
1276 || $i == $lb_enums{'Line_Feed'}
1277 || $i == $lb_enums{'Next_Line'}
1278 || $i == $lb_enums{'Space'}
1279 || $i == $lb_enums{'ZWSpace'})
6b659339
KW
1280 {
1281 # For these classes, a following CM doesn't combine, and should do
289ce9cc
KW
1282 # whatever 'Alphabetic' would do.
1283 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1284 = $lb_table[$i][$lb_enums{'Alphabetic'}];
6b659339
KW
1285 }
1286 else {
1287 # For these classes, the CM combines, so doesn't break, inheriting
1288 # the type of nobreak from the master character.
289ce9cc 1289 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
6b659339
KW
1290 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1291 {
289ce9cc
KW
1292 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1293 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1294 }
1295 }
1296 }
1297
1298 # LB8 Break before any character following a zero-width space, even if one
1299 # or more spaces intervene.
1300 # ZW SP* ÷
1301 for my $i (0 .. @lb_table - 1) {
289ce9cc 1302 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1303 }
1304
1305 # Because of LB8-10, we need to look at context for "SP x", and this must
1306 # be done in the code. So override the existing rules for that, by adding
1307 # a constant to get new rules that tell the code it needs to look at
1308 # context. By adding this action instead of replacing the existing one,
1309 # we can get back to the original rule if necessary.
1310 for my $i (0 .. @lb_table - 1) {
289ce9cc 1311 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
6b659339
KW
1312 }
1313
1314 # LB7 Do not break before spaces or zero width space.
1315 # × SP
1316 # × ZW
1317 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1318 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1319 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1320 }
1321
1322 # LB6 Do not break before hard line breaks.
1323 # × ( BK | CR | LF | NL )
1324 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1325 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1326 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1327 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1328 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1329 }
1330
1331 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1332 # CR × LF
1333 # CR !
1334 # LF !
1335 # NL !
1336 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1337 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1338 = $lb_actions{'LB_BREAKABLE'};
1339 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1340 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339 1341 }
289ce9cc
KW
1342 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1343 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1344
1345 # LB4 Always break after hard line breaks.
1346 # BK !
1347 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1348 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1349 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1350 }
1351
1352 # LB2 Never break at the start of text.
1353 # sot ×
1354 # LB3 Always break at the end of text.
1355 # ! eot
1356 # but these are reversed in the loop below, so that won't break if there
1357 # is no text
1358 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1359 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1360 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1361 }
1362
1363 # LB1 Assign a line breaking class to each code point of the input.
1364 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1365 # depending on criteria outside the scope of this algorithm.
1366 #
1367 # In the absence of such criteria all characters with a specific
1368 # combination of original class and General_Category property value are
1369 # resolved as follows:
1370 # Original Resolved General_Category
1371 # AI, SG, XX AL Any
1372 # SA CM Only Mn or Mc
1373 # SA AL Any except Mn and Mc
1374 # CJ NS Any
1375 #
1376 # This is done in mktables, so we never see any of the remapped-from
1377 # classes.
1378
289ce9cc
KW
1379 output_table_common('LB', \%lb_actions,
1380 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
6b659339
KW
1381}
1382
7e54b87f
KW
1383sub output_WB_table() {
1384
1385 # Create and output the enums, #defines, and pair table for use in
1386 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1387
1388 # This uses the same mechanism in the other bounds tables generated by
1389 # this file. The actions that could override a 0 or 1 are added to those
1390 # numbers; the actions that clearly don't depend on the underlying rule
1391 # simply overwrite
1392 my %wb_actions = (
1393 WB_NOBREAK => 0,
1394 WB_BREAKABLE => 1,
1395 WB_hs_then_hs => 2,
1396 WB_Ex_or_FO_then_foo => 3,
1397 WB_DQ_then_HL => 4,
1398 WB_HL_then_DQ => 6,
1399 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1400 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1401 WB_MB_or_MN_or_SQ_then_NU => 12,
1402 WB_NU_then_MB_or_MN_or_SQ => 14,
1403 );
1404
7e54b87f
KW
1405 # Construct the WB pair table.
1406 # The table is constructed in reverse order of the rules, to make the
1407 # lower-numbered, higher priority ones override the later ones, as the
1408 # algorithm stops at the earliest matching rule
1409
1410 my @wb_table;
1411 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
1412
1413 # Otherwise, break everywhere (including around ideographs).
1414 # WB14 Any ÷ Any
1415 for my $i (0 .. $table_size - 1) {
1416 for my $j (0 .. $table_size - 1) {
1417 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1418 }
1419 }
1420
1421 # Do not break between regional indicator symbols.
1422 # WB13c Regional_Indicator × Regional_Indicator
289ce9cc
KW
1423 $wb_table[$wb_enums{'Regional_Indicator'}]
1424 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1425
1426 # Do not break from extenders.
1427 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
289ce9cc
KW
1428 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1429 = $wb_actions{'WB_NOBREAK'};
1430 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1431 = $wb_actions{'WB_NOBREAK'};
1432 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1433 = $wb_actions{'WB_NOBREAK'};
1434 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1435 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1436
1437 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1438 # × # ExtendNumLet
289ce9cc
KW
1439 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1440 = $wb_actions{'WB_NOBREAK'};
1441 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1442 = $wb_actions{'WB_NOBREAK'};
1443 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1444 = $wb_actions{'WB_NOBREAK'};
1445 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1446 = $wb_actions{'WB_NOBREAK'};
1447 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1448 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1449
1450 # Do not break between Katakana.
1451 # WB13 Katakana × Katakana
289ce9cc
KW
1452 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1453 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1454
1455 # Do not break within sequences, such as “3.2” or “3,456.789”.
1456 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
289ce9cc 1457 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
7e54b87f 1458 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1459 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
7e54b87f 1460 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1461 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1462 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1463
1464 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
289ce9cc 1465 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
7e54b87f 1466 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1467 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
7e54b87f 1468 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1469 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
7e54b87f
KW
1470 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1471
1472 # Do not break within sequences of digits, or digits adjacent to letters
1473 # (“3a”, or “A3”).
1474 # WB10 Numeric × (ALetter | Hebrew_Letter)
289ce9cc
KW
1475 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1476 = $wb_actions{'WB_NOBREAK'};
1477 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1478 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1479
1480 # WB9 (ALetter | Hebrew_Letter) × Numeric
289ce9cc
KW
1481 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1482 = $wb_actions{'WB_NOBREAK'};
1483 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1484 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1485
1486 # WB8 Numeric × Numeric
289ce9cc
KW
1487 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1488 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1489
1490 # Do not break letters across certain punctuation.
1491 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
289ce9cc
KW
1492 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1493 += $wb_actions{'WB_DQ_then_HL'};
7e54b87f
KW
1494
1495 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
289ce9cc
KW
1496 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1497 += $wb_actions{'WB_HL_then_DQ'};
7e54b87f
KW
1498
1499 # WB7a Hebrew_Letter × Single_Quote
289ce9cc
KW
1500 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1501 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1502
1503 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1504 # × (ALetter | Hebrew_Letter)
289ce9cc 1505 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
7e54b87f 1506 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1507 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1508 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1509 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
7e54b87f 1510 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1511 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1512 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1513 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
7e54b87f 1514 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1515 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f
KW
1516 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1517
1518 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1519 # | Single_Quote) (ALetter | Hebrew_Letter)
289ce9cc 1520 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1521 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1522 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1523 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1524 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
7e54b87f 1525 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1526 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
7e54b87f 1527 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1528 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
7e54b87f 1529 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1530 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1531 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1532
1533 # Do not break between most letters.
1534 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
289ce9cc
KW
1535 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1536 = $wb_actions{'WB_NOBREAK'};
1537 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1538 = $wb_actions{'WB_NOBREAK'};
1539 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1540 = $wb_actions{'WB_NOBREAK'};
1541 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1542 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1543
1544 # Ignore Format and Extend characters, except when they appear at the
1545 # beginning of a region of text.
1546 # WB4 X (Extend | Format)* → X
1547 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1548 $wb_table[$wb_enums{'Extend'}][$i]
1549 = $wb_actions{'WB_Ex_or_FO_then_foo'};
1550 $wb_table[$wb_enums{'Format'}][$i]
1551 = $wb_actions{'WB_Ex_or_FO_then_foo'};
7e54b87f
KW
1552 }
1553
1554 # Implied is that these attach to the character before them, except for
1555 # the characters that mark the end of a region of text. The rules below
1556 # override the ones set up here, for all the characters that need
1557 # overriding.
1558 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1559 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1560 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1561 }
1562
1563 # Break before and after white space
1564 # WB3b ÷ (Newline | CR | LF)
1565 # WB3a (Newline | CR | LF) ÷
1566 # et. al.
289ce9cc 1567 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1568 for my $j (0 .. @wb_table - 1) {
1569 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1570 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1571 }
1572 }
1573
1574 # But do not break within white space.
1575 # WB3 CR × LF
1576 # et.al.
289ce9cc
KW
1577 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1578 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1579 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1580 }
1581 }
1582
1583 # And do not break horizontal space followed by Extend or Format
289ce9cc
KW
1584 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1585 = $wb_actions{'WB_NOBREAK'};
1586 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1587 = $wb_actions{'WB_NOBREAK'};
1588 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1589 [$wb_enums{'Perl_Tailored_HSpace'}]
1590 = $wb_actions{'WB_hs_then_hs'};
7e54b87f
KW
1591
1592 # Break at the start and end of text.
1593 # WB2 ÷ eot
1594 # WB1 sot ÷
1595 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1596 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1597 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
7e54b87f
KW
1598 }
1599
1600 # But, unspecified by Unicode, we shouldn't break on an empty string.
289ce9cc 1601 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
7e54b87f 1602
289ce9cc
KW
1603 output_table_common('WB', \%wb_actions,
1604 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
7e54b87f
KW
1605}
1606
9d9177be
KW
1607output_invlist("Latin1", [ 0, 256 ]);
1608output_invlist("AboveLatin1", [ 256 ]);
1609
bffc0129 1610end_file_pound_if;
43b443dd 1611
3f427fd9
KW
1612# We construct lists for all the POSIX and backslash sequence character
1613# classes in two forms:
1614# 1) ones which match only in the ASCII range
1615# 2) ones which match either in the Latin1 range, or the entire Unicode range
1616#
1617# These get compiled in, and hence affect the memory footprint of every Perl
1618# program, even those not using Unicode. To minimize the size, currently
1619# the Latin1 version is generated for the beyond ASCII range except for those
1620# lists that are quite small for the entire range, such as for \s, which is 22
1621# UVs long plus 4 UVs (currently) for the header.
1622#
1623# To save even more memory, the ASCII versions could be derived from the
1624# larger ones at runtime, saving some memory (minus the expense of the machine
1625# instructions to do so), but these are all small anyway, so their total is
1626# about 100 UVs.
1627#
1628# In the list of properties below that get generated, the L1 prefix is a fake
1629# property that means just the Latin1 range of the full property (whose name
1630# has an X prefix instead of L1).
a02047bf
KW
1631#
1632# An initial & means to use the subroutine from this file instead of an
1633# official inversion list.
3f427fd9 1634
0c4ecf42
KW
1635for my $charset (get_supported_code_pages()) {
1636 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1637
99f21fb9
KW
1638 @a2n = @{get_a2n($charset)};
1639 no warnings 'qw';
1640 # Ignore non-alpha in sort
1641 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
1642 ASCII
1643 Cased
1644 VertSpace
1645 XPerlSpace
1646 XPosixAlnum
1647 XPosixAlpha
1648 XPosixBlank
1649 XPosixCntrl
1650 XPosixDigit
1651 XPosixGraph
1652 XPosixLower
1653 XPosixPrint
1654 XPosixPunct
1655 XPosixSpace
1656 XPosixUpper
1657 XPosixWord
1658 XPosixXDigit
1659 _Perl_Any_Folds
1660 &NonL1_Perl_Non_Final_Folds
1661 _Perl_Folds_To_Multi_Char
1662 &UpperLatin1
1663 _Perl_IDStart
1664 _Perl_IDCont
02f811dd 1665 _Perl_GCB,EDGE
ca8226cf 1666 _Perl_LB,EDGE
bf4268fa 1667 _Perl_SB,EDGE
190d69bb 1668 _Perl_WB,EDGE,UNKNOWN
1c8c3428 1669 )
0f5e3c71
KW
1670 ) {
1671
1672 # For the Latin1 properties, we change to use the eXtended version of the
1673 # base property, then go through the result and get rid of everything not
1674 # in Latin1 (above 255). Actually, we retain the element for the range
1675 # that crosses the 255/256 boundary if it is one that matches the
1676 # property. For example, in the Word property, there is a range of code
1677 # points that start at U+00F8 and goes through U+02C1. Instead of
1678 # artificially cutting that off at 256 because 256 is the first code point
1679 # above Latin1, we let the range go to its natural ending. That gives us
1680 # extra information with no added space taken. But if the range that
1681 # crosses the boundary is one that doesn't match the property, we don't
1682 # start a new range above 255, as that could be construed as going to
1683 # infinity. For example, the Upper property doesn't include the character
1684 # at 255, but does include the one at 256. We don't include the 256 one.
1685 my $prop_name = $prop;
1686 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1687 my $extra_enums = "";
1688 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1689 my $lookup_prop = $prop_name;
1690 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1691 or $lookup_prop =~ s/^L1//);
1692 my $nonl1_only = 0;
1693 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1694 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1695
1696 my @invlist;
99f21fb9
KW
1697 my @invmap;
1698 my $map_format;
1699 my $map_default;
1700 my $maps_to_code_point;
1701 my $to_adjust;
0f5e3c71
KW
1702 if ($is_local_sub) {
1703 @invlist = eval $lookup_prop;
289ce9cc 1704 die $@ if $@;
0f5e3c71
KW
1705 }
1706 else {
1707 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1708 if (! @invlist) {
99f21fb9 1709
ad85f59a
KW
1710 # If couldn't find a non-empty inversion list, see if it is
1711 # instead an inversion map
1712 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1713 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1714 if (! $list_ref) {
1715 # An empty return here could mean an unknown property, or
1716 # merely that the original inversion list is empty. Call
1717 # in scalar context to differentiate
1718 my $count = prop_invlist($lookup_prop,
1719 '_perl_core_internal_ok');
1720 die "Could not find inversion list for '$lookup_prop'"
1721 unless defined $count;
1722 }
1723 else {
18b852b3
KW
1724 @invlist = @$list_ref;
1725 @invmap = @$map_ref;
1726 $map_format = $format;
1727 $map_default = $default;
1728 $maps_to_code_point = $map_format =~ /x/;
1729 $to_adjust = $map_format =~ /a/;
ad85f59a 1730 }
99f21fb9 1731 }
0f5e3c71 1732 }
ad85f59a
KW
1733
1734
1735 # Short-circuit an empty inversion list.
1736 if (! @invlist) {
1737 output_invlist($prop_name, \@invlist, $charset);
1738 next;
1739 }
ceb1de32 1740
99f21fb9
KW
1741 # Re-order the Unicode code points to native ones for this platform.
1742 # This is only needed for code points below 256, because native code
1743 # points are only in that range. For inversion maps of properties
1744 # where the mappings are adjusted (format =~ /a/), this reordering
1745 # could mess up the adjustment pattern that was in the input, so that
1746 # has to be dealt with.
1747 #
1748 # And inversion maps that map to code points need to eventually have
1749 # all those code points remapped to native, and it's better to do that
1750 # here, going through the whole list not just those below 256. This
1751 # is because some inversion maps have adjustments (format =~ /a/)
1752 # which may be affected by the reordering. This code needs to be done
1753 # both for when we are translating the inversion lists for < 256, and
1754 # for the inversion maps for everything. By doing both in this loop,
1755 # we can share that code.
1756 #
1757 # So, we go through everything for an inversion map to code points;
1758 # otherwise, we can skip any remapping at all if we are going to
1759 # output only the above-Latin1 values, or if the range spans the whole
1760 # of 0..256, as the remap will also include all of 0..256 (256 not
1761 # 255 because a re-ordering could cause 256 to need to be in the same
1762 # range as 255.)
1763 if ((@invmap && $maps_to_code_point)
1764 || (! $nonl1_only || ($invlist[0] < 256
1765 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1766 {
fb4554ea 1767
99f21fb9 1768 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1769 # Look at all the ranges that start before 257.
1770 my @latin1_list;
1771 while (@invlist) {
1772 last if $invlist[0] > 256;
1773 my $upper = @invlist > 1
1774 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1775
1776 # To infinity. You may want to stop much much
1777 # earlier; going this high may expose perl
1778 # deficiencies with very large numbers.
1779 : $Unicode::UCD::MAX_CP;
fb4554ea 1780 for my $j ($invlist[0] .. $upper) {
99f21fb9 1781 push @latin1_list, a2n($j);
0f5e3c71 1782 }
fb4554ea
KW
1783
1784 shift @invlist; # Shift off the range that's in the list
1785 shift @invlist; # Shift off the range not in the list
0c4ecf42 1786 }
fb4554ea
KW
1787
1788 # Here @invlist contains all the ranges in the original that start
1789 # at code points above 256, and @latin1_list contains all the
1790 # native code points for ranges that start with a Unicode code
1791 # point below 257. We sort the latter and convert it to inversion
1792 # list format. Then simply prepend it to the list of the higher
1793 # code points.
1794 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1795 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1796 unshift @invlist, @latin1_list;
99f21fb9
KW
1797 }
1798 else { # Is an inversion map
1799
1800 # This is a similar procedure as plain inversion list, but has
1801 # multiple buckets. A plain inversion list just has two
1802 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1803 # pretty much can ignore the 2nd bucket, as it is completely
1804 # defined by the 1st. But here, what we do is create buckets
1805 # which contain the code points that map to each, translated
1806 # to native and turned into an inversion list. Thus each
1807 # bucket is an inversion list of native code points that map
1808 # to it or don't map to it. We use these to create an
1809 # inversion map for the whole property.
1810
1811 # As mentioned earlier, we use this procedure to not just
1812 # remap the inversion list to native values, but also the maps
1813 # of code points to native ones. In the latter case we have
1814 # to look at the whole of the inversion map (or at least to
1815 # above Unicode; as the maps of code points above that should
1816 # all be to the default).
1817 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1818
1819 my %mapped_lists; # A hash whose keys are the buckets.
1820 while (@invlist) {
1821 last if $invlist[0] > $upper_limit;
1822
1823 # This shouldn't actually happen, as prop_invmap() returns
1824 # an extra element at the end that is beyond $upper_limit
1825 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1826
1827 my $bucket;
1828
1829 # A hash key can't be a ref (we are only expecting arrays
1830 # of scalars here), so convert any such to a string that
1831 # will be converted back later (using a vertical tab as
1832 # the separator). Even if the mapping is to code points,
1833 # we don't translate to native here because the code
1834 # output_map() calls to output these arrays assumes the
1835 # input is Unicode, not native.
1836 if (ref $invmap[0]) {
1837 $bucket = join "\cK", @{$invmap[0]};
1838 }
1839 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1840
1841 # Do convert to native for maps to single code points.
1842 # There are some properties that have a few outlier
1843 # maps that aren't code points, so the above test
1844 # skips those.
1845 $bucket = a2n($invmap[0]);
1846 } else {
1847 $bucket = $invmap[0];
1848 }
1849
1850 # We now have the bucket that all code points in the range
1851 # map to, though possibly they need to be adjusted. Go
1852 # through the range and put each translated code point in
1853 # it into its bucket.
1854 my $base_map = $invmap[0];
1855 for my $j ($invlist[0] .. $invlist[1] - 1) {
1856 if ($to_adjust
1857 # The 1st code point doesn't need adjusting
1858 && $j > $invlist[0]
1859
1860 # Skip any non-numeric maps: these are outliers
1861 # that aren't code points.
1862 && $base_map =~ $numeric_re
1863
1864 # 'ne' because the default can be a string
1865 && $base_map ne $map_default)
1866 {
1867 # We adjust, by incrementing each the bucket and
1868 # the map. For code point maps, translate to
1869 # native
1870 $base_map++;
1871 $bucket = ($maps_to_code_point)
1872 ? a2n($base_map)
1873 : $base_map;
1874 }
1875
1876 # Add the native code point to the bucket for the
1877 # current map
1878 push @{$mapped_lists{$bucket}}, a2n($j);
1879 } # End of loop through all code points in the range
1880
1881 # Get ready for the next range
1882 shift @invlist;
1883 shift @invmap;
1884 } # End of loop through all ranges in the map.
1885
1886 # Here, @invlist and @invmap retain all the ranges from the
1887 # originals that start with code points above $upper_limit.
1888 # Each bucket in %mapped_lists contains all the code points
1889 # that map to that bucket. If the bucket is for a map to a
1890 # single code point is a single code point, the bucket has
1891 # been converted to native. If something else (including
1892 # multiple code points), no conversion is done.
1893 #
1894 # Now we recreate the inversion map into %xlated, but this
1895 # time for the native character set.
1896 my %xlated;
1897 foreach my $bucket (keys %mapped_lists) {
1898
1899 # Sort and convert this bucket to an inversion list. The
1900 # result will be that ranges that start with even-numbered
1901 # indexes will be for code points that map to this bucket;
1902 # odd ones map to some other bucket, and are discarded
1903 # below.
1904 @{$mapped_lists{$bucket}}
1905 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
1906 @{$mapped_lists{$bucket}}
1907 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
1908
1909 # Add each even-numbered range in the bucket to %xlated;
1910 # so that the keys of %xlated become the range start code
1911 # points, and the values are their corresponding maps.
1912 while (@{$mapped_lists{$bucket}}) {
1913 my $range_start = $mapped_lists{$bucket}->[0];
1914 if ($bucket =~ /\cK/) {
1915 @{$xlated{$range_start}} = split /\cK/, $bucket;
1916 }
1917 else {
1918 $xlated{$range_start} = $bucket;
1919 }
1920 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
1921 shift @{$mapped_lists{$bucket}}; # Get ready for next
1922 # iteration
1923 }
1924 } # End of loop through all the buckets.
1925
1926 # Here %xlated's keys are the range starts of all the code
1927 # points in the inversion map. Construct an inversion list
1928 # from them.
1929 my @new_invlist = sort { $a <=> $b } keys %xlated;
1930
1931 # If the list is adjusted, we want to munge this list so that
1932 # we only have one entry for where consecutive code points map
1933 # to consecutive values. We just skip the subsequent entries
1934 # where this is the case.
1935 if ($to_adjust) {
1936 my @temp;
1937 for my $i (0 .. @new_invlist - 1) {
1938 next if $i > 0
1939 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
1940 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
1941 && $xlated{$new_invlist[$i]} =~ $numeric_re
1942 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
1943 push @temp, $new_invlist[$i];
1944 }
1945 @new_invlist = @temp;
1946 }
1947
1948 # The inversion map comes from %xlated's values. We can
1949 # unshift each onto the front of the untouched portion, in
1950 # reverse order of the portion we did process.
1951 foreach my $start (reverse @new_invlist) {
1952 unshift @invmap, $xlated{$start};
1953 }
1954
1955 # Finally prepend the inversion list we have just constructed to the
1956 # one that contains anything we didn't process.
1957 unshift @invlist, @new_invlist;
1958 }
1959 }
1960
1961 # prop_invmap() returns an extra final entry, which we can now
1962 # discard.
1963 if (@invmap) {
1964 pop @invlist;
1965 pop @invmap;
ceb1de32 1966 }
0f5e3c71
KW
1967
1968 if ($l1_only) {
99f21fb9 1969 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
1970 for my $i (0 .. @invlist - 1 - 1) {
1971 if ($invlist[$i] > 255) {
1972
1973 # In an inversion list, even-numbered elements give the code
1974 # points that begin ranges that match the property;
1975 # odd-numbered give ones that begin ranges that don't match.
1976 # If $i is odd, we are at the first code point above 255 that
1977 # doesn't match, which means the range it is ending does
1978 # match, and crosses the 255/256 boundary. We want to include
1979 # this ending point, so increment $i, so the splice below
1980 # includes it. Conversely, if $i is even, it is the first
1981 # code point above 255 that matches, which means there was no
1982 # matching range that crossed the boundary, and we don't want
1983 # to include this code point, so splice before it.
1984 $i++ if $i % 2 != 0;
1985
1986 # Remove everything past this.
1987 splice @invlist, $i;
99f21fb9 1988 splice @invmap, $i if @invmap;
0f5e3c71
KW
1989 last;
1990 }
0c4ecf42
KW
1991 }
1992 }
0f5e3c71
KW
1993 elsif ($nonl1_only) {
1994 my $found_nonl1 = 0;
1995 for my $i (0 .. @invlist - 1 - 1) {
1996 next if $invlist[$i] < 256;
1997
1998 # Here, we have the first element in the array that indicates an
1999 # element above Latin1. Get rid of all previous ones.
2000 splice @invlist, 0, $i;
99f21fb9 2001 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
2002
2003 # If this one's index is not divisible by 2, it means that this
2004 # element is inverting away from being in the list, which means
99f21fb9
KW
2005 # all code points from 256 to this one are in this list (or
2006 # map to the default for inversion maps)
2007 if ($i % 2 != 0) {
2008 unshift @invlist, 256;
2009 unshift @invmap, $map_default if @invmap;
2010 }
0f5e3c71 2011 $found_nonl1 = 1;
3f427fd9
KW
2012 last;
2013 }
0f5e3c71 2014 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 2015 }
3f427fd9 2016
0f5e3c71 2017 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 2018 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 2019 }
bffc0129 2020 end_file_pound_if;
0c4ecf42 2021 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
2022}
2023
973a28ed
KW
2024switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2025
2026output_GCB_table();
6b659339 2027output_LB_table();
7e54b87f 2028output_WB_table();
6b659339 2029
973a28ed
KW
2030end_file_pound_if;
2031
2308ab83 2032my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
2033my @sources = ($0, qw(lib/unicore/mktables
2034 lib/Unicode/UCD.pm
2035 regen/charset_translations.pl
2036 ));
9a3da3ad
FC
2037{
2038 # Depend on mktables’ own sources. It’s a shorter list of files than
2039 # those that Unicode::UCD uses.
2308ab83
KW
2040 if (! open my $mktables_list, $sources_list) {
2041
2042 # This should force a rebuild once $sources_list exists
2043 push @sources, $sources_list;
2044 }
2045 else {
2046 while(<$mktables_list>) {
2047 last if /===/;
2048 chomp;
2049 push @sources, "lib/unicore/$_" if /^[^#]/;
2050 }
9a3da3ad
FC
2051 }
2052}
6b659339
KW
2053
2054read_only_bottom_close_and_rename($out_fh, \@sources);