This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Add defensive check
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
3d7c117d
MB
11require './regen/regen_lib.pl';
12require './regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
289ce9cc
KW
43my $max_hdr_len = 3; # In headings, how wide a name is allowed?
44
9d9177be
KW
45print $out_fh "/* See the generating file for comments */\n\n";
46
bffc0129
KW
47# The symbols generated by this program are all currently defined only in a
48# single dot c each. The code knows where most of them go, but this hash
49# gives overrides for the exceptions to the typical place
50my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
59 );
015bb97c 60
f79a09fc 61# This hash contains the properties with enums that have hard-coded references
289ce9cc 62# to them in C code. It is neeed to make sure that if perl is compiled
f79a09fc
KW
63# with an older Unicode data set, that all the enum values the code is
64# expecting will still be in the enum typedef. Thus the code doesn't have to
289ce9cc
KW
65# change. The Unicode version won't have any code points that have the enum
66# values not in that version, so the code that handles them will not get
67# exercised. This is far better than having to #ifdef things. The names here
68# should be the long names of the respective property values. The reason for
69# this is because regexec.c uses them as case labels, and the long name is
70# generally more understandable than the short.
f79a09fc
KW
71my %hard_coded_enums =
72 ( gcb => [
73 'Control',
74 'CR',
b0e24409
KW
75 'E_Base',
76 'E_Base_GAZ',
77 'E_Modifier',
f79a09fc 78 'Extend',
b0e24409 79 'Glue_After_Zwj',
f79a09fc
KW
80 'L',
81 'LF',
82 'LV',
83 'LVT',
84 'Other',
85 'Prepend',
86 'Regional_Indicator',
87 'SpacingMark',
88 'T',
89 'V',
b0e24409 90 'ZWJ',
f79a09fc 91 ],
ca8226cf
KW
92 lb => [
93 'Alphabetic',
94 'Break_After',
95 'Break_Before',
96 'Break_Both',
97 'Break_Symbols',
98 'Carriage_Return',
99 'Close_Parenthesis',
100 'Close_Punctuation',
101 'Combining_Mark',
102 'Contingent_Break',
b0e24409
KW
103 'E_Base',
104 'E_Modifier',
ca8226cf
KW
105 'Exclamation',
106 'Glue',
107 'H2',
108 'H3',
109 'Hebrew_Letter',
110 'Hyphen',
111 'Ideographic',
112 'Infix_Numeric',
113 'Inseparable',
114 'JL',
115 'JT',
116 'JV',
117 'Line_Feed',
118 'Mandatory_Break',
119 'Next_Line',
120 'Nonstarter',
121 'Numeric',
122 'Open_Punctuation',
123 'Postfix_Numeric',
124 'Prefix_Numeric',
125 'Quotation',
126 'Regional_Indicator',
127 'Space',
128 'Word_Joiner',
b0e24409 129 'ZWJ',
ca8226cf
KW
130 'ZWSpace',
131 ],
f79a09fc
KW
132 sb => [
133 'ATerm',
134 'Close',
135 'CR',
136 'Extend',
137 'Format',
138 'LF',
139 'Lower',
140 'Numeric',
141 'OLetter',
142 'Other',
143 'SContinue',
144 'Sep',
145 'Sp',
146 'STerm',
147 'Upper',
148 ],
149 wb => [
150 'ALetter',
151 'CR',
152 'Double_Quote',
b0e24409
KW
153 'E_Base',
154 'E_Base_GAZ',
155 'E_Modifier',
f79a09fc
KW
156 'Extend',
157 'ExtendNumLet',
158 'Format',
b0e24409 159 'Glue_After_Zwj',
f79a09fc
KW
160 'Hebrew_Letter',
161 'Katakana',
162 'LF',
163 'MidLetter',
164 'MidNum',
165 'MidNumLet',
166 'Newline',
167 'Numeric',
168 'Other',
f1f6961f 169 'Perl_Tailored_HSpace',
f79a09fc
KW
170 'Regional_Indicator',
171 'Single_Quote',
b0e24409 172 'ZWJ',
f79a09fc
KW
173 ],
174);
175
973a28ed
KW
176my %gcb_enums;
177my @gcb_short_enums;
289ce9cc 178my %gcb_abbreviations;
6b659339
KW
179my %lb_enums;
180my @lb_short_enums;
289ce9cc 181my %lb_abbreviations;
7e54b87f
KW
182my %wb_enums;
183my @wb_short_enums;
289ce9cc 184my %wb_abbreviations;
6b659339 185
99f21fb9
KW
186my @a2n;
187
188sub uniques {
189 # Returns non-duplicated input values. From "Perl Best Practices:
190 # Encapsulated Cleverness". p. 455 in first edition.
191
192 my %seen;
193 return grep { ! $seen{$_}++ } @_;
194}
195
196sub a2n($) {
197 my $cp = shift;
198
199 # Returns the input Unicode code point translated to native.
200
201 return $cp if $cp !~ $numeric_re || $cp > 255;
202 return $a2n[$cp];
203}
204
bffc0129
KW
205sub end_file_pound_if {
206 if ($in_file_pound_if) {
207 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
208 $in_file_pound_if = 0;
209 }
210}
211
212sub switch_pound_if ($$) {
213 my $name = shift;
214 my $new_pound_if = shift;
215
216 # Switch to new #if given by the 2nd argument. If there is an override
217 # for this, it instead switches to that. The 1st argument is the
218 # static's name, used to look up the overrides
219
220 if (exists $exceptions_to_where_to_define{$name}) {
221 $new_pound_if = $exceptions_to_where_to_define{$name};
222 }
223
224 # Exit current #if if the new one is different from the old
225 if ($in_file_pound_if
226 && $in_file_pound_if !~ /$new_pound_if/)
227 {
228 end_file_pound_if;
229 }
230
231 # Enter new #if, if not already in it.
232 if (! $in_file_pound_if) {
233 $in_file_pound_if = "defined($new_pound_if)";
234 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
235 }
236}
237
0c4ecf42 238sub output_invlist ($$;$) {
9d9177be
KW
239 my $name = shift;
240 my $invlist = shift; # Reference to inversion list array
0c4ecf42 241 my $charset = shift // ""; # name of character set for comment
9d9177be 242
76d3994c 243 die "No inversion list for $name" unless defined $invlist
ad85f59a 244 && ref $invlist eq 'ARRAY';
76d3994c 245
9d9177be
KW
246 # Output the inversion list $invlist using the name $name for it.
247 # It is output in the exact internal form for inversion lists.
248
a0316a6c
KW
249 # Is the last element of the header 0, or 1 ?
250 my $zero_or_one = 0;
ad85f59a 251 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 252 unshift @$invlist, 0;
9d9177be
KW
253 $zero_or_one = 1;
254 }
0a07b44b 255 my $count = @$invlist;
9d9177be 256
bffc0129 257 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 258
0c4ecf42
KW
259 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
260 print $out_fh " /* for $charset */" if $charset;
261 print $out_fh "\n";
9d9177be 262
a0316a6c 263 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
264 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
265 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
266 ",\t/* 0 if the list starts at 0;",
267 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
268
269 # The main body are the UVs passed in to this routine. Do the final
270 # element separately
47d53124
KW
271 for my $i (0 .. @$invlist - 1) {
272 printf $out_fh "\t0x%X", $invlist->[$i];
273 print $out_fh "," if $i < @$invlist - 1;
274 print $out_fh "\n";
9d9177be
KW
275 }
276
9d9177be
KW
277 print $out_fh "};\n";
278}
279
99f21fb9
KW
280sub output_invmap ($$$$$$$) {
281 my $name = shift;
282 my $invmap = shift; # Reference to inversion map array
283 my $prop_name = shift;
284 my $input_format = shift; # The inversion map's format
285 my $default = shift; # The property value for code points who
286 # otherwise don't have a value specified.
287 my $extra_enums = shift; # comma-separated list of our additions to the
288 # property's standard possible values
289 my $charset = shift // ""; # name of character set for comment
290
291 # Output the inversion map $invmap for property $prop_name, but use $name
292 # as the actual data structure's name.
293
294 my $count = @$invmap;
295
296 my $output_format;
297 my $declaration_type;
298 my %enums;
299 my $name_prefix;
300
301 if ($input_format eq 's') {
b83e6484 302 my $orig_prop_name = $prop_name;
02f811dd
KW
303 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
304 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
19a5f1d5
KW
305 my @enums;
306 if ($orig_prop_name eq $prop_name) {
307 @enums = prop_values($prop_name);
308 }
309 else {
310 @enums = uniques(@$invmap);
311 }
289ce9cc 312
f79a09fc 313
27a619f7
KW
314 die "Only enum properties are currently handled; '$prop_name' isn't one"
315 unless @enums;
316
317 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
318 my @canonical_input_enums;
319 if (@expected_enums) {
320 if (@expected_enums < @enums) {
321 die 'You need to update %hard_coded_enums to reflect new'
322 . " entries in this Unicode version\n"
323 . "Expected: " . join(", ", sort @expected_enums) . "\n"
324 . " Got: " . join(", ", sort @enums);
19a5f1d5 325 }
99f21fb9 326
27a619f7 327 if (! defined prop_aliases($prop_name)) {
6dc80864 328
27a619f7
KW
329 # Convert the input enums into canonical form and
330 # save for use below
331 @canonical_input_enums = map { lc ($_ =~ s/_//gr) }
332 @enums;
19a5f1d5 333 }
27a619f7
KW
334 @enums = sort @expected_enums;
335 }
6b659339 336
27a619f7
KW
337 # The internal enums come last, and in the order specified
338 my @extras;
339 if ($extra_enums ne "") {
340 @extras = split /,/, $extra_enums;
341 push @enums, @extras;
342 }
289ce9cc 343
27a619f7
KW
344 # Assign a value to each element of the enum. The default
345 # value always gets 0; the others are arbitrarily assigned.
346 my $enum_val = 0;
347 my $canonical_default = prop_value_aliases($prop_name, $default);
348 $default = $canonical_default if defined $canonical_default;
349 $enums{$default} = $enum_val++;
350 for my $enum (@enums) {
351 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
352 }
353
354 # Calculate the enum values for certain properties like
355 # _Perl_GCB and _Perl_LB, because we output special tables for
356 # them.
357 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
358
359 # We use string evals to allow the same code to work on
360 # all tables we're doing.
361 my $type = lc $prop_name;
362
363 # We use lowercase single letter names for any property
364 # values not in the release of Unicode being compiled now.
365 my $placeholder = "a";
366
367 # Skip if we've already done this code, which populated
368 # this hash
369 if (eval "! \%${type}_enums") {
370
371 # For each enum ...
372 foreach my $enum (sort keys %enums) {
373 my $value = $enums{$enum};
374 my $short;
375 my $abbreviated_from;
376
377 # Special case this wb property value to make the
378 # name more clear
379 if ($enum eq 'Perl_Tailored_HSpace') {
380 $short = 'hs';
381 $abbreviated_from = $enum;
382 }
383 elsif (grep { $_ eq $enum } @extras) {
289ce9cc 384
27a619f7
KW
385 # The 'short' name for one of the property
386 # values added by this file is just the
387 # lowercase of it
388 $short = lc $enum;
389 }
390 elsif (grep {$_ eq lc ( $enum =~ s/_//gr) }
391 @canonical_input_enums)
392 { # On Unicode versions that predate the
393 # official property, we have set up this array
394 # to be the canonical form of each enum in the
395 # substitute property. If the enum we're
396 # looking at is canonically the same as one of
397 # these, use its name instead of generating a
398 # placeholder one in the next clause (which
399 # will happen because prop_value_aliases()
400 # will fail because it only works on official
401 # properties)
402 $short = $enum;
403 }
404 else {
405 # Use the official short name for the other
406 # property values, which should all be
407 # official ones.
408 ($short) = prop_value_aliases($type, $enum);
409
410 # But create a placeholder for ones not in
411 # this Unicode version.
412 $short = $placeholder++ unless defined $short;
413 }
414
415 # If our short name is too long, or we already
416 # know that the name is an abbreviation, truncate
417 # to make sure it's short enough, and remember
418 # that we did this so we can later place in a
419 # comment in the generated file
420 if ( $abbreviated_from
421 || length $short > $max_hdr_len)
422 {
423 $short = substr($short, 0, $max_hdr_len);
424 $abbreviated_from = $enum
425 unless $abbreviated_from;
426 # If the name we are to display conflicts, try
427 # another.
428 while (eval "exists
429 \$${type}_abbreviations{$short}")
430 {
289ce9cc 431 die $@ if $@;
256fceb3
KW
432
433 # The increment operator on strings doesn't work
434 # on those containing an '_', so just use the
435 # final portion.
436 my @short = split '_', $short;
437 $short[-1]++;
438 $short = join "_", @short;
289ce9cc 439 }
19a5f1d5 440
27a619f7 441 eval "\$${type}_abbreviations{$short} = '$enum'";
19a5f1d5 442 die $@ if $@;
7e54b87f 443 }
27a619f7
KW
444
445 # Remember the mapping from the property value
446 # (enum) name to its value.
447 eval "\$${type}_enums{$enum} = $value";
448 die $@ if $@;
449
450 # Remember the inverse mapping to the short name
451 # so that we can properly label the generated
452 # table's rows and columns
453 eval "\$${type}_short_enums[$value] = '$short'";
454 die $@ if $@;
7e54b87f 455 }
99f21fb9 456 }
19a5f1d5 457 }
99f21fb9 458
19a5f1d5
KW
459 # Inversion map stuff is currently used only by regexec
460 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
461
462 # The short names tend to be two lower case letters, but it looks
463 # better for those if they are upper. XXX
464 $short_name = uc($short_name) if length($short_name) < 3
465 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
466 $name_prefix = "${short_name}_";
467 my $enum_count = keys %enums;
468 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
469
470 print $out_fh "\ntypedef enum {\n";
471 my @enum_list;
472 foreach my $enum (keys %enums) {
473 $enum_list[$enums{$enum}] = $enum;
99f21fb9 474 }
19a5f1d5
KW
475 foreach my $i (0 .. @enum_list - 1) {
476 my $name = $enum_list[$i];
477 print $out_fh "\t${name_prefix}$name = $i";
478 print $out_fh "," if $i < $enum_count - 1;
479 print $out_fh "\n";
480 }
481 $declaration_type = "${name_prefix}enum";
482 print $out_fh "} $declaration_type;\n";
483
484 $output_format = "${name_prefix}%s";
99f21fb9
KW
485 }
486 else {
487 die "'$input_format' invmap() format for '$prop_name' unimplemented";
488 }
489
490 die "No inversion map for $prop_name" unless defined $invmap
491 && ref $invmap eq 'ARRAY'
492 && $count;
493
494 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
495 print $out_fh " /* for $charset */" if $charset;
496 print $out_fh "\n";
497
498 # The main body are the scalars passed in to this routine.
499 for my $i (0 .. $count - 1) {
500 my $element = $invmap->[$i];
02f811dd
KW
501 my $full_element_name = prop_value_aliases($prop_name, $element);
502 $element = $full_element_name if defined $full_element_name;
503 $element = $name_prefix . $element;
99f21fb9
KW
504 print $out_fh "\t$element";
505 print $out_fh "," if $i < $count - 1;
506 print $out_fh "\n";
507 }
508 print $out_fh "};\n";
99f21fb9
KW
509}
510
5a7e5385 511sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
512
513 # Returns an inversion list constructed from the sorted input array of
514 # code points
515
516 my $list_ref = shift;
517
99f21fb9
KW
518 return unless @$list_ref;
519
a02047bf
KW
520 # Initialize to just the first element
521 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
522
523 # For each succeeding element, if it extends the previous range, adjust
524 # up, otherwise add it.
525 for my $i (1 .. @$list_ref - 1) {
526 if ($invlist[-1] == $list_ref->[$i]) {
527 $invlist[-1]++;
528 }
529 else {
530 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
531 }
532 }
533 return @invlist;
534}
535
536# Read in the Case Folding rules, and construct arrays of code points for the
537# properties we need.
538my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
539die "Could not find inversion map for Case_Folding" unless defined $format;
540die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
541 unless $format eq 'al'
542 || $format eq 'a';
a02047bf
KW
543my @has_multi_char_fold;
544my @is_non_final_fold;
545
546for my $i (0 .. @$folds_ref - 1) {
547 next unless ref $folds_ref->[$i]; # Skip single-char folds
548 push @has_multi_char_fold, $cp_ref->[$i];
549
b6a6e956 550 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
551 # position
552 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
553 push @is_non_final_fold, $folds_ref->[$i][$j]
554 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
555 }
556}
557
a02047bf
KW
558sub _Perl_Non_Final_Folds {
559 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 560 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
561}
562
99f21fb9
KW
563sub prop_name_for_cmp ($) { # Sort helper
564 my $name = shift;
565
566 # Returns the input lowercased, with non-alphas removed, as well as
567 # everything starting with a comma
568
569 $name =~ s/,.*//;
570 $name =~ s/[[:^alpha:]]//g;
571 return lc $name;
572}
573
892d8259 574sub UpperLatin1 {
5a7e5385 575 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
576}
577
289ce9cc
KW
578sub output_table_common {
579
580 # Common subroutine to actually output the generated rules table.
581
582 my ($property,
583 $table_value_defines_ref,
584 $table_ref,
585 $names_ref,
586 $abbreviations_ref) = @_;
587 my $size = @$table_ref;
588
589 # Output the #define list, sorted by numeric value
590 if ($table_value_defines_ref) {
591 my $max_name_length = 0;
592 my @defines;
593
594 # Put in order, and at the same time find the longest name
595 while (my ($enum, $value) = each %$table_value_defines_ref) {
596 $defines[$value] = $enum;
597
598 my $length = length $enum;
599 $max_name_length = $length if $length > $max_name_length;
600 }
601
602 print $out_fh "\n";
603
604 # Output, so that the values are vertically aligned in a column after
605 # the longest name
606 foreach my $i (0 .. @defines - 1) {
607 next unless defined $defines[$i];
608 printf $out_fh "#define %-*s %2d\n",
609 $max_name_length,
610 $defines[$i],
611 $i;
612 }
613 }
614
615 my $column_width = 2; # We currently allow 2 digits for the number
616
617 # If the maximum value in the table is 1, it can be a bool. (Being above
618 # a U8 is not currently handled
619 my $max_element = 0;
620 for my $i (0 .. $size - 1) {
621 for my $j (0 .. $size - 1) {
622 next if $max_element >= $table_ref->[$i][$j];
623 $max_element = $table_ref->[$i][$j];
624 }
625 }
626 die "Need wider table column width given '$max_element"
627 if length $max_element > $column_width;
628
629 my $table_type = ($max_element == 1)
630 ? 'bool'
631 : 'U8';
632
633 # If a name is longer than the width set aside for a column, its column
634 # needs to have increased spacing so that the name doesn't get truncated
635 # nor run into an adjacent column
636 my @spacers;
637
638 # If we are being compiled on a Unicode version earlier than that which
639 # this file was designed for, it may be that some of the property values
640 # aren't in the current release, and so would be undefined if we didn't
641 # define them ourselves. Earlier code has done this, making them
642 # lowercase characters of length one. We look to see if any exist, so
643 # that we can add an annotation to the output table
644 my $has_placeholder = 0;
645
646 for my $i (0 .. $size - 1) {
647 no warnings 'numeric';
648 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
649 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
650 }
651
652 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
653
654 # Calculate the column heading line
655 my $header_line = "/* "
656 . (" " x $max_hdr_len) # We let the row heading meld to
657 # the '*/' for those that are at
658 # the max
659 . " " x 3; # Space for '*/ '
660 # Now each column
661 for my $i (0 .. $size - 1) {
662 $header_line .= sprintf "%s%*s",
663 $spacers[$i],
664 $column_width + 1, # 1 for the ','
665 $names_ref->[$i];
666 }
667 $header_line .= " */\n";
668
669 # If we have annotations, output it now.
670 if ($has_placeholder || scalar %$abbreviations_ref) {
671 my $text = "";
672 foreach my $abbr (sort keys %$abbreviations_ref) {
673 $text .= "; " if $text;
674 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
675 }
676 if ($has_placeholder) {
677 $text .= "; other " if $text;
678 $text .= "lowercase names are placeholders for"
679 . " property values not defined until a later Unicode"
680 . " release, so are irrelevant in this one, as they are"
681 . " not assigned to any code points";
682 }
683
684 my $indent = " " x 3;
685 $text = $indent . "/* $text */";
686
687 # Wrap the text so that it is no wider than the table, which the
688 # header line gives.
689 my $output_width = length $header_line;
690 while (length $text > $output_width) {
691 my $cur_line = substr($text, 0, $output_width);
692
693 # Find the first blank back from the right end to wrap at.
694 for (my $i = $output_width -1; $i > 0; $i--) {
695 if (substr($text, $i, 1) eq " ") {
696 print $out_fh substr($text, 0, $i), "\n";
697
698 # Set so will look at just the remaining tail (which will
699 # be indented and have a '*' after the indent
700 $text = $indent . " * " . substr($text, $i + 1);
701 last;
702 }
703 }
704 }
705
706 # And any remaining
707 print $out_fh $text, "\n" if $text;
708 }
709
710 # We calculated the header line earlier just to get its width so that we
711 # could make sure the annotations fit into that.
712 print $out_fh $header_line;
713
714 # Now output the bulk of the table.
715 for my $i (0 .. $size - 1) {
716
717 # First the row heading.
718 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
719 print $out_fh "{"; # Then the brace for this row
720
721 # Then each column
722 for my $j (0 .. $size -1) {
723 print $out_fh $spacers[$j];
724 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
725 print $out_fh "," if $j < $size - 1;
726 }
727 print $out_fh " }";
728 print $out_fh "," if $i < $size - 1;
729 print $out_fh "\n";
730 }
731
732 print $out_fh "};\n";
733}
734
973a28ed
KW
735sub output_GCB_table() {
736
737 # Create and output the pair table for use in determining Grapheme Cluster
738 # Breaks, given in http://www.unicode.org/reports/tr29/.
b0e24409
KW
739 my %gcb_actions = (
740 GCB_NOBREAK => 0,
741 GCB_BREAKABLE => 1,
742 GCB_RI_then_RI => 2, # Rules 12 and 13
743 GCB_EX_then_EM => 3, # Rule 10
744 );
973a28ed
KW
745
746 # The table is constructed in reverse order of the rules, to make the
747 # lower-numbered, higher priority ones override the later ones, as the
748 # algorithm stops at the earliest matching rule
749
750 my @gcb_table;
751 my $table_size = @gcb_short_enums;
752
753 # Otherwise, break everywhere.
b0e24409 754 # GB99 Any ÷ Any
973a28ed
KW
755 for my $i (0 .. $table_size - 1) {
756 for my $j (0 .. $table_size - 1) {
757 $gcb_table[$i][$j] = 1;
758 }
759 }
760
b0e24409
KW
761 # Do not break within emoji flag sequences. That is, do not break between
762 # regional indicator (RI) symbols if there is an odd number of RI
763 # characters before the break point. Must be resolved in runtime code.
764 #
c492f156 765 # GB12 sot (RI RI)* RI × RI
b0e24409
KW
766 # GB13 [^RI] (RI RI)* RI × RI
767 $gcb_table[$gcb_enums{'Regional_Indicator'}]
768 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
769
770 # Do not break within emoji modifier sequences or emoji zwj sequences.
771 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
772 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
773 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
774
775 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
776 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
777 = $gcb_actions{GCB_EX_then_EM};
778 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
779 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
780
781 # Do not break before extending characters or ZWJ.
973a28ed 782 # Do not break before SpacingMarks, or after Prepend characters.
973a28ed 783 # GB9b Prepend ×
b0e24409
KW
784 # GB9a × SpacingMark
785 # GB9 × ( Extend | ZWJ )
973a28ed 786 for my $i (0 .. @gcb_table - 1) {
289ce9cc 787 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
b0e24409
KW
788 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
789 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
790 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
973a28ed
KW
791 }
792
973a28ed
KW
793 # Do not break Hangul syllable sequences.
794 # GB8 ( LVT | T) × T
795 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
796 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
797
798 # GB7 ( LV | V ) × ( V | T )
799 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
800 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
801 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
802 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
803
804 # GB6 L × ( L | V | LV | LVT )
805 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
806 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
807 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
808 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
809
289ce9cc
KW
810 # Do not break between a CR and LF. Otherwise, break before and after
811 # controls.
973a28ed
KW
812 # GB5 ÷ ( Control | CR | LF )
813 # GB4 ( Control | CR | LF ) ÷
814 for my $i (0 .. @gcb_table - 1) {
289ce9cc 815 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
973a28ed
KW
816 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
817 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
289ce9cc 818 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
973a28ed
KW
819 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
820 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
821 }
822
823 # GB3 CR × LF
824 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
825
b0e24409 826 # Break at the start and end of text, unless the text is empty
973a28ed
KW
827 # GB1 sot ÷
828 # GB2 ÷ eot
829 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
830 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
831 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
973a28ed 832 }
289ce9cc 833 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
973a28ed 834
b0e24409 835 output_table_common('GCB', \%gcb_actions,
289ce9cc 836 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
973a28ed
KW
837}
838
6b659339
KW
839sub output_LB_table() {
840
841 # Create and output the enums, #defines, and pair table for use in
842 # determining Line Breaks. This uses the default line break algorithm,
843 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
844 # in that page, as the Unicode-furnished tests assume that tailoring.
845
6b659339
KW
846 # The result is really just true or false. But we follow along with tr14,
847 # creating a rule which is false for something like X SP* X. That gets
848 # encoding 2. The rest of the actions are synthetic ones that indicate
849 # some context handling is required. These each are added to the
850 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
851 # value can be retrieved. Actually only rules from 7 through 18 (which
852 # are the ones where space matter) are possible to have 2 added to them.
853 # The others below add just 0 or 1. It might be possible for one
854 # synthetic rule to be added to another, yielding a larger value. This
855 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
856 # names of the middle grouping below, it is impossible for that to occur
857 # for them because they all start with mutually exclusive classes. That
858 # the final rule can't be added to any of the others isn't obvious from
859 # its name, so it is assigned a power of 2 higher than the others can get
860 # to so any addition would preserve all data. (And the code will reach an
861 # assert(0) on debugging builds should this happen.)
862 my %lb_actions = (
863 LB_NOBREAK => 0,
864 LB_BREAKABLE => 1,
865 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
866
b0e24409 867 LB_CM_ZWJ_foo => 3, # Rule 9
6b659339
KW
868 LB_SP_foo => 6, # Rule 18
869 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
870 LB_SY_or_IS_then_various => 11, # Rule 25
871 LB_HY_or_BA_then_foo => 13, # Rule 21
b0e24409 872 LB_RI_then_RI => 15, # Rule 30a
6b659339 873
b0e24409 874 LB_various_then_PO_or_PR => (1<<5), # Rule 25
6b659339
KW
875 );
876
6b659339
KW
877 # Construct the LB pair table. This is based on the rules in
878 # http://www.unicode.org/reports/tr14/, but modified as those rules are
879 # designed for someone taking a string of text and sequentially going
880 # through it to find the break opportunities, whereas, Perl requires
881 # determining if a given random spot is a break opportunity, without
882 # knowing all the entire string before it.
883 #
884 # The table is constructed in reverse order of the rules, to make the
885 # lower-numbered, higher priority ones override the later ones, as the
886 # algorithm stops at the earliest matching rule
887
888 my @lb_table;
889 my $table_size = @lb_short_enums;
890
891 # LB31. Break everywhere else
892 for my $i (0 .. $table_size - 1) {
893 for my $j (0 .. $table_size - 1) {
894 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
895 }
896 }
897
b0e24409
KW
898 # LB30b Do not break between an emoji base and an emoji modifier.
899 # EB × EM
900 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
901 = $lb_actions{'LB_NOBREAK'};
902
903 # LB30a Break between two regional indicator symbols if and only if there
904 # are an even number of regional indicators preceding the position of the
905 # break.
906 # sot (RI RI)* RI × RI
907 # [^RI] (RI RI)* RI × RI
289ce9cc 908 $lb_table[$lb_enums{'Regional_Indicator'}]
b0e24409 909 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
6b659339
KW
910
911 # LB30 Do not break between letters, numbers, or ordinary symbols and
912 # opening or closing parentheses.
913 # (AL | HL | NU) × OP
289ce9cc
KW
914 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
915 = $lb_actions{'LB_NOBREAK'};
916 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
917 = $lb_actions{'LB_NOBREAK'};
918 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
919 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
920
921 # CP × (AL | HL | NU)
289ce9cc
KW
922 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
923 = $lb_actions{'LB_NOBREAK'};
924 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
925 = $lb_actions{'LB_NOBREAK'};
926 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
927 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
928
929 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
930 # IS × (AL | HL)
289ce9cc
KW
931 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
932 = $lb_actions{'LB_NOBREAK'};
933 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
934 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
935
936 # LB28 Do not break between alphabetics (“at”).
937 # (AL | HL) × (AL | HL)
289ce9cc
KW
938 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
939 = $lb_actions{'LB_NOBREAK'};
940 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
941 = $lb_actions{'LB_NOBREAK'};
942 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
943 = $lb_actions{'LB_NOBREAK'};
944 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
945 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
946
947 # LB27 Treat a Korean Syllable Block the same as ID.
948 # (JL | JV | JT | H2 | H3) × IN
289ce9cc
KW
949 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
950 = $lb_actions{'LB_NOBREAK'};
951 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
952 = $lb_actions{'LB_NOBREAK'};
953 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
954 = $lb_actions{'LB_NOBREAK'};
955 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
956 = $lb_actions{'LB_NOBREAK'};
957 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
958 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
959
960 # (JL | JV | JT | H2 | H3) × PO
289ce9cc
KW
961 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
962 = $lb_actions{'LB_NOBREAK'};
963 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
964 = $lb_actions{'LB_NOBREAK'};
965 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
966 = $lb_actions{'LB_NOBREAK'};
967 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
968 = $lb_actions{'LB_NOBREAK'};
969 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
970 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
971
972 # PR × (JL | JV | JT | H2 | H3)
289ce9cc
KW
973 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
974 = $lb_actions{'LB_NOBREAK'};
975 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
976 = $lb_actions{'LB_NOBREAK'};
977 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
978 = $lb_actions{'LB_NOBREAK'};
979 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
980 = $lb_actions{'LB_NOBREAK'};
981 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
982 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
983
984 # LB26 Do not break a Korean syllable.
985 # JL × (JL | JV | H2 | H3)
986 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
987 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
988 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
989 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
990
991 # (JV | H2) × (JV | JT)
992 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
993 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
994 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
995 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
996
997 # (JT | H3) × JT
998 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
999 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1000
1001 # LB25 Do not break between the following pairs of classes relevant to
1002 # numbers, as tailored by example 7 in
1003 # http://www.unicode.org/reports/tr14/#Examples
1004 # We follow that tailoring because Unicode's test cases expect it
1005 # (PR | PO) × ( OP | HY )? NU
289ce9cc
KW
1006 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
1007 = $lb_actions{'LB_NOBREAK'};
1008 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1009 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1010
1011 # Given that (OP | HY )? is optional, we have to test for it in code.
1012 # We add in the action (instead of overriding) for this, so that in
1013 # the code we can recover the underlying break value.
289ce9cc 1014 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1015 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1016 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1017 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1018 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339 1019 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1020 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339
KW
1021 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1022
1023 # ( OP | HY ) × NU
289ce9cc
KW
1024 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1025 = $lb_actions{'LB_NOBREAK'};
1026 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1027 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1028
1029 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1030 # which can be rewritten as:
1031 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
289ce9cc
KW
1032 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1033 = $lb_actions{'LB_NOBREAK'};
1034 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1035 = $lb_actions{'LB_NOBREAK'};
1036 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1037 = $lb_actions{'LB_NOBREAK'};
1038 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1039 = $lb_actions{'LB_NOBREAK'};
1040 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1041 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1042
1043 # Like earlier where we have to test in code, we add in the action so
1044 # that we can recover the underlying values. This is done in rules
1045 # below, as well. The code assumes that we haven't added 2 actions.
1046 # Shoul a later Unicode release break that assumption, then tests
1047 # should start failing.
289ce9cc 1048 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
6b659339 1049 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1050 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
6b659339 1051 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1052 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
6b659339 1053 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1054 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
6b659339 1055 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1056 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
6b659339 1057 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1058 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
6b659339 1059 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1060 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
6b659339 1061 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1062 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
6b659339 1063 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1064 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
6b659339 1065 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1066 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
6b659339
KW
1067 += $lb_actions{'LB_SY_or_IS_then_various'};
1068
1069 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1070 # which can be rewritten as:
1071 # NU (SY | IS)* (CL | CP)? × (PO | PR)
289ce9cc
KW
1072 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1073 = $lb_actions{'LB_NOBREAK'};
1074 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1075 = $lb_actions{'LB_NOBREAK'};
6b659339 1076
289ce9cc 1077 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1078 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1079 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1080 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1081 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1082 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1083 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
6b659339
KW
1084 += $lb_actions{'LB_various_then_PO_or_PR'};
1085
289ce9cc 1086 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1087 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1088 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1089 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1090 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1091 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1092 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
6b659339
KW
1093 += $lb_actions{'LB_various_then_PO_or_PR'};
1094
b0e24409
KW
1095 # LB24 Do not break between numeric prefix/postfix and letters, or between
1096 # letters and prefix/postfix.
1097 # (PR | PO) × (AL | HL)
289ce9cc
KW
1098 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1099 = $lb_actions{'LB_NOBREAK'};
1100 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1101 = $lb_actions{'LB_NOBREAK'};
289ce9cc
KW
1102 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1103 = $lb_actions{'LB_NOBREAK'};
1104 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1105 = $lb_actions{'LB_NOBREAK'};
6b659339 1106
b0e24409
KW
1107 # (AL | HL) × (PR | PO)
1108 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1109 = $lb_actions{'LB_NOBREAK'};
1110 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1111 = $lb_actions{'LB_NOBREAK'};
1112 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1113 = $lb_actions{'LB_NOBREAK'};
1114 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1115 = $lb_actions{'LB_NOBREAK'};
1116
1117 # LB23a Do not break between numeric prefixes and ideographs, or between
1118 # ideographs and numeric postfixes.
1119 # PR × (ID | EB | EM)
1120 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1121 = $lb_actions{'LB_NOBREAK'};
1122 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1123 = $lb_actions{'LB_NOBREAK'};
1124 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1125 = $lb_actions{'LB_NOBREAK'};
1126
1127 # (ID | EB | EM) × PO
289ce9cc
KW
1128 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1129 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1130 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1131 = $lb_actions{'LB_NOBREAK'};
1132 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1133 = $lb_actions{'LB_NOBREAK'};
6b659339 1134
b0e24409 1135 # LB23 Do not break between digits and letters
6b659339 1136 # (AL | HL) × NU
289ce9cc
KW
1137 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1138 = $lb_actions{'LB_NOBREAK'};
1139 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1140 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1141
1142 # NU × (AL | HL)
289ce9cc
KW
1143 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1144 = $lb_actions{'LB_NOBREAK'};
1145 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1146 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1147
1148 # LB22 Do not break between two ellipses, or between letters, numbers or
1149 # exclamations and ellipsis.
1150 # (AL | HL) × IN
289ce9cc
KW
1151 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1152 = $lb_actions{'LB_NOBREAK'};
1153 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1154 = $lb_actions{'LB_NOBREAK'};
6b659339 1155
289ce9cc
KW
1156 # Exclamation × IN
1157 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1158 = $lb_actions{'LB_NOBREAK'};
6b659339 1159
b0e24409 1160 # (ID | EB | EM) × IN
289ce9cc
KW
1161 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1162 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1163 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1164 = $lb_actions{'LB_NOBREAK'};
1165 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1166 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1167
1168 # IN × IN
289ce9cc
KW
1169 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1170 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1171
1172 # NU × IN
289ce9cc
KW
1173 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1174 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1175
1176 # LB21b Don’t break between Solidus and Hebrew letters.
1177 # SY × HL
289ce9cc
KW
1178 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1179 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1180
1181 # LB21a Don't break after Hebrew + Hyphen.
1182 # HL (HY | BA) ×
1183 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1184 $lb_table[$lb_enums{'Hyphen'}][$i]
1185 += $lb_actions{'LB_HY_or_BA_then_foo'};
1186 $lb_table[$lb_enums{'Break_After'}][$i]
1187 += $lb_actions{'LB_HY_or_BA_then_foo'};
6b659339
KW
1188 }
1189
1190 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1191 # spaces, small kana, and other non-starters, or after acute accents.
1192 # × BA
1193 # × HY
1194 # × NS
1195 # BB ×
1196 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1197 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1198 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1199 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1200 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1201 }
1202
1203 # LB20 Break before and after unresolved CB.
1204 # ÷ CB
1205 # CB ÷
1206 # Conditional breaks should be resolved external to the line breaking
1207 # rules. However, the default action is to treat unresolved CB as breaking
1208 # before and after.
1209 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1210 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1211 = $lb_actions{'LB_BREAKABLE'};
1212 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1213 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1214 }
1215
1216 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1217 # × QU
1218 # QU ×
1219 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1220 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1221 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1222 }
1223
1224 # LB18 Break after spaces
1225 # SP ÷
1226 for my $i (0 .. @lb_table - 1) {
289ce9cc 1227 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1228 }
1229
1230 # LB17 Do not break within ‘——’, even with intervening spaces.
1231 # B2 SP* × B2
289ce9cc 1232 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
6b659339
KW
1233 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1234
1235 # LB16 Do not break between closing punctuation and a nonstarter even with
1236 # intervening spaces.
1237 # (CL | CP) SP* × NS
289ce9cc 1238 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
6b659339 1239 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1240 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
6b659339
KW
1241 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1242
1243
1244 # LB15 Do not break within ‘”[’, even with intervening spaces.
1245 # QU SP* × OP
289ce9cc 1246 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
6b659339
KW
1247 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1248
1249 # LB14 Do not break after ‘[’, even after spaces.
1250 # OP SP* ×
1251 for my $i (0 .. @lb_table - 1) {
289ce9cc 1252 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
6b659339
KW
1253 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1254 }
1255
1256 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1257 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1258 # [^NU] × CL
1259 # [^NU] × CP
1260 # × EX
1261 # [^NU] × IS
1262 # [^NU] × SY
1263 for my $i (0 .. @lb_table - 1) {
289ce9cc 1264 $lb_table[$i][$lb_enums{'Exclamation'}]
6b659339
KW
1265 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1266
289ce9cc 1267 next if $i == $lb_enums{'Numeric'};
6b659339 1268
289ce9cc 1269 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
6b659339 1270 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1271 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
6b659339 1272 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1273 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
6b659339 1274 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1275 $lb_table[$i][$lb_enums{'Break_Symbols'}]
6b659339
KW
1276 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1277 }
1278
1279 # LB12a Do not break before NBSP and related characters, except after
1280 # spaces and hyphens.
1281 # [^SP BA HY] × GL
1282 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1283 next if $i == $lb_enums{'Space'}
1284 || $i == $lb_enums{'Break_After'}
1285 || $i == $lb_enums{'Hyphen'};
6b659339
KW
1286
1287 # We don't break, but if a property above has said don't break even
1288 # with space between, don't override that (also in the next few rules)
289ce9cc 1289 next if $lb_table[$i][$lb_enums{'Glue'}]
6b659339 1290 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1291 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1292 }
1293
1294 # LB12 Do not break after NBSP and related characters.
1295 # GL ×
1296 for my $i (0 .. @lb_table - 1) {
289ce9cc 1297 next if $lb_table[$lb_enums{'Glue'}][$i]
6b659339 1298 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1299 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1300 }
1301
1302 # LB11 Do not break before or after Word joiner and related characters.
1303 # × WJ
1304 # WJ ×
1305 for my $i (0 .. @lb_table - 1) {
289ce9cc 1306 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
6b659339
KW
1307 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1308 {
289ce9cc 1309 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
6b659339 1310 }
289ce9cc 1311 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
6b659339
KW
1312 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1313 {
289ce9cc 1314 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1315 }
1316 }
1317
1318 # Special case this here to avoid having to do a special case in the code,
1319 # by making this the same as other things with a SP in front of them that
1320 # don't break, we avoid an extra test
289ce9cc 1321 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
6b659339
KW
1322 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1323
1324 # LB9 and LB10 are done in the same loop
1325 #
1326 # LB9 Do not break a combining character sequence; treat it as if it has
1327 # the line breaking class of the base character in all of the
b0e24409
KW
1328 # higher-numbered rules. Treat ZWJ as if it were CM
1329 # Treat X (CM|ZWJ)* as if it were X.
6b659339
KW
1330 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1331
b0e24409
KW
1332 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1333 # case where a CM or ZWJ is the first character on the line or follows SP,
1334 # BK, CR, LF, NL, or ZW.
6b659339
KW
1335 for my $i (0 .. @lb_table - 1) {
1336
b0e24409
KW
1337 # When the CM or ZWJ is the first in the pair, we don't know without
1338 # looking behind whether the CM or ZWJ is going to attach to an
1339 # earlier character, or not. So have to figure this out at runtime in
1340 # the code
1341 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1342 = $lb_actions{'LB_CM_ZWJ_foo'};
1343 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
289ce9cc
KW
1344
1345 if ( $i == $lb_enums{'Mandatory_Break'}
1346 || $i == $lb_enums{'EDGE'}
1347 || $i == $lb_enums{'Carriage_Return'}
1348 || $i == $lb_enums{'Line_Feed'}
1349 || $i == $lb_enums{'Next_Line'}
1350 || $i == $lb_enums{'Space'}
1351 || $i == $lb_enums{'ZWSpace'})
6b659339
KW
1352 {
1353 # For these classes, a following CM doesn't combine, and should do
289ce9cc
KW
1354 # whatever 'Alphabetic' would do.
1355 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1356 = $lb_table[$i][$lb_enums{'Alphabetic'}];
b0e24409
KW
1357 $lb_table[$i][$lb_enums{'ZWJ'}]
1358 = $lb_table[$i][$lb_enums{'Alphabetic'}];
6b659339
KW
1359 }
1360 else {
b0e24409
KW
1361 # For these classes, the CM or ZWJ combines, so doesn't break,
1362 # inheriting the type of nobreak from the master character.
289ce9cc 1363 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
6b659339
KW
1364 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1365 {
289ce9cc
KW
1366 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1367 = $lb_actions{'LB_NOBREAK'};
6b659339 1368 }
b0e24409
KW
1369 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1370 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1371 {
1372 $lb_table[$i][$lb_enums{'ZWJ'}]
1373 = $lb_actions{'LB_NOBREAK'};
1374 }
6b659339
KW
1375 }
1376 }
1377
b0e24409
KW
1378 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1379 # base or emoji modifier. This rule prevents breaks within emoji joiner
1380 # sequences.
1381 # ZWJ × (ID | EB | EM)
1382 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1383 = $lb_actions{'LB_NOBREAK'};
1384 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1385 = $lb_actions{'LB_NOBREAK'};
1386 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1387 = $lb_actions{'LB_NOBREAK'};
1388
6b659339
KW
1389 # LB8 Break before any character following a zero-width space, even if one
1390 # or more spaces intervene.
1391 # ZW SP* ÷
1392 for my $i (0 .. @lb_table - 1) {
289ce9cc 1393 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1394 }
1395
1396 # Because of LB8-10, we need to look at context for "SP x", and this must
1397 # be done in the code. So override the existing rules for that, by adding
1398 # a constant to get new rules that tell the code it needs to look at
1399 # context. By adding this action instead of replacing the existing one,
1400 # we can get back to the original rule if necessary.
1401 for my $i (0 .. @lb_table - 1) {
289ce9cc 1402 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
6b659339
KW
1403 }
1404
1405 # LB7 Do not break before spaces or zero width space.
1406 # × SP
1407 # × ZW
1408 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1409 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1410 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1411 }
1412
1413 # LB6 Do not break before hard line breaks.
1414 # × ( BK | CR | LF | NL )
1415 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1416 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1417 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1418 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1419 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1420 }
1421
1422 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1423 # CR × LF
1424 # CR !
1425 # LF !
1426 # NL !
1427 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1428 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1429 = $lb_actions{'LB_BREAKABLE'};
1430 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1431 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339 1432 }
289ce9cc
KW
1433 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1434 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1435
1436 # LB4 Always break after hard line breaks.
1437 # BK !
1438 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1439 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1440 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1441 }
1442
6b659339
KW
1443 # LB3 Always break at the end of text.
1444 # ! eot
b0e24409
KW
1445 # LB2 Never break at the start of text.
1446 # sot ×
6b659339 1447 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1448 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1449 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1450 }
1451
1452 # LB1 Assign a line breaking class to each code point of the input.
1453 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1454 # depending on criteria outside the scope of this algorithm.
1455 #
1456 # In the absence of such criteria all characters with a specific
1457 # combination of original class and General_Category property value are
1458 # resolved as follows:
1459 # Original Resolved General_Category
1460 # AI, SG, XX AL Any
1461 # SA CM Only Mn or Mc
1462 # SA AL Any except Mn and Mc
1463 # CJ NS Any
1464 #
1465 # This is done in mktables, so we never see any of the remapped-from
1466 # classes.
1467
289ce9cc
KW
1468 output_table_common('LB', \%lb_actions,
1469 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
6b659339
KW
1470}
1471
7e54b87f
KW
1472sub output_WB_table() {
1473
1474 # Create and output the enums, #defines, and pair table for use in
1475 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1476
1477 # This uses the same mechanism in the other bounds tables generated by
1478 # this file. The actions that could override a 0 or 1 are added to those
1479 # numbers; the actions that clearly don't depend on the underlying rule
1480 # simply overwrite
1481 my %wb_actions = (
1482 WB_NOBREAK => 0,
1483 WB_BREAKABLE => 1,
1484 WB_hs_then_hs => 2,
b0e24409 1485 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
7e54b87f
KW
1486 WB_DQ_then_HL => 4,
1487 WB_HL_then_DQ => 6,
1488 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1489 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1490 WB_MB_or_MN_or_SQ_then_NU => 12,
1491 WB_NU_then_MB_or_MN_or_SQ => 14,
b0e24409 1492 WB_RI_then_RI => 16,
7e54b87f
KW
1493 );
1494
7e54b87f
KW
1495 # Construct the WB pair table.
1496 # The table is constructed in reverse order of the rules, to make the
1497 # lower-numbered, higher priority ones override the later ones, as the
1498 # algorithm stops at the earliest matching rule
1499
1500 my @wb_table;
1501 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
39c4defe 1502 die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
7e54b87f
KW
1503
1504 # Otherwise, break everywhere (including around ideographs).
b0e24409 1505 # WB99 Any ÷ Any
7e54b87f
KW
1506 for my $i (0 .. $table_size - 1) {
1507 for my $j (0 .. $table_size - 1) {
1508 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1509 }
1510 }
1511
b0e24409
KW
1512 # Do not break within emoji flag sequences. That is, do not break between
1513 # regional indicator (RI) symbols if there is an odd number of RI
1514 # characters before the break point.
1515 # WB16 [^RI] (RI RI)* RI × RI
c492f156 1516 # WB15 sot (RI RI)* RI × RI
289ce9cc 1517 $wb_table[$wb_enums{'Regional_Indicator'}]
b0e24409
KW
1518 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1519
1520 # Do not break within emoji modifier sequences.
1521 # WB14 ( E_Base | EBG ) × E_Modifier
1522 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1523 = $wb_actions{'WB_NOBREAK'};
1524 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1525 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1526
1527 # Do not break from extenders.
1528 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
289ce9cc
KW
1529 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1530 = $wb_actions{'WB_NOBREAK'};
1531 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1532 = $wb_actions{'WB_NOBREAK'};
1533 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1534 = $wb_actions{'WB_NOBREAK'};
1535 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1536 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1537
1538 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1539 # × # ExtendNumLet
289ce9cc
KW
1540 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1541 = $wb_actions{'WB_NOBREAK'};
1542 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1543 = $wb_actions{'WB_NOBREAK'};
1544 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1545 = $wb_actions{'WB_NOBREAK'};
1546 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1547 = $wb_actions{'WB_NOBREAK'};
1548 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1549 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1550
1551 # Do not break between Katakana.
1552 # WB13 Katakana × Katakana
289ce9cc
KW
1553 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1554 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1555
1556 # Do not break within sequences, such as “3.2” or “3,456.789”.
1557 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
289ce9cc 1558 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
7e54b87f 1559 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1560 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
7e54b87f 1561 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1562 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1563 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1564
1565 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
289ce9cc 1566 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
7e54b87f 1567 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1568 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
7e54b87f 1569 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1570 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
7e54b87f
KW
1571 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1572
1573 # Do not break within sequences of digits, or digits adjacent to letters
1574 # (“3a”, or “A3”).
1575 # WB10 Numeric × (ALetter | Hebrew_Letter)
289ce9cc
KW
1576 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1577 = $wb_actions{'WB_NOBREAK'};
1578 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1579 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1580
1581 # WB9 (ALetter | Hebrew_Letter) × Numeric
289ce9cc
KW
1582 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1583 = $wb_actions{'WB_NOBREAK'};
1584 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1585 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1586
1587 # WB8 Numeric × Numeric
289ce9cc
KW
1588 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1589 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1590
1591 # Do not break letters across certain punctuation.
1592 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
289ce9cc
KW
1593 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1594 += $wb_actions{'WB_DQ_then_HL'};
7e54b87f
KW
1595
1596 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
289ce9cc
KW
1597 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1598 += $wb_actions{'WB_HL_then_DQ'};
7e54b87f
KW
1599
1600 # WB7a Hebrew_Letter × Single_Quote
289ce9cc
KW
1601 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1602 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1603
1604 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1605 # × (ALetter | Hebrew_Letter)
289ce9cc 1606 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
7e54b87f 1607 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1608 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1609 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1610 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
7e54b87f 1611 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1612 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1613 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1614 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
7e54b87f 1615 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1616 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f
KW
1617 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1618
1619 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1620 # | Single_Quote) (ALetter | Hebrew_Letter)
289ce9cc 1621 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1622 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1623 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1624 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1625 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
7e54b87f 1626 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1627 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
7e54b87f 1628 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1629 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
7e54b87f 1630 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1631 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1632 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1633
1634 # Do not break between most letters.
1635 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
289ce9cc
KW
1636 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1637 = $wb_actions{'WB_NOBREAK'};
1638 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1639 = $wb_actions{'WB_NOBREAK'};
1640 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1641 = $wb_actions{'WB_NOBREAK'};
1642 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1643 = $wb_actions{'WB_NOBREAK'};
7e54b87f 1644
b0e24409
KW
1645 # Ignore Format and Extend characters, except after sot, CR, LF, and
1646 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1647 # WB4 X (Extend | Format | ZWJ)* → X
7e54b87f 1648 for my $i (0 .. @wb_table - 1) {
289ce9cc 1649 $wb_table[$wb_enums{'Extend'}][$i]
b0e24409 1650 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
289ce9cc 1651 $wb_table[$wb_enums{'Format'}][$i]
b0e24409
KW
1652 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1653 $wb_table[$wb_enums{'ZWJ'}][$i]
1654 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1655 }
1656 for my $i (0 .. @wb_table - 1) {
1657 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1658 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1659 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1660 }
1661
1662 # Implied is that these attach to the character before them, except for
1663 # the characters that mark the end of a region of text. The rules below
1664 # override the ones set up here, for all the characters that need
1665 # overriding.
1666 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1667 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1668 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1669 }
1670
b0e24409
KW
1671 # Do not break within emoji zwj sequences.
1672 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1673 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1674 = $wb_actions{'WB_NOBREAK'};
1675 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1676 = $wb_actions{'WB_NOBREAK'};
1677
7e54b87f
KW
1678 # Break before and after white space
1679 # WB3b ÷ (Newline | CR | LF)
1680 # WB3a (Newline | CR | LF) ÷
1681 # et. al.
289ce9cc 1682 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1683 for my $j (0 .. @wb_table - 1) {
1684 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1685 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1686 }
1687 }
1688
1689 # But do not break within white space.
1690 # WB3 CR × LF
1691 # et.al.
289ce9cc
KW
1692 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1693 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1694 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1695 }
1696 }
1697
b0e24409 1698 # And do not break horizontal space followed by Extend or Format or ZWJ
289ce9cc
KW
1699 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1700 = $wb_actions{'WB_NOBREAK'};
1701 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1702 = $wb_actions{'WB_NOBREAK'};
b0e24409
KW
1703 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1704 = $wb_actions{'WB_NOBREAK'};
289ce9cc
KW
1705 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1706 [$wb_enums{'Perl_Tailored_HSpace'}]
1707 = $wb_actions{'WB_hs_then_hs'};
7e54b87f 1708
b0e24409
KW
1709 # Break at the start and end of text, unless the text is empty
1710 # WB2 Any ÷ eot
1711 # WB1 sot ÷ Any
7e54b87f 1712 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1713 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1714 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
7e54b87f 1715 }
289ce9cc 1716 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
7e54b87f 1717
289ce9cc
KW
1718 output_table_common('WB', \%wb_actions,
1719 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
7e54b87f
KW
1720}
1721
9d9177be
KW
1722output_invlist("Latin1", [ 0, 256 ]);
1723output_invlist("AboveLatin1", [ 256 ]);
1724
bffc0129 1725end_file_pound_if;
43b443dd 1726
3f427fd9
KW
1727# We construct lists for all the POSIX and backslash sequence character
1728# classes in two forms:
1729# 1) ones which match only in the ASCII range
1730# 2) ones which match either in the Latin1 range, or the entire Unicode range
1731#
1732# These get compiled in, and hence affect the memory footprint of every Perl
1733# program, even those not using Unicode. To minimize the size, currently
1734# the Latin1 version is generated for the beyond ASCII range except for those
1735# lists that are quite small for the entire range, such as for \s, which is 22
1736# UVs long plus 4 UVs (currently) for the header.
1737#
1738# To save even more memory, the ASCII versions could be derived from the
1739# larger ones at runtime, saving some memory (minus the expense of the machine
1740# instructions to do so), but these are all small anyway, so their total is
1741# about 100 UVs.
1742#
1743# In the list of properties below that get generated, the L1 prefix is a fake
1744# property that means just the Latin1 range of the full property (whose name
1745# has an X prefix instead of L1).
a02047bf
KW
1746#
1747# An initial & means to use the subroutine from this file instead of an
1748# official inversion list.
3f427fd9 1749
0c4ecf42
KW
1750for my $charset (get_supported_code_pages()) {
1751 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1752
99f21fb9
KW
1753 @a2n = @{get_a2n($charset)};
1754 no warnings 'qw';
1755 # Ignore non-alpha in sort
1756 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
c0382778 1757 Assigned
1c8c3428
KW
1758 ASCII
1759 Cased
1760 VertSpace
1761 XPerlSpace
1762 XPosixAlnum
1763 XPosixAlpha
1764 XPosixBlank
1765 XPosixCntrl
1766 XPosixDigit
1767 XPosixGraph
1768 XPosixLower
1769 XPosixPrint
1770 XPosixPunct
1771 XPosixSpace
1772 XPosixUpper
1773 XPosixWord
1774 XPosixXDigit
1775 _Perl_Any_Folds
1776 &NonL1_Perl_Non_Final_Folds
1777 _Perl_Folds_To_Multi_Char
1778 &UpperLatin1
1779 _Perl_IDStart
1780 _Perl_IDCont
02f811dd 1781 _Perl_GCB,EDGE
ca8226cf 1782 _Perl_LB,EDGE
bf4268fa 1783 _Perl_SB,EDGE
190d69bb 1784 _Perl_WB,EDGE,UNKNOWN
1c8c3428 1785 )
0f5e3c71
KW
1786 ) {
1787
1788 # For the Latin1 properties, we change to use the eXtended version of the
1789 # base property, then go through the result and get rid of everything not
1790 # in Latin1 (above 255). Actually, we retain the element for the range
1791 # that crosses the 255/256 boundary if it is one that matches the
1792 # property. For example, in the Word property, there is a range of code
1793 # points that start at U+00F8 and goes through U+02C1. Instead of
1794 # artificially cutting that off at 256 because 256 is the first code point
1795 # above Latin1, we let the range go to its natural ending. That gives us
1796 # extra information with no added space taken. But if the range that
1797 # crosses the boundary is one that doesn't match the property, we don't
1798 # start a new range above 255, as that could be construed as going to
1799 # infinity. For example, the Upper property doesn't include the character
1800 # at 255, but does include the one at 256. We don't include the 256 one.
1801 my $prop_name = $prop;
1802 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1803 my $extra_enums = "";
1804 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1805 my $lookup_prop = $prop_name;
1806 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1807 or $lookup_prop =~ s/^L1//);
1808 my $nonl1_only = 0;
1809 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1810 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1811
1812 my @invlist;
99f21fb9
KW
1813 my @invmap;
1814 my $map_format;
1815 my $map_default;
1816 my $maps_to_code_point;
1817 my $to_adjust;
0f5e3c71
KW
1818 if ($is_local_sub) {
1819 @invlist = eval $lookup_prop;
289ce9cc 1820 die $@ if $@;
0f5e3c71
KW
1821 }
1822 else {
1823 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1824 if (! @invlist) {
99f21fb9 1825
ad85f59a
KW
1826 # If couldn't find a non-empty inversion list, see if it is
1827 # instead an inversion map
1828 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1829 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1830 if (! $list_ref) {
1831 # An empty return here could mean an unknown property, or
1832 # merely that the original inversion list is empty. Call
1833 # in scalar context to differentiate
1834 my $count = prop_invlist($lookup_prop,
1835 '_perl_core_internal_ok');
1836 die "Could not find inversion list for '$lookup_prop'"
1837 unless defined $count;
1838 }
1839 else {
18b852b3
KW
1840 @invlist = @$list_ref;
1841 @invmap = @$map_ref;
1842 $map_format = $format;
1843 $map_default = $default;
1844 $maps_to_code_point = $map_format =~ /x/;
1845 $to_adjust = $map_format =~ /a/;
ad85f59a 1846 }
99f21fb9 1847 }
0f5e3c71 1848 }
ad85f59a
KW
1849
1850
1851 # Short-circuit an empty inversion list.
1852 if (! @invlist) {
1853 output_invlist($prop_name, \@invlist, $charset);
1854 next;
1855 }
ceb1de32 1856
99f21fb9
KW
1857 # Re-order the Unicode code points to native ones for this platform.
1858 # This is only needed for code points below 256, because native code
1859 # points are only in that range. For inversion maps of properties
1860 # where the mappings are adjusted (format =~ /a/), this reordering
1861 # could mess up the adjustment pattern that was in the input, so that
1862 # has to be dealt with.
1863 #
1864 # And inversion maps that map to code points need to eventually have
1865 # all those code points remapped to native, and it's better to do that
1866 # here, going through the whole list not just those below 256. This
1867 # is because some inversion maps have adjustments (format =~ /a/)
1868 # which may be affected by the reordering. This code needs to be done
1869 # both for when we are translating the inversion lists for < 256, and
1870 # for the inversion maps for everything. By doing both in this loop,
1871 # we can share that code.
1872 #
1873 # So, we go through everything for an inversion map to code points;
1874 # otherwise, we can skip any remapping at all if we are going to
1875 # output only the above-Latin1 values, or if the range spans the whole
1876 # of 0..256, as the remap will also include all of 0..256 (256 not
1877 # 255 because a re-ordering could cause 256 to need to be in the same
1878 # range as 255.)
1879 if ((@invmap && $maps_to_code_point)
1880 || (! $nonl1_only || ($invlist[0] < 256
1881 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1882 {
fb4554ea 1883
99f21fb9 1884 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1885 # Look at all the ranges that start before 257.
1886 my @latin1_list;
1887 while (@invlist) {
1888 last if $invlist[0] > 256;
1889 my $upper = @invlist > 1
1890 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1891
1892 # To infinity. You may want to stop much much
1893 # earlier; going this high may expose perl
1894 # deficiencies with very large numbers.
1895 : $Unicode::UCD::MAX_CP;
fb4554ea 1896 for my $j ($invlist[0] .. $upper) {
99f21fb9 1897 push @latin1_list, a2n($j);
0f5e3c71 1898 }
fb4554ea
KW
1899
1900 shift @invlist; # Shift off the range that's in the list
1901 shift @invlist; # Shift off the range not in the list
0c4ecf42 1902 }
fb4554ea
KW
1903
1904 # Here @invlist contains all the ranges in the original that start
1905 # at code points above 256, and @latin1_list contains all the
1906 # native code points for ranges that start with a Unicode code
1907 # point below 257. We sort the latter and convert it to inversion
1908 # list format. Then simply prepend it to the list of the higher
1909 # code points.
1910 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1911 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1912 unshift @invlist, @latin1_list;
99f21fb9
KW
1913 }
1914 else { # Is an inversion map
1915
1916 # This is a similar procedure as plain inversion list, but has
1917 # multiple buckets. A plain inversion list just has two
1918 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1919 # pretty much can ignore the 2nd bucket, as it is completely
1920 # defined by the 1st. But here, what we do is create buckets
1921 # which contain the code points that map to each, translated
1922 # to native and turned into an inversion list. Thus each
1923 # bucket is an inversion list of native code points that map
1924 # to it or don't map to it. We use these to create an
1925 # inversion map for the whole property.
1926
1927 # As mentioned earlier, we use this procedure to not just
1928 # remap the inversion list to native values, but also the maps
1929 # of code points to native ones. In the latter case we have
1930 # to look at the whole of the inversion map (or at least to
1931 # above Unicode; as the maps of code points above that should
1932 # all be to the default).
1933 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1934
1935 my %mapped_lists; # A hash whose keys are the buckets.
1936 while (@invlist) {
1937 last if $invlist[0] > $upper_limit;
1938
1939 # This shouldn't actually happen, as prop_invmap() returns
1940 # an extra element at the end that is beyond $upper_limit
1941 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1942
1943 my $bucket;
1944
1945 # A hash key can't be a ref (we are only expecting arrays
1946 # of scalars here), so convert any such to a string that
1947 # will be converted back later (using a vertical tab as
1948 # the separator). Even if the mapping is to code points,
1949 # we don't translate to native here because the code
d8049362 1950 # output_invmap() calls to output these arrays assumes the
99f21fb9
KW
1951 # input is Unicode, not native.
1952 if (ref $invmap[0]) {
1953 $bucket = join "\cK", @{$invmap[0]};
1954 }
1955 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1956
1957 # Do convert to native for maps to single code points.
1958 # There are some properties that have a few outlier
1959 # maps that aren't code points, so the above test
1960 # skips those.
1961 $bucket = a2n($invmap[0]);
1962 } else {
1963 $bucket = $invmap[0];
1964 }
1965
1966 # We now have the bucket that all code points in the range
1967 # map to, though possibly they need to be adjusted. Go
1968 # through the range and put each translated code point in
1969 # it into its bucket.
1970 my $base_map = $invmap[0];
1971 for my $j ($invlist[0] .. $invlist[1] - 1) {
1972 if ($to_adjust
1973 # The 1st code point doesn't need adjusting
1974 && $j > $invlist[0]
1975
1976 # Skip any non-numeric maps: these are outliers
1977 # that aren't code points.
1978 && $base_map =~ $numeric_re
1979
1980 # 'ne' because the default can be a string
1981 && $base_map ne $map_default)
1982 {
1983 # We adjust, by incrementing each the bucket and
1984 # the map. For code point maps, translate to
1985 # native
1986 $base_map++;
1987 $bucket = ($maps_to_code_point)
1988 ? a2n($base_map)
1989 : $base_map;
1990 }
1991
1992 # Add the native code point to the bucket for the
1993 # current map
1994 push @{$mapped_lists{$bucket}}, a2n($j);
1995 } # End of loop through all code points in the range
1996
1997 # Get ready for the next range
1998 shift @invlist;
1999 shift @invmap;
2000 } # End of loop through all ranges in the map.
2001
2002 # Here, @invlist and @invmap retain all the ranges from the
2003 # originals that start with code points above $upper_limit.
2004 # Each bucket in %mapped_lists contains all the code points
2005 # that map to that bucket. If the bucket is for a map to a
2006 # single code point is a single code point, the bucket has
2007 # been converted to native. If something else (including
2008 # multiple code points), no conversion is done.
2009 #
2010 # Now we recreate the inversion map into %xlated, but this
2011 # time for the native character set.
2012 my %xlated;
2013 foreach my $bucket (keys %mapped_lists) {
2014
2015 # Sort and convert this bucket to an inversion list. The
2016 # result will be that ranges that start with even-numbered
2017 # indexes will be for code points that map to this bucket;
2018 # odd ones map to some other bucket, and are discarded
2019 # below.
2020 @{$mapped_lists{$bucket}}
2021 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2022 @{$mapped_lists{$bucket}}
2023 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2024
2025 # Add each even-numbered range in the bucket to %xlated;
2026 # so that the keys of %xlated become the range start code
2027 # points, and the values are their corresponding maps.
2028 while (@{$mapped_lists{$bucket}}) {
2029 my $range_start = $mapped_lists{$bucket}->[0];
2030 if ($bucket =~ /\cK/) {
2031 @{$xlated{$range_start}} = split /\cK/, $bucket;
2032 }
2033 else {
2034 $xlated{$range_start} = $bucket;
2035 }
2036 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2037 shift @{$mapped_lists{$bucket}}; # Get ready for next
2038 # iteration
2039 }
2040 } # End of loop through all the buckets.
2041
2042 # Here %xlated's keys are the range starts of all the code
2043 # points in the inversion map. Construct an inversion list
2044 # from them.
2045 my @new_invlist = sort { $a <=> $b } keys %xlated;
2046
2047 # If the list is adjusted, we want to munge this list so that
2048 # we only have one entry for where consecutive code points map
2049 # to consecutive values. We just skip the subsequent entries
2050 # where this is the case.
2051 if ($to_adjust) {
2052 my @temp;
2053 for my $i (0 .. @new_invlist - 1) {
2054 next if $i > 0
2055 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2056 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2057 && $xlated{$new_invlist[$i]} =~ $numeric_re
2058 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2059 push @temp, $new_invlist[$i];
2060 }
2061 @new_invlist = @temp;
2062 }
2063
2064 # The inversion map comes from %xlated's values. We can
2065 # unshift each onto the front of the untouched portion, in
2066 # reverse order of the portion we did process.
2067 foreach my $start (reverse @new_invlist) {
2068 unshift @invmap, $xlated{$start};
2069 }
2070
2071 # Finally prepend the inversion list we have just constructed to the
2072 # one that contains anything we didn't process.
2073 unshift @invlist, @new_invlist;
2074 }
2075 }
2076
2077 # prop_invmap() returns an extra final entry, which we can now
2078 # discard.
2079 if (@invmap) {
2080 pop @invlist;
2081 pop @invmap;
ceb1de32 2082 }
0f5e3c71
KW
2083
2084 if ($l1_only) {
99f21fb9 2085 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
2086 for my $i (0 .. @invlist - 1 - 1) {
2087 if ($invlist[$i] > 255) {
2088
2089 # In an inversion list, even-numbered elements give the code
2090 # points that begin ranges that match the property;
2091 # odd-numbered give ones that begin ranges that don't match.
2092 # If $i is odd, we are at the first code point above 255 that
2093 # doesn't match, which means the range it is ending does
2094 # match, and crosses the 255/256 boundary. We want to include
2095 # this ending point, so increment $i, so the splice below
2096 # includes it. Conversely, if $i is even, it is the first
2097 # code point above 255 that matches, which means there was no
2098 # matching range that crossed the boundary, and we don't want
2099 # to include this code point, so splice before it.
2100 $i++ if $i % 2 != 0;
2101
2102 # Remove everything past this.
2103 splice @invlist, $i;
99f21fb9 2104 splice @invmap, $i if @invmap;
0f5e3c71
KW
2105 last;
2106 }
0c4ecf42
KW
2107 }
2108 }
0f5e3c71
KW
2109 elsif ($nonl1_only) {
2110 my $found_nonl1 = 0;
2111 for my $i (0 .. @invlist - 1 - 1) {
2112 next if $invlist[$i] < 256;
2113
2114 # Here, we have the first element in the array that indicates an
2115 # element above Latin1. Get rid of all previous ones.
2116 splice @invlist, 0, $i;
99f21fb9 2117 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
2118
2119 # If this one's index is not divisible by 2, it means that this
2120 # element is inverting away from being in the list, which means
99f21fb9
KW
2121 # all code points from 256 to this one are in this list (or
2122 # map to the default for inversion maps)
2123 if ($i % 2 != 0) {
2124 unshift @invlist, 256;
2125 unshift @invmap, $map_default if @invmap;
2126 }
0f5e3c71 2127 $found_nonl1 = 1;
3f427fd9
KW
2128 last;
2129 }
0f5e3c71 2130 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 2131 }
3f427fd9 2132
0f5e3c71 2133 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 2134 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 2135 }
bffc0129 2136 end_file_pound_if;
0c4ecf42 2137 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
2138}
2139
973a28ed
KW
2140switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2141
2142output_GCB_table();
6b659339 2143output_LB_table();
7e54b87f 2144output_WB_table();
6b659339 2145
973a28ed
KW
2146end_file_pound_if;
2147
2308ab83 2148my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
2149my @sources = ($0, qw(lib/unicore/mktables
2150 lib/Unicode/UCD.pm
2151 regen/charset_translations.pl
2152 ));
9a3da3ad
FC
2153{
2154 # Depend on mktables’ own sources. It’s a shorter list of files than
2155 # those that Unicode::UCD uses.
1ae6ead9 2156 if (! open my $mktables_list, '<', $sources_list) {
2308ab83
KW
2157
2158 # This should force a rebuild once $sources_list exists
2159 push @sources, $sources_list;
2160 }
2161 else {
2162 while(<$mktables_list>) {
2163 last if /===/;
2164 chomp;
2165 push @sources, "lib/unicore/$_" if /^[^#]/;
2166 }
9a3da3ad
FC
2167 }
2168}
6b659339
KW
2169
2170read_only_bottom_close_and_rename($out_fh, \@sources);