This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Outdent code
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
3d7c117d
MB
11require './regen/regen_lib.pl';
12require './regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
289ce9cc
KW
43my $max_hdr_len = 3; # In headings, how wide a name is allowed?
44
9d9177be
KW
45print $out_fh "/* See the generating file for comments */\n\n";
46
bffc0129
KW
47# The symbols generated by this program are all currently defined only in a
48# single dot c each. The code knows where most of them go, but this hash
49# gives overrides for the exceptions to the typical place
50my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
59 );
015bb97c 60
f79a09fc 61# This hash contains the properties with enums that have hard-coded references
289ce9cc 62# to them in C code. It is neeed to make sure that if perl is compiled
f79a09fc
KW
63# with an older Unicode data set, that all the enum values the code is
64# expecting will still be in the enum typedef. Thus the code doesn't have to
289ce9cc
KW
65# change. The Unicode version won't have any code points that have the enum
66# values not in that version, so the code that handles them will not get
67# exercised. This is far better than having to #ifdef things. The names here
68# should be the long names of the respective property values. The reason for
69# this is because regexec.c uses them as case labels, and the long name is
70# generally more understandable than the short.
f79a09fc
KW
71my %hard_coded_enums =
72 ( gcb => [
73 'Control',
74 'CR',
b0e24409
KW
75 'E_Base',
76 'E_Base_GAZ',
77 'E_Modifier',
f79a09fc 78 'Extend',
b0e24409 79 'Glue_After_Zwj',
f79a09fc
KW
80 'L',
81 'LF',
82 'LV',
83 'LVT',
84 'Other',
85 'Prepend',
86 'Regional_Indicator',
87 'SpacingMark',
88 'T',
89 'V',
b0e24409 90 'ZWJ',
f79a09fc 91 ],
ca8226cf
KW
92 lb => [
93 'Alphabetic',
94 'Break_After',
95 'Break_Before',
96 'Break_Both',
97 'Break_Symbols',
98 'Carriage_Return',
99 'Close_Parenthesis',
100 'Close_Punctuation',
101 'Combining_Mark',
102 'Contingent_Break',
b0e24409
KW
103 'E_Base',
104 'E_Modifier',
ca8226cf
KW
105 'Exclamation',
106 'Glue',
107 'H2',
108 'H3',
109 'Hebrew_Letter',
110 'Hyphen',
111 'Ideographic',
112 'Infix_Numeric',
113 'Inseparable',
114 'JL',
115 'JT',
116 'JV',
117 'Line_Feed',
118 'Mandatory_Break',
119 'Next_Line',
120 'Nonstarter',
121 'Numeric',
122 'Open_Punctuation',
123 'Postfix_Numeric',
124 'Prefix_Numeric',
125 'Quotation',
126 'Regional_Indicator',
127 'Space',
128 'Word_Joiner',
b0e24409 129 'ZWJ',
ca8226cf
KW
130 'ZWSpace',
131 ],
f79a09fc
KW
132 sb => [
133 'ATerm',
134 'Close',
135 'CR',
136 'Extend',
137 'Format',
138 'LF',
139 'Lower',
140 'Numeric',
141 'OLetter',
142 'Other',
143 'SContinue',
144 'Sep',
145 'Sp',
146 'STerm',
147 'Upper',
148 ],
149 wb => [
150 'ALetter',
151 'CR',
152 'Double_Quote',
b0e24409
KW
153 'E_Base',
154 'E_Base_GAZ',
155 'E_Modifier',
f79a09fc
KW
156 'Extend',
157 'ExtendNumLet',
158 'Format',
b0e24409 159 'Glue_After_Zwj',
f79a09fc
KW
160 'Hebrew_Letter',
161 'Katakana',
162 'LF',
163 'MidLetter',
164 'MidNum',
165 'MidNumLet',
166 'Newline',
167 'Numeric',
168 'Other',
f1f6961f 169 'Perl_Tailored_HSpace',
f79a09fc
KW
170 'Regional_Indicator',
171 'Single_Quote',
b0e24409 172 'ZWJ',
f79a09fc
KW
173 ],
174);
175
973a28ed
KW
176my %gcb_enums;
177my @gcb_short_enums;
289ce9cc 178my %gcb_abbreviations;
6b659339
KW
179my %lb_enums;
180my @lb_short_enums;
289ce9cc 181my %lb_abbreviations;
7e54b87f
KW
182my %wb_enums;
183my @wb_short_enums;
289ce9cc 184my %wb_abbreviations;
6b659339 185
99f21fb9
KW
186my @a2n;
187
188sub uniques {
189 # Returns non-duplicated input values. From "Perl Best Practices:
190 # Encapsulated Cleverness". p. 455 in first edition.
191
192 my %seen;
193 return grep { ! $seen{$_}++ } @_;
194}
195
196sub a2n($) {
197 my $cp = shift;
198
199 # Returns the input Unicode code point translated to native.
200
201 return $cp if $cp !~ $numeric_re || $cp > 255;
202 return $a2n[$cp];
203}
204
bffc0129
KW
205sub end_file_pound_if {
206 if ($in_file_pound_if) {
207 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
208 $in_file_pound_if = 0;
209 }
210}
211
212sub switch_pound_if ($$) {
213 my $name = shift;
214 my $new_pound_if = shift;
215
216 # Switch to new #if given by the 2nd argument. If there is an override
217 # for this, it instead switches to that. The 1st argument is the
218 # static's name, used to look up the overrides
219
220 if (exists $exceptions_to_where_to_define{$name}) {
221 $new_pound_if = $exceptions_to_where_to_define{$name};
222 }
223
224 # Exit current #if if the new one is different from the old
225 if ($in_file_pound_if
226 && $in_file_pound_if !~ /$new_pound_if/)
227 {
228 end_file_pound_if;
229 }
230
231 # Enter new #if, if not already in it.
232 if (! $in_file_pound_if) {
233 $in_file_pound_if = "defined($new_pound_if)";
234 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
235 }
236}
237
0c4ecf42 238sub output_invlist ($$;$) {
9d9177be
KW
239 my $name = shift;
240 my $invlist = shift; # Reference to inversion list array
0c4ecf42 241 my $charset = shift // ""; # name of character set for comment
9d9177be 242
76d3994c 243 die "No inversion list for $name" unless defined $invlist
ad85f59a 244 && ref $invlist eq 'ARRAY';
76d3994c 245
9d9177be
KW
246 # Output the inversion list $invlist using the name $name for it.
247 # It is output in the exact internal form for inversion lists.
248
a0316a6c
KW
249 # Is the last element of the header 0, or 1 ?
250 my $zero_or_one = 0;
ad85f59a 251 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 252 unshift @$invlist, 0;
9d9177be
KW
253 $zero_or_one = 1;
254 }
0a07b44b 255 my $count = @$invlist;
9d9177be 256
bffc0129 257 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 258
0c4ecf42
KW
259 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
260 print $out_fh " /* for $charset */" if $charset;
261 print $out_fh "\n";
9d9177be 262
a0316a6c 263 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
264 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
265 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
266 ",\t/* 0 if the list starts at 0;",
267 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
268
269 # The main body are the UVs passed in to this routine. Do the final
270 # element separately
47d53124
KW
271 for my $i (0 .. @$invlist - 1) {
272 printf $out_fh "\t0x%X", $invlist->[$i];
273 print $out_fh "," if $i < @$invlist - 1;
274 print $out_fh "\n";
9d9177be
KW
275 }
276
9d9177be
KW
277 print $out_fh "};\n";
278}
279
99f21fb9
KW
280sub output_invmap ($$$$$$$) {
281 my $name = shift;
282 my $invmap = shift; # Reference to inversion map array
283 my $prop_name = shift;
284 my $input_format = shift; # The inversion map's format
285 my $default = shift; # The property value for code points who
286 # otherwise don't have a value specified.
287 my $extra_enums = shift; # comma-separated list of our additions to the
288 # property's standard possible values
289 my $charset = shift // ""; # name of character set for comment
290
291 # Output the inversion map $invmap for property $prop_name, but use $name
292 # as the actual data structure's name.
293
294 my $count = @$invmap;
295
296 my $output_format;
297 my $declaration_type;
298 my %enums;
299 my $name_prefix;
300
301 if ($input_format eq 's') {
b83e6484 302 my $orig_prop_name = $prop_name;
02f811dd
KW
303 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
304 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
19a5f1d5
KW
305 my @enums;
306 if ($orig_prop_name eq $prop_name) {
307 @enums = prop_values($prop_name);
308 }
309 else {
310 @enums = uniques(@$invmap);
311 }
289ce9cc 312
19a5f1d5
KW
313 if (! @enums) {
314 die "Only enum properties are currently handled; '$prop_name' isn't one";
315 }
316 else {
317 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
318 my @canonical_input_enums;
319 if (@expected_enums) {
320 if (@expected_enums < @enums) {
321 die 'You need to update %hard_coded_enums to reflect new'
322 . " entries in this Unicode version\n"
323 . "Expected: " . join(", ", sort @expected_enums) . "\n"
324 . " Got: " . join(", ", sort @enums);
325 }
f79a09fc 326
19a5f1d5 327 if (! defined prop_aliases($prop_name)) {
f79a09fc 328
19a5f1d5
KW
329 # Convert the input enums into canonical form and
330 # save for use below
331 @canonical_input_enums = map { lc ($_ =~ s/_//gr) }
332 @enums;
289ce9cc 333 }
19a5f1d5
KW
334 @enums = sort @expected_enums;
335 }
99f21fb9 336
19a5f1d5
KW
337 # The internal enums come last, and in the order specified
338 my @extras;
339 if ($extra_enums ne "") {
340 @extras = split /,/, $extra_enums;
341 push @enums, @extras;
342 }
6dc80864 343
19a5f1d5
KW
344 # Assign a value to each element of the enum. The default
345 # value always gets 0; the others are arbitrarily assigned.
346 my $enum_val = 0;
347 my $canonical_default = prop_value_aliases($prop_name, $default);
348 $default = $canonical_default if defined $canonical_default;
349 $enums{$default} = $enum_val++;
350 for my $enum (@enums) {
351 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
352 }
6b659339 353
19a5f1d5
KW
354 # Calculate the enum values for certain properties like
355 # _Perl_GCB and _Perl_LB, because we output special tables for
356 # them.
357 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
358
359 # We use string evals to allow the same code to work on
360 # all tables we're doing.
361 my $type = lc $prop_name;
362
363 # We use lowercase single letter names for any property
364 # values not in the release of Unicode being compiled now.
365 my $placeholder = "a";
366
367 # Skip if we've already done this code, which populated
368 # this hash
369 if (eval "! \%${type}_enums") {
370
371 # For each enum ...
372 foreach my $enum (sort keys %enums) {
373 my $value = $enums{$enum};
374 my $short;
375 my $abbreviated_from;
376
377 # Special case this wb property value to make the
378 # name more clear
379 if ($enum eq 'Perl_Tailored_HSpace') {
380 $short = 'hs';
381 $abbreviated_from = $enum;
382 }
383 elsif (grep { $_ eq $enum } @extras) {
289ce9cc 384
19a5f1d5
KW
385 # The 'short' name for one of the property
386 # values added by this file is just the
387 # lowercase of it
388 $short = lc $enum;
389 }
390 elsif (grep {$_ eq lc ( $enum =~ s/_//gr) }
391 @canonical_input_enums)
392 { # On Unicode versions that predate the
393 # official property, we have set up this array
394 # to be the canonical form of each enum in the
395 # substitute property. If the enum we're
396 # looking at is canonically the same as one of
397 # these, use its name instead of generating a
398 # placeholder one in the next clause (which
399 # will happen because prop_value_aliases()
400 # will fail because it only works on official
401 # properties)
402 $short = $enum;
403 }
404 else {
405 # Use the official short name for the other
406 # property values, which should all be
407 # official ones.
408 ($short) = prop_value_aliases($type, $enum);
409
410 # But create a placeholder for ones not in
411 # this Unicode version.
412 $short = $placeholder++ unless defined $short;
413 }
289ce9cc 414
19a5f1d5
KW
415 # If our short name is too long, or we already
416 # know that the name is an abbreviation, truncate
417 # to make sure it's short enough, and remember
418 # that we did this so we can later place in a
419 # comment in the generated file
420 if ( $abbreviated_from
421 || length $short > $max_hdr_len)
422 {
423 $short = substr($short, 0, $max_hdr_len);
424 $abbreviated_from = $enum
425 unless $abbreviated_from;
426 # If the name we are to display conflicts, try
427 # another.
428 while (eval "exists
429 \$${type}_abbreviations{$short}")
430 {
289ce9cc 431 die $@ if $@;
19a5f1d5 432 $short++;
289ce9cc
KW
433 }
434
19a5f1d5 435 eval "\$${type}_abbreviations{$short} = '$enum'";
289ce9cc
KW
436 die $@ if $@;
437 }
19a5f1d5
KW
438
439 # Remember the mapping from the property value
440 # (enum) name to its value.
441 eval "\$${type}_enums{$enum} = $value";
442 die $@ if $@;
443
444 # Remember the inverse mapping to the short name
445 # so that we can properly label the generated
446 # table's rows and columns
447 eval "\$${type}_short_enums[$value] = '$short'";
448 die $@ if $@;
7e54b87f
KW
449 }
450 }
99f21fb9 451 }
19a5f1d5 452 }
99f21fb9 453
19a5f1d5
KW
454 # Inversion map stuff is currently used only by regexec
455 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
456
457 # The short names tend to be two lower case letters, but it looks
458 # better for those if they are upper. XXX
459 $short_name = uc($short_name) if length($short_name) < 3
460 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
461 $name_prefix = "${short_name}_";
462 my $enum_count = keys %enums;
463 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
464
465 print $out_fh "\ntypedef enum {\n";
466 my @enum_list;
467 foreach my $enum (keys %enums) {
468 $enum_list[$enums{$enum}] = $enum;
99f21fb9 469 }
19a5f1d5
KW
470 foreach my $i (0 .. @enum_list - 1) {
471 my $name = $enum_list[$i];
472 print $out_fh "\t${name_prefix}$name = $i";
473 print $out_fh "," if $i < $enum_count - 1;
474 print $out_fh "\n";
475 }
476 $declaration_type = "${name_prefix}enum";
477 print $out_fh "} $declaration_type;\n";
478
479 $output_format = "${name_prefix}%s";
99f21fb9
KW
480 }
481 else {
482 die "'$input_format' invmap() format for '$prop_name' unimplemented";
483 }
484
485 die "No inversion map for $prop_name" unless defined $invmap
486 && ref $invmap eq 'ARRAY'
487 && $count;
488
489 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
490 print $out_fh " /* for $charset */" if $charset;
491 print $out_fh "\n";
492
493 # The main body are the scalars passed in to this routine.
494 for my $i (0 .. $count - 1) {
495 my $element = $invmap->[$i];
02f811dd
KW
496 my $full_element_name = prop_value_aliases($prop_name, $element);
497 $element = $full_element_name if defined $full_element_name;
498 $element = $name_prefix . $element;
99f21fb9
KW
499 print $out_fh "\t$element";
500 print $out_fh "," if $i < $count - 1;
501 print $out_fh "\n";
502 }
503 print $out_fh "};\n";
99f21fb9
KW
504}
505
5a7e5385 506sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
507
508 # Returns an inversion list constructed from the sorted input array of
509 # code points
510
511 my $list_ref = shift;
512
99f21fb9
KW
513 return unless @$list_ref;
514
a02047bf
KW
515 # Initialize to just the first element
516 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
517
518 # For each succeeding element, if it extends the previous range, adjust
519 # up, otherwise add it.
520 for my $i (1 .. @$list_ref - 1) {
521 if ($invlist[-1] == $list_ref->[$i]) {
522 $invlist[-1]++;
523 }
524 else {
525 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
526 }
527 }
528 return @invlist;
529}
530
531# Read in the Case Folding rules, and construct arrays of code points for the
532# properties we need.
533my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
534die "Could not find inversion map for Case_Folding" unless defined $format;
535die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
536 unless $format eq 'al'
537 || $format eq 'a';
a02047bf
KW
538my @has_multi_char_fold;
539my @is_non_final_fold;
540
541for my $i (0 .. @$folds_ref - 1) {
542 next unless ref $folds_ref->[$i]; # Skip single-char folds
543 push @has_multi_char_fold, $cp_ref->[$i];
544
b6a6e956 545 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
546 # position
547 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
548 push @is_non_final_fold, $folds_ref->[$i][$j]
549 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
550 }
551}
552
a02047bf
KW
553sub _Perl_Non_Final_Folds {
554 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 555 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
556}
557
99f21fb9
KW
558sub prop_name_for_cmp ($) { # Sort helper
559 my $name = shift;
560
561 # Returns the input lowercased, with non-alphas removed, as well as
562 # everything starting with a comma
563
564 $name =~ s/,.*//;
565 $name =~ s/[[:^alpha:]]//g;
566 return lc $name;
567}
568
892d8259 569sub UpperLatin1 {
5a7e5385 570 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
571}
572
289ce9cc
KW
573sub output_table_common {
574
575 # Common subroutine to actually output the generated rules table.
576
577 my ($property,
578 $table_value_defines_ref,
579 $table_ref,
580 $names_ref,
581 $abbreviations_ref) = @_;
582 my $size = @$table_ref;
583
584 # Output the #define list, sorted by numeric value
585 if ($table_value_defines_ref) {
586 my $max_name_length = 0;
587 my @defines;
588
589 # Put in order, and at the same time find the longest name
590 while (my ($enum, $value) = each %$table_value_defines_ref) {
591 $defines[$value] = $enum;
592
593 my $length = length $enum;
594 $max_name_length = $length if $length > $max_name_length;
595 }
596
597 print $out_fh "\n";
598
599 # Output, so that the values are vertically aligned in a column after
600 # the longest name
601 foreach my $i (0 .. @defines - 1) {
602 next unless defined $defines[$i];
603 printf $out_fh "#define %-*s %2d\n",
604 $max_name_length,
605 $defines[$i],
606 $i;
607 }
608 }
609
610 my $column_width = 2; # We currently allow 2 digits for the number
611
612 # If the maximum value in the table is 1, it can be a bool. (Being above
613 # a U8 is not currently handled
614 my $max_element = 0;
615 for my $i (0 .. $size - 1) {
616 for my $j (0 .. $size - 1) {
617 next if $max_element >= $table_ref->[$i][$j];
618 $max_element = $table_ref->[$i][$j];
619 }
620 }
621 die "Need wider table column width given '$max_element"
622 if length $max_element > $column_width;
623
624 my $table_type = ($max_element == 1)
625 ? 'bool'
626 : 'U8';
627
628 # If a name is longer than the width set aside for a column, its column
629 # needs to have increased spacing so that the name doesn't get truncated
630 # nor run into an adjacent column
631 my @spacers;
632
633 # If we are being compiled on a Unicode version earlier than that which
634 # this file was designed for, it may be that some of the property values
635 # aren't in the current release, and so would be undefined if we didn't
636 # define them ourselves. Earlier code has done this, making them
637 # lowercase characters of length one. We look to see if any exist, so
638 # that we can add an annotation to the output table
639 my $has_placeholder = 0;
640
641 for my $i (0 .. $size - 1) {
642 no warnings 'numeric';
643 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
644 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
645 }
646
647 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
648
649 # Calculate the column heading line
650 my $header_line = "/* "
651 . (" " x $max_hdr_len) # We let the row heading meld to
652 # the '*/' for those that are at
653 # the max
654 . " " x 3; # Space for '*/ '
655 # Now each column
656 for my $i (0 .. $size - 1) {
657 $header_line .= sprintf "%s%*s",
658 $spacers[$i],
659 $column_width + 1, # 1 for the ','
660 $names_ref->[$i];
661 }
662 $header_line .= " */\n";
663
664 # If we have annotations, output it now.
665 if ($has_placeholder || scalar %$abbreviations_ref) {
666 my $text = "";
667 foreach my $abbr (sort keys %$abbreviations_ref) {
668 $text .= "; " if $text;
669 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
670 }
671 if ($has_placeholder) {
672 $text .= "; other " if $text;
673 $text .= "lowercase names are placeholders for"
674 . " property values not defined until a later Unicode"
675 . " release, so are irrelevant in this one, as they are"
676 . " not assigned to any code points";
677 }
678
679 my $indent = " " x 3;
680 $text = $indent . "/* $text */";
681
682 # Wrap the text so that it is no wider than the table, which the
683 # header line gives.
684 my $output_width = length $header_line;
685 while (length $text > $output_width) {
686 my $cur_line = substr($text, 0, $output_width);
687
688 # Find the first blank back from the right end to wrap at.
689 for (my $i = $output_width -1; $i > 0; $i--) {
690 if (substr($text, $i, 1) eq " ") {
691 print $out_fh substr($text, 0, $i), "\n";
692
693 # Set so will look at just the remaining tail (which will
694 # be indented and have a '*' after the indent
695 $text = $indent . " * " . substr($text, $i + 1);
696 last;
697 }
698 }
699 }
700
701 # And any remaining
702 print $out_fh $text, "\n" if $text;
703 }
704
705 # We calculated the header line earlier just to get its width so that we
706 # could make sure the annotations fit into that.
707 print $out_fh $header_line;
708
709 # Now output the bulk of the table.
710 for my $i (0 .. $size - 1) {
711
712 # First the row heading.
713 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
714 print $out_fh "{"; # Then the brace for this row
715
716 # Then each column
717 for my $j (0 .. $size -1) {
718 print $out_fh $spacers[$j];
719 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
720 print $out_fh "," if $j < $size - 1;
721 }
722 print $out_fh " }";
723 print $out_fh "," if $i < $size - 1;
724 print $out_fh "\n";
725 }
726
727 print $out_fh "};\n";
728}
729
973a28ed
KW
730sub output_GCB_table() {
731
732 # Create and output the pair table for use in determining Grapheme Cluster
733 # Breaks, given in http://www.unicode.org/reports/tr29/.
b0e24409
KW
734 my %gcb_actions = (
735 GCB_NOBREAK => 0,
736 GCB_BREAKABLE => 1,
737 GCB_RI_then_RI => 2, # Rules 12 and 13
738 GCB_EX_then_EM => 3, # Rule 10
739 );
973a28ed
KW
740
741 # The table is constructed in reverse order of the rules, to make the
742 # lower-numbered, higher priority ones override the later ones, as the
743 # algorithm stops at the earliest matching rule
744
745 my @gcb_table;
746 my $table_size = @gcb_short_enums;
747
748 # Otherwise, break everywhere.
b0e24409 749 # GB99 Any ÷ Any
973a28ed
KW
750 for my $i (0 .. $table_size - 1) {
751 for my $j (0 .. $table_size - 1) {
752 $gcb_table[$i][$j] = 1;
753 }
754 }
755
b0e24409
KW
756 # Do not break within emoji flag sequences. That is, do not break between
757 # regional indicator (RI) symbols if there is an odd number of RI
758 # characters before the break point. Must be resolved in runtime code.
759 #
c492f156 760 # GB12 sot (RI RI)* RI × RI
b0e24409
KW
761 # GB13 [^RI] (RI RI)* RI × RI
762 $gcb_table[$gcb_enums{'Regional_Indicator'}]
763 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
764
765 # Do not break within emoji modifier sequences or emoji zwj sequences.
766 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
767 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
768 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
769
770 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
771 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
772 = $gcb_actions{GCB_EX_then_EM};
773 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
774 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
775
776 # Do not break before extending characters or ZWJ.
973a28ed 777 # Do not break before SpacingMarks, or after Prepend characters.
973a28ed 778 # GB9b Prepend ×
b0e24409
KW
779 # GB9a × SpacingMark
780 # GB9 × ( Extend | ZWJ )
973a28ed 781 for my $i (0 .. @gcb_table - 1) {
289ce9cc 782 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
b0e24409
KW
783 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
784 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
785 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
973a28ed
KW
786 }
787
973a28ed
KW
788 # Do not break Hangul syllable sequences.
789 # GB8 ( LVT | T) × T
790 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
791 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
792
793 # GB7 ( LV | V ) × ( V | T )
794 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
795 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
796 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
797 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
798
799 # GB6 L × ( L | V | LV | LVT )
800 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
801 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
802 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
803 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
804
289ce9cc
KW
805 # Do not break between a CR and LF. Otherwise, break before and after
806 # controls.
973a28ed
KW
807 # GB5 ÷ ( Control | CR | LF )
808 # GB4 ( Control | CR | LF ) ÷
809 for my $i (0 .. @gcb_table - 1) {
289ce9cc 810 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
973a28ed
KW
811 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
812 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
289ce9cc 813 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
973a28ed
KW
814 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
815 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
816 }
817
818 # GB3 CR × LF
819 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
820
b0e24409 821 # Break at the start and end of text, unless the text is empty
973a28ed
KW
822 # GB1 sot ÷
823 # GB2 ÷ eot
824 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
825 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
826 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
973a28ed 827 }
289ce9cc 828 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
973a28ed 829
b0e24409 830 output_table_common('GCB', \%gcb_actions,
289ce9cc 831 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
973a28ed
KW
832}
833
6b659339
KW
834sub output_LB_table() {
835
836 # Create and output the enums, #defines, and pair table for use in
837 # determining Line Breaks. This uses the default line break algorithm,
838 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
839 # in that page, as the Unicode-furnished tests assume that tailoring.
840
6b659339
KW
841 # The result is really just true or false. But we follow along with tr14,
842 # creating a rule which is false for something like X SP* X. That gets
843 # encoding 2. The rest of the actions are synthetic ones that indicate
844 # some context handling is required. These each are added to the
845 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
846 # value can be retrieved. Actually only rules from 7 through 18 (which
847 # are the ones where space matter) are possible to have 2 added to them.
848 # The others below add just 0 or 1. It might be possible for one
849 # synthetic rule to be added to another, yielding a larger value. This
850 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
851 # names of the middle grouping below, it is impossible for that to occur
852 # for them because they all start with mutually exclusive classes. That
853 # the final rule can't be added to any of the others isn't obvious from
854 # its name, so it is assigned a power of 2 higher than the others can get
855 # to so any addition would preserve all data. (And the code will reach an
856 # assert(0) on debugging builds should this happen.)
857 my %lb_actions = (
858 LB_NOBREAK => 0,
859 LB_BREAKABLE => 1,
860 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
861
b0e24409 862 LB_CM_ZWJ_foo => 3, # Rule 9
6b659339
KW
863 LB_SP_foo => 6, # Rule 18
864 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
865 LB_SY_or_IS_then_various => 11, # Rule 25
866 LB_HY_or_BA_then_foo => 13, # Rule 21
b0e24409 867 LB_RI_then_RI => 15, # Rule 30a
6b659339 868
b0e24409 869 LB_various_then_PO_or_PR => (1<<5), # Rule 25
6b659339
KW
870 );
871
6b659339
KW
872 # Construct the LB pair table. This is based on the rules in
873 # http://www.unicode.org/reports/tr14/, but modified as those rules are
874 # designed for someone taking a string of text and sequentially going
875 # through it to find the break opportunities, whereas, Perl requires
876 # determining if a given random spot is a break opportunity, without
877 # knowing all the entire string before it.
878 #
879 # The table is constructed in reverse order of the rules, to make the
880 # lower-numbered, higher priority ones override the later ones, as the
881 # algorithm stops at the earliest matching rule
882
883 my @lb_table;
884 my $table_size = @lb_short_enums;
885
886 # LB31. Break everywhere else
887 for my $i (0 .. $table_size - 1) {
888 for my $j (0 .. $table_size - 1) {
889 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
890 }
891 }
892
b0e24409
KW
893 # LB30b Do not break between an emoji base and an emoji modifier.
894 # EB × EM
895 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
896 = $lb_actions{'LB_NOBREAK'};
897
898 # LB30a Break between two regional indicator symbols if and only if there
899 # are an even number of regional indicators preceding the position of the
900 # break.
901 # sot (RI RI)* RI × RI
902 # [^RI] (RI RI)* RI × RI
289ce9cc 903 $lb_table[$lb_enums{'Regional_Indicator'}]
b0e24409 904 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
6b659339
KW
905
906 # LB30 Do not break between letters, numbers, or ordinary symbols and
907 # opening or closing parentheses.
908 # (AL | HL | NU) × OP
289ce9cc
KW
909 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
910 = $lb_actions{'LB_NOBREAK'};
911 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
912 = $lb_actions{'LB_NOBREAK'};
913 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
914 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
915
916 # CP × (AL | HL | NU)
289ce9cc
KW
917 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
918 = $lb_actions{'LB_NOBREAK'};
919 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
920 = $lb_actions{'LB_NOBREAK'};
921 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
922 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
923
924 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
925 # IS × (AL | HL)
289ce9cc
KW
926 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
927 = $lb_actions{'LB_NOBREAK'};
928 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
929 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
930
931 # LB28 Do not break between alphabetics (“at”).
932 # (AL | HL) × (AL | HL)
289ce9cc
KW
933 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
934 = $lb_actions{'LB_NOBREAK'};
935 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
936 = $lb_actions{'LB_NOBREAK'};
937 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
938 = $lb_actions{'LB_NOBREAK'};
939 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
940 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
941
942 # LB27 Treat a Korean Syllable Block the same as ID.
943 # (JL | JV | JT | H2 | H3) × IN
289ce9cc
KW
944 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
945 = $lb_actions{'LB_NOBREAK'};
946 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
947 = $lb_actions{'LB_NOBREAK'};
948 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
949 = $lb_actions{'LB_NOBREAK'};
950 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
951 = $lb_actions{'LB_NOBREAK'};
952 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
953 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
954
955 # (JL | JV | JT | H2 | H3) × PO
289ce9cc
KW
956 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
957 = $lb_actions{'LB_NOBREAK'};
958 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
959 = $lb_actions{'LB_NOBREAK'};
960 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
961 = $lb_actions{'LB_NOBREAK'};
962 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
963 = $lb_actions{'LB_NOBREAK'};
964 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
965 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
966
967 # PR × (JL | JV | JT | H2 | H3)
289ce9cc
KW
968 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
969 = $lb_actions{'LB_NOBREAK'};
970 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
971 = $lb_actions{'LB_NOBREAK'};
972 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
973 = $lb_actions{'LB_NOBREAK'};
974 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
975 = $lb_actions{'LB_NOBREAK'};
976 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
977 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
978
979 # LB26 Do not break a Korean syllable.
980 # JL × (JL | JV | H2 | H3)
981 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
982 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
983 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
984 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
985
986 # (JV | H2) × (JV | JT)
987 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
988 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
989 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
990 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
991
992 # (JT | H3) × JT
993 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
994 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
995
996 # LB25 Do not break between the following pairs of classes relevant to
997 # numbers, as tailored by example 7 in
998 # http://www.unicode.org/reports/tr14/#Examples
999 # We follow that tailoring because Unicode's test cases expect it
1000 # (PR | PO) × ( OP | HY )? NU
289ce9cc
KW
1001 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
1002 = $lb_actions{'LB_NOBREAK'};
1003 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1004 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1005
1006 # Given that (OP | HY )? is optional, we have to test for it in code.
1007 # We add in the action (instead of overriding) for this, so that in
1008 # the code we can recover the underlying break value.
289ce9cc 1009 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1010 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1011 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1012 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1013 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339 1014 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1015 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339
KW
1016 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1017
1018 # ( OP | HY ) × NU
289ce9cc
KW
1019 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1020 = $lb_actions{'LB_NOBREAK'};
1021 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1022 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1023
1024 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1025 # which can be rewritten as:
1026 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
289ce9cc
KW
1027 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1028 = $lb_actions{'LB_NOBREAK'};
1029 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1030 = $lb_actions{'LB_NOBREAK'};
1031 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1032 = $lb_actions{'LB_NOBREAK'};
1033 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1034 = $lb_actions{'LB_NOBREAK'};
1035 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1036 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1037
1038 # Like earlier where we have to test in code, we add in the action so
1039 # that we can recover the underlying values. This is done in rules
1040 # below, as well. The code assumes that we haven't added 2 actions.
1041 # Shoul a later Unicode release break that assumption, then tests
1042 # should start failing.
289ce9cc 1043 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
6b659339 1044 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1045 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
6b659339 1046 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1047 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
6b659339 1048 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1049 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
6b659339 1050 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1051 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
6b659339 1052 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1053 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
6b659339 1054 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1055 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
6b659339 1056 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1057 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
6b659339 1058 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1059 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
6b659339 1060 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1061 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
6b659339
KW
1062 += $lb_actions{'LB_SY_or_IS_then_various'};
1063
1064 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1065 # which can be rewritten as:
1066 # NU (SY | IS)* (CL | CP)? × (PO | PR)
289ce9cc
KW
1067 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1068 = $lb_actions{'LB_NOBREAK'};
1069 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1070 = $lb_actions{'LB_NOBREAK'};
6b659339 1071
289ce9cc 1072 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1073 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1074 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1075 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1076 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1077 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1078 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
6b659339
KW
1079 += $lb_actions{'LB_various_then_PO_or_PR'};
1080
289ce9cc 1081 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1082 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1083 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1084 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1085 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1086 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1087 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
6b659339
KW
1088 += $lb_actions{'LB_various_then_PO_or_PR'};
1089
b0e24409
KW
1090 # LB24 Do not break between numeric prefix/postfix and letters, or between
1091 # letters and prefix/postfix.
1092 # (PR | PO) × (AL | HL)
289ce9cc
KW
1093 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1094 = $lb_actions{'LB_NOBREAK'};
1095 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1096 = $lb_actions{'LB_NOBREAK'};
289ce9cc
KW
1097 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1098 = $lb_actions{'LB_NOBREAK'};
1099 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1100 = $lb_actions{'LB_NOBREAK'};
6b659339 1101
b0e24409
KW
1102 # (AL | HL) × (PR | PO)
1103 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1104 = $lb_actions{'LB_NOBREAK'};
1105 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1106 = $lb_actions{'LB_NOBREAK'};
1107 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1108 = $lb_actions{'LB_NOBREAK'};
1109 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1110 = $lb_actions{'LB_NOBREAK'};
1111
1112 # LB23a Do not break between numeric prefixes and ideographs, or between
1113 # ideographs and numeric postfixes.
1114 # PR × (ID | EB | EM)
1115 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1116 = $lb_actions{'LB_NOBREAK'};
1117 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1118 = $lb_actions{'LB_NOBREAK'};
1119 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1120 = $lb_actions{'LB_NOBREAK'};
1121
1122 # (ID | EB | EM) × PO
289ce9cc
KW
1123 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1124 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1125 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1126 = $lb_actions{'LB_NOBREAK'};
1127 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1128 = $lb_actions{'LB_NOBREAK'};
6b659339 1129
b0e24409 1130 # LB23 Do not break between digits and letters
6b659339 1131 # (AL | HL) × NU
289ce9cc
KW
1132 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1133 = $lb_actions{'LB_NOBREAK'};
1134 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1135 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1136
1137 # NU × (AL | HL)
289ce9cc
KW
1138 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1139 = $lb_actions{'LB_NOBREAK'};
1140 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1141 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1142
1143 # LB22 Do not break between two ellipses, or between letters, numbers or
1144 # exclamations and ellipsis.
1145 # (AL | HL) × IN
289ce9cc
KW
1146 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1147 = $lb_actions{'LB_NOBREAK'};
1148 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1149 = $lb_actions{'LB_NOBREAK'};
6b659339 1150
289ce9cc
KW
1151 # Exclamation × IN
1152 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1153 = $lb_actions{'LB_NOBREAK'};
6b659339 1154
b0e24409 1155 # (ID | EB | EM) × IN
289ce9cc
KW
1156 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1157 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1158 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1159 = $lb_actions{'LB_NOBREAK'};
1160 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1161 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1162
1163 # IN × IN
289ce9cc
KW
1164 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1165 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1166
1167 # NU × IN
289ce9cc
KW
1168 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1169 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1170
1171 # LB21b Don’t break between Solidus and Hebrew letters.
1172 # SY × HL
289ce9cc
KW
1173 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1174 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1175
1176 # LB21a Don't break after Hebrew + Hyphen.
1177 # HL (HY | BA) ×
1178 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1179 $lb_table[$lb_enums{'Hyphen'}][$i]
1180 += $lb_actions{'LB_HY_or_BA_then_foo'};
1181 $lb_table[$lb_enums{'Break_After'}][$i]
1182 += $lb_actions{'LB_HY_or_BA_then_foo'};
6b659339
KW
1183 }
1184
1185 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1186 # spaces, small kana, and other non-starters, or after acute accents.
1187 # × BA
1188 # × HY
1189 # × NS
1190 # BB ×
1191 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1192 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1193 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1194 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1195 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1196 }
1197
1198 # LB20 Break before and after unresolved CB.
1199 # ÷ CB
1200 # CB ÷
1201 # Conditional breaks should be resolved external to the line breaking
1202 # rules. However, the default action is to treat unresolved CB as breaking
1203 # before and after.
1204 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1205 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1206 = $lb_actions{'LB_BREAKABLE'};
1207 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1208 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1209 }
1210
1211 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1212 # × QU
1213 # QU ×
1214 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1215 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1216 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1217 }
1218
1219 # LB18 Break after spaces
1220 # SP ÷
1221 for my $i (0 .. @lb_table - 1) {
289ce9cc 1222 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1223 }
1224
1225 # LB17 Do not break within ‘——’, even with intervening spaces.
1226 # B2 SP* × B2
289ce9cc 1227 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
6b659339
KW
1228 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1229
1230 # LB16 Do not break between closing punctuation and a nonstarter even with
1231 # intervening spaces.
1232 # (CL | CP) SP* × NS
289ce9cc 1233 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
6b659339 1234 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1235 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
6b659339
KW
1236 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1237
1238
1239 # LB15 Do not break within ‘”[’, even with intervening spaces.
1240 # QU SP* × OP
289ce9cc 1241 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
6b659339
KW
1242 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1243
1244 # LB14 Do not break after ‘[’, even after spaces.
1245 # OP SP* ×
1246 for my $i (0 .. @lb_table - 1) {
289ce9cc 1247 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
6b659339
KW
1248 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1249 }
1250
1251 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1252 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1253 # [^NU] × CL
1254 # [^NU] × CP
1255 # × EX
1256 # [^NU] × IS
1257 # [^NU] × SY
1258 for my $i (0 .. @lb_table - 1) {
289ce9cc 1259 $lb_table[$i][$lb_enums{'Exclamation'}]
6b659339
KW
1260 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1261
289ce9cc 1262 next if $i == $lb_enums{'Numeric'};
6b659339 1263
289ce9cc 1264 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
6b659339 1265 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1266 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
6b659339 1267 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1268 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
6b659339 1269 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1270 $lb_table[$i][$lb_enums{'Break_Symbols'}]
6b659339
KW
1271 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1272 }
1273
1274 # LB12a Do not break before NBSP and related characters, except after
1275 # spaces and hyphens.
1276 # [^SP BA HY] × GL
1277 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1278 next if $i == $lb_enums{'Space'}
1279 || $i == $lb_enums{'Break_After'}
1280 || $i == $lb_enums{'Hyphen'};
6b659339
KW
1281
1282 # We don't break, but if a property above has said don't break even
1283 # with space between, don't override that (also in the next few rules)
289ce9cc 1284 next if $lb_table[$i][$lb_enums{'Glue'}]
6b659339 1285 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1286 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1287 }
1288
1289 # LB12 Do not break after NBSP and related characters.
1290 # GL ×
1291 for my $i (0 .. @lb_table - 1) {
289ce9cc 1292 next if $lb_table[$lb_enums{'Glue'}][$i]
6b659339 1293 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1294 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1295 }
1296
1297 # LB11 Do not break before or after Word joiner and related characters.
1298 # × WJ
1299 # WJ ×
1300 for my $i (0 .. @lb_table - 1) {
289ce9cc 1301 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
6b659339
KW
1302 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1303 {
289ce9cc 1304 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
6b659339 1305 }
289ce9cc 1306 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
6b659339
KW
1307 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1308 {
289ce9cc 1309 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1310 }
1311 }
1312
1313 # Special case this here to avoid having to do a special case in the code,
1314 # by making this the same as other things with a SP in front of them that
1315 # don't break, we avoid an extra test
289ce9cc 1316 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
6b659339
KW
1317 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1318
1319 # LB9 and LB10 are done in the same loop
1320 #
1321 # LB9 Do not break a combining character sequence; treat it as if it has
1322 # the line breaking class of the base character in all of the
b0e24409
KW
1323 # higher-numbered rules. Treat ZWJ as if it were CM
1324 # Treat X (CM|ZWJ)* as if it were X.
6b659339
KW
1325 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1326
b0e24409
KW
1327 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1328 # case where a CM or ZWJ is the first character on the line or follows SP,
1329 # BK, CR, LF, NL, or ZW.
6b659339
KW
1330 for my $i (0 .. @lb_table - 1) {
1331
b0e24409
KW
1332 # When the CM or ZWJ is the first in the pair, we don't know without
1333 # looking behind whether the CM or ZWJ is going to attach to an
1334 # earlier character, or not. So have to figure this out at runtime in
1335 # the code
1336 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1337 = $lb_actions{'LB_CM_ZWJ_foo'};
1338 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
289ce9cc
KW
1339
1340 if ( $i == $lb_enums{'Mandatory_Break'}
1341 || $i == $lb_enums{'EDGE'}
1342 || $i == $lb_enums{'Carriage_Return'}
1343 || $i == $lb_enums{'Line_Feed'}
1344 || $i == $lb_enums{'Next_Line'}
1345 || $i == $lb_enums{'Space'}
1346 || $i == $lb_enums{'ZWSpace'})
6b659339
KW
1347 {
1348 # For these classes, a following CM doesn't combine, and should do
289ce9cc
KW
1349 # whatever 'Alphabetic' would do.
1350 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1351 = $lb_table[$i][$lb_enums{'Alphabetic'}];
b0e24409
KW
1352 $lb_table[$i][$lb_enums{'ZWJ'}]
1353 = $lb_table[$i][$lb_enums{'Alphabetic'}];
6b659339
KW
1354 }
1355 else {
b0e24409
KW
1356 # For these classes, the CM or ZWJ combines, so doesn't break,
1357 # inheriting the type of nobreak from the master character.
289ce9cc 1358 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
6b659339
KW
1359 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1360 {
289ce9cc
KW
1361 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1362 = $lb_actions{'LB_NOBREAK'};
6b659339 1363 }
b0e24409
KW
1364 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1365 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1366 {
1367 $lb_table[$i][$lb_enums{'ZWJ'}]
1368 = $lb_actions{'LB_NOBREAK'};
1369 }
6b659339
KW
1370 }
1371 }
1372
b0e24409
KW
1373 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1374 # base or emoji modifier. This rule prevents breaks within emoji joiner
1375 # sequences.
1376 # ZWJ × (ID | EB | EM)
1377 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1378 = $lb_actions{'LB_NOBREAK'};
1379 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1380 = $lb_actions{'LB_NOBREAK'};
1381 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1382 = $lb_actions{'LB_NOBREAK'};
1383
6b659339
KW
1384 # LB8 Break before any character following a zero-width space, even if one
1385 # or more spaces intervene.
1386 # ZW SP* ÷
1387 for my $i (0 .. @lb_table - 1) {
289ce9cc 1388 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1389 }
1390
1391 # Because of LB8-10, we need to look at context for "SP x", and this must
1392 # be done in the code. So override the existing rules for that, by adding
1393 # a constant to get new rules that tell the code it needs to look at
1394 # context. By adding this action instead of replacing the existing one,
1395 # we can get back to the original rule if necessary.
1396 for my $i (0 .. @lb_table - 1) {
289ce9cc 1397 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
6b659339
KW
1398 }
1399
1400 # LB7 Do not break before spaces or zero width space.
1401 # × SP
1402 # × ZW
1403 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1404 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1405 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1406 }
1407
1408 # LB6 Do not break before hard line breaks.
1409 # × ( BK | CR | LF | NL )
1410 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1411 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1412 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1413 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1414 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1415 }
1416
1417 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1418 # CR × LF
1419 # CR !
1420 # LF !
1421 # NL !
1422 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1423 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1424 = $lb_actions{'LB_BREAKABLE'};
1425 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1426 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339 1427 }
289ce9cc
KW
1428 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1429 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1430
1431 # LB4 Always break after hard line breaks.
1432 # BK !
1433 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1434 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1435 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1436 }
1437
6b659339
KW
1438 # LB3 Always break at the end of text.
1439 # ! eot
b0e24409
KW
1440 # LB2 Never break at the start of text.
1441 # sot ×
6b659339 1442 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1443 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1444 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1445 }
1446
1447 # LB1 Assign a line breaking class to each code point of the input.
1448 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1449 # depending on criteria outside the scope of this algorithm.
1450 #
1451 # In the absence of such criteria all characters with a specific
1452 # combination of original class and General_Category property value are
1453 # resolved as follows:
1454 # Original Resolved General_Category
1455 # AI, SG, XX AL Any
1456 # SA CM Only Mn or Mc
1457 # SA AL Any except Mn and Mc
1458 # CJ NS Any
1459 #
1460 # This is done in mktables, so we never see any of the remapped-from
1461 # classes.
1462
289ce9cc
KW
1463 output_table_common('LB', \%lb_actions,
1464 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
6b659339
KW
1465}
1466
7e54b87f
KW
1467sub output_WB_table() {
1468
1469 # Create and output the enums, #defines, and pair table for use in
1470 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1471
1472 # This uses the same mechanism in the other bounds tables generated by
1473 # this file. The actions that could override a 0 or 1 are added to those
1474 # numbers; the actions that clearly don't depend on the underlying rule
1475 # simply overwrite
1476 my %wb_actions = (
1477 WB_NOBREAK => 0,
1478 WB_BREAKABLE => 1,
1479 WB_hs_then_hs => 2,
b0e24409 1480 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
7e54b87f
KW
1481 WB_DQ_then_HL => 4,
1482 WB_HL_then_DQ => 6,
1483 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1484 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1485 WB_MB_or_MN_or_SQ_then_NU => 12,
1486 WB_NU_then_MB_or_MN_or_SQ => 14,
b0e24409 1487 WB_RI_then_RI => 16,
7e54b87f
KW
1488 );
1489
7e54b87f
KW
1490 # Construct the WB pair table.
1491 # The table is constructed in reverse order of the rules, to make the
1492 # lower-numbered, higher priority ones override the later ones, as the
1493 # algorithm stops at the earliest matching rule
1494
1495 my @wb_table;
1496 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
1497
1498 # Otherwise, break everywhere (including around ideographs).
b0e24409 1499 # WB99 Any ÷ Any
7e54b87f
KW
1500 for my $i (0 .. $table_size - 1) {
1501 for my $j (0 .. $table_size - 1) {
1502 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1503 }
1504 }
1505
b0e24409
KW
1506 # Do not break within emoji flag sequences. That is, do not break between
1507 # regional indicator (RI) symbols if there is an odd number of RI
1508 # characters before the break point.
1509 # WB16 [^RI] (RI RI)* RI × RI
c492f156 1510 # WB15 sot (RI RI)* RI × RI
289ce9cc 1511 $wb_table[$wb_enums{'Regional_Indicator'}]
b0e24409
KW
1512 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1513
1514 # Do not break within emoji modifier sequences.
1515 # WB14 ( E_Base | EBG ) × E_Modifier
1516 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1517 = $wb_actions{'WB_NOBREAK'};
1518 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1519 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1520
1521 # Do not break from extenders.
1522 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
289ce9cc
KW
1523 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1524 = $wb_actions{'WB_NOBREAK'};
1525 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1526 = $wb_actions{'WB_NOBREAK'};
1527 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1528 = $wb_actions{'WB_NOBREAK'};
1529 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1530 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1531
1532 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1533 # × # ExtendNumLet
289ce9cc
KW
1534 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1535 = $wb_actions{'WB_NOBREAK'};
1536 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1537 = $wb_actions{'WB_NOBREAK'};
1538 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1539 = $wb_actions{'WB_NOBREAK'};
1540 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1541 = $wb_actions{'WB_NOBREAK'};
1542 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1543 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1544
1545 # Do not break between Katakana.
1546 # WB13 Katakana × Katakana
289ce9cc
KW
1547 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1548 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1549
1550 # Do not break within sequences, such as “3.2” or “3,456.789”.
1551 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
289ce9cc 1552 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
7e54b87f 1553 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1554 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
7e54b87f 1555 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1556 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1557 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1558
1559 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
289ce9cc 1560 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
7e54b87f 1561 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1562 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
7e54b87f 1563 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1564 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
7e54b87f
KW
1565 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1566
1567 # Do not break within sequences of digits, or digits adjacent to letters
1568 # (“3a”, or “A3”).
1569 # WB10 Numeric × (ALetter | Hebrew_Letter)
289ce9cc
KW
1570 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1571 = $wb_actions{'WB_NOBREAK'};
1572 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1573 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1574
1575 # WB9 (ALetter | Hebrew_Letter) × Numeric
289ce9cc
KW
1576 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1577 = $wb_actions{'WB_NOBREAK'};
1578 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1579 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1580
1581 # WB8 Numeric × Numeric
289ce9cc
KW
1582 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1583 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1584
1585 # Do not break letters across certain punctuation.
1586 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
289ce9cc
KW
1587 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1588 += $wb_actions{'WB_DQ_then_HL'};
7e54b87f
KW
1589
1590 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
289ce9cc
KW
1591 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1592 += $wb_actions{'WB_HL_then_DQ'};
7e54b87f
KW
1593
1594 # WB7a Hebrew_Letter × Single_Quote
289ce9cc
KW
1595 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1596 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1597
1598 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1599 # × (ALetter | Hebrew_Letter)
289ce9cc 1600 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
7e54b87f 1601 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1602 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1603 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1604 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
7e54b87f 1605 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1606 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1607 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1608 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
7e54b87f 1609 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1610 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f
KW
1611 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1612
1613 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1614 # | Single_Quote) (ALetter | Hebrew_Letter)
289ce9cc 1615 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1616 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1617 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1618 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1619 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
7e54b87f 1620 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1621 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
7e54b87f 1622 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1623 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
7e54b87f 1624 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1625 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1626 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1627
1628 # Do not break between most letters.
1629 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
289ce9cc
KW
1630 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1631 = $wb_actions{'WB_NOBREAK'};
1632 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1633 = $wb_actions{'WB_NOBREAK'};
1634 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1635 = $wb_actions{'WB_NOBREAK'};
1636 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1637 = $wb_actions{'WB_NOBREAK'};
7e54b87f 1638
b0e24409
KW
1639 # Ignore Format and Extend characters, except after sot, CR, LF, and
1640 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1641 # WB4 X (Extend | Format | ZWJ)* → X
7e54b87f 1642 for my $i (0 .. @wb_table - 1) {
289ce9cc 1643 $wb_table[$wb_enums{'Extend'}][$i]
b0e24409 1644 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
289ce9cc 1645 $wb_table[$wb_enums{'Format'}][$i]
b0e24409
KW
1646 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1647 $wb_table[$wb_enums{'ZWJ'}][$i]
1648 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1649 }
1650 for my $i (0 .. @wb_table - 1) {
1651 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1652 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1653 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1654 }
1655
1656 # Implied is that these attach to the character before them, except for
1657 # the characters that mark the end of a region of text. The rules below
1658 # override the ones set up here, for all the characters that need
1659 # overriding.
1660 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1661 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1662 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1663 }
1664
b0e24409
KW
1665 # Do not break within emoji zwj sequences.
1666 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1667 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1668 = $wb_actions{'WB_NOBREAK'};
1669 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1670 = $wb_actions{'WB_NOBREAK'};
1671
7e54b87f
KW
1672 # Break before and after white space
1673 # WB3b ÷ (Newline | CR | LF)
1674 # WB3a (Newline | CR | LF) ÷
1675 # et. al.
289ce9cc 1676 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1677 for my $j (0 .. @wb_table - 1) {
1678 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1679 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1680 }
1681 }
1682
1683 # But do not break within white space.
1684 # WB3 CR × LF
1685 # et.al.
289ce9cc
KW
1686 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1687 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1688 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1689 }
1690 }
1691
b0e24409 1692 # And do not break horizontal space followed by Extend or Format or ZWJ
289ce9cc
KW
1693 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1694 = $wb_actions{'WB_NOBREAK'};
1695 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1696 = $wb_actions{'WB_NOBREAK'};
b0e24409
KW
1697 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1698 = $wb_actions{'WB_NOBREAK'};
289ce9cc
KW
1699 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1700 [$wb_enums{'Perl_Tailored_HSpace'}]
1701 = $wb_actions{'WB_hs_then_hs'};
7e54b87f 1702
b0e24409
KW
1703 # Break at the start and end of text, unless the text is empty
1704 # WB2 Any ÷ eot
1705 # WB1 sot ÷ Any
7e54b87f 1706 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1707 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1708 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
7e54b87f 1709 }
289ce9cc 1710 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
7e54b87f 1711
289ce9cc
KW
1712 output_table_common('WB', \%wb_actions,
1713 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
7e54b87f
KW
1714}
1715
9d9177be
KW
1716output_invlist("Latin1", [ 0, 256 ]);
1717output_invlist("AboveLatin1", [ 256 ]);
1718
bffc0129 1719end_file_pound_if;
43b443dd 1720
3f427fd9
KW
1721# We construct lists for all the POSIX and backslash sequence character
1722# classes in two forms:
1723# 1) ones which match only in the ASCII range
1724# 2) ones which match either in the Latin1 range, or the entire Unicode range
1725#
1726# These get compiled in, and hence affect the memory footprint of every Perl
1727# program, even those not using Unicode. To minimize the size, currently
1728# the Latin1 version is generated for the beyond ASCII range except for those
1729# lists that are quite small for the entire range, such as for \s, which is 22
1730# UVs long plus 4 UVs (currently) for the header.
1731#
1732# To save even more memory, the ASCII versions could be derived from the
1733# larger ones at runtime, saving some memory (minus the expense of the machine
1734# instructions to do so), but these are all small anyway, so their total is
1735# about 100 UVs.
1736#
1737# In the list of properties below that get generated, the L1 prefix is a fake
1738# property that means just the Latin1 range of the full property (whose name
1739# has an X prefix instead of L1).
a02047bf
KW
1740#
1741# An initial & means to use the subroutine from this file instead of an
1742# official inversion list.
3f427fd9 1743
0c4ecf42
KW
1744for my $charset (get_supported_code_pages()) {
1745 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1746
99f21fb9
KW
1747 @a2n = @{get_a2n($charset)};
1748 no warnings 'qw';
1749 # Ignore non-alpha in sort
1750 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
c0382778 1751 Assigned
1c8c3428
KW
1752 ASCII
1753 Cased
1754 VertSpace
1755 XPerlSpace
1756 XPosixAlnum
1757 XPosixAlpha
1758 XPosixBlank
1759 XPosixCntrl
1760 XPosixDigit
1761 XPosixGraph
1762 XPosixLower
1763 XPosixPrint
1764 XPosixPunct
1765 XPosixSpace
1766 XPosixUpper
1767 XPosixWord
1768 XPosixXDigit
1769 _Perl_Any_Folds
1770 &NonL1_Perl_Non_Final_Folds
1771 _Perl_Folds_To_Multi_Char
1772 &UpperLatin1
1773 _Perl_IDStart
1774 _Perl_IDCont
02f811dd 1775 _Perl_GCB,EDGE
ca8226cf 1776 _Perl_LB,EDGE
bf4268fa 1777 _Perl_SB,EDGE
190d69bb 1778 _Perl_WB,EDGE,UNKNOWN
1c8c3428 1779 )
0f5e3c71
KW
1780 ) {
1781
1782 # For the Latin1 properties, we change to use the eXtended version of the
1783 # base property, then go through the result and get rid of everything not
1784 # in Latin1 (above 255). Actually, we retain the element for the range
1785 # that crosses the 255/256 boundary if it is one that matches the
1786 # property. For example, in the Word property, there is a range of code
1787 # points that start at U+00F8 and goes through U+02C1. Instead of
1788 # artificially cutting that off at 256 because 256 is the first code point
1789 # above Latin1, we let the range go to its natural ending. That gives us
1790 # extra information with no added space taken. But if the range that
1791 # crosses the boundary is one that doesn't match the property, we don't
1792 # start a new range above 255, as that could be construed as going to
1793 # infinity. For example, the Upper property doesn't include the character
1794 # at 255, but does include the one at 256. We don't include the 256 one.
1795 my $prop_name = $prop;
1796 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1797 my $extra_enums = "";
1798 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1799 my $lookup_prop = $prop_name;
1800 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1801 or $lookup_prop =~ s/^L1//);
1802 my $nonl1_only = 0;
1803 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1804 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1805
1806 my @invlist;
99f21fb9
KW
1807 my @invmap;
1808 my $map_format;
1809 my $map_default;
1810 my $maps_to_code_point;
1811 my $to_adjust;
0f5e3c71
KW
1812 if ($is_local_sub) {
1813 @invlist = eval $lookup_prop;
289ce9cc 1814 die $@ if $@;
0f5e3c71
KW
1815 }
1816 else {
1817 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1818 if (! @invlist) {
99f21fb9 1819
ad85f59a
KW
1820 # If couldn't find a non-empty inversion list, see if it is
1821 # instead an inversion map
1822 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1823 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1824 if (! $list_ref) {
1825 # An empty return here could mean an unknown property, or
1826 # merely that the original inversion list is empty. Call
1827 # in scalar context to differentiate
1828 my $count = prop_invlist($lookup_prop,
1829 '_perl_core_internal_ok');
1830 die "Could not find inversion list for '$lookup_prop'"
1831 unless defined $count;
1832 }
1833 else {
18b852b3
KW
1834 @invlist = @$list_ref;
1835 @invmap = @$map_ref;
1836 $map_format = $format;
1837 $map_default = $default;
1838 $maps_to_code_point = $map_format =~ /x/;
1839 $to_adjust = $map_format =~ /a/;
ad85f59a 1840 }
99f21fb9 1841 }
0f5e3c71 1842 }
ad85f59a
KW
1843
1844
1845 # Short-circuit an empty inversion list.
1846 if (! @invlist) {
1847 output_invlist($prop_name, \@invlist, $charset);
1848 next;
1849 }
ceb1de32 1850
99f21fb9
KW
1851 # Re-order the Unicode code points to native ones for this platform.
1852 # This is only needed for code points below 256, because native code
1853 # points are only in that range. For inversion maps of properties
1854 # where the mappings are adjusted (format =~ /a/), this reordering
1855 # could mess up the adjustment pattern that was in the input, so that
1856 # has to be dealt with.
1857 #
1858 # And inversion maps that map to code points need to eventually have
1859 # all those code points remapped to native, and it's better to do that
1860 # here, going through the whole list not just those below 256. This
1861 # is because some inversion maps have adjustments (format =~ /a/)
1862 # which may be affected by the reordering. This code needs to be done
1863 # both for when we are translating the inversion lists for < 256, and
1864 # for the inversion maps for everything. By doing both in this loop,
1865 # we can share that code.
1866 #
1867 # So, we go through everything for an inversion map to code points;
1868 # otherwise, we can skip any remapping at all if we are going to
1869 # output only the above-Latin1 values, or if the range spans the whole
1870 # of 0..256, as the remap will also include all of 0..256 (256 not
1871 # 255 because a re-ordering could cause 256 to need to be in the same
1872 # range as 255.)
1873 if ((@invmap && $maps_to_code_point)
1874 || (! $nonl1_only || ($invlist[0] < 256
1875 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1876 {
fb4554ea 1877
99f21fb9 1878 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1879 # Look at all the ranges that start before 257.
1880 my @latin1_list;
1881 while (@invlist) {
1882 last if $invlist[0] > 256;
1883 my $upper = @invlist > 1
1884 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1885
1886 # To infinity. You may want to stop much much
1887 # earlier; going this high may expose perl
1888 # deficiencies with very large numbers.
1889 : $Unicode::UCD::MAX_CP;
fb4554ea 1890 for my $j ($invlist[0] .. $upper) {
99f21fb9 1891 push @latin1_list, a2n($j);
0f5e3c71 1892 }
fb4554ea
KW
1893
1894 shift @invlist; # Shift off the range that's in the list
1895 shift @invlist; # Shift off the range not in the list
0c4ecf42 1896 }
fb4554ea
KW
1897
1898 # Here @invlist contains all the ranges in the original that start
1899 # at code points above 256, and @latin1_list contains all the
1900 # native code points for ranges that start with a Unicode code
1901 # point below 257. We sort the latter and convert it to inversion
1902 # list format. Then simply prepend it to the list of the higher
1903 # code points.
1904 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1905 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1906 unshift @invlist, @latin1_list;
99f21fb9
KW
1907 }
1908 else { # Is an inversion map
1909
1910 # This is a similar procedure as plain inversion list, but has
1911 # multiple buckets. A plain inversion list just has two
1912 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1913 # pretty much can ignore the 2nd bucket, as it is completely
1914 # defined by the 1st. But here, what we do is create buckets
1915 # which contain the code points that map to each, translated
1916 # to native and turned into an inversion list. Thus each
1917 # bucket is an inversion list of native code points that map
1918 # to it or don't map to it. We use these to create an
1919 # inversion map for the whole property.
1920
1921 # As mentioned earlier, we use this procedure to not just
1922 # remap the inversion list to native values, but also the maps
1923 # of code points to native ones. In the latter case we have
1924 # to look at the whole of the inversion map (or at least to
1925 # above Unicode; as the maps of code points above that should
1926 # all be to the default).
1927 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1928
1929 my %mapped_lists; # A hash whose keys are the buckets.
1930 while (@invlist) {
1931 last if $invlist[0] > $upper_limit;
1932
1933 # This shouldn't actually happen, as prop_invmap() returns
1934 # an extra element at the end that is beyond $upper_limit
1935 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1936
1937 my $bucket;
1938
1939 # A hash key can't be a ref (we are only expecting arrays
1940 # of scalars here), so convert any such to a string that
1941 # will be converted back later (using a vertical tab as
1942 # the separator). Even if the mapping is to code points,
1943 # we don't translate to native here because the code
d8049362 1944 # output_invmap() calls to output these arrays assumes the
99f21fb9
KW
1945 # input is Unicode, not native.
1946 if (ref $invmap[0]) {
1947 $bucket = join "\cK", @{$invmap[0]};
1948 }
1949 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1950
1951 # Do convert to native for maps to single code points.
1952 # There are some properties that have a few outlier
1953 # maps that aren't code points, so the above test
1954 # skips those.
1955 $bucket = a2n($invmap[0]);
1956 } else {
1957 $bucket = $invmap[0];
1958 }
1959
1960 # We now have the bucket that all code points in the range
1961 # map to, though possibly they need to be adjusted. Go
1962 # through the range and put each translated code point in
1963 # it into its bucket.
1964 my $base_map = $invmap[0];
1965 for my $j ($invlist[0] .. $invlist[1] - 1) {
1966 if ($to_adjust
1967 # The 1st code point doesn't need adjusting
1968 && $j > $invlist[0]
1969
1970 # Skip any non-numeric maps: these are outliers
1971 # that aren't code points.
1972 && $base_map =~ $numeric_re
1973
1974 # 'ne' because the default can be a string
1975 && $base_map ne $map_default)
1976 {
1977 # We adjust, by incrementing each the bucket and
1978 # the map. For code point maps, translate to
1979 # native
1980 $base_map++;
1981 $bucket = ($maps_to_code_point)
1982 ? a2n($base_map)
1983 : $base_map;
1984 }
1985
1986 # Add the native code point to the bucket for the
1987 # current map
1988 push @{$mapped_lists{$bucket}}, a2n($j);
1989 } # End of loop through all code points in the range
1990
1991 # Get ready for the next range
1992 shift @invlist;
1993 shift @invmap;
1994 } # End of loop through all ranges in the map.
1995
1996 # Here, @invlist and @invmap retain all the ranges from the
1997 # originals that start with code points above $upper_limit.
1998 # Each bucket in %mapped_lists contains all the code points
1999 # that map to that bucket. If the bucket is for a map to a
2000 # single code point is a single code point, the bucket has
2001 # been converted to native. If something else (including
2002 # multiple code points), no conversion is done.
2003 #
2004 # Now we recreate the inversion map into %xlated, but this
2005 # time for the native character set.
2006 my %xlated;
2007 foreach my $bucket (keys %mapped_lists) {
2008
2009 # Sort and convert this bucket to an inversion list. The
2010 # result will be that ranges that start with even-numbered
2011 # indexes will be for code points that map to this bucket;
2012 # odd ones map to some other bucket, and are discarded
2013 # below.
2014 @{$mapped_lists{$bucket}}
2015 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2016 @{$mapped_lists{$bucket}}
2017 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2018
2019 # Add each even-numbered range in the bucket to %xlated;
2020 # so that the keys of %xlated become the range start code
2021 # points, and the values are their corresponding maps.
2022 while (@{$mapped_lists{$bucket}}) {
2023 my $range_start = $mapped_lists{$bucket}->[0];
2024 if ($bucket =~ /\cK/) {
2025 @{$xlated{$range_start}} = split /\cK/, $bucket;
2026 }
2027 else {
2028 $xlated{$range_start} = $bucket;
2029 }
2030 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2031 shift @{$mapped_lists{$bucket}}; # Get ready for next
2032 # iteration
2033 }
2034 } # End of loop through all the buckets.
2035
2036 # Here %xlated's keys are the range starts of all the code
2037 # points in the inversion map. Construct an inversion list
2038 # from them.
2039 my @new_invlist = sort { $a <=> $b } keys %xlated;
2040
2041 # If the list is adjusted, we want to munge this list so that
2042 # we only have one entry for where consecutive code points map
2043 # to consecutive values. We just skip the subsequent entries
2044 # where this is the case.
2045 if ($to_adjust) {
2046 my @temp;
2047 for my $i (0 .. @new_invlist - 1) {
2048 next if $i > 0
2049 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2050 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2051 && $xlated{$new_invlist[$i]} =~ $numeric_re
2052 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2053 push @temp, $new_invlist[$i];
2054 }
2055 @new_invlist = @temp;
2056 }
2057
2058 # The inversion map comes from %xlated's values. We can
2059 # unshift each onto the front of the untouched portion, in
2060 # reverse order of the portion we did process.
2061 foreach my $start (reverse @new_invlist) {
2062 unshift @invmap, $xlated{$start};
2063 }
2064
2065 # Finally prepend the inversion list we have just constructed to the
2066 # one that contains anything we didn't process.
2067 unshift @invlist, @new_invlist;
2068 }
2069 }
2070
2071 # prop_invmap() returns an extra final entry, which we can now
2072 # discard.
2073 if (@invmap) {
2074 pop @invlist;
2075 pop @invmap;
ceb1de32 2076 }
0f5e3c71
KW
2077
2078 if ($l1_only) {
99f21fb9 2079 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
2080 for my $i (0 .. @invlist - 1 - 1) {
2081 if ($invlist[$i] > 255) {
2082
2083 # In an inversion list, even-numbered elements give the code
2084 # points that begin ranges that match the property;
2085 # odd-numbered give ones that begin ranges that don't match.
2086 # If $i is odd, we are at the first code point above 255 that
2087 # doesn't match, which means the range it is ending does
2088 # match, and crosses the 255/256 boundary. We want to include
2089 # this ending point, so increment $i, so the splice below
2090 # includes it. Conversely, if $i is even, it is the first
2091 # code point above 255 that matches, which means there was no
2092 # matching range that crossed the boundary, and we don't want
2093 # to include this code point, so splice before it.
2094 $i++ if $i % 2 != 0;
2095
2096 # Remove everything past this.
2097 splice @invlist, $i;
99f21fb9 2098 splice @invmap, $i if @invmap;
0f5e3c71
KW
2099 last;
2100 }
0c4ecf42
KW
2101 }
2102 }
0f5e3c71
KW
2103 elsif ($nonl1_only) {
2104 my $found_nonl1 = 0;
2105 for my $i (0 .. @invlist - 1 - 1) {
2106 next if $invlist[$i] < 256;
2107
2108 # Here, we have the first element in the array that indicates an
2109 # element above Latin1. Get rid of all previous ones.
2110 splice @invlist, 0, $i;
99f21fb9 2111 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
2112
2113 # If this one's index is not divisible by 2, it means that this
2114 # element is inverting away from being in the list, which means
99f21fb9
KW
2115 # all code points from 256 to this one are in this list (or
2116 # map to the default for inversion maps)
2117 if ($i % 2 != 0) {
2118 unshift @invlist, 256;
2119 unshift @invmap, $map_default if @invmap;
2120 }
0f5e3c71 2121 $found_nonl1 = 1;
3f427fd9
KW
2122 last;
2123 }
0f5e3c71 2124 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 2125 }
3f427fd9 2126
0f5e3c71 2127 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 2128 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 2129 }
bffc0129 2130 end_file_pound_if;
0c4ecf42 2131 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
2132}
2133
973a28ed
KW
2134switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2135
2136output_GCB_table();
6b659339 2137output_LB_table();
7e54b87f 2138output_WB_table();
6b659339 2139
973a28ed
KW
2140end_file_pound_if;
2141
2308ab83 2142my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
2143my @sources = ($0, qw(lib/unicore/mktables
2144 lib/Unicode/UCD.pm
2145 regen/charset_translations.pl
2146 ));
9a3da3ad
FC
2147{
2148 # Depend on mktables’ own sources. It’s a shorter list of files than
2149 # those that Unicode::UCD uses.
1ae6ead9 2150 if (! open my $mktables_list, '<', $sources_list) {
2308ab83
KW
2151
2152 # This should force a rebuild once $sources_list exists
2153 push @sources, $sources_list;
2154 }
2155 else {
2156 while(<$mktables_list>) {
2157 last if /===/;
2158 chomp;
2159 push @sources, "lib/unicore/$_" if /^[^#]/;
2160 }
9a3da3ad
FC
2161 }
2162}
6b659339
KW
2163
2164read_only_bottom_close_and_rename($out_fh, \@sources);