This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Fix typo
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
3d7c117d
MB
11require './regen/regen_lib.pl';
12require './regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
289ce9cc
KW
43my $max_hdr_len = 3; # In headings, how wide a name is allowed?
44
9d9177be
KW
45print $out_fh "/* See the generating file for comments */\n\n";
46
bffc0129
KW
47# The symbols generated by this program are all currently defined only in a
48# single dot c each. The code knows where most of them go, but this hash
49# gives overrides for the exceptions to the typical place
50my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
59 );
015bb97c 60
f79a09fc 61# This hash contains the properties with enums that have hard-coded references
289ce9cc 62# to them in C code. It is neeed to make sure that if perl is compiled
f79a09fc
KW
63# with an older Unicode data set, that all the enum values the code is
64# expecting will still be in the enum typedef. Thus the code doesn't have to
289ce9cc
KW
65# change. The Unicode version won't have any code points that have the enum
66# values not in that version, so the code that handles them will not get
67# exercised. This is far better than having to #ifdef things. The names here
68# should be the long names of the respective property values. The reason for
69# this is because regexec.c uses them as case labels, and the long name is
70# generally more understandable than the short.
f79a09fc
KW
71my %hard_coded_enums =
72 ( gcb => [
73 'Control',
74 'CR',
b0e24409
KW
75 'E_Base',
76 'E_Base_GAZ',
77 'E_Modifier',
f79a09fc 78 'Extend',
b0e24409 79 'Glue_After_Zwj',
f79a09fc
KW
80 'L',
81 'LF',
82 'LV',
83 'LVT',
84 'Other',
85 'Prepend',
86 'Regional_Indicator',
87 'SpacingMark',
88 'T',
89 'V',
b0e24409 90 'ZWJ',
f79a09fc 91 ],
ca8226cf
KW
92 lb => [
93 'Alphabetic',
94 'Break_After',
95 'Break_Before',
96 'Break_Both',
97 'Break_Symbols',
98 'Carriage_Return',
99 'Close_Parenthesis',
100 'Close_Punctuation',
101 'Combining_Mark',
102 'Contingent_Break',
b0e24409
KW
103 'E_Base',
104 'E_Modifier',
ca8226cf
KW
105 'Exclamation',
106 'Glue',
107 'H2',
108 'H3',
109 'Hebrew_Letter',
110 'Hyphen',
111 'Ideographic',
112 'Infix_Numeric',
113 'Inseparable',
114 'JL',
115 'JT',
116 'JV',
117 'Line_Feed',
118 'Mandatory_Break',
119 'Next_Line',
120 'Nonstarter',
121 'Numeric',
122 'Open_Punctuation',
123 'Postfix_Numeric',
124 'Prefix_Numeric',
125 'Quotation',
126 'Regional_Indicator',
127 'Space',
128 'Word_Joiner',
b0e24409 129 'ZWJ',
ca8226cf
KW
130 'ZWSpace',
131 ],
f79a09fc
KW
132 sb => [
133 'ATerm',
134 'Close',
135 'CR',
136 'Extend',
137 'Format',
138 'LF',
139 'Lower',
140 'Numeric',
141 'OLetter',
142 'Other',
143 'SContinue',
144 'Sep',
145 'Sp',
146 'STerm',
147 'Upper',
148 ],
149 wb => [
150 'ALetter',
151 'CR',
152 'Double_Quote',
b0e24409
KW
153 'E_Base',
154 'E_Base_GAZ',
155 'E_Modifier',
f79a09fc
KW
156 'Extend',
157 'ExtendNumLet',
158 'Format',
b0e24409 159 'Glue_After_Zwj',
f79a09fc
KW
160 'Hebrew_Letter',
161 'Katakana',
162 'LF',
163 'MidLetter',
164 'MidNum',
165 'MidNumLet',
166 'Newline',
167 'Numeric',
168 'Other',
f1f6961f 169 'Perl_Tailored_HSpace',
f79a09fc
KW
170 'Regional_Indicator',
171 'Single_Quote',
b0e24409 172 'ZWJ',
f79a09fc
KW
173 ],
174);
175
973a28ed
KW
176my %gcb_enums;
177my @gcb_short_enums;
289ce9cc 178my %gcb_abbreviations;
6b659339
KW
179my %lb_enums;
180my @lb_short_enums;
289ce9cc 181my %lb_abbreviations;
7e54b87f
KW
182my %wb_enums;
183my @wb_short_enums;
289ce9cc 184my %wb_abbreviations;
6b659339 185
99f21fb9
KW
186my @a2n;
187
188sub uniques {
189 # Returns non-duplicated input values. From "Perl Best Practices:
190 # Encapsulated Cleverness". p. 455 in first edition.
191
192 my %seen;
193 return grep { ! $seen{$_}++ } @_;
194}
195
196sub a2n($) {
197 my $cp = shift;
198
199 # Returns the input Unicode code point translated to native.
200
201 return $cp if $cp !~ $numeric_re || $cp > 255;
202 return $a2n[$cp];
203}
204
bffc0129
KW
205sub end_file_pound_if {
206 if ($in_file_pound_if) {
207 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
208 $in_file_pound_if = 0;
209 }
210}
211
212sub switch_pound_if ($$) {
213 my $name = shift;
214 my $new_pound_if = shift;
215
216 # Switch to new #if given by the 2nd argument. If there is an override
217 # for this, it instead switches to that. The 1st argument is the
218 # static's name, used to look up the overrides
219
220 if (exists $exceptions_to_where_to_define{$name}) {
221 $new_pound_if = $exceptions_to_where_to_define{$name};
222 }
223
224 # Exit current #if if the new one is different from the old
225 if ($in_file_pound_if
226 && $in_file_pound_if !~ /$new_pound_if/)
227 {
228 end_file_pound_if;
229 }
230
231 # Enter new #if, if not already in it.
232 if (! $in_file_pound_if) {
233 $in_file_pound_if = "defined($new_pound_if)";
234 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
235 }
236}
237
0c4ecf42 238sub output_invlist ($$;$) {
9d9177be
KW
239 my $name = shift;
240 my $invlist = shift; # Reference to inversion list array
0c4ecf42 241 my $charset = shift // ""; # name of character set for comment
9d9177be 242
76d3994c 243 die "No inversion list for $name" unless defined $invlist
ad85f59a 244 && ref $invlist eq 'ARRAY';
76d3994c 245
9d9177be
KW
246 # Output the inversion list $invlist using the name $name for it.
247 # It is output in the exact internal form for inversion lists.
248
a0316a6c
KW
249 # Is the last element of the header 0, or 1 ?
250 my $zero_or_one = 0;
ad85f59a 251 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 252 unshift @$invlist, 0;
9d9177be
KW
253 $zero_or_one = 1;
254 }
0a07b44b 255 my $count = @$invlist;
9d9177be 256
bffc0129 257 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 258
0c4ecf42
KW
259 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
260 print $out_fh " /* for $charset */" if $charset;
261 print $out_fh "\n";
9d9177be 262
a0316a6c 263 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
264 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
265 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
266 ",\t/* 0 if the list starts at 0;",
267 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
268
269 # The main body are the UVs passed in to this routine. Do the final
270 # element separately
47d53124
KW
271 for my $i (0 .. @$invlist - 1) {
272 printf $out_fh "\t0x%X", $invlist->[$i];
273 print $out_fh "," if $i < @$invlist - 1;
274 print $out_fh "\n";
9d9177be
KW
275 }
276
9d9177be
KW
277 print $out_fh "};\n";
278}
279
99f21fb9
KW
280sub output_invmap ($$$$$$$) {
281 my $name = shift;
282 my $invmap = shift; # Reference to inversion map array
283 my $prop_name = shift;
284 my $input_format = shift; # The inversion map's format
285 my $default = shift; # The property value for code points who
286 # otherwise don't have a value specified.
287 my $extra_enums = shift; # comma-separated list of our additions to the
288 # property's standard possible values
289 my $charset = shift // ""; # name of character set for comment
290
291 # Output the inversion map $invmap for property $prop_name, but use $name
292 # as the actual data structure's name.
293
294 my $count = @$invmap;
295
296 my $output_format;
297 my $declaration_type;
298 my %enums;
299 my $name_prefix;
300
301 if ($input_format eq 's') {
b83e6484 302 my $orig_prop_name = $prop_name;
02f811dd
KW
303 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
304 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
b83e6484
KW
305 my @enums;
306 if ($orig_prop_name eq $prop_name) {
307 @enums = prop_values($prop_name);
308 }
309 else {
310 @enums = uniques(@$invmap);
311 }
289ce9cc 312
99f21fb9
KW
313 if (! @enums) {
314 die "Only enum properties are currently handled; '$prop_name' isn't one";
315 }
316 else {
f79a09fc 317 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
289ce9cc
KW
318 my @canonical_input_enums;
319 if (@expected_enums) {
320 if (@expected_enums < @enums) {
321 die 'You need to update %hard_coded_enums to reflect new'
322 . " entries in this Unicode version\n"
323 . "Expected: " . join(", ", sort @expected_enums) . "\n"
324 . " Got: " . join(", ", sort @enums);
325 }
f79a09fc 326
289ce9cc 327 if (! defined prop_aliases($prop_name)) {
f79a09fc 328
289ce9cc
KW
329 # Convert the input enums into canonical form and
330 # save for use below
331 @canonical_input_enums = map { lc ($_ =~ s/_//gr) }
332 @enums;
333 }
334 @enums = sort @expected_enums;
335 }
99f21fb9 336
289ce9cc
KW
337 # The internal enums come last, and in the order specified
338 my @extras;
339 if ($extra_enums ne "") {
340 @extras = split /,/, $extra_enums;
341 push @enums, @extras;
342 }
6dc80864 343
99f21fb9
KW
344 # Assign a value to each element of the enum. The default
345 # value always gets 0; the others are arbitrarily assigned.
346 my $enum_val = 0;
02f811dd
KW
347 my $canonical_default = prop_value_aliases($prop_name, $default);
348 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
349 $enums{$default} = $enum_val++;
350 for my $enum (@enums) {
351 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
352 }
6b659339 353
289ce9cc
KW
354 # Calculate the enum values for certain properties like
355 # _Perl_GCB and _Perl_LB, because we output special tables for
356 # them.
357 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
358
359 # We use string evals to allow the same code to work on
360 # all tables we're doing.
361 my $type = lc $prop_name;
362
363 # We use lowercase single letter names for any property
364 # values not in the release of Unicode being compiled now.
365 my $placeholder = "a";
366
367 # Skip if we've already done this code, which populated
368 # this hash
369 if (eval "! \%${type}_enums") {
370
371 # For each enum ...
372 foreach my $enum (sort keys %enums) {
373 my $value = $enums{$enum};
374 my $short;
375 my $abbreviated_from;
376
377 # Special case this wb property value to make the
378 # name more clear
379 if ($enum eq 'Perl_Tailored_HSpace') {
380 $short = 'hs';
381 $abbreviated_from = $enum;
382 }
383 elsif (grep { $_ eq $enum } @extras) {
384
385 # The 'short' name for one of the property
386 # values added by this file is just the
387 # lowercase of it
388 $short = lc $enum;
389 }
390 elsif (grep {$_ eq lc ( $enum =~ s/_//gr) }
391 @canonical_input_enums)
392 { # On Unicode versions that predate the
393 # official property, we have set up this array
394 # to be the canonical form of each enum in the
395 # substitute property. If the enum we're
396 # looking at is canonically the same as one of
397 # these, use its name instead of generating a
398 # placeholder one in the next clause (which
399 # will happen because prop_value_aliases()
400 # will fail because it only works on official
401 # properties)
402 $short = $enum;
403 }
404 else {
405 # Use the official short name for the other
406 # property values, which should all be
407 # official ones.
408 ($short) = prop_value_aliases($type, $enum);
409
410 # But create a placeholder for ones not in
411 # this Unicode version.
412 $short = $placeholder++ unless defined $short;
413 }
414
415 # If our short name is too long, or we already
416 # know that the name is an abbreviation, truncate
417 # to make sure it's short enough, and remember
418 # that we did this so we can later place in a
419 # comment in the generated file
420 if ( $abbreviated_from
421 || length $short > $max_hdr_len)
422 {
423 $short = substr($short, 0, $max_hdr_len);
424 $abbreviated_from = $enum
425 unless $abbreviated_from;
426 # If the name we are to display conflicts, try
427 # another.
428 while (eval "exists
429 \$${type}_abbreviations{$short}")
430 {
431 die $@ if $@;
432 $short++;
433 }
434
435 eval "\$${type}_abbreviations{$short} = '$enum'";
436 die $@ if $@;
437 }
438
439 # Remember the mapping from the property value
440 # (enum) name to its value.
441 eval "\$${type}_enums{$enum} = $value";
442 die $@ if $@;
443
444 # Remember the inverse mapping to the short name
445 # so that we can properly label the generated
446 # table's rows and columns
447 eval "\$${type}_short_enums[$value] = '$short'";
448 die $@ if $@;
449 }
7e54b87f
KW
450 }
451 }
99f21fb9
KW
452 }
453
bffc0129
KW
454 # Inversion map stuff is currently used only by regexec
455 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
456 {
457
99f21fb9
KW
458 # The short names tend to be two lower case letters, but it looks
459 # better for those if they are upper. XXX
460 $short_name = uc($short_name) if length($short_name) < 3
461 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 462 $name_prefix = "${short_name}_";
99f21fb9
KW
463 my $enum_count = keys %enums;
464 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
465
466 print $out_fh "\ntypedef enum {\n";
6dc80864
KW
467 my @enum_list;
468 foreach my $enum (keys %enums) {
469 $enum_list[$enums{$enum}] = $enum;
470 }
471 foreach my $i (0 .. @enum_list - 1) {
472 my $name = $enum_list[$i];
473 print $out_fh "\t${name_prefix}$name = $i";
474 print $out_fh "," if $i < $enum_count - 1;
475 print $out_fh "\n";
99f21fb9
KW
476 }
477 $declaration_type = "${name_prefix}enum";
478 print $out_fh "} $declaration_type;\n";
479
480 $output_format = "${name_prefix}%s";
481 }
482 }
483 else {
484 die "'$input_format' invmap() format for '$prop_name' unimplemented";
485 }
486
487 die "No inversion map for $prop_name" unless defined $invmap
488 && ref $invmap eq 'ARRAY'
489 && $count;
490
491 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
492 print $out_fh " /* for $charset */" if $charset;
493 print $out_fh "\n";
494
495 # The main body are the scalars passed in to this routine.
496 for my $i (0 .. $count - 1) {
497 my $element = $invmap->[$i];
02f811dd
KW
498 my $full_element_name = prop_value_aliases($prop_name, $element);
499 $element = $full_element_name if defined $full_element_name;
500 $element = $name_prefix . $element;
99f21fb9
KW
501 print $out_fh "\t$element";
502 print $out_fh "," if $i < $count - 1;
503 print $out_fh "\n";
504 }
505 print $out_fh "};\n";
99f21fb9
KW
506}
507
5a7e5385 508sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
509
510 # Returns an inversion list constructed from the sorted input array of
511 # code points
512
513 my $list_ref = shift;
514
99f21fb9
KW
515 return unless @$list_ref;
516
a02047bf
KW
517 # Initialize to just the first element
518 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
519
520 # For each succeeding element, if it extends the previous range, adjust
521 # up, otherwise add it.
522 for my $i (1 .. @$list_ref - 1) {
523 if ($invlist[-1] == $list_ref->[$i]) {
524 $invlist[-1]++;
525 }
526 else {
527 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
528 }
529 }
530 return @invlist;
531}
532
533# Read in the Case Folding rules, and construct arrays of code points for the
534# properties we need.
535my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
536die "Could not find inversion map for Case_Folding" unless defined $format;
537die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
538 unless $format eq 'al'
539 || $format eq 'a';
a02047bf
KW
540my @has_multi_char_fold;
541my @is_non_final_fold;
542
543for my $i (0 .. @$folds_ref - 1) {
544 next unless ref $folds_ref->[$i]; # Skip single-char folds
545 push @has_multi_char_fold, $cp_ref->[$i];
546
b6a6e956 547 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
548 # position
549 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
550 push @is_non_final_fold, $folds_ref->[$i][$j]
551 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
552 }
553}
554
a02047bf
KW
555sub _Perl_Non_Final_Folds {
556 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 557 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
558}
559
99f21fb9
KW
560sub prop_name_for_cmp ($) { # Sort helper
561 my $name = shift;
562
563 # Returns the input lowercased, with non-alphas removed, as well as
564 # everything starting with a comma
565
566 $name =~ s/,.*//;
567 $name =~ s/[[:^alpha:]]//g;
568 return lc $name;
569}
570
892d8259 571sub UpperLatin1 {
5a7e5385 572 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
573}
574
289ce9cc
KW
575sub output_table_common {
576
577 # Common subroutine to actually output the generated rules table.
578
579 my ($property,
580 $table_value_defines_ref,
581 $table_ref,
582 $names_ref,
583 $abbreviations_ref) = @_;
584 my $size = @$table_ref;
585
586 # Output the #define list, sorted by numeric value
587 if ($table_value_defines_ref) {
588 my $max_name_length = 0;
589 my @defines;
590
591 # Put in order, and at the same time find the longest name
592 while (my ($enum, $value) = each %$table_value_defines_ref) {
593 $defines[$value] = $enum;
594
595 my $length = length $enum;
596 $max_name_length = $length if $length > $max_name_length;
597 }
598
599 print $out_fh "\n";
600
601 # Output, so that the values are vertically aligned in a column after
602 # the longest name
603 foreach my $i (0 .. @defines - 1) {
604 next unless defined $defines[$i];
605 printf $out_fh "#define %-*s %2d\n",
606 $max_name_length,
607 $defines[$i],
608 $i;
609 }
610 }
611
612 my $column_width = 2; # We currently allow 2 digits for the number
613
614 # If the maximum value in the table is 1, it can be a bool. (Being above
615 # a U8 is not currently handled
616 my $max_element = 0;
617 for my $i (0 .. $size - 1) {
618 for my $j (0 .. $size - 1) {
619 next if $max_element >= $table_ref->[$i][$j];
620 $max_element = $table_ref->[$i][$j];
621 }
622 }
623 die "Need wider table column width given '$max_element"
624 if length $max_element > $column_width;
625
626 my $table_type = ($max_element == 1)
627 ? 'bool'
628 : 'U8';
629
630 # If a name is longer than the width set aside for a column, its column
631 # needs to have increased spacing so that the name doesn't get truncated
632 # nor run into an adjacent column
633 my @spacers;
634
635 # If we are being compiled on a Unicode version earlier than that which
636 # this file was designed for, it may be that some of the property values
637 # aren't in the current release, and so would be undefined if we didn't
638 # define them ourselves. Earlier code has done this, making them
639 # lowercase characters of length one. We look to see if any exist, so
640 # that we can add an annotation to the output table
641 my $has_placeholder = 0;
642
643 for my $i (0 .. $size - 1) {
644 no warnings 'numeric';
645 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
646 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
647 }
648
649 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
650
651 # Calculate the column heading line
652 my $header_line = "/* "
653 . (" " x $max_hdr_len) # We let the row heading meld to
654 # the '*/' for those that are at
655 # the max
656 . " " x 3; # Space for '*/ '
657 # Now each column
658 for my $i (0 .. $size - 1) {
659 $header_line .= sprintf "%s%*s",
660 $spacers[$i],
661 $column_width + 1, # 1 for the ','
662 $names_ref->[$i];
663 }
664 $header_line .= " */\n";
665
666 # If we have annotations, output it now.
667 if ($has_placeholder || scalar %$abbreviations_ref) {
668 my $text = "";
669 foreach my $abbr (sort keys %$abbreviations_ref) {
670 $text .= "; " if $text;
671 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
672 }
673 if ($has_placeholder) {
674 $text .= "; other " if $text;
675 $text .= "lowercase names are placeholders for"
676 . " property values not defined until a later Unicode"
677 . " release, so are irrelevant in this one, as they are"
678 . " not assigned to any code points";
679 }
680
681 my $indent = " " x 3;
682 $text = $indent . "/* $text */";
683
684 # Wrap the text so that it is no wider than the table, which the
685 # header line gives.
686 my $output_width = length $header_line;
687 while (length $text > $output_width) {
688 my $cur_line = substr($text, 0, $output_width);
689
690 # Find the first blank back from the right end to wrap at.
691 for (my $i = $output_width -1; $i > 0; $i--) {
692 if (substr($text, $i, 1) eq " ") {
693 print $out_fh substr($text, 0, $i), "\n";
694
695 # Set so will look at just the remaining tail (which will
696 # be indented and have a '*' after the indent
697 $text = $indent . " * " . substr($text, $i + 1);
698 last;
699 }
700 }
701 }
702
703 # And any remaining
704 print $out_fh $text, "\n" if $text;
705 }
706
707 # We calculated the header line earlier just to get its width so that we
708 # could make sure the annotations fit into that.
709 print $out_fh $header_line;
710
711 # Now output the bulk of the table.
712 for my $i (0 .. $size - 1) {
713
714 # First the row heading.
715 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
716 print $out_fh "{"; # Then the brace for this row
717
718 # Then each column
719 for my $j (0 .. $size -1) {
720 print $out_fh $spacers[$j];
721 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
722 print $out_fh "," if $j < $size - 1;
723 }
724 print $out_fh " }";
725 print $out_fh "," if $i < $size - 1;
726 print $out_fh "\n";
727 }
728
729 print $out_fh "};\n";
730}
731
973a28ed
KW
732sub output_GCB_table() {
733
734 # Create and output the pair table for use in determining Grapheme Cluster
735 # Breaks, given in http://www.unicode.org/reports/tr29/.
b0e24409
KW
736 my %gcb_actions = (
737 GCB_NOBREAK => 0,
738 GCB_BREAKABLE => 1,
739 GCB_RI_then_RI => 2, # Rules 12 and 13
740 GCB_EX_then_EM => 3, # Rule 10
741 );
973a28ed
KW
742
743 # The table is constructed in reverse order of the rules, to make the
744 # lower-numbered, higher priority ones override the later ones, as the
745 # algorithm stops at the earliest matching rule
746
747 my @gcb_table;
748 my $table_size = @gcb_short_enums;
749
750 # Otherwise, break everywhere.
b0e24409 751 # GB99 Any ÷ Any
973a28ed
KW
752 for my $i (0 .. $table_size - 1) {
753 for my $j (0 .. $table_size - 1) {
754 $gcb_table[$i][$j] = 1;
755 }
756 }
757
b0e24409
KW
758 # Do not break within emoji flag sequences. That is, do not break between
759 # regional indicator (RI) symbols if there is an odd number of RI
760 # characters before the break point. Must be resolved in runtime code.
761 #
c492f156 762 # GB12 sot (RI RI)* RI × RI
b0e24409
KW
763 # GB13 [^RI] (RI RI)* RI × RI
764 $gcb_table[$gcb_enums{'Regional_Indicator'}]
765 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
766
767 # Do not break within emoji modifier sequences or emoji zwj sequences.
768 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
769 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
770 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
771
772 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
773 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
774 = $gcb_actions{GCB_EX_then_EM};
775 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
776 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
777
778 # Do not break before extending characters or ZWJ.
973a28ed 779 # Do not break before SpacingMarks, or after Prepend characters.
973a28ed 780 # GB9b Prepend ×
b0e24409
KW
781 # GB9a × SpacingMark
782 # GB9 × ( Extend | ZWJ )
973a28ed 783 for my $i (0 .. @gcb_table - 1) {
289ce9cc 784 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
b0e24409
KW
785 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
786 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
787 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
973a28ed
KW
788 }
789
973a28ed
KW
790 # Do not break Hangul syllable sequences.
791 # GB8 ( LVT | T) × T
792 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
793 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
794
795 # GB7 ( LV | V ) × ( V | T )
796 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
797 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
798 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
799 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
800
801 # GB6 L × ( L | V | LV | LVT )
802 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
803 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
804 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
805 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
806
289ce9cc
KW
807 # Do not break between a CR and LF. Otherwise, break before and after
808 # controls.
973a28ed
KW
809 # GB5 ÷ ( Control | CR | LF )
810 # GB4 ( Control | CR | LF ) ÷
811 for my $i (0 .. @gcb_table - 1) {
289ce9cc 812 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
973a28ed
KW
813 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
814 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
289ce9cc 815 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
973a28ed
KW
816 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
817 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
818 }
819
820 # GB3 CR × LF
821 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
822
b0e24409 823 # Break at the start and end of text, unless the text is empty
973a28ed
KW
824 # GB1 sot ÷
825 # GB2 ÷ eot
826 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
827 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
828 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
973a28ed 829 }
289ce9cc 830 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
973a28ed 831
b0e24409 832 output_table_common('GCB', \%gcb_actions,
289ce9cc 833 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
973a28ed
KW
834}
835
6b659339
KW
836sub output_LB_table() {
837
838 # Create and output the enums, #defines, and pair table for use in
839 # determining Line Breaks. This uses the default line break algorithm,
840 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
841 # in that page, as the Unicode-furnished tests assume that tailoring.
842
6b659339
KW
843 # The result is really just true or false. But we follow along with tr14,
844 # creating a rule which is false for something like X SP* X. That gets
845 # encoding 2. The rest of the actions are synthetic ones that indicate
846 # some context handling is required. These each are added to the
847 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
848 # value can be retrieved. Actually only rules from 7 through 18 (which
849 # are the ones where space matter) are possible to have 2 added to them.
850 # The others below add just 0 or 1. It might be possible for one
851 # synthetic rule to be added to another, yielding a larger value. This
852 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
853 # names of the middle grouping below, it is impossible for that to occur
854 # for them because they all start with mutually exclusive classes. That
855 # the final rule can't be added to any of the others isn't obvious from
856 # its name, so it is assigned a power of 2 higher than the others can get
857 # to so any addition would preserve all data. (And the code will reach an
858 # assert(0) on debugging builds should this happen.)
859 my %lb_actions = (
860 LB_NOBREAK => 0,
861 LB_BREAKABLE => 1,
862 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
863
b0e24409 864 LB_CM_ZWJ_foo => 3, # Rule 9
6b659339
KW
865 LB_SP_foo => 6, # Rule 18
866 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
867 LB_SY_or_IS_then_various => 11, # Rule 25
868 LB_HY_or_BA_then_foo => 13, # Rule 21
b0e24409 869 LB_RI_then_RI => 15, # Rule 30a
6b659339 870
b0e24409 871 LB_various_then_PO_or_PR => (1<<5), # Rule 25
6b659339
KW
872 );
873
6b659339
KW
874 # Construct the LB pair table. This is based on the rules in
875 # http://www.unicode.org/reports/tr14/, but modified as those rules are
876 # designed for someone taking a string of text and sequentially going
877 # through it to find the break opportunities, whereas, Perl requires
878 # determining if a given random spot is a break opportunity, without
879 # knowing all the entire string before it.
880 #
881 # The table is constructed in reverse order of the rules, to make the
882 # lower-numbered, higher priority ones override the later ones, as the
883 # algorithm stops at the earliest matching rule
884
885 my @lb_table;
886 my $table_size = @lb_short_enums;
887
888 # LB31. Break everywhere else
889 for my $i (0 .. $table_size - 1) {
890 for my $j (0 .. $table_size - 1) {
891 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
892 }
893 }
894
b0e24409
KW
895 # LB30b Do not break between an emoji base and an emoji modifier.
896 # EB × EM
897 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
898 = $lb_actions{'LB_NOBREAK'};
899
900 # LB30a Break between two regional indicator symbols if and only if there
901 # are an even number of regional indicators preceding the position of the
902 # break.
903 # sot (RI RI)* RI × RI
904 # [^RI] (RI RI)* RI × RI
289ce9cc 905 $lb_table[$lb_enums{'Regional_Indicator'}]
b0e24409 906 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
6b659339
KW
907
908 # LB30 Do not break between letters, numbers, or ordinary symbols and
909 # opening or closing parentheses.
910 # (AL | HL | NU) × OP
289ce9cc
KW
911 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
912 = $lb_actions{'LB_NOBREAK'};
913 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
914 = $lb_actions{'LB_NOBREAK'};
915 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
916 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
917
918 # CP × (AL | HL | NU)
289ce9cc
KW
919 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
920 = $lb_actions{'LB_NOBREAK'};
921 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
922 = $lb_actions{'LB_NOBREAK'};
923 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
924 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
925
926 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
927 # IS × (AL | HL)
289ce9cc
KW
928 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
929 = $lb_actions{'LB_NOBREAK'};
930 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
931 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
932
933 # LB28 Do not break between alphabetics (“at”).
934 # (AL | HL) × (AL | HL)
289ce9cc
KW
935 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
936 = $lb_actions{'LB_NOBREAK'};
937 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
938 = $lb_actions{'LB_NOBREAK'};
939 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
940 = $lb_actions{'LB_NOBREAK'};
941 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
942 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
943
944 # LB27 Treat a Korean Syllable Block the same as ID.
945 # (JL | JV | JT | H2 | H3) × IN
289ce9cc
KW
946 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
947 = $lb_actions{'LB_NOBREAK'};
948 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
949 = $lb_actions{'LB_NOBREAK'};
950 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
951 = $lb_actions{'LB_NOBREAK'};
952 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
953 = $lb_actions{'LB_NOBREAK'};
954 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
955 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
956
957 # (JL | JV | JT | H2 | H3) × PO
289ce9cc
KW
958 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
959 = $lb_actions{'LB_NOBREAK'};
960 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
961 = $lb_actions{'LB_NOBREAK'};
962 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
963 = $lb_actions{'LB_NOBREAK'};
964 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
965 = $lb_actions{'LB_NOBREAK'};
966 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
967 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
968
969 # PR × (JL | JV | JT | H2 | H3)
289ce9cc
KW
970 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
971 = $lb_actions{'LB_NOBREAK'};
972 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
973 = $lb_actions{'LB_NOBREAK'};
974 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
975 = $lb_actions{'LB_NOBREAK'};
976 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
977 = $lb_actions{'LB_NOBREAK'};
978 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
979 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
980
981 # LB26 Do not break a Korean syllable.
982 # JL × (JL | JV | H2 | H3)
983 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
984 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
985 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
986 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
987
988 # (JV | H2) × (JV | JT)
989 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
990 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
991 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
992 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
993
994 # (JT | H3) × JT
995 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
996 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
997
998 # LB25 Do not break between the following pairs of classes relevant to
999 # numbers, as tailored by example 7 in
1000 # http://www.unicode.org/reports/tr14/#Examples
1001 # We follow that tailoring because Unicode's test cases expect it
1002 # (PR | PO) × ( OP | HY )? NU
289ce9cc
KW
1003 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
1004 = $lb_actions{'LB_NOBREAK'};
1005 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1006 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1007
1008 # Given that (OP | HY )? is optional, we have to test for it in code.
1009 # We add in the action (instead of overriding) for this, so that in
1010 # the code we can recover the underlying break value.
289ce9cc 1011 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1012 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1013 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1014 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1015 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339 1016 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1017 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339
KW
1018 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1019
1020 # ( OP | HY ) × NU
289ce9cc
KW
1021 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1022 = $lb_actions{'LB_NOBREAK'};
1023 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1024 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1025
1026 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1027 # which can be rewritten as:
1028 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
289ce9cc
KW
1029 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1030 = $lb_actions{'LB_NOBREAK'};
1031 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1032 = $lb_actions{'LB_NOBREAK'};
1033 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1034 = $lb_actions{'LB_NOBREAK'};
1035 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1036 = $lb_actions{'LB_NOBREAK'};
1037 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1038 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1039
1040 # Like earlier where we have to test in code, we add in the action so
1041 # that we can recover the underlying values. This is done in rules
1042 # below, as well. The code assumes that we haven't added 2 actions.
1043 # Shoul a later Unicode release break that assumption, then tests
1044 # should start failing.
289ce9cc 1045 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
6b659339 1046 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1047 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
6b659339 1048 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1049 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
6b659339 1050 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1051 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
6b659339 1052 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1053 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
6b659339 1054 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1055 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
6b659339 1056 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1057 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
6b659339 1058 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1059 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
6b659339 1060 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1061 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
6b659339 1062 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1063 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
6b659339
KW
1064 += $lb_actions{'LB_SY_or_IS_then_various'};
1065
1066 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1067 # which can be rewritten as:
1068 # NU (SY | IS)* (CL | CP)? × (PO | PR)
289ce9cc
KW
1069 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1070 = $lb_actions{'LB_NOBREAK'};
1071 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1072 = $lb_actions{'LB_NOBREAK'};
6b659339 1073
289ce9cc 1074 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1075 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1076 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1077 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1078 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1079 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1080 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
6b659339
KW
1081 += $lb_actions{'LB_various_then_PO_or_PR'};
1082
289ce9cc 1083 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1084 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1085 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1086 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1087 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1088 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1089 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
6b659339
KW
1090 += $lb_actions{'LB_various_then_PO_or_PR'};
1091
b0e24409
KW
1092 # LB24 Do not break between numeric prefix/postfix and letters, or between
1093 # letters and prefix/postfix.
1094 # (PR | PO) × (AL | HL)
289ce9cc
KW
1095 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1096 = $lb_actions{'LB_NOBREAK'};
1097 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1098 = $lb_actions{'LB_NOBREAK'};
289ce9cc
KW
1099 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1100 = $lb_actions{'LB_NOBREAK'};
1101 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1102 = $lb_actions{'LB_NOBREAK'};
6b659339 1103
b0e24409
KW
1104 # (AL | HL) × (PR | PO)
1105 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1106 = $lb_actions{'LB_NOBREAK'};
1107 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1108 = $lb_actions{'LB_NOBREAK'};
1109 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1110 = $lb_actions{'LB_NOBREAK'};
1111 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1112 = $lb_actions{'LB_NOBREAK'};
1113
1114 # LB23a Do not break between numeric prefixes and ideographs, or between
1115 # ideographs and numeric postfixes.
1116 # PR × (ID | EB | EM)
1117 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1118 = $lb_actions{'LB_NOBREAK'};
1119 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1120 = $lb_actions{'LB_NOBREAK'};
1121 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1122 = $lb_actions{'LB_NOBREAK'};
1123
1124 # (ID | EB | EM) × PO
289ce9cc
KW
1125 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1126 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1127 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1128 = $lb_actions{'LB_NOBREAK'};
1129 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1130 = $lb_actions{'LB_NOBREAK'};
6b659339 1131
b0e24409 1132 # LB23 Do not break between digits and letters
6b659339 1133 # (AL | HL) × NU
289ce9cc
KW
1134 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1135 = $lb_actions{'LB_NOBREAK'};
1136 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1137 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1138
1139 # NU × (AL | HL)
289ce9cc
KW
1140 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1141 = $lb_actions{'LB_NOBREAK'};
1142 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1143 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1144
1145 # LB22 Do not break between two ellipses, or between letters, numbers or
1146 # exclamations and ellipsis.
1147 # (AL | HL) × IN
289ce9cc
KW
1148 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1149 = $lb_actions{'LB_NOBREAK'};
1150 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1151 = $lb_actions{'LB_NOBREAK'};
6b659339 1152
289ce9cc
KW
1153 # Exclamation × IN
1154 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1155 = $lb_actions{'LB_NOBREAK'};
6b659339 1156
b0e24409 1157 # (ID | EB | EM) × IN
289ce9cc
KW
1158 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1159 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1160 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1161 = $lb_actions{'LB_NOBREAK'};
1162 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1163 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1164
1165 # IN × IN
289ce9cc
KW
1166 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1167 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1168
1169 # NU × IN
289ce9cc
KW
1170 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1171 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1172
1173 # LB21b Don’t break between Solidus and Hebrew letters.
1174 # SY × HL
289ce9cc
KW
1175 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1176 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1177
1178 # LB21a Don't break after Hebrew + Hyphen.
1179 # HL (HY | BA) ×
1180 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1181 $lb_table[$lb_enums{'Hyphen'}][$i]
1182 += $lb_actions{'LB_HY_or_BA_then_foo'};
1183 $lb_table[$lb_enums{'Break_After'}][$i]
1184 += $lb_actions{'LB_HY_or_BA_then_foo'};
6b659339
KW
1185 }
1186
1187 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1188 # spaces, small kana, and other non-starters, or after acute accents.
1189 # × BA
1190 # × HY
1191 # × NS
1192 # BB ×
1193 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1194 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1195 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1196 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1197 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1198 }
1199
1200 # LB20 Break before and after unresolved CB.
1201 # ÷ CB
1202 # CB ÷
1203 # Conditional breaks should be resolved external to the line breaking
1204 # rules. However, the default action is to treat unresolved CB as breaking
1205 # before and after.
1206 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1207 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1208 = $lb_actions{'LB_BREAKABLE'};
1209 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1210 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1211 }
1212
1213 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1214 # × QU
1215 # QU ×
1216 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1217 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1218 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1219 }
1220
1221 # LB18 Break after spaces
1222 # SP ÷
1223 for my $i (0 .. @lb_table - 1) {
289ce9cc 1224 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1225 }
1226
1227 # LB17 Do not break within ‘——’, even with intervening spaces.
1228 # B2 SP* × B2
289ce9cc 1229 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
6b659339
KW
1230 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1231
1232 # LB16 Do not break between closing punctuation and a nonstarter even with
1233 # intervening spaces.
1234 # (CL | CP) SP* × NS
289ce9cc 1235 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
6b659339 1236 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1237 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
6b659339
KW
1238 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1239
1240
1241 # LB15 Do not break within ‘”[’, even with intervening spaces.
1242 # QU SP* × OP
289ce9cc 1243 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
6b659339
KW
1244 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1245
1246 # LB14 Do not break after ‘[’, even after spaces.
1247 # OP SP* ×
1248 for my $i (0 .. @lb_table - 1) {
289ce9cc 1249 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
6b659339
KW
1250 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1251 }
1252
1253 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1254 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1255 # [^NU] × CL
1256 # [^NU] × CP
1257 # × EX
1258 # [^NU] × IS
1259 # [^NU] × SY
1260 for my $i (0 .. @lb_table - 1) {
289ce9cc 1261 $lb_table[$i][$lb_enums{'Exclamation'}]
6b659339
KW
1262 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1263
289ce9cc 1264 next if $i == $lb_enums{'Numeric'};
6b659339 1265
289ce9cc 1266 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
6b659339 1267 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1268 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
6b659339 1269 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1270 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
6b659339 1271 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1272 $lb_table[$i][$lb_enums{'Break_Symbols'}]
6b659339
KW
1273 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1274 }
1275
1276 # LB12a Do not break before NBSP and related characters, except after
1277 # spaces and hyphens.
1278 # [^SP BA HY] × GL
1279 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1280 next if $i == $lb_enums{'Space'}
1281 || $i == $lb_enums{'Break_After'}
1282 || $i == $lb_enums{'Hyphen'};
6b659339
KW
1283
1284 # We don't break, but if a property above has said don't break even
1285 # with space between, don't override that (also in the next few rules)
289ce9cc 1286 next if $lb_table[$i][$lb_enums{'Glue'}]
6b659339 1287 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1288 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1289 }
1290
1291 # LB12 Do not break after NBSP and related characters.
1292 # GL ×
1293 for my $i (0 .. @lb_table - 1) {
289ce9cc 1294 next if $lb_table[$lb_enums{'Glue'}][$i]
6b659339 1295 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1296 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1297 }
1298
1299 # LB11 Do not break before or after Word joiner and related characters.
1300 # × WJ
1301 # WJ ×
1302 for my $i (0 .. @lb_table - 1) {
289ce9cc 1303 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
6b659339
KW
1304 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1305 {
289ce9cc 1306 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
6b659339 1307 }
289ce9cc 1308 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
6b659339
KW
1309 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1310 {
289ce9cc 1311 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1312 }
1313 }
1314
1315 # Special case this here to avoid having to do a special case in the code,
1316 # by making this the same as other things with a SP in front of them that
1317 # don't break, we avoid an extra test
289ce9cc 1318 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
6b659339
KW
1319 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1320
1321 # LB9 and LB10 are done in the same loop
1322 #
1323 # LB9 Do not break a combining character sequence; treat it as if it has
1324 # the line breaking class of the base character in all of the
b0e24409
KW
1325 # higher-numbered rules. Treat ZWJ as if it were CM
1326 # Treat X (CM|ZWJ)* as if it were X.
6b659339
KW
1327 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1328
b0e24409
KW
1329 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1330 # case where a CM or ZWJ is the first character on the line or follows SP,
1331 # BK, CR, LF, NL, or ZW.
6b659339
KW
1332 for my $i (0 .. @lb_table - 1) {
1333
b0e24409
KW
1334 # When the CM or ZWJ is the first in the pair, we don't know without
1335 # looking behind whether the CM or ZWJ is going to attach to an
1336 # earlier character, or not. So have to figure this out at runtime in
1337 # the code
1338 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1339 = $lb_actions{'LB_CM_ZWJ_foo'};
1340 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
289ce9cc
KW
1341
1342 if ( $i == $lb_enums{'Mandatory_Break'}
1343 || $i == $lb_enums{'EDGE'}
1344 || $i == $lb_enums{'Carriage_Return'}
1345 || $i == $lb_enums{'Line_Feed'}
1346 || $i == $lb_enums{'Next_Line'}
1347 || $i == $lb_enums{'Space'}
1348 || $i == $lb_enums{'ZWSpace'})
6b659339
KW
1349 {
1350 # For these classes, a following CM doesn't combine, and should do
289ce9cc
KW
1351 # whatever 'Alphabetic' would do.
1352 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1353 = $lb_table[$i][$lb_enums{'Alphabetic'}];
b0e24409
KW
1354 $lb_table[$i][$lb_enums{'ZWJ'}]
1355 = $lb_table[$i][$lb_enums{'Alphabetic'}];
6b659339
KW
1356 }
1357 else {
b0e24409
KW
1358 # For these classes, the CM or ZWJ combines, so doesn't break,
1359 # inheriting the type of nobreak from the master character.
289ce9cc 1360 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
6b659339
KW
1361 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1362 {
289ce9cc
KW
1363 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1364 = $lb_actions{'LB_NOBREAK'};
6b659339 1365 }
b0e24409
KW
1366 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1367 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1368 {
1369 $lb_table[$i][$lb_enums{'ZWJ'}]
1370 = $lb_actions{'LB_NOBREAK'};
1371 }
6b659339
KW
1372 }
1373 }
1374
b0e24409
KW
1375 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1376 # base or emoji modifier. This rule prevents breaks within emoji joiner
1377 # sequences.
1378 # ZWJ × (ID | EB | EM)
1379 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1380 = $lb_actions{'LB_NOBREAK'};
1381 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1382 = $lb_actions{'LB_NOBREAK'};
1383 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1384 = $lb_actions{'LB_NOBREAK'};
1385
6b659339
KW
1386 # LB8 Break before any character following a zero-width space, even if one
1387 # or more spaces intervene.
1388 # ZW SP* ÷
1389 for my $i (0 .. @lb_table - 1) {
289ce9cc 1390 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1391 }
1392
1393 # Because of LB8-10, we need to look at context for "SP x", and this must
1394 # be done in the code. So override the existing rules for that, by adding
1395 # a constant to get new rules that tell the code it needs to look at
1396 # context. By adding this action instead of replacing the existing one,
1397 # we can get back to the original rule if necessary.
1398 for my $i (0 .. @lb_table - 1) {
289ce9cc 1399 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
6b659339
KW
1400 }
1401
1402 # LB7 Do not break before spaces or zero width space.
1403 # × SP
1404 # × ZW
1405 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1406 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1407 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1408 }
1409
1410 # LB6 Do not break before hard line breaks.
1411 # × ( BK | CR | LF | NL )
1412 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1413 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1414 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1415 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1416 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1417 }
1418
1419 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1420 # CR × LF
1421 # CR !
1422 # LF !
1423 # NL !
1424 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1425 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1426 = $lb_actions{'LB_BREAKABLE'};
1427 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1428 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339 1429 }
289ce9cc
KW
1430 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1431 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1432
1433 # LB4 Always break after hard line breaks.
1434 # BK !
1435 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1436 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1437 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1438 }
1439
6b659339
KW
1440 # LB3 Always break at the end of text.
1441 # ! eot
b0e24409
KW
1442 # LB2 Never break at the start of text.
1443 # sot ×
6b659339 1444 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1445 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1446 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1447 }
1448
1449 # LB1 Assign a line breaking class to each code point of the input.
1450 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1451 # depending on criteria outside the scope of this algorithm.
1452 #
1453 # In the absence of such criteria all characters with a specific
1454 # combination of original class and General_Category property value are
1455 # resolved as follows:
1456 # Original Resolved General_Category
1457 # AI, SG, XX AL Any
1458 # SA CM Only Mn or Mc
1459 # SA AL Any except Mn and Mc
1460 # CJ NS Any
1461 #
1462 # This is done in mktables, so we never see any of the remapped-from
1463 # classes.
1464
289ce9cc
KW
1465 output_table_common('LB', \%lb_actions,
1466 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
6b659339
KW
1467}
1468
7e54b87f
KW
1469sub output_WB_table() {
1470
1471 # Create and output the enums, #defines, and pair table for use in
1472 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1473
1474 # This uses the same mechanism in the other bounds tables generated by
1475 # this file. The actions that could override a 0 or 1 are added to those
1476 # numbers; the actions that clearly don't depend on the underlying rule
1477 # simply overwrite
1478 my %wb_actions = (
1479 WB_NOBREAK => 0,
1480 WB_BREAKABLE => 1,
1481 WB_hs_then_hs => 2,
b0e24409 1482 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
7e54b87f
KW
1483 WB_DQ_then_HL => 4,
1484 WB_HL_then_DQ => 6,
1485 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1486 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1487 WB_MB_or_MN_or_SQ_then_NU => 12,
1488 WB_NU_then_MB_or_MN_or_SQ => 14,
b0e24409 1489 WB_RI_then_RI => 16,
7e54b87f
KW
1490 );
1491
7e54b87f
KW
1492 # Construct the WB pair table.
1493 # The table is constructed in reverse order of the rules, to make the
1494 # lower-numbered, higher priority ones override the later ones, as the
1495 # algorithm stops at the earliest matching rule
1496
1497 my @wb_table;
1498 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
1499
1500 # Otherwise, break everywhere (including around ideographs).
b0e24409 1501 # WB99 Any ÷ Any
7e54b87f
KW
1502 for my $i (0 .. $table_size - 1) {
1503 for my $j (0 .. $table_size - 1) {
1504 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1505 }
1506 }
1507
b0e24409
KW
1508 # Do not break within emoji flag sequences. That is, do not break between
1509 # regional indicator (RI) symbols if there is an odd number of RI
1510 # characters before the break point.
1511 # WB16 [^RI] (RI RI)* RI × RI
c492f156 1512 # WB15 sot (RI RI)* RI × RI
289ce9cc 1513 $wb_table[$wb_enums{'Regional_Indicator'}]
b0e24409
KW
1514 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1515
1516 # Do not break within emoji modifier sequences.
1517 # WB14 ( E_Base | EBG ) × E_Modifier
1518 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1519 = $wb_actions{'WB_NOBREAK'};
1520 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1521 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1522
1523 # Do not break from extenders.
1524 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
289ce9cc
KW
1525 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1526 = $wb_actions{'WB_NOBREAK'};
1527 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1528 = $wb_actions{'WB_NOBREAK'};
1529 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1530 = $wb_actions{'WB_NOBREAK'};
1531 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1532 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1533
1534 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1535 # × # ExtendNumLet
289ce9cc
KW
1536 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1537 = $wb_actions{'WB_NOBREAK'};
1538 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1539 = $wb_actions{'WB_NOBREAK'};
1540 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1541 = $wb_actions{'WB_NOBREAK'};
1542 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1543 = $wb_actions{'WB_NOBREAK'};
1544 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1545 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1546
1547 # Do not break between Katakana.
1548 # WB13 Katakana × Katakana
289ce9cc
KW
1549 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1550 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1551
1552 # Do not break within sequences, such as “3.2” or “3,456.789”.
1553 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
289ce9cc 1554 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
7e54b87f 1555 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1556 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
7e54b87f 1557 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1558 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1559 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1560
1561 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
289ce9cc 1562 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
7e54b87f 1563 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1564 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
7e54b87f 1565 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1566 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
7e54b87f
KW
1567 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1568
1569 # Do not break within sequences of digits, or digits adjacent to letters
1570 # (“3a”, or “A3”).
1571 # WB10 Numeric × (ALetter | Hebrew_Letter)
289ce9cc
KW
1572 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1573 = $wb_actions{'WB_NOBREAK'};
1574 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1575 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1576
1577 # WB9 (ALetter | Hebrew_Letter) × Numeric
289ce9cc
KW
1578 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1579 = $wb_actions{'WB_NOBREAK'};
1580 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1581 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1582
1583 # WB8 Numeric × Numeric
289ce9cc
KW
1584 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1585 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1586
1587 # Do not break letters across certain punctuation.
1588 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
289ce9cc
KW
1589 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1590 += $wb_actions{'WB_DQ_then_HL'};
7e54b87f
KW
1591
1592 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
289ce9cc
KW
1593 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1594 += $wb_actions{'WB_HL_then_DQ'};
7e54b87f
KW
1595
1596 # WB7a Hebrew_Letter × Single_Quote
289ce9cc
KW
1597 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1598 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1599
1600 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1601 # × (ALetter | Hebrew_Letter)
289ce9cc 1602 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
7e54b87f 1603 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1604 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1605 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1606 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
7e54b87f 1607 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1608 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1609 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1610 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
7e54b87f 1611 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1612 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f
KW
1613 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1614
1615 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1616 # | Single_Quote) (ALetter | Hebrew_Letter)
289ce9cc 1617 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1618 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1619 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1620 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1621 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
7e54b87f 1622 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1623 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
7e54b87f 1624 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1625 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
7e54b87f 1626 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1627 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1628 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1629
1630 # Do not break between most letters.
1631 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
289ce9cc
KW
1632 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1633 = $wb_actions{'WB_NOBREAK'};
1634 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1635 = $wb_actions{'WB_NOBREAK'};
1636 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1637 = $wb_actions{'WB_NOBREAK'};
1638 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1639 = $wb_actions{'WB_NOBREAK'};
7e54b87f 1640
b0e24409
KW
1641 # Ignore Format and Extend characters, except after sot, CR, LF, and
1642 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1643 # WB4 X (Extend | Format | ZWJ)* → X
7e54b87f 1644 for my $i (0 .. @wb_table - 1) {
289ce9cc 1645 $wb_table[$wb_enums{'Extend'}][$i]
b0e24409 1646 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
289ce9cc 1647 $wb_table[$wb_enums{'Format'}][$i]
b0e24409
KW
1648 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1649 $wb_table[$wb_enums{'ZWJ'}][$i]
1650 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1651 }
1652 for my $i (0 .. @wb_table - 1) {
1653 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1654 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1655 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1656 }
1657
1658 # Implied is that these attach to the character before them, except for
1659 # the characters that mark the end of a region of text. The rules below
1660 # override the ones set up here, for all the characters that need
1661 # overriding.
1662 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1663 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1664 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1665 }
1666
b0e24409
KW
1667 # Do not break within emoji zwj sequences.
1668 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1669 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1670 = $wb_actions{'WB_NOBREAK'};
1671 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1672 = $wb_actions{'WB_NOBREAK'};
1673
7e54b87f
KW
1674 # Break before and after white space
1675 # WB3b ÷ (Newline | CR | LF)
1676 # WB3a (Newline | CR | LF) ÷
1677 # et. al.
289ce9cc 1678 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1679 for my $j (0 .. @wb_table - 1) {
1680 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1681 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1682 }
1683 }
1684
1685 # But do not break within white space.
1686 # WB3 CR × LF
1687 # et.al.
289ce9cc
KW
1688 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1689 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1690 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1691 }
1692 }
1693
b0e24409 1694 # And do not break horizontal space followed by Extend or Format or ZWJ
289ce9cc
KW
1695 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1696 = $wb_actions{'WB_NOBREAK'};
1697 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1698 = $wb_actions{'WB_NOBREAK'};
b0e24409
KW
1699 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1700 = $wb_actions{'WB_NOBREAK'};
289ce9cc
KW
1701 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1702 [$wb_enums{'Perl_Tailored_HSpace'}]
1703 = $wb_actions{'WB_hs_then_hs'};
7e54b87f 1704
b0e24409
KW
1705 # Break at the start and end of text, unless the text is empty
1706 # WB2 Any ÷ eot
1707 # WB1 sot ÷ Any
7e54b87f 1708 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1709 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1710 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
7e54b87f 1711 }
289ce9cc 1712 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
7e54b87f 1713
289ce9cc
KW
1714 output_table_common('WB', \%wb_actions,
1715 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
7e54b87f
KW
1716}
1717
9d9177be
KW
1718output_invlist("Latin1", [ 0, 256 ]);
1719output_invlist("AboveLatin1", [ 256 ]);
1720
bffc0129 1721end_file_pound_if;
43b443dd 1722
3f427fd9
KW
1723# We construct lists for all the POSIX and backslash sequence character
1724# classes in two forms:
1725# 1) ones which match only in the ASCII range
1726# 2) ones which match either in the Latin1 range, or the entire Unicode range
1727#
1728# These get compiled in, and hence affect the memory footprint of every Perl
1729# program, even those not using Unicode. To minimize the size, currently
1730# the Latin1 version is generated for the beyond ASCII range except for those
1731# lists that are quite small for the entire range, such as for \s, which is 22
1732# UVs long plus 4 UVs (currently) for the header.
1733#
1734# To save even more memory, the ASCII versions could be derived from the
1735# larger ones at runtime, saving some memory (minus the expense of the machine
1736# instructions to do so), but these are all small anyway, so their total is
1737# about 100 UVs.
1738#
1739# In the list of properties below that get generated, the L1 prefix is a fake
1740# property that means just the Latin1 range of the full property (whose name
1741# has an X prefix instead of L1).
a02047bf
KW
1742#
1743# An initial & means to use the subroutine from this file instead of an
1744# official inversion list.
3f427fd9 1745
0c4ecf42
KW
1746for my $charset (get_supported_code_pages()) {
1747 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1748
99f21fb9
KW
1749 @a2n = @{get_a2n($charset)};
1750 no warnings 'qw';
1751 # Ignore non-alpha in sort
1752 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
c0382778 1753 Assigned
1c8c3428
KW
1754 ASCII
1755 Cased
1756 VertSpace
1757 XPerlSpace
1758 XPosixAlnum
1759 XPosixAlpha
1760 XPosixBlank
1761 XPosixCntrl
1762 XPosixDigit
1763 XPosixGraph
1764 XPosixLower
1765 XPosixPrint
1766 XPosixPunct
1767 XPosixSpace
1768 XPosixUpper
1769 XPosixWord
1770 XPosixXDigit
1771 _Perl_Any_Folds
1772 &NonL1_Perl_Non_Final_Folds
1773 _Perl_Folds_To_Multi_Char
1774 &UpperLatin1
1775 _Perl_IDStart
1776 _Perl_IDCont
02f811dd 1777 _Perl_GCB,EDGE
ca8226cf 1778 _Perl_LB,EDGE
bf4268fa 1779 _Perl_SB,EDGE
190d69bb 1780 _Perl_WB,EDGE,UNKNOWN
1c8c3428 1781 )
0f5e3c71
KW
1782 ) {
1783
1784 # For the Latin1 properties, we change to use the eXtended version of the
1785 # base property, then go through the result and get rid of everything not
1786 # in Latin1 (above 255). Actually, we retain the element for the range
1787 # that crosses the 255/256 boundary if it is one that matches the
1788 # property. For example, in the Word property, there is a range of code
1789 # points that start at U+00F8 and goes through U+02C1. Instead of
1790 # artificially cutting that off at 256 because 256 is the first code point
1791 # above Latin1, we let the range go to its natural ending. That gives us
1792 # extra information with no added space taken. But if the range that
1793 # crosses the boundary is one that doesn't match the property, we don't
1794 # start a new range above 255, as that could be construed as going to
1795 # infinity. For example, the Upper property doesn't include the character
1796 # at 255, but does include the one at 256. We don't include the 256 one.
1797 my $prop_name = $prop;
1798 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1799 my $extra_enums = "";
1800 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1801 my $lookup_prop = $prop_name;
1802 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1803 or $lookup_prop =~ s/^L1//);
1804 my $nonl1_only = 0;
1805 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1806 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1807
1808 my @invlist;
99f21fb9
KW
1809 my @invmap;
1810 my $map_format;
1811 my $map_default;
1812 my $maps_to_code_point;
1813 my $to_adjust;
0f5e3c71
KW
1814 if ($is_local_sub) {
1815 @invlist = eval $lookup_prop;
289ce9cc 1816 die $@ if $@;
0f5e3c71
KW
1817 }
1818 else {
1819 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1820 if (! @invlist) {
99f21fb9 1821
ad85f59a
KW
1822 # If couldn't find a non-empty inversion list, see if it is
1823 # instead an inversion map
1824 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1825 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1826 if (! $list_ref) {
1827 # An empty return here could mean an unknown property, or
1828 # merely that the original inversion list is empty. Call
1829 # in scalar context to differentiate
1830 my $count = prop_invlist($lookup_prop,
1831 '_perl_core_internal_ok');
1832 die "Could not find inversion list for '$lookup_prop'"
1833 unless defined $count;
1834 }
1835 else {
18b852b3
KW
1836 @invlist = @$list_ref;
1837 @invmap = @$map_ref;
1838 $map_format = $format;
1839 $map_default = $default;
1840 $maps_to_code_point = $map_format =~ /x/;
1841 $to_adjust = $map_format =~ /a/;
ad85f59a 1842 }
99f21fb9 1843 }
0f5e3c71 1844 }
ad85f59a
KW
1845
1846
1847 # Short-circuit an empty inversion list.
1848 if (! @invlist) {
1849 output_invlist($prop_name, \@invlist, $charset);
1850 next;
1851 }
ceb1de32 1852
99f21fb9
KW
1853 # Re-order the Unicode code points to native ones for this platform.
1854 # This is only needed for code points below 256, because native code
1855 # points are only in that range. For inversion maps of properties
1856 # where the mappings are adjusted (format =~ /a/), this reordering
1857 # could mess up the adjustment pattern that was in the input, so that
1858 # has to be dealt with.
1859 #
1860 # And inversion maps that map to code points need to eventually have
1861 # all those code points remapped to native, and it's better to do that
1862 # here, going through the whole list not just those below 256. This
1863 # is because some inversion maps have adjustments (format =~ /a/)
1864 # which may be affected by the reordering. This code needs to be done
1865 # both for when we are translating the inversion lists for < 256, and
1866 # for the inversion maps for everything. By doing both in this loop,
1867 # we can share that code.
1868 #
1869 # So, we go through everything for an inversion map to code points;
1870 # otherwise, we can skip any remapping at all if we are going to
1871 # output only the above-Latin1 values, or if the range spans the whole
1872 # of 0..256, as the remap will also include all of 0..256 (256 not
1873 # 255 because a re-ordering could cause 256 to need to be in the same
1874 # range as 255.)
1875 if ((@invmap && $maps_to_code_point)
1876 || (! $nonl1_only || ($invlist[0] < 256
1877 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1878 {
fb4554ea 1879
99f21fb9 1880 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1881 # Look at all the ranges that start before 257.
1882 my @latin1_list;
1883 while (@invlist) {
1884 last if $invlist[0] > 256;
1885 my $upper = @invlist > 1
1886 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1887
1888 # To infinity. You may want to stop much much
1889 # earlier; going this high may expose perl
1890 # deficiencies with very large numbers.
1891 : $Unicode::UCD::MAX_CP;
fb4554ea 1892 for my $j ($invlist[0] .. $upper) {
99f21fb9 1893 push @latin1_list, a2n($j);
0f5e3c71 1894 }
fb4554ea
KW
1895
1896 shift @invlist; # Shift off the range that's in the list
1897 shift @invlist; # Shift off the range not in the list
0c4ecf42 1898 }
fb4554ea
KW
1899
1900 # Here @invlist contains all the ranges in the original that start
1901 # at code points above 256, and @latin1_list contains all the
1902 # native code points for ranges that start with a Unicode code
1903 # point below 257. We sort the latter and convert it to inversion
1904 # list format. Then simply prepend it to the list of the higher
1905 # code points.
1906 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1907 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1908 unshift @invlist, @latin1_list;
99f21fb9
KW
1909 }
1910 else { # Is an inversion map
1911
1912 # This is a similar procedure as plain inversion list, but has
1913 # multiple buckets. A plain inversion list just has two
1914 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1915 # pretty much can ignore the 2nd bucket, as it is completely
1916 # defined by the 1st. But here, what we do is create buckets
1917 # which contain the code points that map to each, translated
1918 # to native and turned into an inversion list. Thus each
1919 # bucket is an inversion list of native code points that map
1920 # to it or don't map to it. We use these to create an
1921 # inversion map for the whole property.
1922
1923 # As mentioned earlier, we use this procedure to not just
1924 # remap the inversion list to native values, but also the maps
1925 # of code points to native ones. In the latter case we have
1926 # to look at the whole of the inversion map (or at least to
1927 # above Unicode; as the maps of code points above that should
1928 # all be to the default).
1929 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1930
1931 my %mapped_lists; # A hash whose keys are the buckets.
1932 while (@invlist) {
1933 last if $invlist[0] > $upper_limit;
1934
1935 # This shouldn't actually happen, as prop_invmap() returns
1936 # an extra element at the end that is beyond $upper_limit
1937 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1938
1939 my $bucket;
1940
1941 # A hash key can't be a ref (we are only expecting arrays
1942 # of scalars here), so convert any such to a string that
1943 # will be converted back later (using a vertical tab as
1944 # the separator). Even if the mapping is to code points,
1945 # we don't translate to native here because the code
d8049362 1946 # output_invmap() calls to output these arrays assumes the
99f21fb9
KW
1947 # input is Unicode, not native.
1948 if (ref $invmap[0]) {
1949 $bucket = join "\cK", @{$invmap[0]};
1950 }
1951 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1952
1953 # Do convert to native for maps to single code points.
1954 # There are some properties that have a few outlier
1955 # maps that aren't code points, so the above test
1956 # skips those.
1957 $bucket = a2n($invmap[0]);
1958 } else {
1959 $bucket = $invmap[0];
1960 }
1961
1962 # We now have the bucket that all code points in the range
1963 # map to, though possibly they need to be adjusted. Go
1964 # through the range and put each translated code point in
1965 # it into its bucket.
1966 my $base_map = $invmap[0];
1967 for my $j ($invlist[0] .. $invlist[1] - 1) {
1968 if ($to_adjust
1969 # The 1st code point doesn't need adjusting
1970 && $j > $invlist[0]
1971
1972 # Skip any non-numeric maps: these are outliers
1973 # that aren't code points.
1974 && $base_map =~ $numeric_re
1975
1976 # 'ne' because the default can be a string
1977 && $base_map ne $map_default)
1978 {
1979 # We adjust, by incrementing each the bucket and
1980 # the map. For code point maps, translate to
1981 # native
1982 $base_map++;
1983 $bucket = ($maps_to_code_point)
1984 ? a2n($base_map)
1985 : $base_map;
1986 }
1987
1988 # Add the native code point to the bucket for the
1989 # current map
1990 push @{$mapped_lists{$bucket}}, a2n($j);
1991 } # End of loop through all code points in the range
1992
1993 # Get ready for the next range
1994 shift @invlist;
1995 shift @invmap;
1996 } # End of loop through all ranges in the map.
1997
1998 # Here, @invlist and @invmap retain all the ranges from the
1999 # originals that start with code points above $upper_limit.
2000 # Each bucket in %mapped_lists contains all the code points
2001 # that map to that bucket. If the bucket is for a map to a
2002 # single code point is a single code point, the bucket has
2003 # been converted to native. If something else (including
2004 # multiple code points), no conversion is done.
2005 #
2006 # Now we recreate the inversion map into %xlated, but this
2007 # time for the native character set.
2008 my %xlated;
2009 foreach my $bucket (keys %mapped_lists) {
2010
2011 # Sort and convert this bucket to an inversion list. The
2012 # result will be that ranges that start with even-numbered
2013 # indexes will be for code points that map to this bucket;
2014 # odd ones map to some other bucket, and are discarded
2015 # below.
2016 @{$mapped_lists{$bucket}}
2017 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2018 @{$mapped_lists{$bucket}}
2019 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2020
2021 # Add each even-numbered range in the bucket to %xlated;
2022 # so that the keys of %xlated become the range start code
2023 # points, and the values are their corresponding maps.
2024 while (@{$mapped_lists{$bucket}}) {
2025 my $range_start = $mapped_lists{$bucket}->[0];
2026 if ($bucket =~ /\cK/) {
2027 @{$xlated{$range_start}} = split /\cK/, $bucket;
2028 }
2029 else {
2030 $xlated{$range_start} = $bucket;
2031 }
2032 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2033 shift @{$mapped_lists{$bucket}}; # Get ready for next
2034 # iteration
2035 }
2036 } # End of loop through all the buckets.
2037
2038 # Here %xlated's keys are the range starts of all the code
2039 # points in the inversion map. Construct an inversion list
2040 # from them.
2041 my @new_invlist = sort { $a <=> $b } keys %xlated;
2042
2043 # If the list is adjusted, we want to munge this list so that
2044 # we only have one entry for where consecutive code points map
2045 # to consecutive values. We just skip the subsequent entries
2046 # where this is the case.
2047 if ($to_adjust) {
2048 my @temp;
2049 for my $i (0 .. @new_invlist - 1) {
2050 next if $i > 0
2051 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2052 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2053 && $xlated{$new_invlist[$i]} =~ $numeric_re
2054 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2055 push @temp, $new_invlist[$i];
2056 }
2057 @new_invlist = @temp;
2058 }
2059
2060 # The inversion map comes from %xlated's values. We can
2061 # unshift each onto the front of the untouched portion, in
2062 # reverse order of the portion we did process.
2063 foreach my $start (reverse @new_invlist) {
2064 unshift @invmap, $xlated{$start};
2065 }
2066
2067 # Finally prepend the inversion list we have just constructed to the
2068 # one that contains anything we didn't process.
2069 unshift @invlist, @new_invlist;
2070 }
2071 }
2072
2073 # prop_invmap() returns an extra final entry, which we can now
2074 # discard.
2075 if (@invmap) {
2076 pop @invlist;
2077 pop @invmap;
ceb1de32 2078 }
0f5e3c71
KW
2079
2080 if ($l1_only) {
99f21fb9 2081 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
2082 for my $i (0 .. @invlist - 1 - 1) {
2083 if ($invlist[$i] > 255) {
2084
2085 # In an inversion list, even-numbered elements give the code
2086 # points that begin ranges that match the property;
2087 # odd-numbered give ones that begin ranges that don't match.
2088 # If $i is odd, we are at the first code point above 255 that
2089 # doesn't match, which means the range it is ending does
2090 # match, and crosses the 255/256 boundary. We want to include
2091 # this ending point, so increment $i, so the splice below
2092 # includes it. Conversely, if $i is even, it is the first
2093 # code point above 255 that matches, which means there was no
2094 # matching range that crossed the boundary, and we don't want
2095 # to include this code point, so splice before it.
2096 $i++ if $i % 2 != 0;
2097
2098 # Remove everything past this.
2099 splice @invlist, $i;
99f21fb9 2100 splice @invmap, $i if @invmap;
0f5e3c71
KW
2101 last;
2102 }
0c4ecf42
KW
2103 }
2104 }
0f5e3c71
KW
2105 elsif ($nonl1_only) {
2106 my $found_nonl1 = 0;
2107 for my $i (0 .. @invlist - 1 - 1) {
2108 next if $invlist[$i] < 256;
2109
2110 # Here, we have the first element in the array that indicates an
2111 # element above Latin1. Get rid of all previous ones.
2112 splice @invlist, 0, $i;
99f21fb9 2113 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
2114
2115 # If this one's index is not divisible by 2, it means that this
2116 # element is inverting away from being in the list, which means
99f21fb9
KW
2117 # all code points from 256 to this one are in this list (or
2118 # map to the default for inversion maps)
2119 if ($i % 2 != 0) {
2120 unshift @invlist, 256;
2121 unshift @invmap, $map_default if @invmap;
2122 }
0f5e3c71 2123 $found_nonl1 = 1;
3f427fd9
KW
2124 last;
2125 }
0f5e3c71 2126 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 2127 }
3f427fd9 2128
0f5e3c71 2129 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 2130 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 2131 }
bffc0129 2132 end_file_pound_if;
0c4ecf42 2133 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
2134}
2135
973a28ed
KW
2136switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2137
2138output_GCB_table();
6b659339 2139output_LB_table();
7e54b87f 2140output_WB_table();
6b659339 2141
973a28ed
KW
2142end_file_pound_if;
2143
2308ab83 2144my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
2145my @sources = ($0, qw(lib/unicore/mktables
2146 lib/Unicode/UCD.pm
2147 regen/charset_translations.pl
2148 ));
9a3da3ad
FC
2149{
2150 # Depend on mktables’ own sources. It’s a shorter list of files than
2151 # those that Unicode::UCD uses.
1ae6ead9 2152 if (! open my $mktables_list, '<', $sources_list) {
2308ab83
KW
2153
2154 # This should force a rebuild once $sources_list exists
2155 push @sources, $sources_list;
2156 }
2157 else {
2158 while(<$mktables_list>) {
2159 last if /===/;
2160 chomp;
2161 push @sources, "lib/unicore/$_" if /^[^#]/;
2162 }
9a3da3ad
FC
2163 }
2164}
6b659339
KW
2165
2166read_only_bottom_close_and_rename($out_fh, \@sources);