This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Generate _Perl_SCX property
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
3d7c117d
MB
11require './regen/regen_lib.pl';
12require './regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
289ce9cc
KW
43my $max_hdr_len = 3; # In headings, how wide a name is allowed?
44
9d9177be
KW
45print $out_fh "/* See the generating file for comments */\n\n";
46
bffc0129
KW
47# The symbols generated by this program are all currently defined only in a
48# single dot c each. The code knows where most of them go, but this hash
49# gives overrides for the exceptions to the typical place
50my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
59 );
015bb97c 60
973a28ed
KW
61my %gcb_enums;
62my @gcb_short_enums;
289ce9cc 63my %gcb_abbreviations;
6b659339
KW
64my %lb_enums;
65my @lb_short_enums;
289ce9cc 66my %lb_abbreviations;
7e54b87f
KW
67my %wb_enums;
68my @wb_short_enums;
289ce9cc 69my %wb_abbreviations;
6b659339 70
99f21fb9
KW
71my @a2n;
72
73sub uniques {
74 # Returns non-duplicated input values. From "Perl Best Practices:
75 # Encapsulated Cleverness". p. 455 in first edition.
76
77 my %seen;
78 return grep { ! $seen{$_}++ } @_;
79}
80
81sub a2n($) {
82 my $cp = shift;
83
84 # Returns the input Unicode code point translated to native.
85
86 return $cp if $cp !~ $numeric_re || $cp > 255;
87 return $a2n[$cp];
88}
89
bffc0129
KW
90sub end_file_pound_if {
91 if ($in_file_pound_if) {
92 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
93 $in_file_pound_if = 0;
94 }
95}
96
97sub switch_pound_if ($$) {
98 my $name = shift;
99 my $new_pound_if = shift;
100
101 # Switch to new #if given by the 2nd argument. If there is an override
102 # for this, it instead switches to that. The 1st argument is the
103 # static's name, used to look up the overrides
104
105 if (exists $exceptions_to_where_to_define{$name}) {
106 $new_pound_if = $exceptions_to_where_to_define{$name};
107 }
108
109 # Exit current #if if the new one is different from the old
110 if ($in_file_pound_if
111 && $in_file_pound_if !~ /$new_pound_if/)
112 {
113 end_file_pound_if;
114 }
115
116 # Enter new #if, if not already in it.
117 if (! $in_file_pound_if) {
118 $in_file_pound_if = "defined($new_pound_if)";
119 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
120 }
121}
122
0c4ecf42 123sub output_invlist ($$;$) {
9d9177be
KW
124 my $name = shift;
125 my $invlist = shift; # Reference to inversion list array
0c4ecf42 126 my $charset = shift // ""; # name of character set for comment
9d9177be 127
76d3994c 128 die "No inversion list for $name" unless defined $invlist
ad85f59a 129 && ref $invlist eq 'ARRAY';
76d3994c 130
9d9177be
KW
131 # Output the inversion list $invlist using the name $name for it.
132 # It is output in the exact internal form for inversion lists.
133
a0316a6c
KW
134 # Is the last element of the header 0, or 1 ?
135 my $zero_or_one = 0;
ad85f59a 136 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 137 unshift @$invlist, 0;
9d9177be
KW
138 $zero_or_one = 1;
139 }
0a07b44b 140 my $count = @$invlist;
9d9177be 141
bffc0129 142 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 143
0c4ecf42
KW
144 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
145 print $out_fh " /* for $charset */" if $charset;
146 print $out_fh "\n";
9d9177be 147
a0316a6c 148 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
149 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
150 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
151 ",\t/* 0 if the list starts at 0;",
152 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
153
154 # The main body are the UVs passed in to this routine. Do the final
155 # element separately
47d53124
KW
156 for my $i (0 .. @$invlist - 1) {
157 printf $out_fh "\t0x%X", $invlist->[$i];
158 print $out_fh "," if $i < @$invlist - 1;
159 print $out_fh "\n";
9d9177be
KW
160 }
161
9d9177be
KW
162 print $out_fh "};\n";
163}
164
99f21fb9
KW
165sub output_invmap ($$$$$$$) {
166 my $name = shift;
167 my $invmap = shift; # Reference to inversion map array
168 my $prop_name = shift;
169 my $input_format = shift; # The inversion map's format
170 my $default = shift; # The property value for code points who
171 # otherwise don't have a value specified.
172 my $extra_enums = shift; # comma-separated list of our additions to the
173 # property's standard possible values
174 my $charset = shift // ""; # name of character set for comment
175
176 # Output the inversion map $invmap for property $prop_name, but use $name
177 # as the actual data structure's name.
178
179 my $count = @$invmap;
180
181 my $output_format;
182 my $declaration_type;
183 my %enums;
184 my $name_prefix;
185
34623dbb 186 if ($input_format =~ / ^ s l? $ /x) {
02f811dd
KW
187 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
188 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
226b74db 189 my @input_enums;
f79a09fc 190
226b74db 191 # Find all the possible input values. These become the enum names
34623dbb
KW
192 # that comprise the inversion map. For inputs that don't have sub
193 # lists, we can just get the unique values. Otherwise, we have to
194 # expand the sublists first.
195 if ($input_format ne 'sl') {
226b74db 196 @input_enums = sort(uniques(@$invmap));
34623dbb
KW
197 }
198 else {
199 foreach my $element (@$invmap) {
200 if (ref $element) {
201 push @input_enums, @$element;
202 }
203 else {
204 push @input_enums, $element;
205 }
206 }
207 @input_enums = sort(uniques(@input_enums));
208 }
6b659339 209
226b74db
KW
210 # The internal enums come last, and in the order specified.
211 my @enums = @input_enums;
27a619f7
KW
212 my @extras;
213 if ($extra_enums ne "") {
214 @extras = split /,/, $extra_enums;
226b74db
KW
215
216 # Don't add if already there.
217 foreach my $this_extra (@extras) {
218 next if grep { $_ eq $this_extra } @enums;
219
220 push @enums, $this_extra;
221 }
27a619f7 222 }
289ce9cc 223
226b74db
KW
224 # Assign a value to each element of the enum type we are creating.
225 # The default value always gets 0; the others are arbitrarily
226 # assigned.
27a619f7
KW
227 my $enum_val = 0;
228 my $canonical_default = prop_value_aliases($prop_name, $default);
229 $default = $canonical_default if defined $canonical_default;
230 $enums{$default} = $enum_val++;
226b74db 231
27a619f7
KW
232 for my $enum (@enums) {
233 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
234 }
235
226b74db 236 # Calculate the data for the special tables output for these properties.
27a619f7
KW
237 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
238
226b74db
KW
239 # The data includes the hashes %gcb_enums, %lb_enums, etc.
240 # Similarly we calculate column headings for the tables.
241 #
27a619f7 242 # We use string evals to allow the same code to work on
226b74db 243 # all the tables
27a619f7
KW
244 my $type = lc $prop_name;
245
27a619f7
KW
246 my $placeholder = "a";
247
248 # Skip if we've already done this code, which populated
249 # this hash
250 if (eval "! \%${type}_enums") {
251
226b74db 252 # For each enum in the type ...
27a619f7
KW
253 foreach my $enum (sort keys %enums) {
254 my $value = $enums{$enum};
255 my $short;
256 my $abbreviated_from;
257
258 # Special case this wb property value to make the
259 # name more clear
260 if ($enum eq 'Perl_Tailored_HSpace') {
261 $short = 'hs';
262 $abbreviated_from = $enum;
263 }
27a619f7 264 else {
226b74db
KW
265
266 # Use the official short name, if found.
27a619f7
KW
267 ($short) = prop_value_aliases($type, $enum);
268
226b74db
KW
269 if (! defined $short) {
270
271 # But if there is no official name, use the name
272 # that came from the data (if any). Otherwise,
273 # the name had to come from the extras list.
274 # There are two types of values in that list.
275 #
276 # First are those enums that are not part of the
277 # property, but are defined by this code. By
278 # convention these have all-caps names of at least
279 # 4 characters. We use the lowercased name for
280 # thse.
281 #
282 # Second are enums that are needed to get
283 # regexec.c to compile, but don't exist in all
284 # Unicode releases. To get here, we must be
285 # compiling an earlier Unicode release that
286 # doesn't have that enum, so just use a unique
287 # anonymous name for it.
288 if (grep { $_ eq $enum } @input_enums) {
289 $short = $enum
290 }
291 elsif ($enum !~ / ^ [A-Z]{4,} $ /x) {
292 $short = $placeholder++;
293 }
294 else {
295 $short = lc $enum;
296 }
297 }
27a619f7
KW
298 }
299
300 # If our short name is too long, or we already
301 # know that the name is an abbreviation, truncate
302 # to make sure it's short enough, and remember
226b74db
KW
303 # that we did this so we can later add a comment in the
304 # generated file
27a619f7
KW
305 if ( $abbreviated_from
306 || length $short > $max_hdr_len)
307 {
308 $short = substr($short, 0, $max_hdr_len);
309 $abbreviated_from = $enum
310 unless $abbreviated_from;
311 # If the name we are to display conflicts, try
312 # another.
313 while (eval "exists
314 \$${type}_abbreviations{$short}")
315 {
289ce9cc 316 die $@ if $@;
256fceb3
KW
317
318 # The increment operator on strings doesn't work
319 # on those containing an '_', so just use the
320 # final portion.
321 my @short = split '_', $short;
322 $short[-1]++;
323 $short = join "_", @short;
289ce9cc 324 }
19a5f1d5 325
27a619f7 326 eval "\$${type}_abbreviations{$short} = '$enum'";
19a5f1d5 327 die $@ if $@;
7e54b87f 328 }
27a619f7
KW
329
330 # Remember the mapping from the property value
331 # (enum) name to its value.
332 eval "\$${type}_enums{$enum} = $value";
333 die $@ if $@;
334
335 # Remember the inverse mapping to the short name
336 # so that we can properly label the generated
337 # table's rows and columns
338 eval "\$${type}_short_enums[$value] = '$short'";
339 die $@ if $@;
7e54b87f 340 }
99f21fb9 341 }
19a5f1d5 342 }
99f21fb9 343
19a5f1d5
KW
344 # Inversion map stuff is currently used only by regexec
345 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
346
347 # The short names tend to be two lower case letters, but it looks
348 # better for those if they are upper. XXX
349 $short_name = uc($short_name) if length($short_name) < 3
226b74db 350 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
19a5f1d5 351 $name_prefix = "${short_name}_";
cdc243dd
KW
352
353 # Currently unneeded
354 #print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
19a5f1d5 355
34623dbb
KW
356 if ($input_format eq 'sl') {
357 print $out_fh
358 "\n/* Negative enum values indicate the need to use an auxiliary"
359 . " table\n * consisting of the list of enums this one expands to."
360 . " The absolute\n * values of the negative enums are indices into"
361 . " a table of the auxiliary\n * tables' addresses */";
362 }
363
226b74db 364 # Start the enum definition for this map
19a5f1d5
KW
365 print $out_fh "\ntypedef enum {\n";
366 my @enum_list;
367 foreach my $enum (keys %enums) {
368 $enum_list[$enums{$enum}] = $enum;
99f21fb9 369 }
19a5f1d5 370 foreach my $i (0 .. @enum_list - 1) {
34623dbb
KW
371 print $out_fh ",\n" if $i > 0;
372
19a5f1d5
KW
373 my $name = $enum_list[$i];
374 print $out_fh "\t${name_prefix}$name = $i";
19a5f1d5 375 }
34623dbb
KW
376
377 # For an 'sl' property, we need extra enums, because some of the
378 # elements are lists. Each such distinct list is placed in its own
379 # auxiliary map table. Here, we go through the inversion map, and for
380 # each distinct list found, create an enum value for it, numbered -1,
381 # -2, ....
382 my %multiples;
383 my $aux_table_prefix = "AUX_TABLE_";
384 if ($input_format eq 'sl') {
385 foreach my $element (@$invmap) {
386
387 # A regular scalar is not one of the lists we're looking for
388 # at this stage.
389 next unless ref $element;
390
391 my $joined = join ",", sort @$element;
392 my $already_found = exists $multiples{$joined};
393
394 my $i;
395 if ($already_found) { # Use any existing one
396 $i = $multiples{$joined};
397 }
398 else { # Otherwise increment to get a new table number
399 $i = keys(%multiples) + 1;
400 $multiples{$joined} = $i;
401 }
402
403 # This changes the inversion map for this entry to not be the
404 # list
405 $element = "use_$aux_table_prefix$i";
406
407 # And add to the enum values
408 if (! $already_found) {
409 print $out_fh ",\n\t${name_prefix}$element = -$i";
410 }
411 }
412 }
413
414 print $out_fh "\n";
19a5f1d5
KW
415 $declaration_type = "${name_prefix}enum";
416 print $out_fh "} $declaration_type;\n";
226b74db 417 # Finished with the enum defintion.
19a5f1d5
KW
418
419 $output_format = "${name_prefix}%s";
34623dbb
KW
420
421 # If there are auxiliary tables, output them.
422 if (%multiples) {
423
424 print $out_fh "\n#define HAS_${name_prefix}AUX_TABLES\n";
425
426 # Invert keys and values
427 my %inverted_mults;
428 while (my ($key, $value) = each %multiples) {
429 $inverted_mults{$value} = $key;
430 }
431
432 # Output them in sorted order
433 my @sorted_table_list = sort { $a <=> $b } keys %inverted_mults;
434
435 # Keep track of how big each aux table is
436 my @aux_counts;
437
438 # Output each aux table.
439 foreach my $table_number (@sorted_table_list) {
440 my $table = $inverted_mults{$table_number};
441 print $out_fh "\nstatic const $declaration_type $name_prefix$aux_table_prefix$table_number\[] = {\n";
442
443 # Earlier, we joined the elements of this table together with a comma
444 my @elements = split ",", $table;
445
446 $aux_counts[$table_number] = scalar @elements;
447 for my $i (0 .. @elements - 1) {
448 print $out_fh ",\n" if $i > 0;
449 print $out_fh "\t${name_prefix}$elements[$i]";
450 }
451 print $out_fh "\n};\n";
452 }
453
454 # Output the table that is indexed by the absolute value of the
455 # aux table enum and contains pointers to the tables output just
456 # above
457 print $out_fh "\nstatic const $declaration_type * const ${name_prefix}${aux_table_prefix}ptrs\[] = {\n";
458 print $out_fh "\tNULL,\t/* Placeholder */\n";
459 for my $i (1 .. @sorted_table_list) {
460 print $out_fh ",\n" if $i > 1;
461 print $out_fh "\t$name_prefix$aux_table_prefix$i";
462 }
463 print $out_fh "\n};\n";
464
465 print $out_fh
466 "\n/* Parallel table to the above, giving the number of elements"
467 . " in each table\n * pointed to */\n";
468 print $out_fh "static const U8 ${name_prefix}${aux_table_prefix}lengths\[] = {\n";
469 print $out_fh "\t0,\t/* Placeholder */\n";
470 for my $i (1 .. @sorted_table_list) {
471 print $out_fh ",\n" if $i > 1;
472 print $out_fh "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */";
473 }
474 print $out_fh "\n};\n";
475 } # End of outputting the auxiliary and associated tables
99f21fb9
KW
476 }
477 else {
478 die "'$input_format' invmap() format for '$prop_name' unimplemented";
479 }
480
481 die "No inversion map for $prop_name" unless defined $invmap
482 && ref $invmap eq 'ARRAY'
483 && $count;
484
226b74db 485 # Now output the inversion map proper
99f21fb9
KW
486 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
487 print $out_fh " /* for $charset */" if $charset;
488 print $out_fh "\n";
489
490 # The main body are the scalars passed in to this routine.
491 for my $i (0 .. $count - 1) {
492 my $element = $invmap->[$i];
02f811dd
KW
493 my $full_element_name = prop_value_aliases($prop_name, $element);
494 $element = $full_element_name if defined $full_element_name;
495 $element = $name_prefix . $element;
99f21fb9
KW
496 print $out_fh "\t$element";
497 print $out_fh "," if $i < $count - 1;
498 print $out_fh "\n";
499 }
500 print $out_fh "};\n";
99f21fb9
KW
501}
502
5a7e5385 503sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
504
505 # Returns an inversion list constructed from the sorted input array of
506 # code points
507
508 my $list_ref = shift;
509
99f21fb9
KW
510 return unless @$list_ref;
511
a02047bf
KW
512 # Initialize to just the first element
513 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
514
515 # For each succeeding element, if it extends the previous range, adjust
516 # up, otherwise add it.
517 for my $i (1 .. @$list_ref - 1) {
518 if ($invlist[-1] == $list_ref->[$i]) {
519 $invlist[-1]++;
520 }
521 else {
522 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
523 }
524 }
525 return @invlist;
526}
527
528# Read in the Case Folding rules, and construct arrays of code points for the
529# properties we need.
530my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
531die "Could not find inversion map for Case_Folding" unless defined $format;
532die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
533 unless $format eq 'al'
534 || $format eq 'a';
a02047bf
KW
535my @has_multi_char_fold;
536my @is_non_final_fold;
537
538for my $i (0 .. @$folds_ref - 1) {
539 next unless ref $folds_ref->[$i]; # Skip single-char folds
540 push @has_multi_char_fold, $cp_ref->[$i];
541
b6a6e956 542 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
543 # position
544 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
545 push @is_non_final_fold, $folds_ref->[$i][$j]
546 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
547 }
548}
549
a02047bf
KW
550sub _Perl_Non_Final_Folds {
551 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 552 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
553}
554
99f21fb9
KW
555sub prop_name_for_cmp ($) { # Sort helper
556 my $name = shift;
557
558 # Returns the input lowercased, with non-alphas removed, as well as
559 # everything starting with a comma
560
561 $name =~ s/,.*//;
562 $name =~ s/[[:^alpha:]]//g;
563 return lc $name;
564}
565
892d8259 566sub UpperLatin1 {
5a7e5385 567 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
568}
569
289ce9cc
KW
570sub output_table_common {
571
572 # Common subroutine to actually output the generated rules table.
573
574 my ($property,
575 $table_value_defines_ref,
576 $table_ref,
577 $names_ref,
578 $abbreviations_ref) = @_;
579 my $size = @$table_ref;
580
581 # Output the #define list, sorted by numeric value
582 if ($table_value_defines_ref) {
583 my $max_name_length = 0;
584 my @defines;
585
586 # Put in order, and at the same time find the longest name
587 while (my ($enum, $value) = each %$table_value_defines_ref) {
588 $defines[$value] = $enum;
589
590 my $length = length $enum;
591 $max_name_length = $length if $length > $max_name_length;
592 }
593
594 print $out_fh "\n";
595
596 # Output, so that the values are vertically aligned in a column after
597 # the longest name
598 foreach my $i (0 .. @defines - 1) {
599 next unless defined $defines[$i];
600 printf $out_fh "#define %-*s %2d\n",
601 $max_name_length,
602 $defines[$i],
603 $i;
604 }
605 }
606
607 my $column_width = 2; # We currently allow 2 digits for the number
608
609 # If the maximum value in the table is 1, it can be a bool. (Being above
610 # a U8 is not currently handled
611 my $max_element = 0;
612 for my $i (0 .. $size - 1) {
613 for my $j (0 .. $size - 1) {
614 next if $max_element >= $table_ref->[$i][$j];
615 $max_element = $table_ref->[$i][$j];
616 }
617 }
618 die "Need wider table column width given '$max_element"
619 if length $max_element > $column_width;
620
621 my $table_type = ($max_element == 1)
622 ? 'bool'
623 : 'U8';
624
625 # If a name is longer than the width set aside for a column, its column
626 # needs to have increased spacing so that the name doesn't get truncated
627 # nor run into an adjacent column
628 my @spacers;
629
630 # If we are being compiled on a Unicode version earlier than that which
631 # this file was designed for, it may be that some of the property values
632 # aren't in the current release, and so would be undefined if we didn't
633 # define them ourselves. Earlier code has done this, making them
634 # lowercase characters of length one. We look to see if any exist, so
635 # that we can add an annotation to the output table
636 my $has_placeholder = 0;
637
638 for my $i (0 .. $size - 1) {
639 no warnings 'numeric';
640 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
641 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
642 }
643
644 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
645
646 # Calculate the column heading line
647 my $header_line = "/* "
648 . (" " x $max_hdr_len) # We let the row heading meld to
649 # the '*/' for those that are at
650 # the max
651 . " " x 3; # Space for '*/ '
652 # Now each column
653 for my $i (0 .. $size - 1) {
654 $header_line .= sprintf "%s%*s",
655 $spacers[$i],
656 $column_width + 1, # 1 for the ','
657 $names_ref->[$i];
658 }
659 $header_line .= " */\n";
660
661 # If we have annotations, output it now.
662 if ($has_placeholder || scalar %$abbreviations_ref) {
663 my $text = "";
664 foreach my $abbr (sort keys %$abbreviations_ref) {
665 $text .= "; " if $text;
666 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
667 }
668 if ($has_placeholder) {
669 $text .= "; other " if $text;
670 $text .= "lowercase names are placeholders for"
671 . " property values not defined until a later Unicode"
672 . " release, so are irrelevant in this one, as they are"
673 . " not assigned to any code points";
674 }
675
676 my $indent = " " x 3;
677 $text = $indent . "/* $text */";
678
679 # Wrap the text so that it is no wider than the table, which the
680 # header line gives.
681 my $output_width = length $header_line;
682 while (length $text > $output_width) {
683 my $cur_line = substr($text, 0, $output_width);
684
685 # Find the first blank back from the right end to wrap at.
686 for (my $i = $output_width -1; $i > 0; $i--) {
687 if (substr($text, $i, 1) eq " ") {
688 print $out_fh substr($text, 0, $i), "\n";
689
690 # Set so will look at just the remaining tail (which will
691 # be indented and have a '*' after the indent
692 $text = $indent . " * " . substr($text, $i + 1);
693 last;
694 }
695 }
696 }
697
698 # And any remaining
699 print $out_fh $text, "\n" if $text;
700 }
701
702 # We calculated the header line earlier just to get its width so that we
703 # could make sure the annotations fit into that.
704 print $out_fh $header_line;
705
706 # Now output the bulk of the table.
707 for my $i (0 .. $size - 1) {
708
709 # First the row heading.
710 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
711 print $out_fh "{"; # Then the brace for this row
712
713 # Then each column
714 for my $j (0 .. $size -1) {
715 print $out_fh $spacers[$j];
716 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
717 print $out_fh "," if $j < $size - 1;
718 }
719 print $out_fh " }";
720 print $out_fh "," if $i < $size - 1;
721 print $out_fh "\n";
722 }
723
724 print $out_fh "};\n";
725}
726
973a28ed
KW
727sub output_GCB_table() {
728
729 # Create and output the pair table for use in determining Grapheme Cluster
730 # Breaks, given in http://www.unicode.org/reports/tr29/.
b0e24409
KW
731 my %gcb_actions = (
732 GCB_NOBREAK => 0,
733 GCB_BREAKABLE => 1,
734 GCB_RI_then_RI => 2, # Rules 12 and 13
735 GCB_EX_then_EM => 3, # Rule 10
736 );
973a28ed
KW
737
738 # The table is constructed in reverse order of the rules, to make the
739 # lower-numbered, higher priority ones override the later ones, as the
740 # algorithm stops at the earliest matching rule
741
742 my @gcb_table;
743 my $table_size = @gcb_short_enums;
744
745 # Otherwise, break everywhere.
b0e24409 746 # GB99 Any ÷ Any
973a28ed
KW
747 for my $i (0 .. $table_size - 1) {
748 for my $j (0 .. $table_size - 1) {
749 $gcb_table[$i][$j] = 1;
750 }
751 }
752
b0e24409
KW
753 # Do not break within emoji flag sequences. That is, do not break between
754 # regional indicator (RI) symbols if there is an odd number of RI
755 # characters before the break point. Must be resolved in runtime code.
756 #
c492f156 757 # GB12 sot (RI RI)* RI × RI
b0e24409
KW
758 # GB13 [^RI] (RI RI)* RI × RI
759 $gcb_table[$gcb_enums{'Regional_Indicator'}]
760 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
761
762 # Do not break within emoji modifier sequences or emoji zwj sequences.
763 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
764 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
765 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
766
767 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
768 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
769 = $gcb_actions{GCB_EX_then_EM};
770 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
771 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
772
773 # Do not break before extending characters or ZWJ.
973a28ed 774 # Do not break before SpacingMarks, or after Prepend characters.
973a28ed 775 # GB9b Prepend ×
b0e24409
KW
776 # GB9a × SpacingMark
777 # GB9 × ( Extend | ZWJ )
973a28ed 778 for my $i (0 .. @gcb_table - 1) {
289ce9cc 779 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
b0e24409
KW
780 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
781 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
782 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
973a28ed
KW
783 }
784
973a28ed
KW
785 # Do not break Hangul syllable sequences.
786 # GB8 ( LVT | T) × T
787 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
788 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
789
790 # GB7 ( LV | V ) × ( V | T )
791 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
792 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
793 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
794 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
795
796 # GB6 L × ( L | V | LV | LVT )
797 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
798 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
799 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
800 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
801
289ce9cc
KW
802 # Do not break between a CR and LF. Otherwise, break before and after
803 # controls.
973a28ed
KW
804 # GB5 ÷ ( Control | CR | LF )
805 # GB4 ( Control | CR | LF ) ÷
806 for my $i (0 .. @gcb_table - 1) {
289ce9cc 807 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
973a28ed
KW
808 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
809 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
289ce9cc 810 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
973a28ed
KW
811 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
812 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
813 }
814
815 # GB3 CR × LF
816 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
817
b0e24409 818 # Break at the start and end of text, unless the text is empty
973a28ed
KW
819 # GB1 sot ÷
820 # GB2 ÷ eot
821 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
822 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
823 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
973a28ed 824 }
289ce9cc 825 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
973a28ed 826
b0e24409 827 output_table_common('GCB', \%gcb_actions,
289ce9cc 828 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
973a28ed
KW
829}
830
6b659339
KW
831sub output_LB_table() {
832
833 # Create and output the enums, #defines, and pair table for use in
834 # determining Line Breaks. This uses the default line break algorithm,
835 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
836 # in that page, as the Unicode-furnished tests assume that tailoring.
837
6b659339
KW
838 # The result is really just true or false. But we follow along with tr14,
839 # creating a rule which is false for something like X SP* X. That gets
840 # encoding 2. The rest of the actions are synthetic ones that indicate
841 # some context handling is required. These each are added to the
842 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
843 # value can be retrieved. Actually only rules from 7 through 18 (which
844 # are the ones where space matter) are possible to have 2 added to them.
845 # The others below add just 0 or 1. It might be possible for one
846 # synthetic rule to be added to another, yielding a larger value. This
847 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
848 # names of the middle grouping below, it is impossible for that to occur
849 # for them because they all start with mutually exclusive classes. That
850 # the final rule can't be added to any of the others isn't obvious from
851 # its name, so it is assigned a power of 2 higher than the others can get
852 # to so any addition would preserve all data. (And the code will reach an
853 # assert(0) on debugging builds should this happen.)
854 my %lb_actions = (
855 LB_NOBREAK => 0,
856 LB_BREAKABLE => 1,
857 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
858
b0e24409 859 LB_CM_ZWJ_foo => 3, # Rule 9
6b659339
KW
860 LB_SP_foo => 6, # Rule 18
861 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
862 LB_SY_or_IS_then_various => 11, # Rule 25
863 LB_HY_or_BA_then_foo => 13, # Rule 21
b0e24409 864 LB_RI_then_RI => 15, # Rule 30a
6b659339 865
b0e24409 866 LB_various_then_PO_or_PR => (1<<5), # Rule 25
6b659339
KW
867 );
868
6b659339
KW
869 # Construct the LB pair table. This is based on the rules in
870 # http://www.unicode.org/reports/tr14/, but modified as those rules are
871 # designed for someone taking a string of text and sequentially going
872 # through it to find the break opportunities, whereas, Perl requires
873 # determining if a given random spot is a break opportunity, without
874 # knowing all the entire string before it.
875 #
876 # The table is constructed in reverse order of the rules, to make the
877 # lower-numbered, higher priority ones override the later ones, as the
878 # algorithm stops at the earliest matching rule
879
880 my @lb_table;
881 my $table_size = @lb_short_enums;
882
883 # LB31. Break everywhere else
884 for my $i (0 .. $table_size - 1) {
885 for my $j (0 .. $table_size - 1) {
886 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
887 }
888 }
889
b0e24409
KW
890 # LB30b Do not break between an emoji base and an emoji modifier.
891 # EB × EM
892 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
893 = $lb_actions{'LB_NOBREAK'};
894
895 # LB30a Break between two regional indicator symbols if and only if there
896 # are an even number of regional indicators preceding the position of the
897 # break.
898 # sot (RI RI)* RI × RI
899 # [^RI] (RI RI)* RI × RI
289ce9cc 900 $lb_table[$lb_enums{'Regional_Indicator'}]
b0e24409 901 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
6b659339
KW
902
903 # LB30 Do not break between letters, numbers, or ordinary symbols and
904 # opening or closing parentheses.
905 # (AL | HL | NU) × OP
289ce9cc
KW
906 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
907 = $lb_actions{'LB_NOBREAK'};
908 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
909 = $lb_actions{'LB_NOBREAK'};
910 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
911 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
912
913 # CP × (AL | HL | NU)
289ce9cc
KW
914 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
915 = $lb_actions{'LB_NOBREAK'};
916 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
917 = $lb_actions{'LB_NOBREAK'};
918 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
919 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
920
921 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
922 # IS × (AL | HL)
289ce9cc
KW
923 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
924 = $lb_actions{'LB_NOBREAK'};
925 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
926 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
927
928 # LB28 Do not break between alphabetics (“at”).
929 # (AL | HL) × (AL | HL)
289ce9cc
KW
930 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
931 = $lb_actions{'LB_NOBREAK'};
932 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
933 = $lb_actions{'LB_NOBREAK'};
934 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
935 = $lb_actions{'LB_NOBREAK'};
936 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
937 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
938
939 # LB27 Treat a Korean Syllable Block the same as ID.
940 # (JL | JV | JT | H2 | H3) × IN
289ce9cc
KW
941 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
942 = $lb_actions{'LB_NOBREAK'};
943 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
944 = $lb_actions{'LB_NOBREAK'};
945 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
946 = $lb_actions{'LB_NOBREAK'};
947 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
948 = $lb_actions{'LB_NOBREAK'};
949 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
950 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
951
952 # (JL | JV | JT | H2 | H3) × PO
289ce9cc
KW
953 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
954 = $lb_actions{'LB_NOBREAK'};
955 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
956 = $lb_actions{'LB_NOBREAK'};
957 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
958 = $lb_actions{'LB_NOBREAK'};
959 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
960 = $lb_actions{'LB_NOBREAK'};
961 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
962 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
963
964 # PR × (JL | JV | JT | H2 | H3)
289ce9cc
KW
965 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
966 = $lb_actions{'LB_NOBREAK'};
967 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
968 = $lb_actions{'LB_NOBREAK'};
969 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
970 = $lb_actions{'LB_NOBREAK'};
971 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
972 = $lb_actions{'LB_NOBREAK'};
973 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
974 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
975
976 # LB26 Do not break a Korean syllable.
977 # JL × (JL | JV | H2 | H3)
978 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
979 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
980 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
981 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
982
983 # (JV | H2) × (JV | JT)
984 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
985 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
986 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
987 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
988
989 # (JT | H3) × JT
990 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
991 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
992
993 # LB25 Do not break between the following pairs of classes relevant to
994 # numbers, as tailored by example 7 in
995 # http://www.unicode.org/reports/tr14/#Examples
996 # We follow that tailoring because Unicode's test cases expect it
997 # (PR | PO) × ( OP | HY )? NU
289ce9cc
KW
998 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
999 = $lb_actions{'LB_NOBREAK'};
1000 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1001 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1002
1003 # Given that (OP | HY )? is optional, we have to test for it in code.
1004 # We add in the action (instead of overriding) for this, so that in
1005 # the code we can recover the underlying break value.
289ce9cc 1006 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1007 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1008 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1009 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1010 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339 1011 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1012 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339
KW
1013 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1014
1015 # ( OP | HY ) × NU
289ce9cc
KW
1016 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1017 = $lb_actions{'LB_NOBREAK'};
1018 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1019 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1020
1021 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1022 # which can be rewritten as:
1023 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
289ce9cc
KW
1024 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1025 = $lb_actions{'LB_NOBREAK'};
1026 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1027 = $lb_actions{'LB_NOBREAK'};
1028 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1029 = $lb_actions{'LB_NOBREAK'};
1030 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1031 = $lb_actions{'LB_NOBREAK'};
1032 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1033 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1034
1035 # Like earlier where we have to test in code, we add in the action so
1036 # that we can recover the underlying values. This is done in rules
1037 # below, as well. The code assumes that we haven't added 2 actions.
1038 # Shoul a later Unicode release break that assumption, then tests
1039 # should start failing.
289ce9cc 1040 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
6b659339 1041 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1042 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
6b659339 1043 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1044 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
6b659339 1045 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1046 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
6b659339 1047 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1048 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
6b659339 1049 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1050 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
6b659339 1051 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1052 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
6b659339 1053 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1054 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
6b659339 1055 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1056 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
6b659339 1057 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1058 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
6b659339
KW
1059 += $lb_actions{'LB_SY_or_IS_then_various'};
1060
1061 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1062 # which can be rewritten as:
1063 # NU (SY | IS)* (CL | CP)? × (PO | PR)
289ce9cc
KW
1064 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1065 = $lb_actions{'LB_NOBREAK'};
1066 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1067 = $lb_actions{'LB_NOBREAK'};
6b659339 1068
289ce9cc 1069 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1070 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1071 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1072 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1073 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1074 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1075 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
6b659339
KW
1076 += $lb_actions{'LB_various_then_PO_or_PR'};
1077
289ce9cc 1078 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1079 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1080 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1081 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1082 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1083 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1084 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
6b659339
KW
1085 += $lb_actions{'LB_various_then_PO_or_PR'};
1086
b0e24409
KW
1087 # LB24 Do not break between numeric prefix/postfix and letters, or between
1088 # letters and prefix/postfix.
1089 # (PR | PO) × (AL | HL)
289ce9cc
KW
1090 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1091 = $lb_actions{'LB_NOBREAK'};
1092 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1093 = $lb_actions{'LB_NOBREAK'};
289ce9cc
KW
1094 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1095 = $lb_actions{'LB_NOBREAK'};
1096 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1097 = $lb_actions{'LB_NOBREAK'};
6b659339 1098
b0e24409
KW
1099 # (AL | HL) × (PR | PO)
1100 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1101 = $lb_actions{'LB_NOBREAK'};
1102 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1103 = $lb_actions{'LB_NOBREAK'};
1104 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1105 = $lb_actions{'LB_NOBREAK'};
1106 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1107 = $lb_actions{'LB_NOBREAK'};
1108
1109 # LB23a Do not break between numeric prefixes and ideographs, or between
1110 # ideographs and numeric postfixes.
1111 # PR × (ID | EB | EM)
1112 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1113 = $lb_actions{'LB_NOBREAK'};
1114 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1115 = $lb_actions{'LB_NOBREAK'};
1116 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1117 = $lb_actions{'LB_NOBREAK'};
1118
1119 # (ID | EB | EM) × PO
289ce9cc
KW
1120 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1121 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1122 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1123 = $lb_actions{'LB_NOBREAK'};
1124 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1125 = $lb_actions{'LB_NOBREAK'};
6b659339 1126
b0e24409 1127 # LB23 Do not break between digits and letters
6b659339 1128 # (AL | HL) × NU
289ce9cc
KW
1129 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1130 = $lb_actions{'LB_NOBREAK'};
1131 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1132 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1133
1134 # NU × (AL | HL)
289ce9cc
KW
1135 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1136 = $lb_actions{'LB_NOBREAK'};
1137 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1138 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1139
1140 # LB22 Do not break between two ellipses, or between letters, numbers or
1141 # exclamations and ellipsis.
1142 # (AL | HL) × IN
289ce9cc
KW
1143 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1144 = $lb_actions{'LB_NOBREAK'};
1145 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1146 = $lb_actions{'LB_NOBREAK'};
6b659339 1147
289ce9cc
KW
1148 # Exclamation × IN
1149 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1150 = $lb_actions{'LB_NOBREAK'};
6b659339 1151
b0e24409 1152 # (ID | EB | EM) × IN
289ce9cc
KW
1153 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1154 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1155 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1156 = $lb_actions{'LB_NOBREAK'};
1157 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1158 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1159
1160 # IN × IN
289ce9cc
KW
1161 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1162 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1163
1164 # NU × IN
289ce9cc
KW
1165 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1166 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1167
1168 # LB21b Don’t break between Solidus and Hebrew letters.
1169 # SY × HL
289ce9cc
KW
1170 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1171 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1172
1173 # LB21a Don't break after Hebrew + Hyphen.
1174 # HL (HY | BA) ×
1175 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1176 $lb_table[$lb_enums{'Hyphen'}][$i]
1177 += $lb_actions{'LB_HY_or_BA_then_foo'};
1178 $lb_table[$lb_enums{'Break_After'}][$i]
1179 += $lb_actions{'LB_HY_or_BA_then_foo'};
6b659339
KW
1180 }
1181
1182 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1183 # spaces, small kana, and other non-starters, or after acute accents.
1184 # × BA
1185 # × HY
1186 # × NS
1187 # BB ×
1188 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1189 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1190 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1191 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1192 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1193 }
1194
1195 # LB20 Break before and after unresolved CB.
1196 # ÷ CB
1197 # CB ÷
1198 # Conditional breaks should be resolved external to the line breaking
1199 # rules. However, the default action is to treat unresolved CB as breaking
1200 # before and after.
1201 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1202 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1203 = $lb_actions{'LB_BREAKABLE'};
1204 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1205 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1206 }
1207
1208 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1209 # × QU
1210 # QU ×
1211 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1212 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1213 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1214 }
1215
1216 # LB18 Break after spaces
1217 # SP ÷
1218 for my $i (0 .. @lb_table - 1) {
289ce9cc 1219 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1220 }
1221
1222 # LB17 Do not break within ‘——’, even with intervening spaces.
1223 # B2 SP* × B2
289ce9cc 1224 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
6b659339
KW
1225 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1226
1227 # LB16 Do not break between closing punctuation and a nonstarter even with
1228 # intervening spaces.
1229 # (CL | CP) SP* × NS
289ce9cc 1230 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
6b659339 1231 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1232 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
6b659339
KW
1233 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1234
1235
1236 # LB15 Do not break within ‘”[’, even with intervening spaces.
1237 # QU SP* × OP
289ce9cc 1238 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
6b659339
KW
1239 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1240
1241 # LB14 Do not break after ‘[’, even after spaces.
1242 # OP SP* ×
1243 for my $i (0 .. @lb_table - 1) {
289ce9cc 1244 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
6b659339
KW
1245 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1246 }
1247
1248 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1249 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1250 # [^NU] × CL
1251 # [^NU] × CP
1252 # × EX
1253 # [^NU] × IS
1254 # [^NU] × SY
1255 for my $i (0 .. @lb_table - 1) {
289ce9cc 1256 $lb_table[$i][$lb_enums{'Exclamation'}]
6b659339
KW
1257 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1258
289ce9cc 1259 next if $i == $lb_enums{'Numeric'};
6b659339 1260
289ce9cc 1261 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
6b659339 1262 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1263 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
6b659339 1264 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1265 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
6b659339 1266 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1267 $lb_table[$i][$lb_enums{'Break_Symbols'}]
6b659339
KW
1268 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1269 }
1270
1271 # LB12a Do not break before NBSP and related characters, except after
1272 # spaces and hyphens.
1273 # [^SP BA HY] × GL
1274 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1275 next if $i == $lb_enums{'Space'}
1276 || $i == $lb_enums{'Break_After'}
1277 || $i == $lb_enums{'Hyphen'};
6b659339
KW
1278
1279 # We don't break, but if a property above has said don't break even
1280 # with space between, don't override that (also in the next few rules)
289ce9cc 1281 next if $lb_table[$i][$lb_enums{'Glue'}]
6b659339 1282 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1283 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1284 }
1285
1286 # LB12 Do not break after NBSP and related characters.
1287 # GL ×
1288 for my $i (0 .. @lb_table - 1) {
289ce9cc 1289 next if $lb_table[$lb_enums{'Glue'}][$i]
6b659339 1290 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1291 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1292 }
1293
1294 # LB11 Do not break before or after Word joiner and related characters.
1295 # × WJ
1296 # WJ ×
1297 for my $i (0 .. @lb_table - 1) {
289ce9cc 1298 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
6b659339
KW
1299 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1300 {
289ce9cc 1301 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
6b659339 1302 }
289ce9cc 1303 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
6b659339
KW
1304 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1305 {
289ce9cc 1306 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1307 }
1308 }
1309
1310 # Special case this here to avoid having to do a special case in the code,
1311 # by making this the same as other things with a SP in front of them that
1312 # don't break, we avoid an extra test
289ce9cc 1313 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
6b659339
KW
1314 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1315
1316 # LB9 and LB10 are done in the same loop
1317 #
1318 # LB9 Do not break a combining character sequence; treat it as if it has
1319 # the line breaking class of the base character in all of the
b0e24409
KW
1320 # higher-numbered rules. Treat ZWJ as if it were CM
1321 # Treat X (CM|ZWJ)* as if it were X.
6b659339
KW
1322 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1323
b0e24409
KW
1324 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1325 # case where a CM or ZWJ is the first character on the line or follows SP,
1326 # BK, CR, LF, NL, or ZW.
6b659339
KW
1327 for my $i (0 .. @lb_table - 1) {
1328
b0e24409
KW
1329 # When the CM or ZWJ is the first in the pair, we don't know without
1330 # looking behind whether the CM or ZWJ is going to attach to an
1331 # earlier character, or not. So have to figure this out at runtime in
1332 # the code
1333 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1334 = $lb_actions{'LB_CM_ZWJ_foo'};
1335 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
289ce9cc
KW
1336
1337 if ( $i == $lb_enums{'Mandatory_Break'}
1338 || $i == $lb_enums{'EDGE'}
1339 || $i == $lb_enums{'Carriage_Return'}
1340 || $i == $lb_enums{'Line_Feed'}
1341 || $i == $lb_enums{'Next_Line'}
1342 || $i == $lb_enums{'Space'}
1343 || $i == $lb_enums{'ZWSpace'})
6b659339
KW
1344 {
1345 # For these classes, a following CM doesn't combine, and should do
289ce9cc
KW
1346 # whatever 'Alphabetic' would do.
1347 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1348 = $lb_table[$i][$lb_enums{'Alphabetic'}];
b0e24409
KW
1349 $lb_table[$i][$lb_enums{'ZWJ'}]
1350 = $lb_table[$i][$lb_enums{'Alphabetic'}];
6b659339
KW
1351 }
1352 else {
b0e24409
KW
1353 # For these classes, the CM or ZWJ combines, so doesn't break,
1354 # inheriting the type of nobreak from the master character.
289ce9cc 1355 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
6b659339
KW
1356 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1357 {
289ce9cc
KW
1358 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1359 = $lb_actions{'LB_NOBREAK'};
6b659339 1360 }
b0e24409
KW
1361 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1362 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1363 {
1364 $lb_table[$i][$lb_enums{'ZWJ'}]
1365 = $lb_actions{'LB_NOBREAK'};
1366 }
6b659339
KW
1367 }
1368 }
1369
b0e24409
KW
1370 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1371 # base or emoji modifier. This rule prevents breaks within emoji joiner
1372 # sequences.
1373 # ZWJ × (ID | EB | EM)
1374 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1375 = $lb_actions{'LB_NOBREAK'};
1376 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1377 = $lb_actions{'LB_NOBREAK'};
1378 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1379 = $lb_actions{'LB_NOBREAK'};
1380
6b659339
KW
1381 # LB8 Break before any character following a zero-width space, even if one
1382 # or more spaces intervene.
1383 # ZW SP* ÷
1384 for my $i (0 .. @lb_table - 1) {
289ce9cc 1385 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1386 }
1387
1388 # Because of LB8-10, we need to look at context for "SP x", and this must
1389 # be done in the code. So override the existing rules for that, by adding
1390 # a constant to get new rules that tell the code it needs to look at
1391 # context. By adding this action instead of replacing the existing one,
1392 # we can get back to the original rule if necessary.
1393 for my $i (0 .. @lb_table - 1) {
289ce9cc 1394 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
6b659339
KW
1395 }
1396
1397 # LB7 Do not break before spaces or zero width space.
1398 # × SP
1399 # × ZW
1400 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1401 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1402 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1403 }
1404
1405 # LB6 Do not break before hard line breaks.
1406 # × ( BK | CR | LF | NL )
1407 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1408 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1409 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1410 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1411 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1412 }
1413
1414 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1415 # CR × LF
1416 # CR !
1417 # LF !
1418 # NL !
1419 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1420 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1421 = $lb_actions{'LB_BREAKABLE'};
1422 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1423 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339 1424 }
289ce9cc
KW
1425 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1426 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1427
1428 # LB4 Always break after hard line breaks.
1429 # BK !
1430 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1431 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1432 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1433 }
1434
6b659339
KW
1435 # LB3 Always break at the end of text.
1436 # ! eot
b0e24409
KW
1437 # LB2 Never break at the start of text.
1438 # sot ×
6b659339 1439 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1440 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1441 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1442 }
1443
1444 # LB1 Assign a line breaking class to each code point of the input.
1445 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1446 # depending on criteria outside the scope of this algorithm.
1447 #
1448 # In the absence of such criteria all characters with a specific
1449 # combination of original class and General_Category property value are
1450 # resolved as follows:
1451 # Original Resolved General_Category
1452 # AI, SG, XX AL Any
1453 # SA CM Only Mn or Mc
1454 # SA AL Any except Mn and Mc
1455 # CJ NS Any
1456 #
1457 # This is done in mktables, so we never see any of the remapped-from
1458 # classes.
1459
289ce9cc
KW
1460 output_table_common('LB', \%lb_actions,
1461 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
6b659339
KW
1462}
1463
7e54b87f
KW
1464sub output_WB_table() {
1465
1466 # Create and output the enums, #defines, and pair table for use in
1467 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1468
1469 # This uses the same mechanism in the other bounds tables generated by
1470 # this file. The actions that could override a 0 or 1 are added to those
1471 # numbers; the actions that clearly don't depend on the underlying rule
1472 # simply overwrite
1473 my %wb_actions = (
1474 WB_NOBREAK => 0,
1475 WB_BREAKABLE => 1,
1476 WB_hs_then_hs => 2,
b0e24409 1477 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
7e54b87f
KW
1478 WB_DQ_then_HL => 4,
1479 WB_HL_then_DQ => 6,
1480 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1481 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1482 WB_MB_or_MN_or_SQ_then_NU => 12,
1483 WB_NU_then_MB_or_MN_or_SQ => 14,
b0e24409 1484 WB_RI_then_RI => 16,
7e54b87f
KW
1485 );
1486
7e54b87f
KW
1487 # Construct the WB pair table.
1488 # The table is constructed in reverse order of the rules, to make the
1489 # lower-numbered, higher priority ones override the later ones, as the
1490 # algorithm stops at the earliest matching rule
1491
1492 my @wb_table;
1493 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
39c4defe 1494 die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
7e54b87f
KW
1495
1496 # Otherwise, break everywhere (including around ideographs).
b0e24409 1497 # WB99 Any ÷ Any
7e54b87f
KW
1498 for my $i (0 .. $table_size - 1) {
1499 for my $j (0 .. $table_size - 1) {
1500 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1501 }
1502 }
1503
b0e24409
KW
1504 # Do not break within emoji flag sequences. That is, do not break between
1505 # regional indicator (RI) symbols if there is an odd number of RI
1506 # characters before the break point.
1507 # WB16 [^RI] (RI RI)* RI × RI
c492f156 1508 # WB15 sot (RI RI)* RI × RI
289ce9cc 1509 $wb_table[$wb_enums{'Regional_Indicator'}]
b0e24409
KW
1510 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1511
1512 # Do not break within emoji modifier sequences.
1513 # WB14 ( E_Base | EBG ) × E_Modifier
1514 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1515 = $wb_actions{'WB_NOBREAK'};
1516 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1517 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1518
1519 # Do not break from extenders.
1520 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
289ce9cc
KW
1521 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1522 = $wb_actions{'WB_NOBREAK'};
1523 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1524 = $wb_actions{'WB_NOBREAK'};
1525 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1526 = $wb_actions{'WB_NOBREAK'};
1527 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1528 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1529
1530 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1531 # × # ExtendNumLet
289ce9cc
KW
1532 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1533 = $wb_actions{'WB_NOBREAK'};
1534 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1535 = $wb_actions{'WB_NOBREAK'};
1536 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1537 = $wb_actions{'WB_NOBREAK'};
1538 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1539 = $wb_actions{'WB_NOBREAK'};
1540 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1541 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1542
1543 # Do not break between Katakana.
1544 # WB13 Katakana × Katakana
289ce9cc
KW
1545 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1546 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1547
1548 # Do not break within sequences, such as “3.2” or “3,456.789”.
1549 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
289ce9cc 1550 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
7e54b87f 1551 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1552 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
7e54b87f 1553 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1554 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1555 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1556
1557 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
289ce9cc 1558 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
7e54b87f 1559 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1560 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
7e54b87f 1561 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1562 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
7e54b87f
KW
1563 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1564
1565 # Do not break within sequences of digits, or digits adjacent to letters
1566 # (“3a”, or “A3”).
1567 # WB10 Numeric × (ALetter | Hebrew_Letter)
289ce9cc
KW
1568 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1569 = $wb_actions{'WB_NOBREAK'};
1570 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1571 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1572
1573 # WB9 (ALetter | Hebrew_Letter) × Numeric
289ce9cc
KW
1574 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1575 = $wb_actions{'WB_NOBREAK'};
1576 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1577 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1578
1579 # WB8 Numeric × Numeric
289ce9cc
KW
1580 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1581 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1582
1583 # Do not break letters across certain punctuation.
1584 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
289ce9cc
KW
1585 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1586 += $wb_actions{'WB_DQ_then_HL'};
7e54b87f
KW
1587
1588 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
289ce9cc
KW
1589 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1590 += $wb_actions{'WB_HL_then_DQ'};
7e54b87f
KW
1591
1592 # WB7a Hebrew_Letter × Single_Quote
289ce9cc
KW
1593 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1594 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1595
1596 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1597 # × (ALetter | Hebrew_Letter)
289ce9cc 1598 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
7e54b87f 1599 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1600 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1601 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1602 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
7e54b87f 1603 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1604 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1605 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1606 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
7e54b87f 1607 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1608 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f
KW
1609 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1610
1611 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1612 # | Single_Quote) (ALetter | Hebrew_Letter)
289ce9cc 1613 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1614 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1615 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1616 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1617 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
7e54b87f 1618 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1619 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
7e54b87f 1620 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1621 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
7e54b87f 1622 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1623 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1624 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1625
1626 # Do not break between most letters.
1627 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
289ce9cc
KW
1628 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1629 = $wb_actions{'WB_NOBREAK'};
1630 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1631 = $wb_actions{'WB_NOBREAK'};
1632 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1633 = $wb_actions{'WB_NOBREAK'};
1634 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1635 = $wb_actions{'WB_NOBREAK'};
7e54b87f 1636
b0e24409
KW
1637 # Ignore Format and Extend characters, except after sot, CR, LF, and
1638 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1639 # WB4 X (Extend | Format | ZWJ)* → X
7e54b87f 1640 for my $i (0 .. @wb_table - 1) {
289ce9cc 1641 $wb_table[$wb_enums{'Extend'}][$i]
b0e24409 1642 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
289ce9cc 1643 $wb_table[$wb_enums{'Format'}][$i]
b0e24409
KW
1644 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1645 $wb_table[$wb_enums{'ZWJ'}][$i]
1646 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1647 }
1648 for my $i (0 .. @wb_table - 1) {
1649 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1650 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1651 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1652 }
1653
1654 # Implied is that these attach to the character before them, except for
1655 # the characters that mark the end of a region of text. The rules below
1656 # override the ones set up here, for all the characters that need
1657 # overriding.
1658 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1659 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1660 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1661 }
1662
b0e24409
KW
1663 # Do not break within emoji zwj sequences.
1664 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1665 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1666 = $wb_actions{'WB_NOBREAK'};
1667 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1668 = $wb_actions{'WB_NOBREAK'};
1669
7e54b87f
KW
1670 # Break before and after white space
1671 # WB3b ÷ (Newline | CR | LF)
1672 # WB3a (Newline | CR | LF) ÷
1673 # et. al.
289ce9cc 1674 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1675 for my $j (0 .. @wb_table - 1) {
1676 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1677 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1678 }
1679 }
1680
1681 # But do not break within white space.
1682 # WB3 CR × LF
1683 # et.al.
289ce9cc
KW
1684 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1685 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1686 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1687 }
1688 }
1689
b0e24409 1690 # And do not break horizontal space followed by Extend or Format or ZWJ
289ce9cc
KW
1691 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1692 = $wb_actions{'WB_NOBREAK'};
1693 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1694 = $wb_actions{'WB_NOBREAK'};
b0e24409
KW
1695 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1696 = $wb_actions{'WB_NOBREAK'};
289ce9cc
KW
1697 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1698 [$wb_enums{'Perl_Tailored_HSpace'}]
1699 = $wb_actions{'WB_hs_then_hs'};
7e54b87f 1700
b0e24409
KW
1701 # Break at the start and end of text, unless the text is empty
1702 # WB2 Any ÷ eot
1703 # WB1 sot ÷ Any
7e54b87f 1704 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1705 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1706 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
7e54b87f 1707 }
289ce9cc 1708 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
7e54b87f 1709
289ce9cc
KW
1710 output_table_common('WB', \%wb_actions,
1711 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
7e54b87f
KW
1712}
1713
9d9177be
KW
1714output_invlist("Latin1", [ 0, 256 ]);
1715output_invlist("AboveLatin1", [ 256 ]);
1716
bffc0129 1717end_file_pound_if;
43b443dd 1718
3f427fd9
KW
1719# We construct lists for all the POSIX and backslash sequence character
1720# classes in two forms:
1721# 1) ones which match only in the ASCII range
1722# 2) ones which match either in the Latin1 range, or the entire Unicode range
1723#
1724# These get compiled in, and hence affect the memory footprint of every Perl
1725# program, even those not using Unicode. To minimize the size, currently
1726# the Latin1 version is generated for the beyond ASCII range except for those
1727# lists that are quite small for the entire range, such as for \s, which is 22
1728# UVs long plus 4 UVs (currently) for the header.
1729#
1730# To save even more memory, the ASCII versions could be derived from the
1731# larger ones at runtime, saving some memory (minus the expense of the machine
1732# instructions to do so), but these are all small anyway, so their total is
1733# about 100 UVs.
1734#
1735# In the list of properties below that get generated, the L1 prefix is a fake
1736# property that means just the Latin1 range of the full property (whose name
1737# has an X prefix instead of L1).
a02047bf
KW
1738#
1739# An initial & means to use the subroutine from this file instead of an
1740# official inversion list.
3f427fd9 1741
0c4ecf42
KW
1742for my $charset (get_supported_code_pages()) {
1743 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1744
99f21fb9 1745 @a2n = @{get_a2n($charset)};
226b74db
KW
1746 # Below is the list of property names to generate. '&' means to use the
1747 # subroutine to generate the inversion list instead of the generic code
1748 # below. Some properties have a comma-separated list after the name,
1749 # These are extra enums to add to those found in the Unicode tables.
99f21fb9
KW
1750 no warnings 'qw';
1751 # Ignore non-alpha in sort
1752 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
c0382778 1753 Assigned
1c8c3428
KW
1754 ASCII
1755 Cased
1756 VertSpace
1757 XPerlSpace
1758 XPosixAlnum
1759 XPosixAlpha
1760 XPosixBlank
1761 XPosixCntrl
1762 XPosixDigit
1763 XPosixGraph
1764 XPosixLower
1765 XPosixPrint
1766 XPosixPunct
1767 XPosixSpace
1768 XPosixUpper
1769 XPosixWord
1770 XPosixXDigit
1771 _Perl_Any_Folds
1772 &NonL1_Perl_Non_Final_Folds
1773 _Perl_Folds_To_Multi_Char
1774 &UpperLatin1
1775 _Perl_IDStart
1776 _Perl_IDCont
226b74db
KW
1777 _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
1778 _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
1779 _Perl_SB,SContinue,CR,Extend,LF,EDGE
1780 _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
1c8c3428 1781 )
226b74db
KW
1782 # NOTE that the convention is that extra enum
1783 # values come after the property name, separated by
1784 # commas, with the enums that aren't ever defined
1785 # by Unicode coming last, at least 4 all-uppercase
1786 # characters. The others are enum names that are
1787 # needed by perl, but aren't in all Unicode
1788 # releases.
0f5e3c71
KW
1789 ) {
1790
1791 # For the Latin1 properties, we change to use the eXtended version of the
1792 # base property, then go through the result and get rid of everything not
1793 # in Latin1 (above 255). Actually, we retain the element for the range
1794 # that crosses the 255/256 boundary if it is one that matches the
1795 # property. For example, in the Word property, there is a range of code
1796 # points that start at U+00F8 and goes through U+02C1. Instead of
1797 # artificially cutting that off at 256 because 256 is the first code point
1798 # above Latin1, we let the range go to its natural ending. That gives us
1799 # extra information with no added space taken. But if the range that
1800 # crosses the boundary is one that doesn't match the property, we don't
1801 # start a new range above 255, as that could be construed as going to
1802 # infinity. For example, the Upper property doesn't include the character
1803 # at 255, but does include the one at 256. We don't include the 256 one.
1804 my $prop_name = $prop;
1805 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1806 my $extra_enums = "";
1807 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1808 my $lookup_prop = $prop_name;
1809 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1810 or $lookup_prop =~ s/^L1//);
1811 my $nonl1_only = 0;
1812 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1813 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1814
1815 my @invlist;
99f21fb9
KW
1816 my @invmap;
1817 my $map_format;
1818 my $map_default;
1819 my $maps_to_code_point;
1820 my $to_adjust;
0f5e3c71
KW
1821 if ($is_local_sub) {
1822 @invlist = eval $lookup_prop;
289ce9cc 1823 die $@ if $@;
0f5e3c71
KW
1824 }
1825 else {
1826 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1827 if (! @invlist) {
99f21fb9 1828
ad85f59a
KW
1829 # If couldn't find a non-empty inversion list, see if it is
1830 # instead an inversion map
1831 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1832 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1833 if (! $list_ref) {
1834 # An empty return here could mean an unknown property, or
1835 # merely that the original inversion list is empty. Call
1836 # in scalar context to differentiate
1837 my $count = prop_invlist($lookup_prop,
1838 '_perl_core_internal_ok');
1839 die "Could not find inversion list for '$lookup_prop'"
1840 unless defined $count;
1841 }
1842 else {
18b852b3
KW
1843 @invlist = @$list_ref;
1844 @invmap = @$map_ref;
1845 $map_format = $format;
1846 $map_default = $default;
1847 $maps_to_code_point = $map_format =~ /x/;
1848 $to_adjust = $map_format =~ /a/;
ad85f59a 1849 }
99f21fb9 1850 }
0f5e3c71 1851 }
ad85f59a
KW
1852
1853
1854 # Short-circuit an empty inversion list.
1855 if (! @invlist) {
1856 output_invlist($prop_name, \@invlist, $charset);
1857 next;
1858 }
ceb1de32 1859
99f21fb9
KW
1860 # Re-order the Unicode code points to native ones for this platform.
1861 # This is only needed for code points below 256, because native code
1862 # points are only in that range. For inversion maps of properties
1863 # where the mappings are adjusted (format =~ /a/), this reordering
1864 # could mess up the adjustment pattern that was in the input, so that
1865 # has to be dealt with.
1866 #
1867 # And inversion maps that map to code points need to eventually have
1868 # all those code points remapped to native, and it's better to do that
1869 # here, going through the whole list not just those below 256. This
1870 # is because some inversion maps have adjustments (format =~ /a/)
1871 # which may be affected by the reordering. This code needs to be done
1872 # both for when we are translating the inversion lists for < 256, and
1873 # for the inversion maps for everything. By doing both in this loop,
1874 # we can share that code.
1875 #
1876 # So, we go through everything for an inversion map to code points;
1877 # otherwise, we can skip any remapping at all if we are going to
1878 # output only the above-Latin1 values, or if the range spans the whole
1879 # of 0..256, as the remap will also include all of 0..256 (256 not
1880 # 255 because a re-ordering could cause 256 to need to be in the same
1881 # range as 255.)
1882 if ((@invmap && $maps_to_code_point)
1883 || (! $nonl1_only || ($invlist[0] < 256
1884 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1885 {
fb4554ea 1886
99f21fb9 1887 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1888 # Look at all the ranges that start before 257.
1889 my @latin1_list;
1890 while (@invlist) {
1891 last if $invlist[0] > 256;
1892 my $upper = @invlist > 1
1893 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1894
1895 # To infinity. You may want to stop much much
1896 # earlier; going this high may expose perl
1897 # deficiencies with very large numbers.
1898 : $Unicode::UCD::MAX_CP;
fb4554ea 1899 for my $j ($invlist[0] .. $upper) {
99f21fb9 1900 push @latin1_list, a2n($j);
0f5e3c71 1901 }
fb4554ea
KW
1902
1903 shift @invlist; # Shift off the range that's in the list
1904 shift @invlist; # Shift off the range not in the list
0c4ecf42 1905 }
fb4554ea
KW
1906
1907 # Here @invlist contains all the ranges in the original that start
1908 # at code points above 256, and @latin1_list contains all the
1909 # native code points for ranges that start with a Unicode code
1910 # point below 257. We sort the latter and convert it to inversion
1911 # list format. Then simply prepend it to the list of the higher
1912 # code points.
1913 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1914 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1915 unshift @invlist, @latin1_list;
99f21fb9
KW
1916 }
1917 else { # Is an inversion map
1918
1919 # This is a similar procedure as plain inversion list, but has
1920 # multiple buckets. A plain inversion list just has two
1921 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1922 # pretty much can ignore the 2nd bucket, as it is completely
1923 # defined by the 1st. But here, what we do is create buckets
1924 # which contain the code points that map to each, translated
1925 # to native and turned into an inversion list. Thus each
1926 # bucket is an inversion list of native code points that map
1927 # to it or don't map to it. We use these to create an
1928 # inversion map for the whole property.
1929
1930 # As mentioned earlier, we use this procedure to not just
1931 # remap the inversion list to native values, but also the maps
1932 # of code points to native ones. In the latter case we have
1933 # to look at the whole of the inversion map (or at least to
1934 # above Unicode; as the maps of code points above that should
1935 # all be to the default).
1936 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1937
1938 my %mapped_lists; # A hash whose keys are the buckets.
1939 while (@invlist) {
1940 last if $invlist[0] > $upper_limit;
1941
1942 # This shouldn't actually happen, as prop_invmap() returns
1943 # an extra element at the end that is beyond $upper_limit
1944 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1945
1946 my $bucket;
1947
1948 # A hash key can't be a ref (we are only expecting arrays
1949 # of scalars here), so convert any such to a string that
1950 # will be converted back later (using a vertical tab as
1951 # the separator). Even if the mapping is to code points,
1952 # we don't translate to native here because the code
d8049362 1953 # output_invmap() calls to output these arrays assumes the
99f21fb9
KW
1954 # input is Unicode, not native.
1955 if (ref $invmap[0]) {
1956 $bucket = join "\cK", @{$invmap[0]};
1957 }
1958 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1959
1960 # Do convert to native for maps to single code points.
1961 # There are some properties that have a few outlier
1962 # maps that aren't code points, so the above test
1963 # skips those.
1964 $bucket = a2n($invmap[0]);
1965 } else {
1966 $bucket = $invmap[0];
1967 }
1968
1969 # We now have the bucket that all code points in the range
1970 # map to, though possibly they need to be adjusted. Go
1971 # through the range and put each translated code point in
1972 # it into its bucket.
1973 my $base_map = $invmap[0];
1974 for my $j ($invlist[0] .. $invlist[1] - 1) {
1975 if ($to_adjust
1976 # The 1st code point doesn't need adjusting
1977 && $j > $invlist[0]
1978
1979 # Skip any non-numeric maps: these are outliers
1980 # that aren't code points.
1981 && $base_map =~ $numeric_re
1982
1983 # 'ne' because the default can be a string
1984 && $base_map ne $map_default)
1985 {
1986 # We adjust, by incrementing each the bucket and
1987 # the map. For code point maps, translate to
1988 # native
1989 $base_map++;
1990 $bucket = ($maps_to_code_point)
1991 ? a2n($base_map)
1992 : $base_map;
1993 }
1994
1995 # Add the native code point to the bucket for the
1996 # current map
1997 push @{$mapped_lists{$bucket}}, a2n($j);
1998 } # End of loop through all code points in the range
1999
2000 # Get ready for the next range
2001 shift @invlist;
2002 shift @invmap;
2003 } # End of loop through all ranges in the map.
2004
2005 # Here, @invlist and @invmap retain all the ranges from the
2006 # originals that start with code points above $upper_limit.
2007 # Each bucket in %mapped_lists contains all the code points
2008 # that map to that bucket. If the bucket is for a map to a
2009 # single code point is a single code point, the bucket has
2010 # been converted to native. If something else (including
2011 # multiple code points), no conversion is done.
2012 #
2013 # Now we recreate the inversion map into %xlated, but this
2014 # time for the native character set.
2015 my %xlated;
2016 foreach my $bucket (keys %mapped_lists) {
2017
2018 # Sort and convert this bucket to an inversion list. The
2019 # result will be that ranges that start with even-numbered
2020 # indexes will be for code points that map to this bucket;
2021 # odd ones map to some other bucket, and are discarded
2022 # below.
2023 @{$mapped_lists{$bucket}}
2024 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2025 @{$mapped_lists{$bucket}}
2026 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2027
2028 # Add each even-numbered range in the bucket to %xlated;
2029 # so that the keys of %xlated become the range start code
2030 # points, and the values are their corresponding maps.
2031 while (@{$mapped_lists{$bucket}}) {
2032 my $range_start = $mapped_lists{$bucket}->[0];
2033 if ($bucket =~ /\cK/) {
2034 @{$xlated{$range_start}} = split /\cK/, $bucket;
2035 }
2036 else {
2037 $xlated{$range_start} = $bucket;
2038 }
2039 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2040 shift @{$mapped_lists{$bucket}}; # Get ready for next
2041 # iteration
2042 }
2043 } # End of loop through all the buckets.
2044
2045 # Here %xlated's keys are the range starts of all the code
2046 # points in the inversion map. Construct an inversion list
2047 # from them.
2048 my @new_invlist = sort { $a <=> $b } keys %xlated;
2049
2050 # If the list is adjusted, we want to munge this list so that
2051 # we only have one entry for where consecutive code points map
2052 # to consecutive values. We just skip the subsequent entries
2053 # where this is the case.
2054 if ($to_adjust) {
2055 my @temp;
2056 for my $i (0 .. @new_invlist - 1) {
2057 next if $i > 0
2058 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2059 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2060 && $xlated{$new_invlist[$i]} =~ $numeric_re
2061 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2062 push @temp, $new_invlist[$i];
2063 }
2064 @new_invlist = @temp;
2065 }
2066
2067 # The inversion map comes from %xlated's values. We can
2068 # unshift each onto the front of the untouched portion, in
2069 # reverse order of the portion we did process.
2070 foreach my $start (reverse @new_invlist) {
2071 unshift @invmap, $xlated{$start};
2072 }
2073
2074 # Finally prepend the inversion list we have just constructed to the
2075 # one that contains anything we didn't process.
2076 unshift @invlist, @new_invlist;
2077 }
2078 }
2079
2080 # prop_invmap() returns an extra final entry, which we can now
2081 # discard.
2082 if (@invmap) {
2083 pop @invlist;
2084 pop @invmap;
ceb1de32 2085 }
0f5e3c71
KW
2086
2087 if ($l1_only) {
99f21fb9 2088 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
2089 for my $i (0 .. @invlist - 1 - 1) {
2090 if ($invlist[$i] > 255) {
2091
2092 # In an inversion list, even-numbered elements give the code
2093 # points that begin ranges that match the property;
2094 # odd-numbered give ones that begin ranges that don't match.
2095 # If $i is odd, we are at the first code point above 255 that
2096 # doesn't match, which means the range it is ending does
2097 # match, and crosses the 255/256 boundary. We want to include
2098 # this ending point, so increment $i, so the splice below
2099 # includes it. Conversely, if $i is even, it is the first
2100 # code point above 255 that matches, which means there was no
2101 # matching range that crossed the boundary, and we don't want
2102 # to include this code point, so splice before it.
2103 $i++ if $i % 2 != 0;
2104
2105 # Remove everything past this.
2106 splice @invlist, $i;
99f21fb9 2107 splice @invmap, $i if @invmap;
0f5e3c71
KW
2108 last;
2109 }
0c4ecf42
KW
2110 }
2111 }
0f5e3c71
KW
2112 elsif ($nonl1_only) {
2113 my $found_nonl1 = 0;
2114 for my $i (0 .. @invlist - 1 - 1) {
2115 next if $invlist[$i] < 256;
2116
2117 # Here, we have the first element in the array that indicates an
2118 # element above Latin1. Get rid of all previous ones.
2119 splice @invlist, 0, $i;
99f21fb9 2120 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
2121
2122 # If this one's index is not divisible by 2, it means that this
2123 # element is inverting away from being in the list, which means
99f21fb9
KW
2124 # all code points from 256 to this one are in this list (or
2125 # map to the default for inversion maps)
2126 if ($i % 2 != 0) {
2127 unshift @invlist, 256;
2128 unshift @invmap, $map_default if @invmap;
2129 }
0f5e3c71 2130 $found_nonl1 = 1;
3f427fd9
KW
2131 last;
2132 }
0f5e3c71 2133 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 2134 }
3f427fd9 2135
0f5e3c71 2136 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 2137 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 2138 }
bffc0129 2139 end_file_pound_if;
0c4ecf42 2140 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
2141}
2142
973a28ed
KW
2143switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2144
2145output_GCB_table();
6b659339 2146output_LB_table();
7e54b87f 2147output_WB_table();
6b659339 2148
973a28ed
KW
2149end_file_pound_if;
2150
2308ab83 2151my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
2152my @sources = ($0, qw(lib/unicore/mktables
2153 lib/Unicode/UCD.pm
2154 regen/charset_translations.pl
2155 ));
9a3da3ad
FC
2156{
2157 # Depend on mktables’ own sources. It’s a shorter list of files than
2158 # those that Unicode::UCD uses.
1ae6ead9 2159 if (! open my $mktables_list, '<', $sources_list) {
2308ab83
KW
2160
2161 # This should force a rebuild once $sources_list exists
2162 push @sources, $sources_list;
2163 }
2164 else {
2165 while(<$mktables_list>) {
2166 last if /===/;
2167 chomp;
2168 push @sources, "lib/unicore/$_" if /^[^#]/;
2169 }
9a3da3ad
FC
2170 }
2171}
6b659339
KW
2172
2173read_only_bottom_close_and_rename($out_fh, \@sources);