This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Allow override of where enums get defined
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
463b4a67 10 charprop
99f21fb9 11 );
3d7c117d
MB
12require './regen/regen_lib.pl';
13require './regen/charset_translations.pl';
9d9177be
KW
14
15# This program outputs charclass_invlists.h, which contains various inversion
16# lists in the form of C arrays that are to be used as-is for inversion lists.
17# Thus, the lists it contains are essentially pre-compiled, and need only a
18# light-weight fast wrapper to make them usable at run-time.
19
20# As such, this code knows about the internal structure of these lists, and
21# any change made to that has to be done here as well. A random number stored
22# in the headers is used to minimize the possibility of things getting
23# out-of-sync, or the wrong data structure being passed. Currently that
24# random number is:
99f21fb9
KW
25
26# charclass_invlists.h now also has a partial implementation of inversion
27# maps; enough to generate tables for the line break properties, such as GCB
28
0a07b44b 29my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 30
99f21fb9
KW
31# integer or float
32my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
33
34# Matches valid C language enum names: begins with ASCII alphabetic, then any
35# ASCII \w
36my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
37
9d9177be
KW
38my $out_fh = open_new('charclass_invlists.h', '>',
39 {style => '*', by => $0,
40 from => "Unicode::UCD"});
41
bffc0129 42my $in_file_pound_if = 0;
43b443dd 43
289ce9cc
KW
44my $max_hdr_len = 3; # In headings, how wide a name is allowed?
45
9d9177be
KW
46print $out_fh "/* See the generating file for comments */\n\n";
47
bffc0129
KW
48# The symbols generated by this program are all currently defined only in a
49# single dot c each. The code knows where most of them go, but this hash
50# gives overrides for the exceptions to the typical place
51my %exceptions_to_where_to_define =
52 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
53 AboveLatin1 => 'PERL_IN_REGCOMP_C',
54 Latin1 => 'PERL_IN_REGCOMP_C',
55 UpperLatin1 => 'PERL_IN_REGCOMP_C',
56 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
57 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
58 _Perl_IDCont => 'PERL_IN_UTF8_C',
59 _Perl_IDStart => 'PERL_IN_UTF8_C',
9abae7a5 60 Currency_Symbol => 'PERL_IN_LOCALE_C',
bffc0129 61 );
dd0ee239 62my %where_to_define_enums = ();
015bb97c 63
973a28ed
KW
64my %gcb_enums;
65my @gcb_short_enums;
289ce9cc 66my %gcb_abbreviations;
6b659339
KW
67my %lb_enums;
68my @lb_short_enums;
289ce9cc 69my %lb_abbreviations;
7e54b87f
KW
70my %wb_enums;
71my @wb_short_enums;
289ce9cc 72my %wb_abbreviations;
6b659339 73
99f21fb9
KW
74my @a2n;
75
76sub uniques {
77 # Returns non-duplicated input values. From "Perl Best Practices:
78 # Encapsulated Cleverness". p. 455 in first edition.
79
80 my %seen;
81 return grep { ! $seen{$_}++ } @_;
82}
83
84sub a2n($) {
85 my $cp = shift;
86
87 # Returns the input Unicode code point translated to native.
88
89 return $cp if $cp !~ $numeric_re || $cp > 255;
90 return $a2n[$cp];
91}
92
bffc0129
KW
93sub end_file_pound_if {
94 if ($in_file_pound_if) {
95 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
96 $in_file_pound_if = 0;
97 }
98}
99
100sub switch_pound_if ($$) {
101 my $name = shift;
102 my $new_pound_if = shift;
62a54bb7
KW
103 my @new_pound_if = ref ($new_pound_if)
104 ? @$new_pound_if
105 : $new_pound_if;
bffc0129
KW
106
107 # Switch to new #if given by the 2nd argument. If there is an override
108 # for this, it instead switches to that. The 1st argument is the
109 # static's name, used to look up the overrides
110
111 if (exists $exceptions_to_where_to_define{$name}) {
62a54bb7 112 @new_pound_if = $exceptions_to_where_to_define{$name};
bffc0129 113 }
62a54bb7 114 $new_pound_if = join "", @new_pound_if;
bffc0129
KW
115
116 # Exit current #if if the new one is different from the old
62a54bb7 117 if ( $in_file_pound_if
bffc0129
KW
118 && $in_file_pound_if !~ /$new_pound_if/)
119 {
120 end_file_pound_if;
121 }
122
123 # Enter new #if, if not already in it.
124 if (! $in_file_pound_if) {
62a54bb7
KW
125 foreach my $element (@new_pound_if) {
126 $element = "defined($element)";
127 }
128 $in_file_pound_if = join " || ", @new_pound_if;
bffc0129 129 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
130 }
131}
132
0c4ecf42 133sub output_invlist ($$;$) {
9d9177be
KW
134 my $name = shift;
135 my $invlist = shift; # Reference to inversion list array
0c4ecf42 136 my $charset = shift // ""; # name of character set for comment
9d9177be 137
76d3994c 138 die "No inversion list for $name" unless defined $invlist
ad85f59a 139 && ref $invlist eq 'ARRAY';
76d3994c 140
9d9177be
KW
141 # Output the inversion list $invlist using the name $name for it.
142 # It is output in the exact internal form for inversion lists.
143
a0316a6c
KW
144 # Is the last element of the header 0, or 1 ?
145 my $zero_or_one = 0;
ad85f59a 146 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 147 unshift @$invlist, 0;
9d9177be
KW
148 $zero_or_one = 1;
149 }
0a07b44b 150 my $count = @$invlist;
9d9177be 151
bffc0129 152 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 153
0c4ecf42
KW
154 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
155 print $out_fh " /* for $charset */" if $charset;
156 print $out_fh "\n";
9d9177be 157
a0316a6c 158 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
159 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
160 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
161 ",\t/* 0 if the list starts at 0;",
162 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
163
164 # The main body are the UVs passed in to this routine. Do the final
165 # element separately
47d53124
KW
166 for my $i (0 .. @$invlist - 1) {
167 printf $out_fh "\t0x%X", $invlist->[$i];
168 print $out_fh "," if $i < @$invlist - 1;
169 print $out_fh "\n";
9d9177be
KW
170 }
171
9d9177be
KW
172 print $out_fh "};\n";
173}
174
99f21fb9
KW
175sub output_invmap ($$$$$$$) {
176 my $name = shift;
177 my $invmap = shift; # Reference to inversion map array
178 my $prop_name = shift;
179 my $input_format = shift; # The inversion map's format
180 my $default = shift; # The property value for code points who
181 # otherwise don't have a value specified.
182 my $extra_enums = shift; # comma-separated list of our additions to the
183 # property's standard possible values
184 my $charset = shift // ""; # name of character set for comment
185
186 # Output the inversion map $invmap for property $prop_name, but use $name
187 # as the actual data structure's name.
188
189 my $count = @$invmap;
190
191 my $output_format;
192 my $declaration_type;
193 my %enums;
194 my $name_prefix;
195
34623dbb 196 if ($input_format =~ / ^ s l? $ /x) {
02f811dd
KW
197 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
198 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
226b74db 199 my @input_enums;
f79a09fc 200
226b74db 201 # Find all the possible input values. These become the enum names
34623dbb
KW
202 # that comprise the inversion map. For inputs that don't have sub
203 # lists, we can just get the unique values. Otherwise, we have to
204 # expand the sublists first.
205 if ($input_format ne 'sl') {
226b74db 206 @input_enums = sort(uniques(@$invmap));
34623dbb
KW
207 }
208 else {
209 foreach my $element (@$invmap) {
210 if (ref $element) {
211 push @input_enums, @$element;
212 }
213 else {
214 push @input_enums, $element;
215 }
216 }
217 @input_enums = sort(uniques(@input_enums));
218 }
6b659339 219
226b74db
KW
220 # The internal enums come last, and in the order specified.
221 my @enums = @input_enums;
27a619f7
KW
222 my @extras;
223 if ($extra_enums ne "") {
224 @extras = split /,/, $extra_enums;
226b74db
KW
225
226 # Don't add if already there.
227 foreach my $this_extra (@extras) {
228 next if grep { $_ eq $this_extra } @enums;
229
230 push @enums, $this_extra;
231 }
27a619f7 232 }
289ce9cc 233
226b74db
KW
234 # Assign a value to each element of the enum type we are creating.
235 # The default value always gets 0; the others are arbitrarily
236 # assigned.
27a619f7
KW
237 my $enum_val = 0;
238 my $canonical_default = prop_value_aliases($prop_name, $default);
239 $default = $canonical_default if defined $canonical_default;
240 $enums{$default} = $enum_val++;
226b74db 241
27a619f7
KW
242 for my $enum (@enums) {
243 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
244 }
245
226b74db 246 # Calculate the data for the special tables output for these properties.
27a619f7
KW
247 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
248
226b74db
KW
249 # The data includes the hashes %gcb_enums, %lb_enums, etc.
250 # Similarly we calculate column headings for the tables.
251 #
27a619f7 252 # We use string evals to allow the same code to work on
226b74db 253 # all the tables
27a619f7
KW
254 my $type = lc $prop_name;
255
27a619f7
KW
256 my $placeholder = "a";
257
258 # Skip if we've already done this code, which populated
259 # this hash
260 if (eval "! \%${type}_enums") {
261
226b74db 262 # For each enum in the type ...
27a619f7
KW
263 foreach my $enum (sort keys %enums) {
264 my $value = $enums{$enum};
265 my $short;
266 my $abbreviated_from;
267
268 # Special case this wb property value to make the
269 # name more clear
270 if ($enum eq 'Perl_Tailored_HSpace') {
271 $short = 'hs';
272 $abbreviated_from = $enum;
273 }
27a619f7 274 else {
226b74db
KW
275
276 # Use the official short name, if found.
27a619f7
KW
277 ($short) = prop_value_aliases($type, $enum);
278
226b74db
KW
279 if (! defined $short) {
280
281 # But if there is no official name, use the name
282 # that came from the data (if any). Otherwise,
283 # the name had to come from the extras list.
284 # There are two types of values in that list.
285 #
286 # First are those enums that are not part of the
287 # property, but are defined by this code. By
288 # convention these have all-caps names of at least
289 # 4 characters. We use the lowercased name for
290 # thse.
291 #
292 # Second are enums that are needed to get
293 # regexec.c to compile, but don't exist in all
294 # Unicode releases. To get here, we must be
295 # compiling an earlier Unicode release that
296 # doesn't have that enum, so just use a unique
297 # anonymous name for it.
298 if (grep { $_ eq $enum } @input_enums) {
299 $short = $enum
300 }
301 elsif ($enum !~ / ^ [A-Z]{4,} $ /x) {
302 $short = $placeholder++;
303 }
304 else {
305 $short = lc $enum;
306 }
307 }
27a619f7
KW
308 }
309
310 # If our short name is too long, or we already
311 # know that the name is an abbreviation, truncate
312 # to make sure it's short enough, and remember
226b74db
KW
313 # that we did this so we can later add a comment in the
314 # generated file
27a619f7
KW
315 if ( $abbreviated_from
316 || length $short > $max_hdr_len)
317 {
318 $short = substr($short, 0, $max_hdr_len);
319 $abbreviated_from = $enum
320 unless $abbreviated_from;
321 # If the name we are to display conflicts, try
322 # another.
323 while (eval "exists
324 \$${type}_abbreviations{$short}")
325 {
289ce9cc 326 die $@ if $@;
256fceb3
KW
327
328 # The increment operator on strings doesn't work
329 # on those containing an '_', so just use the
330 # final portion.
331 my @short = split '_', $short;
332 $short[-1]++;
333 $short = join "_", @short;
289ce9cc 334 }
19a5f1d5 335
27a619f7 336 eval "\$${type}_abbreviations{$short} = '$enum'";
19a5f1d5 337 die $@ if $@;
7e54b87f 338 }
27a619f7
KW
339
340 # Remember the mapping from the property value
341 # (enum) name to its value.
342 eval "\$${type}_enums{$enum} = $value";
343 die $@ if $@;
344
345 # Remember the inverse mapping to the short name
346 # so that we can properly label the generated
347 # table's rows and columns
348 eval "\$${type}_short_enums[$value] = '$short'";
349 die $@ if $@;
7e54b87f 350 }
99f21fb9 351 }
19a5f1d5 352 }
99f21fb9 353
dd0ee239
KW
354 # Inversion map stuff is used only by regexec, unless it is in the
355 # enum exception list
356 my $where = (exists $where_to_define_enums{$name})
357 ? $where_to_define_enums{$name}
358 : 'PERL_IN_REGEXEC_C';
359 switch_pound_if($name, $where);
19a5f1d5
KW
360
361 # The short names tend to be two lower case letters, but it looks
362 # better for those if they are upper. XXX
363 $short_name = uc($short_name) if length($short_name) < 3
226b74db 364 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
19a5f1d5 365 $name_prefix = "${short_name}_";
cdc243dd
KW
366
367 # Currently unneeded
368 #print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
19a5f1d5 369
34623dbb
KW
370 if ($input_format eq 'sl') {
371 print $out_fh
372 "\n/* Negative enum values indicate the need to use an auxiliary"
373 . " table\n * consisting of the list of enums this one expands to."
374 . " The absolute\n * values of the negative enums are indices into"
375 . " a table of the auxiliary\n * tables' addresses */";
376 }
377
226b74db 378 # Start the enum definition for this map
19a5f1d5
KW
379 print $out_fh "\ntypedef enum {\n";
380 my @enum_list;
381 foreach my $enum (keys %enums) {
382 $enum_list[$enums{$enum}] = $enum;
99f21fb9 383 }
19a5f1d5 384 foreach my $i (0 .. @enum_list - 1) {
34623dbb
KW
385 print $out_fh ",\n" if $i > 0;
386
19a5f1d5
KW
387 my $name = $enum_list[$i];
388 print $out_fh "\t${name_prefix}$name = $i";
19a5f1d5 389 }
34623dbb
KW
390
391 # For an 'sl' property, we need extra enums, because some of the
392 # elements are lists. Each such distinct list is placed in its own
393 # auxiliary map table. Here, we go through the inversion map, and for
394 # each distinct list found, create an enum value for it, numbered -1,
395 # -2, ....
396 my %multiples;
397 my $aux_table_prefix = "AUX_TABLE_";
398 if ($input_format eq 'sl') {
399 foreach my $element (@$invmap) {
400
401 # A regular scalar is not one of the lists we're looking for
402 # at this stage.
403 next unless ref $element;
404
405 my $joined = join ",", sort @$element;
406 my $already_found = exists $multiples{$joined};
407
408 my $i;
409 if ($already_found) { # Use any existing one
410 $i = $multiples{$joined};
411 }
412 else { # Otherwise increment to get a new table number
413 $i = keys(%multiples) + 1;
414 $multiples{$joined} = $i;
415 }
416
417 # This changes the inversion map for this entry to not be the
418 # list
419 $element = "use_$aux_table_prefix$i";
420
421 # And add to the enum values
422 if (! $already_found) {
423 print $out_fh ",\n\t${name_prefix}$element = -$i";
424 }
425 }
426 }
427
428 print $out_fh "\n";
19a5f1d5
KW
429 $declaration_type = "${name_prefix}enum";
430 print $out_fh "} $declaration_type;\n";
226b74db 431 # Finished with the enum defintion.
19a5f1d5
KW
432
433 $output_format = "${name_prefix}%s";
34623dbb
KW
434
435 # If there are auxiliary tables, output them.
436 if (%multiples) {
437
438 print $out_fh "\n#define HAS_${name_prefix}AUX_TABLES\n";
439
440 # Invert keys and values
441 my %inverted_mults;
442 while (my ($key, $value) = each %multiples) {
443 $inverted_mults{$value} = $key;
444 }
445
446 # Output them in sorted order
447 my @sorted_table_list = sort { $a <=> $b } keys %inverted_mults;
448
449 # Keep track of how big each aux table is
450 my @aux_counts;
451
452 # Output each aux table.
453 foreach my $table_number (@sorted_table_list) {
454 my $table = $inverted_mults{$table_number};
455 print $out_fh "\nstatic const $declaration_type $name_prefix$aux_table_prefix$table_number\[] = {\n";
456
457 # Earlier, we joined the elements of this table together with a comma
458 my @elements = split ",", $table;
459
460 $aux_counts[$table_number] = scalar @elements;
461 for my $i (0 .. @elements - 1) {
462 print $out_fh ",\n" if $i > 0;
463 print $out_fh "\t${name_prefix}$elements[$i]";
464 }
465 print $out_fh "\n};\n";
466 }
467
468 # Output the table that is indexed by the absolute value of the
469 # aux table enum and contains pointers to the tables output just
470 # above
471 print $out_fh "\nstatic const $declaration_type * const ${name_prefix}${aux_table_prefix}ptrs\[] = {\n";
472 print $out_fh "\tNULL,\t/* Placeholder */\n";
473 for my $i (1 .. @sorted_table_list) {
474 print $out_fh ",\n" if $i > 1;
475 print $out_fh "\t$name_prefix$aux_table_prefix$i";
476 }
477 print $out_fh "\n};\n";
478
479 print $out_fh
480 "\n/* Parallel table to the above, giving the number of elements"
481 . " in each table\n * pointed to */\n";
482 print $out_fh "static const U8 ${name_prefix}${aux_table_prefix}lengths\[] = {\n";
483 print $out_fh "\t0,\t/* Placeholder */\n";
484 for my $i (1 .. @sorted_table_list) {
485 print $out_fh ",\n" if $i > 1;
486 print $out_fh "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */";
487 }
488 print $out_fh "\n};\n";
489 } # End of outputting the auxiliary and associated tables
463b4a67
KW
490
491 # The scx property used in regexec.c needs a specialized table which
492 # is most convenient to output here, while the data structures set up
493 # above are still extant. This table contains the code point that is
494 # the zero digit of each script, indexed by script enum value.
495 if (lc $short_name eq 'scx') {
496 my @decimals_invlist = prop_invlist("Numeric_Type=Decimal");
497 my %script_zeros;
498
499 # Find all the decimal digits. The 0 of each range is always the
500 # 0th element, except in some early Unicode releases, so check for
501 # that.
502 for (my $i = 0; $i < @decimals_invlist; $i += 2) {
503 my $code_point = $decimals_invlist[$i];
504 next if chr($code_point) !~ /\p{Nv=0}/;
505
506 # Turn the scripts this zero is in into a list.
507 my @scripts = split ",",
508 charprop($code_point, "_Perl_SCX", '_perl_core_internal_ok');
509 $code_point = sprintf("0x%x", $code_point);
510
511 foreach my $script (@scripts) {
512 if (! exists $script_zeros{$script}) {
513 $script_zeros{$script} = $code_point;
514 }
515 elsif (ref $script_zeros{$script}) {
516 push $script_zeros{$script}->@*, $code_point;
517 }
518 else { # Turn into a list if this is the 2nd zero of the
519 # script
520 my $existing = $script_zeros{$script};
521 undef $script_zeros{$script};
522 push $script_zeros{$script}->@*, $existing, $code_point;
523 }
524 }
525 }
526
527 # @script_zeros contains the zero, sorted by the script's enum
528 # value
529 my @script_zeros;
530 foreach my $script (keys %script_zeros) {
531 my $enum_value = $enums{$script};
532 $script_zeros[$enum_value] = $script_zeros{$script};
533 }
534
535 print $out_fh
536 "\n/* This table, indexed by the script enum, gives the zero"
537 . " code point for that\n * script; 0 if the script has multiple"
538 . " digit sequences. Scripts without a\n * digit sequence use"
539 . " ASCII [0-9], hence are marked '0' */\n";
540 print $out_fh "static const UV script_zeros[] = {\n";
541 for my $i (0 .. @script_zeros - 1) {
542 my $code_point = $script_zeros[$i];
543 if (defined $code_point) {
544 $code_point = " 0" if ref $code_point;
545 print $out_fh "\t$code_point";
546 }
547 elsif (lc $enum_list[$i] eq 'inherited') {
548 print $out_fh "\t 0";
549 }
550 else { # The only digits a script without its own set accepts
551 # is [0-9]
552 print $out_fh "\t'0'";
553 }
554 print $out_fh "," if $i < @script_zeros - 1;
555 print $out_fh "\t/* $enum_list[$i] */";
556 print $out_fh "\n";
557 }
558 print $out_fh "};\n";
559 } # End of special handling of scx
99f21fb9
KW
560 }
561 else {
562 die "'$input_format' invmap() format for '$prop_name' unimplemented";
563 }
564
565 die "No inversion map for $prop_name" unless defined $invmap
566 && ref $invmap eq 'ARRAY'
567 && $count;
568
226b74db 569 # Now output the inversion map proper
99f21fb9
KW
570 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
571 print $out_fh " /* for $charset */" if $charset;
572 print $out_fh "\n";
573
574 # The main body are the scalars passed in to this routine.
575 for my $i (0 .. $count - 1) {
576 my $element = $invmap->[$i];
02f811dd
KW
577 my $full_element_name = prop_value_aliases($prop_name, $element);
578 $element = $full_element_name if defined $full_element_name;
579 $element = $name_prefix . $element;
99f21fb9
KW
580 print $out_fh "\t$element";
581 print $out_fh "," if $i < $count - 1;
582 print $out_fh "\n";
583 }
584 print $out_fh "};\n";
99f21fb9
KW
585}
586
5a7e5385 587sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
588
589 # Returns an inversion list constructed from the sorted input array of
590 # code points
591
592 my $list_ref = shift;
593
99f21fb9
KW
594 return unless @$list_ref;
595
a02047bf
KW
596 # Initialize to just the first element
597 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
598
599 # For each succeeding element, if it extends the previous range, adjust
600 # up, otherwise add it.
601 for my $i (1 .. @$list_ref - 1) {
602 if ($invlist[-1] == $list_ref->[$i]) {
603 $invlist[-1]++;
604 }
605 else {
606 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
607 }
608 }
609 return @invlist;
610}
611
612# Read in the Case Folding rules, and construct arrays of code points for the
613# properties we need.
614my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
615die "Could not find inversion map for Case_Folding" unless defined $format;
616die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
617 unless $format eq 'al'
618 || $format eq 'a';
a02047bf
KW
619my @has_multi_char_fold;
620my @is_non_final_fold;
621
622for my $i (0 .. @$folds_ref - 1) {
623 next unless ref $folds_ref->[$i]; # Skip single-char folds
624 push @has_multi_char_fold, $cp_ref->[$i];
625
b6a6e956 626 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
627 # position
628 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
629 push @is_non_final_fold, $folds_ref->[$i][$j]
630 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
631 }
632}
633
a02047bf
KW
634sub _Perl_Non_Final_Folds {
635 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 636 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
637}
638
99f21fb9
KW
639sub prop_name_for_cmp ($) { # Sort helper
640 my $name = shift;
641
642 # Returns the input lowercased, with non-alphas removed, as well as
643 # everything starting with a comma
644
645 $name =~ s/,.*//;
646 $name =~ s/[[:^alpha:]]//g;
647 return lc $name;
648}
649
892d8259 650sub UpperLatin1 {
5a7e5385 651 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
652}
653
289ce9cc
KW
654sub output_table_common {
655
656 # Common subroutine to actually output the generated rules table.
657
658 my ($property,
659 $table_value_defines_ref,
660 $table_ref,
661 $names_ref,
662 $abbreviations_ref) = @_;
663 my $size = @$table_ref;
664
665 # Output the #define list, sorted by numeric value
666 if ($table_value_defines_ref) {
667 my $max_name_length = 0;
668 my @defines;
669
670 # Put in order, and at the same time find the longest name
671 while (my ($enum, $value) = each %$table_value_defines_ref) {
672 $defines[$value] = $enum;
673
674 my $length = length $enum;
675 $max_name_length = $length if $length > $max_name_length;
676 }
677
678 print $out_fh "\n";
679
680 # Output, so that the values are vertically aligned in a column after
681 # the longest name
682 foreach my $i (0 .. @defines - 1) {
683 next unless defined $defines[$i];
684 printf $out_fh "#define %-*s %2d\n",
685 $max_name_length,
686 $defines[$i],
687 $i;
688 }
689 }
690
691 my $column_width = 2; # We currently allow 2 digits for the number
692
693 # If the maximum value in the table is 1, it can be a bool. (Being above
694 # a U8 is not currently handled
695 my $max_element = 0;
696 for my $i (0 .. $size - 1) {
697 for my $j (0 .. $size - 1) {
698 next if $max_element >= $table_ref->[$i][$j];
699 $max_element = $table_ref->[$i][$j];
700 }
701 }
702 die "Need wider table column width given '$max_element"
703 if length $max_element > $column_width;
704
705 my $table_type = ($max_element == 1)
706 ? 'bool'
707 : 'U8';
708
709 # If a name is longer than the width set aside for a column, its column
710 # needs to have increased spacing so that the name doesn't get truncated
711 # nor run into an adjacent column
712 my @spacers;
713
714 # If we are being compiled on a Unicode version earlier than that which
715 # this file was designed for, it may be that some of the property values
716 # aren't in the current release, and so would be undefined if we didn't
717 # define them ourselves. Earlier code has done this, making them
718 # lowercase characters of length one. We look to see if any exist, so
719 # that we can add an annotation to the output table
720 my $has_placeholder = 0;
721
722 for my $i (0 .. $size - 1) {
723 no warnings 'numeric';
724 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
725 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
726 }
727
728 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
729
730 # Calculate the column heading line
731 my $header_line = "/* "
732 . (" " x $max_hdr_len) # We let the row heading meld to
733 # the '*/' for those that are at
734 # the max
735 . " " x 3; # Space for '*/ '
736 # Now each column
737 for my $i (0 .. $size - 1) {
738 $header_line .= sprintf "%s%*s",
739 $spacers[$i],
740 $column_width + 1, # 1 for the ','
741 $names_ref->[$i];
742 }
743 $header_line .= " */\n";
744
745 # If we have annotations, output it now.
746 if ($has_placeholder || scalar %$abbreviations_ref) {
747 my $text = "";
748 foreach my $abbr (sort keys %$abbreviations_ref) {
749 $text .= "; " if $text;
750 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
751 }
752 if ($has_placeholder) {
753 $text .= "; other " if $text;
754 $text .= "lowercase names are placeholders for"
755 . " property values not defined until a later Unicode"
756 . " release, so are irrelevant in this one, as they are"
757 . " not assigned to any code points";
758 }
759
760 my $indent = " " x 3;
761 $text = $indent . "/* $text */";
762
763 # Wrap the text so that it is no wider than the table, which the
764 # header line gives.
765 my $output_width = length $header_line;
766 while (length $text > $output_width) {
767 my $cur_line = substr($text, 0, $output_width);
768
769 # Find the first blank back from the right end to wrap at.
770 for (my $i = $output_width -1; $i > 0; $i--) {
771 if (substr($text, $i, 1) eq " ") {
772 print $out_fh substr($text, 0, $i), "\n";
773
774 # Set so will look at just the remaining tail (which will
775 # be indented and have a '*' after the indent
776 $text = $indent . " * " . substr($text, $i + 1);
777 last;
778 }
779 }
780 }
781
782 # And any remaining
783 print $out_fh $text, "\n" if $text;
784 }
785
786 # We calculated the header line earlier just to get its width so that we
787 # could make sure the annotations fit into that.
788 print $out_fh $header_line;
789
790 # Now output the bulk of the table.
791 for my $i (0 .. $size - 1) {
792
793 # First the row heading.
794 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
795 print $out_fh "{"; # Then the brace for this row
796
797 # Then each column
798 for my $j (0 .. $size -1) {
799 print $out_fh $spacers[$j];
800 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
801 print $out_fh "," if $j < $size - 1;
802 }
803 print $out_fh " }";
804 print $out_fh "," if $i < $size - 1;
805 print $out_fh "\n";
806 }
807
808 print $out_fh "};\n";
809}
810
973a28ed
KW
811sub output_GCB_table() {
812
813 # Create and output the pair table for use in determining Grapheme Cluster
814 # Breaks, given in http://www.unicode.org/reports/tr29/.
b0e24409
KW
815 my %gcb_actions = (
816 GCB_NOBREAK => 0,
817 GCB_BREAKABLE => 1,
818 GCB_RI_then_RI => 2, # Rules 12 and 13
819 GCB_EX_then_EM => 3, # Rule 10
820 );
973a28ed
KW
821
822 # The table is constructed in reverse order of the rules, to make the
823 # lower-numbered, higher priority ones override the later ones, as the
824 # algorithm stops at the earliest matching rule
825
826 my @gcb_table;
827 my $table_size = @gcb_short_enums;
828
829 # Otherwise, break everywhere.
b0e24409 830 # GB99 Any ÷ Any
973a28ed
KW
831 for my $i (0 .. $table_size - 1) {
832 for my $j (0 .. $table_size - 1) {
833 $gcb_table[$i][$j] = 1;
834 }
835 }
836
b0e24409
KW
837 # Do not break within emoji flag sequences. That is, do not break between
838 # regional indicator (RI) symbols if there is an odd number of RI
839 # characters before the break point. Must be resolved in runtime code.
840 #
c492f156 841 # GB12 sot (RI RI)* RI × RI
b0e24409
KW
842 # GB13 [^RI] (RI RI)* RI × RI
843 $gcb_table[$gcb_enums{'Regional_Indicator'}]
844 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
845
846 # Do not break within emoji modifier sequences or emoji zwj sequences.
847 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
848 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
849 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
850
851 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
852 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
853 = $gcb_actions{GCB_EX_then_EM};
854 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
855 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
856
857 # Do not break before extending characters or ZWJ.
973a28ed 858 # Do not break before SpacingMarks, or after Prepend characters.
973a28ed 859 # GB9b Prepend ×
b0e24409
KW
860 # GB9a × SpacingMark
861 # GB9 × ( Extend | ZWJ )
973a28ed 862 for my $i (0 .. @gcb_table - 1) {
289ce9cc 863 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
b0e24409
KW
864 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
865 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
866 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
973a28ed
KW
867 }
868
973a28ed
KW
869 # Do not break Hangul syllable sequences.
870 # GB8 ( LVT | T) × T
871 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
872 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
873
874 # GB7 ( LV | V ) × ( V | T )
875 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
876 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
877 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
878 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
879
880 # GB6 L × ( L | V | LV | LVT )
881 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
882 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
883 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
884 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
885
289ce9cc
KW
886 # Do not break between a CR and LF. Otherwise, break before and after
887 # controls.
973a28ed
KW
888 # GB5 ÷ ( Control | CR | LF )
889 # GB4 ( Control | CR | LF ) ÷
890 for my $i (0 .. @gcb_table - 1) {
289ce9cc 891 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
973a28ed
KW
892 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
893 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
289ce9cc 894 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
973a28ed
KW
895 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
896 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
897 }
898
899 # GB3 CR × LF
900 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
901
b0e24409 902 # Break at the start and end of text, unless the text is empty
973a28ed
KW
903 # GB1 sot ÷
904 # GB2 ÷ eot
905 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
906 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
907 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
973a28ed 908 }
289ce9cc 909 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
973a28ed 910
b0e24409 911 output_table_common('GCB', \%gcb_actions,
289ce9cc 912 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
973a28ed
KW
913}
914
6b659339
KW
915sub output_LB_table() {
916
917 # Create and output the enums, #defines, and pair table for use in
918 # determining Line Breaks. This uses the default line break algorithm,
919 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
920 # in that page, as the Unicode-furnished tests assume that tailoring.
921
6b659339
KW
922 # The result is really just true or false. But we follow along with tr14,
923 # creating a rule which is false for something like X SP* X. That gets
924 # encoding 2. The rest of the actions are synthetic ones that indicate
925 # some context handling is required. These each are added to the
926 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
927 # value can be retrieved. Actually only rules from 7 through 18 (which
928 # are the ones where space matter) are possible to have 2 added to them.
929 # The others below add just 0 or 1. It might be possible for one
930 # synthetic rule to be added to another, yielding a larger value. This
931 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
932 # names of the middle grouping below, it is impossible for that to occur
933 # for them because they all start with mutually exclusive classes. That
934 # the final rule can't be added to any of the others isn't obvious from
935 # its name, so it is assigned a power of 2 higher than the others can get
936 # to so any addition would preserve all data. (And the code will reach an
937 # assert(0) on debugging builds should this happen.)
938 my %lb_actions = (
939 LB_NOBREAK => 0,
940 LB_BREAKABLE => 1,
941 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
942
b0e24409 943 LB_CM_ZWJ_foo => 3, # Rule 9
6b659339
KW
944 LB_SP_foo => 6, # Rule 18
945 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
946 LB_SY_or_IS_then_various => 11, # Rule 25
947 LB_HY_or_BA_then_foo => 13, # Rule 21
b0e24409 948 LB_RI_then_RI => 15, # Rule 30a
6b659339 949
b0e24409 950 LB_various_then_PO_or_PR => (1<<5), # Rule 25
6b659339
KW
951 );
952
6b659339
KW
953 # Construct the LB pair table. This is based on the rules in
954 # http://www.unicode.org/reports/tr14/, but modified as those rules are
955 # designed for someone taking a string of text and sequentially going
956 # through it to find the break opportunities, whereas, Perl requires
957 # determining if a given random spot is a break opportunity, without
958 # knowing all the entire string before it.
959 #
960 # The table is constructed in reverse order of the rules, to make the
961 # lower-numbered, higher priority ones override the later ones, as the
962 # algorithm stops at the earliest matching rule
963
964 my @lb_table;
965 my $table_size = @lb_short_enums;
966
967 # LB31. Break everywhere else
968 for my $i (0 .. $table_size - 1) {
969 for my $j (0 .. $table_size - 1) {
970 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
971 }
972 }
973
b0e24409
KW
974 # LB30b Do not break between an emoji base and an emoji modifier.
975 # EB × EM
976 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
977 = $lb_actions{'LB_NOBREAK'};
978
979 # LB30a Break between two regional indicator symbols if and only if there
980 # are an even number of regional indicators preceding the position of the
981 # break.
982 # sot (RI RI)* RI × RI
983 # [^RI] (RI RI)* RI × RI
289ce9cc 984 $lb_table[$lb_enums{'Regional_Indicator'}]
b0e24409 985 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
6b659339
KW
986
987 # LB30 Do not break between letters, numbers, or ordinary symbols and
988 # opening or closing parentheses.
989 # (AL | HL | NU) × OP
289ce9cc
KW
990 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
991 = $lb_actions{'LB_NOBREAK'};
992 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
993 = $lb_actions{'LB_NOBREAK'};
994 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
995 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
996
997 # CP × (AL | HL | NU)
289ce9cc
KW
998 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
999 = $lb_actions{'LB_NOBREAK'};
1000 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
1001 = $lb_actions{'LB_NOBREAK'};
1002 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
1003 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1004
1005 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
1006 # IS × (AL | HL)
289ce9cc
KW
1007 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
1008 = $lb_actions{'LB_NOBREAK'};
1009 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1010 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1011
1012 # LB28 Do not break between alphabetics (“at”).
1013 # (AL | HL) × (AL | HL)
289ce9cc
KW
1014 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
1015 = $lb_actions{'LB_NOBREAK'};
1016 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
1017 = $lb_actions{'LB_NOBREAK'};
1018 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
1019 = $lb_actions{'LB_NOBREAK'};
1020 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
1021 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1022
1023 # LB27 Treat a Korean Syllable Block the same as ID.
1024 # (JL | JV | JT | H2 | H3) × IN
289ce9cc
KW
1025 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
1026 = $lb_actions{'LB_NOBREAK'};
1027 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
1028 = $lb_actions{'LB_NOBREAK'};
1029 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
1030 = $lb_actions{'LB_NOBREAK'};
1031 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
1032 = $lb_actions{'LB_NOBREAK'};
1033 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
1034 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1035
1036 # (JL | JV | JT | H2 | H3) × PO
289ce9cc
KW
1037 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
1038 = $lb_actions{'LB_NOBREAK'};
1039 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
1040 = $lb_actions{'LB_NOBREAK'};
1041 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
1042 = $lb_actions{'LB_NOBREAK'};
1043 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
1044 = $lb_actions{'LB_NOBREAK'};
1045 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
1046 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1047
1048 # PR × (JL | JV | JT | H2 | H3)
289ce9cc
KW
1049 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
1050 = $lb_actions{'LB_NOBREAK'};
1051 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
1052 = $lb_actions{'LB_NOBREAK'};
1053 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
1054 = $lb_actions{'LB_NOBREAK'};
1055 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
1056 = $lb_actions{'LB_NOBREAK'};
1057 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
1058 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1059
1060 # LB26 Do not break a Korean syllable.
1061 # JL × (JL | JV | H2 | H3)
1062 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
1063 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
1064 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
1065 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
1066
1067 # (JV | H2) × (JV | JT)
1068 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
1069 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
1070 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1071 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1072
1073 # (JT | H3) × JT
1074 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1075 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1076
1077 # LB25 Do not break between the following pairs of classes relevant to
1078 # numbers, as tailored by example 7 in
1079 # http://www.unicode.org/reports/tr14/#Examples
1080 # We follow that tailoring because Unicode's test cases expect it
1081 # (PR | PO) × ( OP | HY )? NU
289ce9cc
KW
1082 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
1083 = $lb_actions{'LB_NOBREAK'};
1084 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1085 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1086
1087 # Given that (OP | HY )? is optional, we have to test for it in code.
1088 # We add in the action (instead of overriding) for this, so that in
1089 # the code we can recover the underlying break value.
289ce9cc 1090 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1091 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1092 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1093 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1094 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339 1095 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1096 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339
KW
1097 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1098
1099 # ( OP | HY ) × NU
289ce9cc
KW
1100 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1101 = $lb_actions{'LB_NOBREAK'};
1102 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1103 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1104
1105 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1106 # which can be rewritten as:
1107 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
289ce9cc
KW
1108 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1109 = $lb_actions{'LB_NOBREAK'};
1110 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1111 = $lb_actions{'LB_NOBREAK'};
1112 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1113 = $lb_actions{'LB_NOBREAK'};
1114 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1115 = $lb_actions{'LB_NOBREAK'};
1116 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1117 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1118
1119 # Like earlier where we have to test in code, we add in the action so
1120 # that we can recover the underlying values. This is done in rules
1121 # below, as well. The code assumes that we haven't added 2 actions.
1122 # Shoul a later Unicode release break that assumption, then tests
1123 # should start failing.
289ce9cc 1124 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
6b659339 1125 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1126 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
6b659339 1127 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1128 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
6b659339 1129 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1130 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
6b659339 1131 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1132 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
6b659339 1133 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1134 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
6b659339 1135 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1136 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
6b659339 1137 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1138 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
6b659339 1139 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1140 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
6b659339 1141 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1142 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
6b659339
KW
1143 += $lb_actions{'LB_SY_or_IS_then_various'};
1144
1145 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1146 # which can be rewritten as:
1147 # NU (SY | IS)* (CL | CP)? × (PO | PR)
289ce9cc
KW
1148 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1149 = $lb_actions{'LB_NOBREAK'};
1150 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1151 = $lb_actions{'LB_NOBREAK'};
6b659339 1152
289ce9cc 1153 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1154 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1155 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1156 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1157 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1158 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1159 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
6b659339
KW
1160 += $lb_actions{'LB_various_then_PO_or_PR'};
1161
289ce9cc 1162 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1163 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1164 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1165 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1166 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1167 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1168 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
6b659339
KW
1169 += $lb_actions{'LB_various_then_PO_or_PR'};
1170
b0e24409
KW
1171 # LB24 Do not break between numeric prefix/postfix and letters, or between
1172 # letters and prefix/postfix.
1173 # (PR | PO) × (AL | HL)
289ce9cc
KW
1174 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1175 = $lb_actions{'LB_NOBREAK'};
1176 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1177 = $lb_actions{'LB_NOBREAK'};
289ce9cc
KW
1178 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1179 = $lb_actions{'LB_NOBREAK'};
1180 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1181 = $lb_actions{'LB_NOBREAK'};
6b659339 1182
b0e24409
KW
1183 # (AL | HL) × (PR | PO)
1184 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1185 = $lb_actions{'LB_NOBREAK'};
1186 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1187 = $lb_actions{'LB_NOBREAK'};
1188 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1189 = $lb_actions{'LB_NOBREAK'};
1190 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1191 = $lb_actions{'LB_NOBREAK'};
1192
1193 # LB23a Do not break between numeric prefixes and ideographs, or between
1194 # ideographs and numeric postfixes.
1195 # PR × (ID | EB | EM)
1196 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1197 = $lb_actions{'LB_NOBREAK'};
1198 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1199 = $lb_actions{'LB_NOBREAK'};
1200 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1201 = $lb_actions{'LB_NOBREAK'};
1202
1203 # (ID | EB | EM) × PO
289ce9cc
KW
1204 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1205 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1206 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1207 = $lb_actions{'LB_NOBREAK'};
1208 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1209 = $lb_actions{'LB_NOBREAK'};
6b659339 1210
b0e24409 1211 # LB23 Do not break between digits and letters
6b659339 1212 # (AL | HL) × NU
289ce9cc
KW
1213 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1214 = $lb_actions{'LB_NOBREAK'};
1215 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1216 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1217
1218 # NU × (AL | HL)
289ce9cc
KW
1219 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1220 = $lb_actions{'LB_NOBREAK'};
1221 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1222 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1223
1224 # LB22 Do not break between two ellipses, or between letters, numbers or
1225 # exclamations and ellipsis.
1226 # (AL | HL) × IN
289ce9cc
KW
1227 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1228 = $lb_actions{'LB_NOBREAK'};
1229 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1230 = $lb_actions{'LB_NOBREAK'};
6b659339 1231
289ce9cc
KW
1232 # Exclamation × IN
1233 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1234 = $lb_actions{'LB_NOBREAK'};
6b659339 1235
b0e24409 1236 # (ID | EB | EM) × IN
289ce9cc
KW
1237 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1238 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1239 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1240 = $lb_actions{'LB_NOBREAK'};
1241 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1242 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1243
1244 # IN × IN
289ce9cc
KW
1245 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1246 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1247
1248 # NU × IN
289ce9cc
KW
1249 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1250 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1251
1252 # LB21b Don’t break between Solidus and Hebrew letters.
1253 # SY × HL
289ce9cc
KW
1254 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1255 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1256
1257 # LB21a Don't break after Hebrew + Hyphen.
1258 # HL (HY | BA) ×
1259 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1260 $lb_table[$lb_enums{'Hyphen'}][$i]
1261 += $lb_actions{'LB_HY_or_BA_then_foo'};
1262 $lb_table[$lb_enums{'Break_After'}][$i]
1263 += $lb_actions{'LB_HY_or_BA_then_foo'};
6b659339
KW
1264 }
1265
1266 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1267 # spaces, small kana, and other non-starters, or after acute accents.
1268 # × BA
1269 # × HY
1270 # × NS
1271 # BB ×
1272 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1273 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1274 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1275 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1276 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1277 }
1278
1279 # LB20 Break before and after unresolved CB.
1280 # ÷ CB
1281 # CB ÷
1282 # Conditional breaks should be resolved external to the line breaking
1283 # rules. However, the default action is to treat unresolved CB as breaking
1284 # before and after.
1285 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1286 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1287 = $lb_actions{'LB_BREAKABLE'};
1288 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1289 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1290 }
1291
1292 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1293 # × QU
1294 # QU ×
1295 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1296 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1297 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1298 }
1299
1300 # LB18 Break after spaces
1301 # SP ÷
1302 for my $i (0 .. @lb_table - 1) {
289ce9cc 1303 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1304 }
1305
1306 # LB17 Do not break within ‘——’, even with intervening spaces.
1307 # B2 SP* × B2
289ce9cc 1308 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
6b659339
KW
1309 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1310
1311 # LB16 Do not break between closing punctuation and a nonstarter even with
1312 # intervening spaces.
1313 # (CL | CP) SP* × NS
289ce9cc 1314 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
6b659339 1315 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1316 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
6b659339
KW
1317 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1318
1319
1320 # LB15 Do not break within ‘”[’, even with intervening spaces.
1321 # QU SP* × OP
289ce9cc 1322 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
6b659339
KW
1323 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1324
1325 # LB14 Do not break after ‘[’, even after spaces.
1326 # OP SP* ×
1327 for my $i (0 .. @lb_table - 1) {
289ce9cc 1328 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
6b659339
KW
1329 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1330 }
1331
1332 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1333 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1334 # [^NU] × CL
1335 # [^NU] × CP
1336 # × EX
1337 # [^NU] × IS
1338 # [^NU] × SY
1339 for my $i (0 .. @lb_table - 1) {
289ce9cc 1340 $lb_table[$i][$lb_enums{'Exclamation'}]
6b659339
KW
1341 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1342
289ce9cc 1343 next if $i == $lb_enums{'Numeric'};
6b659339 1344
289ce9cc 1345 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
6b659339 1346 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1347 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
6b659339 1348 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1349 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
6b659339 1350 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1351 $lb_table[$i][$lb_enums{'Break_Symbols'}]
6b659339
KW
1352 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1353 }
1354
1355 # LB12a Do not break before NBSP and related characters, except after
1356 # spaces and hyphens.
1357 # [^SP BA HY] × GL
1358 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1359 next if $i == $lb_enums{'Space'}
1360 || $i == $lb_enums{'Break_After'}
1361 || $i == $lb_enums{'Hyphen'};
6b659339
KW
1362
1363 # We don't break, but if a property above has said don't break even
1364 # with space between, don't override that (also in the next few rules)
289ce9cc 1365 next if $lb_table[$i][$lb_enums{'Glue'}]
6b659339 1366 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1367 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1368 }
1369
1370 # LB12 Do not break after NBSP and related characters.
1371 # GL ×
1372 for my $i (0 .. @lb_table - 1) {
289ce9cc 1373 next if $lb_table[$lb_enums{'Glue'}][$i]
6b659339 1374 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1375 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1376 }
1377
1378 # LB11 Do not break before or after Word joiner and related characters.
1379 # × WJ
1380 # WJ ×
1381 for my $i (0 .. @lb_table - 1) {
289ce9cc 1382 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
6b659339
KW
1383 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1384 {
289ce9cc 1385 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
6b659339 1386 }
289ce9cc 1387 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
6b659339
KW
1388 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1389 {
289ce9cc 1390 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1391 }
1392 }
1393
1394 # Special case this here to avoid having to do a special case in the code,
1395 # by making this the same as other things with a SP in front of them that
1396 # don't break, we avoid an extra test
289ce9cc 1397 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
6b659339
KW
1398 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1399
1400 # LB9 and LB10 are done in the same loop
1401 #
1402 # LB9 Do not break a combining character sequence; treat it as if it has
1403 # the line breaking class of the base character in all of the
b0e24409
KW
1404 # higher-numbered rules. Treat ZWJ as if it were CM
1405 # Treat X (CM|ZWJ)* as if it were X.
6b659339
KW
1406 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1407
b0e24409
KW
1408 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1409 # case where a CM or ZWJ is the first character on the line or follows SP,
1410 # BK, CR, LF, NL, or ZW.
6b659339
KW
1411 for my $i (0 .. @lb_table - 1) {
1412
b0e24409
KW
1413 # When the CM or ZWJ is the first in the pair, we don't know without
1414 # looking behind whether the CM or ZWJ is going to attach to an
1415 # earlier character, or not. So have to figure this out at runtime in
1416 # the code
1417 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1418 = $lb_actions{'LB_CM_ZWJ_foo'};
1419 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
289ce9cc
KW
1420
1421 if ( $i == $lb_enums{'Mandatory_Break'}
1422 || $i == $lb_enums{'EDGE'}
1423 || $i == $lb_enums{'Carriage_Return'}
1424 || $i == $lb_enums{'Line_Feed'}
1425 || $i == $lb_enums{'Next_Line'}
1426 || $i == $lb_enums{'Space'}
1427 || $i == $lb_enums{'ZWSpace'})
6b659339
KW
1428 {
1429 # For these classes, a following CM doesn't combine, and should do
289ce9cc
KW
1430 # whatever 'Alphabetic' would do.
1431 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1432 = $lb_table[$i][$lb_enums{'Alphabetic'}];
b0e24409
KW
1433 $lb_table[$i][$lb_enums{'ZWJ'}]
1434 = $lb_table[$i][$lb_enums{'Alphabetic'}];
6b659339
KW
1435 }
1436 else {
b0e24409
KW
1437 # For these classes, the CM or ZWJ combines, so doesn't break,
1438 # inheriting the type of nobreak from the master character.
289ce9cc 1439 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
6b659339
KW
1440 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1441 {
289ce9cc
KW
1442 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1443 = $lb_actions{'LB_NOBREAK'};
6b659339 1444 }
b0e24409
KW
1445 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1446 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1447 {
1448 $lb_table[$i][$lb_enums{'ZWJ'}]
1449 = $lb_actions{'LB_NOBREAK'};
1450 }
6b659339
KW
1451 }
1452 }
1453
b0e24409
KW
1454 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1455 # base or emoji modifier. This rule prevents breaks within emoji joiner
1456 # sequences.
1457 # ZWJ × (ID | EB | EM)
1458 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1459 = $lb_actions{'LB_NOBREAK'};
1460 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1461 = $lb_actions{'LB_NOBREAK'};
1462 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1463 = $lb_actions{'LB_NOBREAK'};
1464
6b659339
KW
1465 # LB8 Break before any character following a zero-width space, even if one
1466 # or more spaces intervene.
1467 # ZW SP* ÷
1468 for my $i (0 .. @lb_table - 1) {
289ce9cc 1469 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1470 }
1471
1472 # Because of LB8-10, we need to look at context for "SP x", and this must
1473 # be done in the code. So override the existing rules for that, by adding
1474 # a constant to get new rules that tell the code it needs to look at
1475 # context. By adding this action instead of replacing the existing one,
1476 # we can get back to the original rule if necessary.
1477 for my $i (0 .. @lb_table - 1) {
289ce9cc 1478 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
6b659339
KW
1479 }
1480
1481 # LB7 Do not break before spaces or zero width space.
1482 # × SP
1483 # × ZW
1484 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1485 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1486 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1487 }
1488
1489 # LB6 Do not break before hard line breaks.
1490 # × ( BK | CR | LF | NL )
1491 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1492 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1493 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1494 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1495 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1496 }
1497
1498 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1499 # CR × LF
1500 # CR !
1501 # LF !
1502 # NL !
1503 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1504 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1505 = $lb_actions{'LB_BREAKABLE'};
1506 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1507 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339 1508 }
289ce9cc
KW
1509 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1510 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1511
1512 # LB4 Always break after hard line breaks.
1513 # BK !
1514 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1515 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1516 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1517 }
1518
6b659339
KW
1519 # LB3 Always break at the end of text.
1520 # ! eot
b0e24409
KW
1521 # LB2 Never break at the start of text.
1522 # sot ×
6b659339 1523 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1524 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1525 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1526 }
1527
1528 # LB1 Assign a line breaking class to each code point of the input.
1529 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1530 # depending on criteria outside the scope of this algorithm.
1531 #
1532 # In the absence of such criteria all characters with a specific
1533 # combination of original class and General_Category property value are
1534 # resolved as follows:
1535 # Original Resolved General_Category
1536 # AI, SG, XX AL Any
1537 # SA CM Only Mn or Mc
1538 # SA AL Any except Mn and Mc
1539 # CJ NS Any
1540 #
1541 # This is done in mktables, so we never see any of the remapped-from
1542 # classes.
1543
289ce9cc
KW
1544 output_table_common('LB', \%lb_actions,
1545 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
6b659339
KW
1546}
1547
7e54b87f
KW
1548sub output_WB_table() {
1549
1550 # Create and output the enums, #defines, and pair table for use in
1551 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1552
1553 # This uses the same mechanism in the other bounds tables generated by
1554 # this file. The actions that could override a 0 or 1 are added to those
1555 # numbers; the actions that clearly don't depend on the underlying rule
1556 # simply overwrite
1557 my %wb_actions = (
1558 WB_NOBREAK => 0,
1559 WB_BREAKABLE => 1,
1560 WB_hs_then_hs => 2,
b0e24409 1561 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
7e54b87f
KW
1562 WB_DQ_then_HL => 4,
1563 WB_HL_then_DQ => 6,
1564 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1565 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1566 WB_MB_or_MN_or_SQ_then_NU => 12,
1567 WB_NU_then_MB_or_MN_or_SQ => 14,
b0e24409 1568 WB_RI_then_RI => 16,
7e54b87f
KW
1569 );
1570
7e54b87f
KW
1571 # Construct the WB pair table.
1572 # The table is constructed in reverse order of the rules, to make the
1573 # lower-numbered, higher priority ones override the later ones, as the
1574 # algorithm stops at the earliest matching rule
1575
1576 my @wb_table;
1577 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
39c4defe 1578 die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
7e54b87f
KW
1579
1580 # Otherwise, break everywhere (including around ideographs).
b0e24409 1581 # WB99 Any ÷ Any
7e54b87f
KW
1582 for my $i (0 .. $table_size - 1) {
1583 for my $j (0 .. $table_size - 1) {
1584 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1585 }
1586 }
1587
b0e24409
KW
1588 # Do not break within emoji flag sequences. That is, do not break between
1589 # regional indicator (RI) symbols if there is an odd number of RI
1590 # characters before the break point.
1591 # WB16 [^RI] (RI RI)* RI × RI
c492f156 1592 # WB15 sot (RI RI)* RI × RI
289ce9cc 1593 $wb_table[$wb_enums{'Regional_Indicator'}]
b0e24409
KW
1594 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1595
1596 # Do not break within emoji modifier sequences.
1597 # WB14 ( E_Base | EBG ) × E_Modifier
1598 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1599 = $wb_actions{'WB_NOBREAK'};
1600 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1601 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1602
1603 # Do not break from extenders.
1604 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
289ce9cc
KW
1605 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1606 = $wb_actions{'WB_NOBREAK'};
1607 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1608 = $wb_actions{'WB_NOBREAK'};
1609 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1610 = $wb_actions{'WB_NOBREAK'};
1611 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1612 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1613
1614 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1615 # × # ExtendNumLet
289ce9cc
KW
1616 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1617 = $wb_actions{'WB_NOBREAK'};
1618 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1619 = $wb_actions{'WB_NOBREAK'};
1620 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1621 = $wb_actions{'WB_NOBREAK'};
1622 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1623 = $wb_actions{'WB_NOBREAK'};
1624 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1625 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1626
1627 # Do not break between Katakana.
1628 # WB13 Katakana × Katakana
289ce9cc
KW
1629 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1630 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1631
1632 # Do not break within sequences, such as “3.2” or “3,456.789”.
1633 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
289ce9cc 1634 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
7e54b87f 1635 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1636 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
7e54b87f 1637 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1638 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1639 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1640
1641 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
289ce9cc 1642 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
7e54b87f 1643 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1644 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
7e54b87f 1645 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1646 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
7e54b87f
KW
1647 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1648
1649 # Do not break within sequences of digits, or digits adjacent to letters
1650 # (“3a”, or “A3”).
1651 # WB10 Numeric × (ALetter | Hebrew_Letter)
289ce9cc
KW
1652 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1653 = $wb_actions{'WB_NOBREAK'};
1654 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1655 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1656
1657 # WB9 (ALetter | Hebrew_Letter) × Numeric
289ce9cc
KW
1658 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1659 = $wb_actions{'WB_NOBREAK'};
1660 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1661 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1662
1663 # WB8 Numeric × Numeric
289ce9cc
KW
1664 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1665 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1666
1667 # Do not break letters across certain punctuation.
1668 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
289ce9cc
KW
1669 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1670 += $wb_actions{'WB_DQ_then_HL'};
7e54b87f
KW
1671
1672 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
289ce9cc
KW
1673 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1674 += $wb_actions{'WB_HL_then_DQ'};
7e54b87f
KW
1675
1676 # WB7a Hebrew_Letter × Single_Quote
289ce9cc
KW
1677 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1678 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1679
1680 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1681 # × (ALetter | Hebrew_Letter)
289ce9cc 1682 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
7e54b87f 1683 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1684 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1685 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1686 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
7e54b87f 1687 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1688 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1689 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1690 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
7e54b87f 1691 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1692 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f
KW
1693 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1694
1695 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1696 # | Single_Quote) (ALetter | Hebrew_Letter)
289ce9cc 1697 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1698 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1699 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1700 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1701 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
7e54b87f 1702 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1703 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
7e54b87f 1704 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1705 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
7e54b87f 1706 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1707 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1708 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1709
1710 # Do not break between most letters.
1711 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
289ce9cc
KW
1712 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1713 = $wb_actions{'WB_NOBREAK'};
1714 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1715 = $wb_actions{'WB_NOBREAK'};
1716 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1717 = $wb_actions{'WB_NOBREAK'};
1718 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1719 = $wb_actions{'WB_NOBREAK'};
7e54b87f 1720
b0e24409
KW
1721 # Ignore Format and Extend characters, except after sot, CR, LF, and
1722 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1723 # WB4 X (Extend | Format | ZWJ)* → X
7e54b87f 1724 for my $i (0 .. @wb_table - 1) {
289ce9cc 1725 $wb_table[$wb_enums{'Extend'}][$i]
b0e24409 1726 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
289ce9cc 1727 $wb_table[$wb_enums{'Format'}][$i]
b0e24409
KW
1728 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1729 $wb_table[$wb_enums{'ZWJ'}][$i]
1730 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1731 }
1732 for my $i (0 .. @wb_table - 1) {
1733 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1734 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1735 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1736 }
1737
1738 # Implied is that these attach to the character before them, except for
1739 # the characters that mark the end of a region of text. The rules below
1740 # override the ones set up here, for all the characters that need
1741 # overriding.
1742 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1743 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1744 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1745 }
1746
b0e24409
KW
1747 # Do not break within emoji zwj sequences.
1748 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1749 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1750 = $wb_actions{'WB_NOBREAK'};
1751 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1752 = $wb_actions{'WB_NOBREAK'};
1753
7e54b87f
KW
1754 # Break before and after white space
1755 # WB3b ÷ (Newline | CR | LF)
1756 # WB3a (Newline | CR | LF) ÷
1757 # et. al.
289ce9cc 1758 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1759 for my $j (0 .. @wb_table - 1) {
1760 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1761 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1762 }
1763 }
1764
1765 # But do not break within white space.
1766 # WB3 CR × LF
1767 # et.al.
289ce9cc
KW
1768 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1769 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1770 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1771 }
1772 }
1773
b0e24409 1774 # And do not break horizontal space followed by Extend or Format or ZWJ
289ce9cc
KW
1775 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1776 = $wb_actions{'WB_NOBREAK'};
1777 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1778 = $wb_actions{'WB_NOBREAK'};
b0e24409
KW
1779 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1780 = $wb_actions{'WB_NOBREAK'};
289ce9cc
KW
1781 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1782 [$wb_enums{'Perl_Tailored_HSpace'}]
1783 = $wb_actions{'WB_hs_then_hs'};
7e54b87f 1784
b0e24409
KW
1785 # Break at the start and end of text, unless the text is empty
1786 # WB2 Any ÷ eot
1787 # WB1 sot ÷ Any
7e54b87f 1788 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1789 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1790 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
7e54b87f 1791 }
289ce9cc 1792 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
7e54b87f 1793
289ce9cc
KW
1794 output_table_common('WB', \%wb_actions,
1795 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
7e54b87f
KW
1796}
1797
9d9177be
KW
1798output_invlist("Latin1", [ 0, 256 ]);
1799output_invlist("AboveLatin1", [ 256 ]);
1800
bffc0129 1801end_file_pound_if;
43b443dd 1802
3f427fd9
KW
1803# We construct lists for all the POSIX and backslash sequence character
1804# classes in two forms:
1805# 1) ones which match only in the ASCII range
1806# 2) ones which match either in the Latin1 range, or the entire Unicode range
1807#
1808# These get compiled in, and hence affect the memory footprint of every Perl
1809# program, even those not using Unicode. To minimize the size, currently
1810# the Latin1 version is generated for the beyond ASCII range except for those
1811# lists that are quite small for the entire range, such as for \s, which is 22
1812# UVs long plus 4 UVs (currently) for the header.
1813#
1814# To save even more memory, the ASCII versions could be derived from the
1815# larger ones at runtime, saving some memory (minus the expense of the machine
1816# instructions to do so), but these are all small anyway, so their total is
1817# about 100 UVs.
1818#
1819# In the list of properties below that get generated, the L1 prefix is a fake
1820# property that means just the Latin1 range of the full property (whose name
1821# has an X prefix instead of L1).
a02047bf
KW
1822#
1823# An initial & means to use the subroutine from this file instead of an
1824# official inversion list.
3f427fd9 1825
0c4ecf42
KW
1826for my $charset (get_supported_code_pages()) {
1827 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1828
99f21fb9 1829 @a2n = @{get_a2n($charset)};
226b74db
KW
1830 # Below is the list of property names to generate. '&' means to use the
1831 # subroutine to generate the inversion list instead of the generic code
1832 # below. Some properties have a comma-separated list after the name,
1833 # These are extra enums to add to those found in the Unicode tables.
99f21fb9
KW
1834 no warnings 'qw';
1835 # Ignore non-alpha in sort
1836 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
c0382778 1837 Assigned
1c8c3428
KW
1838 ASCII
1839 Cased
9abae7a5 1840 Currency_Symbol
1c8c3428
KW
1841 VertSpace
1842 XPerlSpace
1843 XPosixAlnum
1844 XPosixAlpha
1845 XPosixBlank
1846 XPosixCntrl
1847 XPosixDigit
1848 XPosixGraph
1849 XPosixLower
1850 XPosixPrint
1851 XPosixPunct
1852 XPosixSpace
1853 XPosixUpper
1854 XPosixWord
1855 XPosixXDigit
1856 _Perl_Any_Folds
1857 &NonL1_Perl_Non_Final_Folds
1858 _Perl_Folds_To_Multi_Char
1859 &UpperLatin1
1860 _Perl_IDStart
1861 _Perl_IDCont
226b74db
KW
1862 _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
1863 _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
1864 _Perl_SB,SContinue,CR,Extend,LF,EDGE
1865 _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
463b4a67 1866 _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
1c8c3428 1867 )
226b74db
KW
1868 # NOTE that the convention is that extra enum
1869 # values come after the property name, separated by
1870 # commas, with the enums that aren't ever defined
1871 # by Unicode coming last, at least 4 all-uppercase
1872 # characters. The others are enum names that are
1873 # needed by perl, but aren't in all Unicode
1874 # releases.
0f5e3c71
KW
1875 ) {
1876
1877 # For the Latin1 properties, we change to use the eXtended version of the
1878 # base property, then go through the result and get rid of everything not
1879 # in Latin1 (above 255). Actually, we retain the element for the range
1880 # that crosses the 255/256 boundary if it is one that matches the
1881 # property. For example, in the Word property, there is a range of code
1882 # points that start at U+00F8 and goes through U+02C1. Instead of
1883 # artificially cutting that off at 256 because 256 is the first code point
1884 # above Latin1, we let the range go to its natural ending. That gives us
1885 # extra information with no added space taken. But if the range that
1886 # crosses the boundary is one that doesn't match the property, we don't
1887 # start a new range above 255, as that could be construed as going to
1888 # infinity. For example, the Upper property doesn't include the character
1889 # at 255, but does include the one at 256. We don't include the 256 one.
1890 my $prop_name = $prop;
1891 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1892 my $extra_enums = "";
1893 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1894 my $lookup_prop = $prop_name;
1895 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1896 or $lookup_prop =~ s/^L1//);
1897 my $nonl1_only = 0;
1898 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1899 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1900
1901 my @invlist;
99f21fb9
KW
1902 my @invmap;
1903 my $map_format;
1904 my $map_default;
1905 my $maps_to_code_point;
1906 my $to_adjust;
0f5e3c71
KW
1907 if ($is_local_sub) {
1908 @invlist = eval $lookup_prop;
289ce9cc 1909 die $@ if $@;
0f5e3c71
KW
1910 }
1911 else {
1912 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1913 if (! @invlist) {
99f21fb9 1914
ad85f59a
KW
1915 # If couldn't find a non-empty inversion list, see if it is
1916 # instead an inversion map
1917 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1918 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1919 if (! $list_ref) {
1920 # An empty return here could mean an unknown property, or
1921 # merely that the original inversion list is empty. Call
1922 # in scalar context to differentiate
1923 my $count = prop_invlist($lookup_prop,
1924 '_perl_core_internal_ok');
1925 die "Could not find inversion list for '$lookup_prop'"
1926 unless defined $count;
1927 }
1928 else {
18b852b3
KW
1929 @invlist = @$list_ref;
1930 @invmap = @$map_ref;
1931 $map_format = $format;
1932 $map_default = $default;
1933 $maps_to_code_point = $map_format =~ /x/;
1934 $to_adjust = $map_format =~ /a/;
ad85f59a 1935 }
99f21fb9 1936 }
0f5e3c71 1937 }
ad85f59a
KW
1938
1939
1940 # Short-circuit an empty inversion list.
1941 if (! @invlist) {
1942 output_invlist($prop_name, \@invlist, $charset);
1943 next;
1944 }
ceb1de32 1945
99f21fb9
KW
1946 # Re-order the Unicode code points to native ones for this platform.
1947 # This is only needed for code points below 256, because native code
1948 # points are only in that range. For inversion maps of properties
1949 # where the mappings are adjusted (format =~ /a/), this reordering
1950 # could mess up the adjustment pattern that was in the input, so that
1951 # has to be dealt with.
1952 #
1953 # And inversion maps that map to code points need to eventually have
1954 # all those code points remapped to native, and it's better to do that
1955 # here, going through the whole list not just those below 256. This
1956 # is because some inversion maps have adjustments (format =~ /a/)
1957 # which may be affected by the reordering. This code needs to be done
1958 # both for when we are translating the inversion lists for < 256, and
1959 # for the inversion maps for everything. By doing both in this loop,
1960 # we can share that code.
1961 #
1962 # So, we go through everything for an inversion map to code points;
1963 # otherwise, we can skip any remapping at all if we are going to
1964 # output only the above-Latin1 values, or if the range spans the whole
1965 # of 0..256, as the remap will also include all of 0..256 (256 not
1966 # 255 because a re-ordering could cause 256 to need to be in the same
1967 # range as 255.)
1968 if ((@invmap && $maps_to_code_point)
1969 || (! $nonl1_only || ($invlist[0] < 256
1970 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1971 {
fb4554ea 1972
99f21fb9 1973 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1974 # Look at all the ranges that start before 257.
1975 my @latin1_list;
1976 while (@invlist) {
1977 last if $invlist[0] > 256;
1978 my $upper = @invlist > 1
1979 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1980
1981 # To infinity. You may want to stop much much
1982 # earlier; going this high may expose perl
1983 # deficiencies with very large numbers.
1984 : $Unicode::UCD::MAX_CP;
fb4554ea 1985 for my $j ($invlist[0] .. $upper) {
99f21fb9 1986 push @latin1_list, a2n($j);
0f5e3c71 1987 }
fb4554ea
KW
1988
1989 shift @invlist; # Shift off the range that's in the list
1990 shift @invlist; # Shift off the range not in the list
0c4ecf42 1991 }
fb4554ea
KW
1992
1993 # Here @invlist contains all the ranges in the original that start
1994 # at code points above 256, and @latin1_list contains all the
1995 # native code points for ranges that start with a Unicode code
1996 # point below 257. We sort the latter and convert it to inversion
1997 # list format. Then simply prepend it to the list of the higher
1998 # code points.
1999 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 2000 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 2001 unshift @invlist, @latin1_list;
99f21fb9
KW
2002 }
2003 else { # Is an inversion map
2004
2005 # This is a similar procedure as plain inversion list, but has
2006 # multiple buckets. A plain inversion list just has two
2007 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
2008 # pretty much can ignore the 2nd bucket, as it is completely
2009 # defined by the 1st. But here, what we do is create buckets
2010 # which contain the code points that map to each, translated
2011 # to native and turned into an inversion list. Thus each
2012 # bucket is an inversion list of native code points that map
2013 # to it or don't map to it. We use these to create an
2014 # inversion map for the whole property.
2015
2016 # As mentioned earlier, we use this procedure to not just
2017 # remap the inversion list to native values, but also the maps
2018 # of code points to native ones. In the latter case we have
2019 # to look at the whole of the inversion map (or at least to
2020 # above Unicode; as the maps of code points above that should
2021 # all be to the default).
2022 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
2023
2024 my %mapped_lists; # A hash whose keys are the buckets.
2025 while (@invlist) {
2026 last if $invlist[0] > $upper_limit;
2027
2028 # This shouldn't actually happen, as prop_invmap() returns
2029 # an extra element at the end that is beyond $upper_limit
2030 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
2031
2032 my $bucket;
2033
2034 # A hash key can't be a ref (we are only expecting arrays
2035 # of scalars here), so convert any such to a string that
2036 # will be converted back later (using a vertical tab as
2037 # the separator). Even if the mapping is to code points,
2038 # we don't translate to native here because the code
d8049362 2039 # output_invmap() calls to output these arrays assumes the
99f21fb9
KW
2040 # input is Unicode, not native.
2041 if (ref $invmap[0]) {
2042 $bucket = join "\cK", @{$invmap[0]};
2043 }
2044 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
2045
2046 # Do convert to native for maps to single code points.
2047 # There are some properties that have a few outlier
2048 # maps that aren't code points, so the above test
2049 # skips those.
2050 $bucket = a2n($invmap[0]);
2051 } else {
2052 $bucket = $invmap[0];
2053 }
2054
2055 # We now have the bucket that all code points in the range
2056 # map to, though possibly they need to be adjusted. Go
2057 # through the range and put each translated code point in
2058 # it into its bucket.
2059 my $base_map = $invmap[0];
2060 for my $j ($invlist[0] .. $invlist[1] - 1) {
2061 if ($to_adjust
2062 # The 1st code point doesn't need adjusting
2063 && $j > $invlist[0]
2064
2065 # Skip any non-numeric maps: these are outliers
2066 # that aren't code points.
2067 && $base_map =~ $numeric_re
2068
2069 # 'ne' because the default can be a string
2070 && $base_map ne $map_default)
2071 {
2072 # We adjust, by incrementing each the bucket and
2073 # the map. For code point maps, translate to
2074 # native
2075 $base_map++;
2076 $bucket = ($maps_to_code_point)
2077 ? a2n($base_map)
2078 : $base_map;
2079 }
2080
2081 # Add the native code point to the bucket for the
2082 # current map
2083 push @{$mapped_lists{$bucket}}, a2n($j);
2084 } # End of loop through all code points in the range
2085
2086 # Get ready for the next range
2087 shift @invlist;
2088 shift @invmap;
2089 } # End of loop through all ranges in the map.
2090
2091 # Here, @invlist and @invmap retain all the ranges from the
2092 # originals that start with code points above $upper_limit.
2093 # Each bucket in %mapped_lists contains all the code points
2094 # that map to that bucket. If the bucket is for a map to a
2095 # single code point is a single code point, the bucket has
2096 # been converted to native. If something else (including
2097 # multiple code points), no conversion is done.
2098 #
2099 # Now we recreate the inversion map into %xlated, but this
2100 # time for the native character set.
2101 my %xlated;
2102 foreach my $bucket (keys %mapped_lists) {
2103
2104 # Sort and convert this bucket to an inversion list. The
2105 # result will be that ranges that start with even-numbered
2106 # indexes will be for code points that map to this bucket;
2107 # odd ones map to some other bucket, and are discarded
2108 # below.
2109 @{$mapped_lists{$bucket}}
2110 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2111 @{$mapped_lists{$bucket}}
2112 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2113
2114 # Add each even-numbered range in the bucket to %xlated;
2115 # so that the keys of %xlated become the range start code
2116 # points, and the values are their corresponding maps.
2117 while (@{$mapped_lists{$bucket}}) {
2118 my $range_start = $mapped_lists{$bucket}->[0];
2119 if ($bucket =~ /\cK/) {
2120 @{$xlated{$range_start}} = split /\cK/, $bucket;
2121 }
2122 else {
2123 $xlated{$range_start} = $bucket;
2124 }
2125 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2126 shift @{$mapped_lists{$bucket}}; # Get ready for next
2127 # iteration
2128 }
2129 } # End of loop through all the buckets.
2130
2131 # Here %xlated's keys are the range starts of all the code
2132 # points in the inversion map. Construct an inversion list
2133 # from them.
2134 my @new_invlist = sort { $a <=> $b } keys %xlated;
2135
2136 # If the list is adjusted, we want to munge this list so that
2137 # we only have one entry for where consecutive code points map
2138 # to consecutive values. We just skip the subsequent entries
2139 # where this is the case.
2140 if ($to_adjust) {
2141 my @temp;
2142 for my $i (0 .. @new_invlist - 1) {
2143 next if $i > 0
2144 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2145 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2146 && $xlated{$new_invlist[$i]} =~ $numeric_re
2147 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2148 push @temp, $new_invlist[$i];
2149 }
2150 @new_invlist = @temp;
2151 }
2152
2153 # The inversion map comes from %xlated's values. We can
2154 # unshift each onto the front of the untouched portion, in
2155 # reverse order of the portion we did process.
2156 foreach my $start (reverse @new_invlist) {
2157 unshift @invmap, $xlated{$start};
2158 }
2159
2160 # Finally prepend the inversion list we have just constructed to the
2161 # one that contains anything we didn't process.
2162 unshift @invlist, @new_invlist;
2163 }
2164 }
2165
2166 # prop_invmap() returns an extra final entry, which we can now
2167 # discard.
2168 if (@invmap) {
2169 pop @invlist;
2170 pop @invmap;
ceb1de32 2171 }
0f5e3c71
KW
2172
2173 if ($l1_only) {
99f21fb9 2174 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
2175 for my $i (0 .. @invlist - 1 - 1) {
2176 if ($invlist[$i] > 255) {
2177
2178 # In an inversion list, even-numbered elements give the code
2179 # points that begin ranges that match the property;
2180 # odd-numbered give ones that begin ranges that don't match.
2181 # If $i is odd, we are at the first code point above 255 that
2182 # doesn't match, which means the range it is ending does
2183 # match, and crosses the 255/256 boundary. We want to include
2184 # this ending point, so increment $i, so the splice below
2185 # includes it. Conversely, if $i is even, it is the first
2186 # code point above 255 that matches, which means there was no
2187 # matching range that crossed the boundary, and we don't want
2188 # to include this code point, so splice before it.
2189 $i++ if $i % 2 != 0;
2190
2191 # Remove everything past this.
2192 splice @invlist, $i;
99f21fb9 2193 splice @invmap, $i if @invmap;
0f5e3c71
KW
2194 last;
2195 }
0c4ecf42
KW
2196 }
2197 }
0f5e3c71
KW
2198 elsif ($nonl1_only) {
2199 my $found_nonl1 = 0;
2200 for my $i (0 .. @invlist - 1 - 1) {
2201 next if $invlist[$i] < 256;
2202
2203 # Here, we have the first element in the array that indicates an
2204 # element above Latin1. Get rid of all previous ones.
2205 splice @invlist, 0, $i;
99f21fb9 2206 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
2207
2208 # If this one's index is not divisible by 2, it means that this
2209 # element is inverting away from being in the list, which means
99f21fb9
KW
2210 # all code points from 256 to this one are in this list (or
2211 # map to the default for inversion maps)
2212 if ($i % 2 != 0) {
2213 unshift @invlist, 256;
2214 unshift @invmap, $map_default if @invmap;
2215 }
0f5e3c71 2216 $found_nonl1 = 1;
3f427fd9
KW
2217 last;
2218 }
0f5e3c71 2219 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 2220 }
3f427fd9 2221
0f5e3c71 2222 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 2223 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 2224 }
bffc0129 2225 end_file_pound_if;
0c4ecf42 2226 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
2227}
2228
973a28ed
KW
2229switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2230
2231output_GCB_table();
6b659339 2232output_LB_table();
7e54b87f 2233output_WB_table();
6b659339 2234
973a28ed
KW
2235end_file_pound_if;
2236
2308ab83 2237my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
2238my @sources = ($0, qw(lib/unicore/mktables
2239 lib/Unicode/UCD.pm
2240 regen/charset_translations.pl
2241 ));
9a3da3ad
FC
2242{
2243 # Depend on mktables’ own sources. It’s a shorter list of files than
2244 # those that Unicode::UCD uses.
1ae6ead9 2245 if (! open my $mktables_list, '<', $sources_list) {
2308ab83
KW
2246
2247 # This should force a rebuild once $sources_list exists
2248 push @sources, $sources_list;
2249 }
2250 else {
2251 while(<$mktables_list>) {
2252 last if /===/;
2253 chomp;
2254 push @sources, "lib/unicore/$_" if /^[^#]/;
2255 }
9a3da3ad
FC
2256 }
2257}
6b659339
KW
2258
2259read_only_bottom_close_and_rename($out_fh, \@sources);