This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Use correct structure to look up data
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
bffc0129
KW
45# The symbols generated by this program are all currently defined only in a
46# single dot c each. The code knows where most of them go, but this hash
47# gives overrides for the exceptions to the typical place
48my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
015bb97c 58
f79a09fc
KW
59# This hash contains the properties with enums that have hard-coded references
60# to them in C code. Its only use is to make sure that if perl is compiled
61# with an older Unicode data set, that all the enum values the code is
62# expecting will still be in the enum typedef. Thus the code doesn't have to
63# change. The Unicode version won't have any code points that have these enum
64# values, so the code that handles them will not get exercised. This is far
65# better than having to #ifdef things.
66my %hard_coded_enums =
67 ( gcb => [
68 'Control',
69 'CR',
70 'Extend',
71 'L',
72 'LF',
73 'LV',
74 'LVT',
75 'Other',
76 'Prepend',
77 'Regional_Indicator',
78 'SpacingMark',
79 'T',
80 'V',
81 ],
ca8226cf
KW
82 lb => [
83 'Alphabetic',
84 'Break_After',
85 'Break_Before',
86 'Break_Both',
87 'Break_Symbols',
88 'Carriage_Return',
89 'Close_Parenthesis',
90 'Close_Punctuation',
91 'Combining_Mark',
92 'Contingent_Break',
93 'Exclamation',
94 'Glue',
95 'H2',
96 'H3',
97 'Hebrew_Letter',
98 'Hyphen',
99 'Ideographic',
100 'Infix_Numeric',
101 'Inseparable',
102 'JL',
103 'JT',
104 'JV',
105 'Line_Feed',
106 'Mandatory_Break',
107 'Next_Line',
108 'Nonstarter',
109 'Numeric',
110 'Open_Punctuation',
111 'Postfix_Numeric',
112 'Prefix_Numeric',
113 'Quotation',
114 'Regional_Indicator',
115 'Space',
116 'Word_Joiner',
117 'ZWSpace',
118 ],
f79a09fc
KW
119 sb => [
120 'ATerm',
121 'Close',
122 'CR',
123 'Extend',
124 'Format',
125 'LF',
126 'Lower',
127 'Numeric',
128 'OLetter',
129 'Other',
130 'SContinue',
131 'Sep',
132 'Sp',
133 'STerm',
134 'Upper',
135 ],
136 wb => [
137 'ALetter',
138 'CR',
139 'Double_Quote',
140 'Extend',
141 'ExtendNumLet',
142 'Format',
143 'Hebrew_Letter',
144 'Katakana',
145 'LF',
146 'MidLetter',
147 'MidNum',
148 'MidNumLet',
149 'Newline',
150 'Numeric',
151 'Other',
f1f6961f 152 'Perl_Tailored_HSpace',
f79a09fc
KW
153 'Regional_Indicator',
154 'Single_Quote',
f79a09fc
KW
155 ],
156);
157
973a28ed
KW
158my %gcb_enums;
159my @gcb_short_enums;
6b659339
KW
160my %lb_enums;
161my @lb_short_enums;
7e54b87f
KW
162my %wb_enums;
163my @wb_short_enums;
6b659339 164
99f21fb9
KW
165my @a2n;
166
167sub uniques {
168 # Returns non-duplicated input values. From "Perl Best Practices:
169 # Encapsulated Cleverness". p. 455 in first edition.
170
171 my %seen;
172 return grep { ! $seen{$_}++ } @_;
173}
174
175sub a2n($) {
176 my $cp = shift;
177
178 # Returns the input Unicode code point translated to native.
179
180 return $cp if $cp !~ $numeric_re || $cp > 255;
181 return $a2n[$cp];
182}
183
bffc0129
KW
184sub end_file_pound_if {
185 if ($in_file_pound_if) {
186 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
187 $in_file_pound_if = 0;
188 }
189}
190
191sub switch_pound_if ($$) {
192 my $name = shift;
193 my $new_pound_if = shift;
194
195 # Switch to new #if given by the 2nd argument. If there is an override
196 # for this, it instead switches to that. The 1st argument is the
197 # static's name, used to look up the overrides
198
199 if (exists $exceptions_to_where_to_define{$name}) {
200 $new_pound_if = $exceptions_to_where_to_define{$name};
201 }
202
203 # Exit current #if if the new one is different from the old
204 if ($in_file_pound_if
205 && $in_file_pound_if !~ /$new_pound_if/)
206 {
207 end_file_pound_if;
208 }
209
210 # Enter new #if, if not already in it.
211 if (! $in_file_pound_if) {
212 $in_file_pound_if = "defined($new_pound_if)";
213 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
214 }
215}
216
0c4ecf42 217sub output_invlist ($$;$) {
9d9177be
KW
218 my $name = shift;
219 my $invlist = shift; # Reference to inversion list array
0c4ecf42 220 my $charset = shift // ""; # name of character set for comment
9d9177be 221
76d3994c 222 die "No inversion list for $name" unless defined $invlist
ad85f59a 223 && ref $invlist eq 'ARRAY';
76d3994c 224
9d9177be
KW
225 # Output the inversion list $invlist using the name $name for it.
226 # It is output in the exact internal form for inversion lists.
227
a0316a6c
KW
228 # Is the last element of the header 0, or 1 ?
229 my $zero_or_one = 0;
ad85f59a 230 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 231 unshift @$invlist, 0;
9d9177be
KW
232 $zero_or_one = 1;
233 }
0a07b44b 234 my $count = @$invlist;
9d9177be 235
bffc0129 236 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 237
0c4ecf42
KW
238 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
239 print $out_fh " /* for $charset */" if $charset;
240 print $out_fh "\n";
9d9177be 241
a0316a6c 242 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
243 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
244 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
245 ",\t/* 0 if the list starts at 0;",
246 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
247
248 # The main body are the UVs passed in to this routine. Do the final
249 # element separately
47d53124
KW
250 for my $i (0 .. @$invlist - 1) {
251 printf $out_fh "\t0x%X", $invlist->[$i];
252 print $out_fh "," if $i < @$invlist - 1;
253 print $out_fh "\n";
9d9177be
KW
254 }
255
9d9177be
KW
256 print $out_fh "};\n";
257}
258
99f21fb9
KW
259sub output_invmap ($$$$$$$) {
260 my $name = shift;
261 my $invmap = shift; # Reference to inversion map array
262 my $prop_name = shift;
263 my $input_format = shift; # The inversion map's format
264 my $default = shift; # The property value for code points who
265 # otherwise don't have a value specified.
266 my $extra_enums = shift; # comma-separated list of our additions to the
267 # property's standard possible values
268 my $charset = shift // ""; # name of character set for comment
269
270 # Output the inversion map $invmap for property $prop_name, but use $name
271 # as the actual data structure's name.
272
273 my $count = @$invmap;
274
275 my $output_format;
276 my $declaration_type;
277 my %enums;
278 my $name_prefix;
279
280 if ($input_format eq 's') {
b83e6484 281 my $orig_prop_name = $prop_name;
02f811dd
KW
282 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
283 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
b83e6484
KW
284 my @enums;
285 if ($orig_prop_name eq $prop_name) {
286 @enums = prop_values($prop_name);
287 }
288 else {
289 @enums = uniques(@$invmap);
290 }
99f21fb9
KW
291 if (! @enums) {
292 die "Only enum properties are currently handled; '$prop_name' isn't one";
293 }
294 else {
295
f79a09fc 296 # Convert short names to long
99f21fb9 297 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
f79a09fc
KW
298
299 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
300 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
301 if @expected_enums < @enums;
302
303 # Remove the enums found in the input from the ones we expect
304 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
305 splice(@expected_enums, $i, 1)
306 if grep { $expected_enums[$i] eq $_ } @enums;
307 }
308
309 # The ones remaining must be because we're using an older
310 # Unicode version. Add them to the list.
311 push @enums, @expected_enums;
312
313 # Add in the extra values coded into this program, and sort.
99f21fb9
KW
314 @enums = sort @enums;
315
6dc80864
KW
316 # The internal enums comes last.
317 push @enums, split /,/, $extra_enums if $extra_enums ne "";
318
99f21fb9
KW
319 # Assign a value to each element of the enum. The default
320 # value always gets 0; the others are arbitrarily assigned.
321 my $enum_val = 0;
02f811dd
KW
322 my $canonical_default = prop_value_aliases($prop_name, $default);
323 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
324 $enums{$default} = $enum_val++;
325 for my $enum (@enums) {
326 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
327 }
6b659339 328
973a28ed
KW
329 # Calculate the enum values for properties _Perl_GCB and
330 # _Perl_LB because we output special tables for them
331 if ($name eq '_Perl_GCB' && ! %gcb_enums) {
332 while (my ($enum, $value) = each %enums) {
333 my ($short) = prop_value_aliases('GCB', $enum);
334 $short = lc $enum unless defined $short;
335 $gcb_enums{$short} = $value;
336 @gcb_short_enums[$value] = $short;
337 }
338 }
339 elsif ($name eq '_Perl_LB' && ! %lb_enums) {
6b659339
KW
340 while (my ($enum, $value) = each %enums) {
341 my ($short) = prop_value_aliases('LB', $enum);
342 $short = substr(lc $enum, 0, 2) unless defined $short;
343 $lb_enums{$short} = $value;
344 @lb_short_enums[$value] = $short;
345 }
346 }
7e54b87f
KW
347 elsif ($name eq '_Perl_WB' && ! %wb_enums) {
348 while (my ($enum, $value) = each %enums) {
349 my ($short) = prop_value_aliases('WB', $enum);
350 $short = lc $enum unless defined $short;
351 $short = substr($short, 0, 2);
352
353 # Special case a better name than the kludgy one
354 $short = 'hs' if $short eq 'pe';
355
356 $wb_enums{$short} = $value;
357 @wb_short_enums[$value] = $short;
358 }
359 }
99f21fb9
KW
360 }
361
bffc0129
KW
362 # Inversion map stuff is currently used only by regexec
363 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
364 {
365
99f21fb9
KW
366 # The short names tend to be two lower case letters, but it looks
367 # better for those if they are upper. XXX
368 $short_name = uc($short_name) if length($short_name) < 3
369 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 370 $name_prefix = "${short_name}_";
99f21fb9
KW
371 my $enum_count = keys %enums;
372 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
373
374 print $out_fh "\ntypedef enum {\n";
6dc80864
KW
375 my @enum_list;
376 foreach my $enum (keys %enums) {
377 $enum_list[$enums{$enum}] = $enum;
378 }
379 foreach my $i (0 .. @enum_list - 1) {
380 my $name = $enum_list[$i];
381 print $out_fh "\t${name_prefix}$name = $i";
382 print $out_fh "," if $i < $enum_count - 1;
383 print $out_fh "\n";
99f21fb9
KW
384 }
385 $declaration_type = "${name_prefix}enum";
386 print $out_fh "} $declaration_type;\n";
387
388 $output_format = "${name_prefix}%s";
389 }
390 }
391 else {
392 die "'$input_format' invmap() format for '$prop_name' unimplemented";
393 }
394
395 die "No inversion map for $prop_name" unless defined $invmap
396 && ref $invmap eq 'ARRAY'
397 && $count;
398
399 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
400 print $out_fh " /* for $charset */" if $charset;
401 print $out_fh "\n";
402
403 # The main body are the scalars passed in to this routine.
404 for my $i (0 .. $count - 1) {
405 my $element = $invmap->[$i];
02f811dd
KW
406 my $full_element_name = prop_value_aliases($prop_name, $element);
407 $element = $full_element_name if defined $full_element_name;
408 $element = $name_prefix . $element;
99f21fb9
KW
409 print $out_fh "\t$element";
410 print $out_fh "," if $i < $count - 1;
411 print $out_fh "\n";
412 }
413 print $out_fh "};\n";
99f21fb9
KW
414}
415
5a7e5385 416sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
417
418 # Returns an inversion list constructed from the sorted input array of
419 # code points
420
421 my $list_ref = shift;
422
99f21fb9
KW
423 return unless @$list_ref;
424
a02047bf
KW
425 # Initialize to just the first element
426 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
427
428 # For each succeeding element, if it extends the previous range, adjust
429 # up, otherwise add it.
430 for my $i (1 .. @$list_ref - 1) {
431 if ($invlist[-1] == $list_ref->[$i]) {
432 $invlist[-1]++;
433 }
434 else {
435 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
436 }
437 }
438 return @invlist;
439}
440
441# Read in the Case Folding rules, and construct arrays of code points for the
442# properties we need.
443my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
444die "Could not find inversion map for Case_Folding" unless defined $format;
445die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
446 unless $format eq 'al'
447 || $format eq 'a';
a02047bf
KW
448my @has_multi_char_fold;
449my @is_non_final_fold;
450
451for my $i (0 .. @$folds_ref - 1) {
452 next unless ref $folds_ref->[$i]; # Skip single-char folds
453 push @has_multi_char_fold, $cp_ref->[$i];
454
b6a6e956 455 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
456 # position
457 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
458 push @is_non_final_fold, $folds_ref->[$i][$j]
459 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
460 }
461}
462
a02047bf
KW
463sub _Perl_Non_Final_Folds {
464 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 465 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
466}
467
99f21fb9
KW
468sub prop_name_for_cmp ($) { # Sort helper
469 my $name = shift;
470
471 # Returns the input lowercased, with non-alphas removed, as well as
472 # everything starting with a comma
473
474 $name =~ s/,.*//;
475 $name =~ s/[[:^alpha:]]//g;
476 return lc $name;
477}
478
892d8259 479sub UpperLatin1 {
5a7e5385 480 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
481}
482
973a28ed
KW
483sub output_GCB_table() {
484
485 # Create and output the pair table for use in determining Grapheme Cluster
486 # Breaks, given in http://www.unicode.org/reports/tr29/.
487
488 # The table is constructed in reverse order of the rules, to make the
489 # lower-numbered, higher priority ones override the later ones, as the
490 # algorithm stops at the earliest matching rule
491
492 my @gcb_table;
493 my $table_size = @gcb_short_enums;
494
495 # Otherwise, break everywhere.
496 # GB10 Any ÷ Any
497 for my $i (0 .. $table_size - 1) {
498 for my $j (0 .. $table_size - 1) {
499 $gcb_table[$i][$j] = 1;
500 }
501 }
502
503 # Do not break before extending characters.
504 # Do not break before SpacingMarks, or after Prepend characters.
505 # GB9 × Extend
506 # GB9a × SpacingMark
507 # GB9b Prepend ×
508 for my $i (0 .. @gcb_table - 1) {
509 $gcb_table[$i][$gcb_enums{'EX'}] = 0;
510 $gcb_table[$i][$gcb_enums{'SM'}] = 0;
511 $gcb_table[$gcb_enums{'PP'}][$i] = 0;
512 }
513
514 # Do not break between regional indicator symbols.
515 # GB8a Regional_Indicator × Regional_Indicator
516 $gcb_table[$gcb_enums{'RI'}][$gcb_enums{'RI'}] = 0;
517
518 # Do not break Hangul syllable sequences.
519 # GB8 ( LVT | T) × T
520 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
521 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
522
523 # GB7 ( LV | V ) × ( V | T )
524 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
525 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
526 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
527 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
528
529 # GB6 L × ( L | V | LV | LVT )
530 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
531 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
532 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
533 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
534
535 # Do not break between a CR and LF. Otherwise, break before and after controls.
536 # GB5 ÷ ( Control | CR | LF )
537 # GB4 ( Control | CR | LF ) ÷
538 for my $i (0 .. @gcb_table - 1) {
539 $gcb_table[$i][$gcb_enums{'CN'}] = 1;
540 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
541 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
542 $gcb_table[$gcb_enums{'CN'}][$i] = 1;
543 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
544 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
545 }
546
547 # GB3 CR × LF
548 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
549
550 # Break at the start and end of text.
551 # GB1 sot ÷
552 # GB2 ÷ eot
553 for my $i (0 .. @gcb_table - 1) {
554 $gcb_table[$i][$gcb_enums{'edge'}] = 1;
555 $gcb_table[$gcb_enums{'edge'}][$i] = 1;
556 }
557
558 # But, unspecified by Unicode, we shouldn't break on an empty string.
559 $gcb_table[$gcb_enums{'edge'}][$gcb_enums{'edge'}] = 0;
560
561 print $out_fh "\nstatic const bool GCB_table[$table_size][$table_size] = {\n";
78e3ac85 562 print $out_fh "/* ";
973a28ed
KW
563 for my $i (0 .. @gcb_table - 1) {
564 printf $out_fh "%5s", $gcb_short_enums[$i];
565 }
78e3ac85 566 print $out_fh "*/\n";
973a28ed
KW
567
568 for my $i (0 .. @gcb_table - 1) {
78e3ac85
AC
569 printf $out_fh "/*%4s*/ ", $gcb_short_enums[$i];
570 print $out_fh "{";
571 print $out_fh join ", ", map sprintf("%3d", $_), @{ $gcb_table[$i] };
572 print $out_fh "}";
573 print $out_fh "," if $i < @gcb_table - 1;
973a28ed
KW
574 print $out_fh "\n";
575 }
576
577 print $out_fh "};\n";
578}
579
6b659339
KW
580sub output_LB_table() {
581
582 # Create and output the enums, #defines, and pair table for use in
583 # determining Line Breaks. This uses the default line break algorithm,
584 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
585 # in that page, as the Unicode-furnished tests assume that tailoring.
586
6b659339
KW
587 # The result is really just true or false. But we follow along with tr14,
588 # creating a rule which is false for something like X SP* X. That gets
589 # encoding 2. The rest of the actions are synthetic ones that indicate
590 # some context handling is required. These each are added to the
591 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
592 # value can be retrieved. Actually only rules from 7 through 18 (which
593 # are the ones where space matter) are possible to have 2 added to them.
594 # The others below add just 0 or 1. It might be possible for one
595 # synthetic rule to be added to another, yielding a larger value. This
596 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
597 # names of the middle grouping below, it is impossible for that to occur
598 # for them because they all start with mutually exclusive classes. That
599 # the final rule can't be added to any of the others isn't obvious from
600 # its name, so it is assigned a power of 2 higher than the others can get
601 # to so any addition would preserve all data. (And the code will reach an
602 # assert(0) on debugging builds should this happen.)
603 my %lb_actions = (
604 LB_NOBREAK => 0,
605 LB_BREAKABLE => 1,
606 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
607
608 LB_CM_foo => 3, # Rule 9
609 LB_SP_foo => 6, # Rule 18
610 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
611 LB_SY_or_IS_then_various => 11, # Rule 25
612 LB_HY_or_BA_then_foo => 13, # Rule 21
613
614 LB_various_then_PO_or_PR => (1<<4), # Rule 25
615 );
616
617 # Output the #define list, sorted by numeric value
618 my @defines;
619 while (my ($enum, $value) = each %lb_actions) {
620 $defines[$value] = $enum;
621 }
622
623 print $out_fh "\n";
624
625 foreach my $i (0 .. @defines - 1) {
626 next unless defined $defines[$i];
627 print $out_fh "#define $defines[$i]\t$i\n";
628 }
629
630 # Construct the LB pair table. This is based on the rules in
631 # http://www.unicode.org/reports/tr14/, but modified as those rules are
632 # designed for someone taking a string of text and sequentially going
633 # through it to find the break opportunities, whereas, Perl requires
634 # determining if a given random spot is a break opportunity, without
635 # knowing all the entire string before it.
636 #
637 # The table is constructed in reverse order of the rules, to make the
638 # lower-numbered, higher priority ones override the later ones, as the
639 # algorithm stops at the earliest matching rule
640
641 my @lb_table;
642 my $table_size = @lb_short_enums;
643
644 # LB31. Break everywhere else
645 for my $i (0 .. $table_size - 1) {
646 for my $j (0 .. $table_size - 1) {
647 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
648 }
649 }
650
651 # LB30a. Don't break between Regional Indicators
652 $lb_table[$lb_enums{'RI'}][$lb_enums{'RI'}] = $lb_actions{'LB_NOBREAK'};
653
654 # LB30 Do not break between letters, numbers, or ordinary symbols and
655 # opening or closing parentheses.
656 # (AL | HL | NU) × OP
657 $lb_table[$lb_enums{'AL'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
658 $lb_table[$lb_enums{'HL'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
659 $lb_table[$lb_enums{'NU'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
660
661 # CP × (AL | HL | NU)
662 $lb_table[$lb_enums{'CP'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
663 $lb_table[$lb_enums{'CP'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
664 $lb_table[$lb_enums{'CP'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
665
666 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
667 # IS × (AL | HL)
668 $lb_table[$lb_enums{'IS'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
669 $lb_table[$lb_enums{'IS'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
670
671 # LB28 Do not break between alphabetics (“at”).
672 # (AL | HL) × (AL | HL)
673 $lb_table[$lb_enums{'AL'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
674 $lb_table[$lb_enums{'HL'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
675 $lb_table[$lb_enums{'AL'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
676 $lb_table[$lb_enums{'HL'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
677
678 # LB27 Treat a Korean Syllable Block the same as ID.
679 # (JL | JV | JT | H2 | H3) × IN
680 $lb_table[$lb_enums{'JL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
681 $lb_table[$lb_enums{'JV'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
682 $lb_table[$lb_enums{'JT'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
683 $lb_table[$lb_enums{'H2'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
684 $lb_table[$lb_enums{'H3'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
685
686 # (JL | JV | JT | H2 | H3) × PO
687 $lb_table[$lb_enums{'JL'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
688 $lb_table[$lb_enums{'JV'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
689 $lb_table[$lb_enums{'JT'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
690 $lb_table[$lb_enums{'H2'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
691 $lb_table[$lb_enums{'H3'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
692
693 # PR × (JL | JV | JT | H2 | H3)
694 $lb_table[$lb_enums{'PR'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
695 $lb_table[$lb_enums{'PR'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
696 $lb_table[$lb_enums{'PR'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
697 $lb_table[$lb_enums{'PR'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
698 $lb_table[$lb_enums{'PR'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
699
700 # LB26 Do not break a Korean syllable.
701 # JL × (JL | JV | H2 | H3)
702 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
703 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
704 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
705 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
706
707 # (JV | H2) × (JV | JT)
708 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
709 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
710 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
711 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
712
713 # (JT | H3) × JT
714 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
715 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
716
717 # LB25 Do not break between the following pairs of classes relevant to
718 # numbers, as tailored by example 7 in
719 # http://www.unicode.org/reports/tr14/#Examples
720 # We follow that tailoring because Unicode's test cases expect it
721 # (PR | PO) × ( OP | HY )? NU
722 $lb_table[$lb_enums{'PR'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
723 $lb_table[$lb_enums{'PO'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
724
725 # Given that (OP | HY )? is optional, we have to test for it in code.
726 # We add in the action (instead of overriding) for this, so that in
727 # the code we can recover the underlying break value.
728 $lb_table[$lb_enums{'PR'}][$lb_enums{'OP'}]
729 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
730 $lb_table[$lb_enums{'PO'}][$lb_enums{'OP'}]
731 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
732 $lb_table[$lb_enums{'PR'}][$lb_enums{'HY'}]
733 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
734 $lb_table[$lb_enums{'PO'}][$lb_enums{'HY'}]
735 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
736
737 # ( OP | HY ) × NU
738 $lb_table[$lb_enums{'OP'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
739 $lb_table[$lb_enums{'HY'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
740
741 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
742 # which can be rewritten as:
743 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
744 $lb_table[$lb_enums{'NU'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
745 $lb_table[$lb_enums{'NU'}][$lb_enums{'SY'}] = $lb_actions{'LB_NOBREAK'};
746 $lb_table[$lb_enums{'NU'}][$lb_enums{'IS'}] = $lb_actions{'LB_NOBREAK'};
747 $lb_table[$lb_enums{'NU'}][$lb_enums{'CL'}] = $lb_actions{'LB_NOBREAK'};
748 $lb_table[$lb_enums{'NU'}][$lb_enums{'CP'}] = $lb_actions{'LB_NOBREAK'};
749
750 # Like earlier where we have to test in code, we add in the action so
751 # that we can recover the underlying values. This is done in rules
752 # below, as well. The code assumes that we haven't added 2 actions.
753 # Shoul a later Unicode release break that assumption, then tests
754 # should start failing.
755 $lb_table[$lb_enums{'SY'}][$lb_enums{'NU'}]
756 += $lb_actions{'LB_SY_or_IS_then_various'};
757 $lb_table[$lb_enums{'SY'}][$lb_enums{'SY'}]
758 += $lb_actions{'LB_SY_or_IS_then_various'};
759 $lb_table[$lb_enums{'SY'}][$lb_enums{'IS'}]
760 += $lb_actions{'LB_SY_or_IS_then_various'};
761 $lb_table[$lb_enums{'SY'}][$lb_enums{'CL'}]
762 += $lb_actions{'LB_SY_or_IS_then_various'};
763 $lb_table[$lb_enums{'SY'}][$lb_enums{'CP'}]
764 += $lb_actions{'LB_SY_or_IS_then_various'};
765 $lb_table[$lb_enums{'IS'}][$lb_enums{'NU'}]
766 += $lb_actions{'LB_SY_or_IS_then_various'};
767 $lb_table[$lb_enums{'IS'}][$lb_enums{'SY'}]
768 += $lb_actions{'LB_SY_or_IS_then_various'};
769 $lb_table[$lb_enums{'IS'}][$lb_enums{'IS'}]
770 += $lb_actions{'LB_SY_or_IS_then_various'};
771 $lb_table[$lb_enums{'IS'}][$lb_enums{'CL'}]
772 += $lb_actions{'LB_SY_or_IS_then_various'};
773 $lb_table[$lb_enums{'IS'}][$lb_enums{'CP'}]
774 += $lb_actions{'LB_SY_or_IS_then_various'};
775
776 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
777 # which can be rewritten as:
778 # NU (SY | IS)* (CL | CP)? × (PO | PR)
779 $lb_table[$lb_enums{'NU'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
780 $lb_table[$lb_enums{'NU'}][$lb_enums{'PR'}] = $lb_actions{'LB_NOBREAK'};
781
782 $lb_table[$lb_enums{'CP'}][$lb_enums{'PO'}]
783 += $lb_actions{'LB_various_then_PO_or_PR'};
784 $lb_table[$lb_enums{'CL'}][$lb_enums{'PO'}]
785 += $lb_actions{'LB_various_then_PO_or_PR'};
786 $lb_table[$lb_enums{'IS'}][$lb_enums{'PO'}]
787 += $lb_actions{'LB_various_then_PO_or_PR'};
788 $lb_table[$lb_enums{'SY'}][$lb_enums{'PO'}]
789 += $lb_actions{'LB_various_then_PO_or_PR'};
790
791 $lb_table[$lb_enums{'CP'}][$lb_enums{'PR'}]
792 += $lb_actions{'LB_various_then_PO_or_PR'};
793 $lb_table[$lb_enums{'CL'}][$lb_enums{'PR'}]
794 += $lb_actions{'LB_various_then_PO_or_PR'};
795 $lb_table[$lb_enums{'IS'}][$lb_enums{'PR'}]
796 += $lb_actions{'LB_various_then_PO_or_PR'};
797 $lb_table[$lb_enums{'SY'}][$lb_enums{'PR'}]
798 += $lb_actions{'LB_various_then_PO_or_PR'};
799
800 # LB24 Do not break between prefix and letters or ideographs.
801 # PR × ID
802 $lb_table[$lb_enums{'PR'}][$lb_enums{'ID'}] = $lb_actions{'LB_NOBREAK'};
803
804 # PR × (AL | HL)
805 $lb_table[$lb_enums{'PR'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
806 $lb_table[$lb_enums{'PR'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
807
808 # PO × (AL | HL)
809 $lb_table[$lb_enums{'PO'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
810 $lb_table[$lb_enums{'PO'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
811
812 # LB23 Do not break within ‘a9’, ‘3a’, or ‘H%’.
813 # ID × PO
814 $lb_table[$lb_enums{'ID'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
815
816 # (AL | HL) × NU
817 $lb_table[$lb_enums{'AL'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
818 $lb_table[$lb_enums{'HL'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
819
820 # NU × (AL | HL)
821 $lb_table[$lb_enums{'NU'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
822 $lb_table[$lb_enums{'NU'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
823
824 # LB22 Do not break between two ellipses, or between letters, numbers or
825 # exclamations and ellipsis.
826 # (AL | HL) × IN
827 $lb_table[$lb_enums{'AL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
828 $lb_table[$lb_enums{'HL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
829
830 # EX × IN
831 $lb_table[$lb_enums{'EX'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
832
833 # ID × IN
834 $lb_table[$lb_enums{'ID'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
835
836 # IN × IN
837 $lb_table[$lb_enums{'IN'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
838
839 # NU × IN
840 $lb_table[$lb_enums{'NU'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
841
842 # LB21b Don’t break between Solidus and Hebrew letters.
843 # SY × HL
844 $lb_table[$lb_enums{'SY'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
845
846 # LB21a Don't break after Hebrew + Hyphen.
847 # HL (HY | BA) ×
848 for my $i (0 .. @lb_table - 1) {
849 $lb_table[$lb_enums{'HY'}][$i] += $lb_actions{'LB_HY_or_BA_then_foo'};
850 $lb_table[$lb_enums{'BA'}][$i] += $lb_actions{'LB_HY_or_BA_then_foo'};
851 }
852
853 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
854 # spaces, small kana, and other non-starters, or after acute accents.
855 # × BA
856 # × HY
857 # × NS
858 # BB ×
859 for my $i (0 .. @lb_table - 1) {
860 $lb_table[$i][$lb_enums{'BA'}] = $lb_actions{'LB_NOBREAK'};
861 $lb_table[$i][$lb_enums{'HY'}] = $lb_actions{'LB_NOBREAK'};
862 $lb_table[$i][$lb_enums{'NS'}] = $lb_actions{'LB_NOBREAK'};
863 $lb_table[$lb_enums{'BB'}][$i] = $lb_actions{'LB_NOBREAK'};
864 }
865
866 # LB20 Break before and after unresolved CB.
867 # ÷ CB
868 # CB ÷
869 # Conditional breaks should be resolved external to the line breaking
870 # rules. However, the default action is to treat unresolved CB as breaking
871 # before and after.
872 for my $i (0 .. @lb_table - 1) {
873 $lb_table[$i][$lb_enums{'CB'}] = $lb_actions{'LB_BREAKABLE'};
874 $lb_table[$lb_enums{'CB'}][$i] = $lb_actions{'LB_BREAKABLE'};
875 }
876
877 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
878 # × QU
879 # QU ×
880 for my $i (0 .. @lb_table - 1) {
881 $lb_table[$i][$lb_enums{'QU'}] = $lb_actions{'LB_NOBREAK'};
882 $lb_table[$lb_enums{'QU'}][$i] = $lb_actions{'LB_NOBREAK'};
883 }
884
885 # LB18 Break after spaces
886 # SP ÷
887 for my $i (0 .. @lb_table - 1) {
888 $lb_table[$lb_enums{'SP'}][$i] = $lb_actions{'LB_BREAKABLE'};
889 }
890
891 # LB17 Do not break within ‘——’, even with intervening spaces.
892 # B2 SP* × B2
893 $lb_table[$lb_enums{'B2'}][$lb_enums{'B2'}]
894 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
895
896 # LB16 Do not break between closing punctuation and a nonstarter even with
897 # intervening spaces.
898 # (CL | CP) SP* × NS
899 $lb_table[$lb_enums{'CL'}][$lb_enums{'NS'}]
900 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
901 $lb_table[$lb_enums{'CP'}][$lb_enums{'NS'}]
902 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
903
904
905 # LB15 Do not break within ‘”[’, even with intervening spaces.
906 # QU SP* × OP
907 $lb_table[$lb_enums{'QU'}][$lb_enums{'OP'}]
908 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
909
910 # LB14 Do not break after ‘[’, even after spaces.
911 # OP SP* ×
912 for my $i (0 .. @lb_table - 1) {
913 $lb_table[$lb_enums{'OP'}][$i]
914 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
915 }
916
917 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
918 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
919 # [^NU] × CL
920 # [^NU] × CP
921 # × EX
922 # [^NU] × IS
923 # [^NU] × SY
924 for my $i (0 .. @lb_table - 1) {
925 $lb_table[$i][$lb_enums{'EX'}]
926 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
927
928 next if $i == $lb_enums{'NU'};
929
930 $lb_table[$i][$lb_enums{'CL'}]
931 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
932 $lb_table[$i][$lb_enums{'CP'}]
933 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
934 $lb_table[$i][$lb_enums{'IS'}]
935 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
936 $lb_table[$i][$lb_enums{'SY'}]
937 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
938 }
939
940 # LB12a Do not break before NBSP and related characters, except after
941 # spaces and hyphens.
942 # [^SP BA HY] × GL
943 for my $i (0 .. @lb_table - 1) {
944 next if $i == $lb_enums{'SP'}
945 || $i == $lb_enums{'BA'}
946 || $i == $lb_enums{'HY'};
947
948 # We don't break, but if a property above has said don't break even
949 # with space between, don't override that (also in the next few rules)
950 next if $lb_table[$i][$lb_enums{'GL'}]
951 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
952 $lb_table[$i][$lb_enums{'GL'}] = $lb_actions{'LB_NOBREAK'};
953 }
954
955 # LB12 Do not break after NBSP and related characters.
956 # GL ×
957 for my $i (0 .. @lb_table - 1) {
958 next if $lb_table[$lb_enums{'GL'}][$i]
959 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
960 $lb_table[$lb_enums{'GL'}][$i] = $lb_actions{'LB_NOBREAK'};
961 }
962
963 # LB11 Do not break before or after Word joiner and related characters.
964 # × WJ
965 # WJ ×
966 for my $i (0 .. @lb_table - 1) {
967 if ($lb_table[$i][$lb_enums{'WJ'}]
968 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
969 {
970 $lb_table[$i][$lb_enums{'WJ'}] = $lb_actions{'LB_NOBREAK'};
971 }
972 if ($lb_table[$lb_enums{'WJ'}][$i]
973 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
974 {
975 $lb_table[$lb_enums{'WJ'}][$i] = $lb_actions{'LB_NOBREAK'};
976 }
977 }
978
979 # Special case this here to avoid having to do a special case in the code,
980 # by making this the same as other things with a SP in front of them that
981 # don't break, we avoid an extra test
982 $lb_table[$lb_enums{'SP'}][$lb_enums{'WJ'}]
983 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
984
985 # LB9 and LB10 are done in the same loop
986 #
987 # LB9 Do not break a combining character sequence; treat it as if it has
988 # the line breaking class of the base character in all of the
989 # higher-numbered rules.
990 # Treat X CM* as if it were X.
991 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
992
993 # LB10 Treat any remaining combining mark as AL. This catches the case
994 # where a CM is the first character on the line or follows SP, BK, CR, LF,
995 # NL, or ZW.
996 for my $i (0 .. @lb_table - 1) {
997
998 # When the CM is the first in the pair, we don't know without looking
999 # behind whether the CM is going to inherit from an earlier character,
1000 # or not. So have to figure this out in the code
1001 $lb_table[$lb_enums{'CM'}][$i] = $lb_actions{'LB_CM_foo'};
1002
1003 if ( $i == $lb_enums{'BK'}
1004 || $i == $lb_enums{'ed'}
1005 || $i == $lb_enums{'CR'}
1006 || $i == $lb_enums{'LF'}
1007 || $i == $lb_enums{'NL'}
1008 || $i == $lb_enums{'SP'}
1009 || $i == $lb_enums{'ZW'})
1010 {
1011 # For these classes, a following CM doesn't combine, and should do
1012 # whatever 'AL' would do.
1013 $lb_table[$i][$lb_enums{'CM'}] = $lb_table[$i][$lb_enums{'AL'}];
1014 }
1015 else {
1016 # For these classes, the CM combines, so doesn't break, inheriting
1017 # the type of nobreak from the master character.
1018 if ($lb_table[$i][$lb_enums{'CM'}]
1019 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1020 {
1021 $lb_table[$i][$lb_enums{'CM'}] = $lb_actions{'LB_NOBREAK'};
1022 }
1023 }
1024 }
1025
1026 # LB8 Break before any character following a zero-width space, even if one
1027 # or more spaces intervene.
1028 # ZW SP* ÷
1029 for my $i (0 .. @lb_table - 1) {
1030 $lb_table[$lb_enums{'ZW'}][$i] = $lb_actions{'LB_BREAKABLE'};
1031 }
1032
1033 # Because of LB8-10, we need to look at context for "SP x", and this must
1034 # be done in the code. So override the existing rules for that, by adding
1035 # a constant to get new rules that tell the code it needs to look at
1036 # context. By adding this action instead of replacing the existing one,
1037 # we can get back to the original rule if necessary.
1038 for my $i (0 .. @lb_table - 1) {
1039 $lb_table[$lb_enums{'SP'}][$i] += $lb_actions{'LB_SP_foo'};
1040 }
1041
1042 # LB7 Do not break before spaces or zero width space.
1043 # × SP
1044 # × ZW
1045 for my $i (0 .. @lb_table - 1) {
1046 $lb_table[$i][$lb_enums{'SP'}] = $lb_actions{'LB_NOBREAK'};
1047 $lb_table[$i][$lb_enums{'ZW'}] = $lb_actions{'LB_NOBREAK'};
1048 }
1049
1050 # LB6 Do not break before hard line breaks.
1051 # × ( BK | CR | LF | NL )
1052 for my $i (0 .. @lb_table - 1) {
1053 $lb_table[$i][$lb_enums{'BK'}] = $lb_actions{'LB_NOBREAK'};
1054 $lb_table[$i][$lb_enums{'CR'}] = $lb_actions{'LB_NOBREAK'};
1055 $lb_table[$i][$lb_enums{'LF'}] = $lb_actions{'LB_NOBREAK'};
1056 $lb_table[$i][$lb_enums{'NL'}] = $lb_actions{'LB_NOBREAK'};
1057 }
1058
1059 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1060 # CR × LF
1061 # CR !
1062 # LF !
1063 # NL !
1064 for my $i (0 .. @lb_table - 1) {
1065 $lb_table[$lb_enums{'CR'}][$i] = $lb_actions{'LB_BREAKABLE'};
1066 $lb_table[$lb_enums{'LF'}][$i] = $lb_actions{'LB_BREAKABLE'};
1067 $lb_table[$lb_enums{'NL'}][$i] = $lb_actions{'LB_BREAKABLE'};
1068 }
1069 $lb_table[$lb_enums{'CR'}][$lb_enums{'LF'}] = $lb_actions{'LB_NOBREAK'};
1070
1071 # LB4 Always break after hard line breaks.
1072 # BK !
1073 for my $i (0 .. @lb_table - 1) {
1074 $lb_table[$lb_enums{'BK'}][$i] = $lb_actions{'LB_BREAKABLE'};
1075 }
1076
1077 # LB2 Never break at the start of text.
1078 # sot ×
1079 # LB3 Always break at the end of text.
1080 # ! eot
1081 # but these are reversed in the loop below, so that won't break if there
1082 # is no text
1083 for my $i (0 .. @lb_table - 1) {
1084 $lb_table[$i][$lb_enums{'ed'}] = $lb_actions{'LB_BREAKABLE'};
1085 $lb_table[$lb_enums{'ed'}][$i] = $lb_actions{'LB_NOBREAK'};
1086 }
1087
1088 # LB1 Assign a line breaking class to each code point of the input.
1089 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1090 # depending on criteria outside the scope of this algorithm.
1091 #
1092 # In the absence of such criteria all characters with a specific
1093 # combination of original class and General_Category property value are
1094 # resolved as follows:
1095 # Original Resolved General_Category
1096 # AI, SG, XX AL Any
1097 # SA CM Only Mn or Mc
1098 # SA AL Any except Mn and Mc
1099 # CJ NS Any
1100 #
1101 # This is done in mktables, so we never see any of the remapped-from
1102 # classes.
1103
1104 print $out_fh "\nstatic const U8 LB_table[$table_size][$table_size] = {\n";
1105 print $out_fh "\n/* 'ed' stands for 'edge' */\n";
78e3ac85 1106 print $out_fh "/* ";
6b659339
KW
1107 for my $i (0 .. @lb_table - 1) {
1108 print $out_fh " $lb_short_enums[$i]";
1109 }
1110 print $out_fh " */\n";
1111
1112 for my $i (0 .. @lb_table - 1) {
1113 print $out_fh "/* $lb_short_enums[$i] */ ";
78e3ac85
AC
1114 print $out_fh "{ ";
1115 print $out_fh join ", ", map sprintf("%2d", $_), @{ $lb_table[$i] };
1116 print $out_fh " }";
1117 print $out_fh "," if $i < @lb_table - 1;
6b659339
KW
1118 print $out_fh "\n";
1119 }
1120
1121 print $out_fh "};\n";
6b659339
KW
1122}
1123
7e54b87f
KW
1124sub output_WB_table() {
1125
1126 # Create and output the enums, #defines, and pair table for use in
1127 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1128
1129 # This uses the same mechanism in the other bounds tables generated by
1130 # this file. The actions that could override a 0 or 1 are added to those
1131 # numbers; the actions that clearly don't depend on the underlying rule
1132 # simply overwrite
1133 my %wb_actions = (
1134 WB_NOBREAK => 0,
1135 WB_BREAKABLE => 1,
1136 WB_hs_then_hs => 2,
1137 WB_Ex_or_FO_then_foo => 3,
1138 WB_DQ_then_HL => 4,
1139 WB_HL_then_DQ => 6,
1140 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1141 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1142 WB_MB_or_MN_or_SQ_then_NU => 12,
1143 WB_NU_then_MB_or_MN_or_SQ => 14,
1144 );
1145
1146 # Output the #define list, sorted by numeric value
1147 my @defines;
1148 while (my ($enum, $value) = each %wb_actions) {
1149 $defines[$value] = $enum;
1150 }
1151
1152 print $out_fh "\n";
1153
1154 foreach my $i (0 .. @defines - 1) {
1155 next unless defined $defines[$i];
1156 print $out_fh "#define $defines[$i]\t$i\n";
1157 }
1158
1159 # Construct the WB pair table.
1160 # The table is constructed in reverse order of the rules, to make the
1161 # lower-numbered, higher priority ones override the later ones, as the
1162 # algorithm stops at the earliest matching rule
1163
1164 my @wb_table;
1165 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
1166
1167 # Otherwise, break everywhere (including around ideographs).
1168 # WB14 Any ÷ Any
1169 for my $i (0 .. $table_size - 1) {
1170 for my $j (0 .. $table_size - 1) {
1171 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1172 }
1173 }
1174
1175 # Do not break between regional indicator symbols.
1176 # WB13c Regional_Indicator × Regional_Indicator
1177 $wb_table[$wb_enums{'RI'}][$wb_enums{'RI'}] = $wb_actions{'WB_NOBREAK'};
1178
1179 # Do not break from extenders.
1180 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
1181 $wb_table[$wb_enums{'EX'}][$wb_enums{'LE'}] = $wb_actions{'WB_NOBREAK'};
1182 $wb_table[$wb_enums{'EX'}][$wb_enums{'HL'}] = $wb_actions{'WB_NOBREAK'};
1183 $wb_table[$wb_enums{'EX'}][$wb_enums{'NU'}] = $wb_actions{'WB_NOBREAK'};
1184 $wb_table[$wb_enums{'EX'}][$wb_enums{'KA'}] = $wb_actions{'WB_NOBREAK'};
1185
1186 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1187 # × # ExtendNumLet
1188 $wb_table[$wb_enums{'LE'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'};
1189 $wb_table[$wb_enums{'HL'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'};
1190 $wb_table[$wb_enums{'NU'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'};
1191 $wb_table[$wb_enums{'KA'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'};
1192 $wb_table[$wb_enums{'EX'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'};
1193
1194 # Do not break between Katakana.
1195 # WB13 Katakana × Katakana
1196 $wb_table[$wb_enums{'KA'}][$wb_enums{'KA'}] = $wb_actions{'WB_NOBREAK'};
1197
1198 # Do not break within sequences, such as “3.2” or “3,456.789”.
1199 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
1200 $wb_table[$wb_enums{'NU'}][$wb_enums{'MB'}]
1201 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1202 $wb_table[$wb_enums{'NU'}][$wb_enums{'MN'}]
1203 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1204 $wb_table[$wb_enums{'NU'}][$wb_enums{'SQ'}]
1205 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1206
1207 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
1208 $wb_table[$wb_enums{'MB'}][$wb_enums{'NU'}]
1209 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1210 $wb_table[$wb_enums{'MN'}][$wb_enums{'NU'}]
1211 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1212 $wb_table[$wb_enums{'SQ'}][$wb_enums{'NU'}]
1213 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1214
1215 # Do not break within sequences of digits, or digits adjacent to letters
1216 # (“3a”, or “A3”).
1217 # WB10 Numeric × (ALetter | Hebrew_Letter)
1218 $wb_table[$wb_enums{'NU'}][$wb_enums{'LE'}] = $wb_actions{'WB_NOBREAK'};
1219 $wb_table[$wb_enums{'NU'}][$wb_enums{'HL'}] = $wb_actions{'WB_NOBREAK'};
1220
1221 # WB9 (ALetter | Hebrew_Letter) × Numeric
1222 $wb_table[$wb_enums{'LE'}][$wb_enums{'NU'}] = $wb_actions{'WB_NOBREAK'};
1223 $wb_table[$wb_enums{'HL'}][$wb_enums{'NU'}] = $wb_actions{'WB_NOBREAK'};
1224
1225 # WB8 Numeric × Numeric
1226 $wb_table[$wb_enums{'NU'}][$wb_enums{'NU'}] = $wb_actions{'WB_NOBREAK'};
1227
1228 # Do not break letters across certain punctuation.
1229 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
1230 $wb_table[$wb_enums{'DQ'}][$wb_enums{'HL'}] += $wb_actions{'WB_DQ_then_HL'};
1231
1232 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
1233 $wb_table[$wb_enums{'HL'}][$wb_enums{'DQ'}] += $wb_actions{'WB_HL_then_DQ'};
1234
1235 # WB7a Hebrew_Letter × Single_Quote
1236 $wb_table[$wb_enums{'HL'}][$wb_enums{'SQ'}] = $wb_actions{'WB_NOBREAK'};
1237
1238 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1239 # × (ALetter | Hebrew_Letter)
1240 $wb_table[$wb_enums{'MB'}][$wb_enums{'LE'}]
1241 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1242 $wb_table[$wb_enums{'MB'}][$wb_enums{'HL'}]
1243 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1244 $wb_table[$wb_enums{'ML'}][$wb_enums{'LE'}]
1245 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1246 $wb_table[$wb_enums{'ML'}][$wb_enums{'HL'}]
1247 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1248 $wb_table[$wb_enums{'SQ'}][$wb_enums{'LE'}]
1249 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1250 $wb_table[$wb_enums{'SQ'}][$wb_enums{'HL'}]
1251 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1252
1253 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1254 # | Single_Quote) (ALetter | Hebrew_Letter)
1255 $wb_table[$wb_enums{'LE'}][$wb_enums{'MB'}]
1256 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1257 $wb_table[$wb_enums{'HL'}][$wb_enums{'MB'}]
1258 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1259 $wb_table[$wb_enums{'LE'}][$wb_enums{'ML'}]
1260 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1261 $wb_table[$wb_enums{'HL'}][$wb_enums{'ML'}]
1262 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1263 $wb_table[$wb_enums{'LE'}][$wb_enums{'SQ'}]
1264 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1265 $wb_table[$wb_enums{'HL'}][$wb_enums{'SQ'}]
1266 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1267
1268 # Do not break between most letters.
1269 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
1270 $wb_table[$wb_enums{'LE'}][$wb_enums{'LE'}] = $wb_actions{'WB_NOBREAK'};
1271 $wb_table[$wb_enums{'LE'}][$wb_enums{'HL'}] = $wb_actions{'WB_NOBREAK'};
1272 $wb_table[$wb_enums{'HL'}][$wb_enums{'LE'}] = $wb_actions{'WB_NOBREAK'};
1273 $wb_table[$wb_enums{'HL'}][$wb_enums{'HL'}] = $wb_actions{'WB_NOBREAK'};
1274
1275 # Ignore Format and Extend characters, except when they appear at the
1276 # beginning of a region of text.
1277 # WB4 X (Extend | Format)* → X
1278 for my $i (0 .. @wb_table - 1) {
1279 $wb_table[$wb_enums{'Ex'}][$i] = $wb_actions{'WB_Ex_or_FO_then_foo'};
1280 $wb_table[$wb_enums{'FO'}][$i] = $wb_actions{'WB_Ex_or_FO_then_foo'};
1281 }
1282
1283 # Implied is that these attach to the character before them, except for
1284 # the characters that mark the end of a region of text. The rules below
1285 # override the ones set up here, for all the characters that need
1286 # overriding.
1287 for my $i (0 .. @wb_table - 1) {
1288 $wb_table[$i][$wb_enums{'Ex'}] = $wb_actions{'WB_NOBREAK'};
1289 $wb_table[$i][$wb_enums{'FO'}] = $wb_actions{'WB_NOBREAK'};
1290 }
1291
1292 # Break before and after white space
1293 # WB3b ÷ (Newline | CR | LF)
1294 # WB3a (Newline | CR | LF) ÷
1295 # et. al.
1296 for my $i ('CR', 'LF', 'NL', 'hs') {
1297 for my $j (0 .. @wb_table - 1) {
1298 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1299 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1300 }
1301 }
1302
1303 # But do not break within white space.
1304 # WB3 CR × LF
1305 # et.al.
1306 for my $i ('CR', 'LF', 'NL', 'hs') {
1307 for my $j ('CR', 'LF', 'NL', 'hs') {
1308 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1309 }
1310 }
1311
1312 # And do not break horizontal space followed by Extend or Format
1313 $wb_table[$wb_enums{'hs'}][$wb_enums{'Ex'}] = $wb_actions{'WB_NOBREAK'};
1314 $wb_table[$wb_enums{'hs'}][$wb_enums{'FO'}] = $wb_actions{'WB_NOBREAK'};
1315 $wb_table[$wb_enums{'hs'}][$wb_enums{'hs'}] = $wb_actions{'WB_hs_then_hs'};
1316
1317 # Break at the start and end of text.
1318 # WB2 ÷ eot
1319 # WB1 sot ÷
1320 for my $i (0 .. @wb_table - 1) {
1321 $wb_table[$i][$wb_enums{'ed'}] = $wb_actions{'WB_BREAKABLE'};
1322 $wb_table[$wb_enums{'ed'}][$i] = $wb_actions{'WB_BREAKABLE'};
1323 }
1324
1325 # But, unspecified by Unicode, we shouldn't break on an empty string.
1326 $wb_table[$wb_enums{'ed'}][$wb_enums{'ed'}] = 0;
1327
1328 print $out_fh "\nstatic const U8 WB_table[$table_size][$table_size] = {\n";
1329 print $out_fh "\n/* 'Ex' stands for 'Extend'; 'hs' for 'Perl_Tailored_HSpace'; 'ed' for 'edge' */\n";
1330 print $out_fh "/* ";
1331 for my $i (0 .. @wb_table - 1) {
1332 print $out_fh " $wb_short_enums[$i]";
1333 }
1334 print $out_fh " */\n";
1335
1336 for my $i (0 .. @wb_table - 1) {
1337 print $out_fh "/* $wb_short_enums[$i] */ ";
1338 print $out_fh "{";
1339 print $out_fh join ", ", map sprintf("%2d", $_), @{ $wb_table[$i] };
1340 print $out_fh " }";
1341 print $out_fh "," if $i < @wb_table - 1;
1342 print $out_fh "\n";
1343 }
1344
1345 print $out_fh "};\n";
1346}
1347
9d9177be
KW
1348output_invlist("Latin1", [ 0, 256 ]);
1349output_invlist("AboveLatin1", [ 256 ]);
1350
bffc0129 1351end_file_pound_if;
43b443dd 1352
3f427fd9
KW
1353# We construct lists for all the POSIX and backslash sequence character
1354# classes in two forms:
1355# 1) ones which match only in the ASCII range
1356# 2) ones which match either in the Latin1 range, or the entire Unicode range
1357#
1358# These get compiled in, and hence affect the memory footprint of every Perl
1359# program, even those not using Unicode. To minimize the size, currently
1360# the Latin1 version is generated for the beyond ASCII range except for those
1361# lists that are quite small for the entire range, such as for \s, which is 22
1362# UVs long plus 4 UVs (currently) for the header.
1363#
1364# To save even more memory, the ASCII versions could be derived from the
1365# larger ones at runtime, saving some memory (minus the expense of the machine
1366# instructions to do so), but these are all small anyway, so their total is
1367# about 100 UVs.
1368#
1369# In the list of properties below that get generated, the L1 prefix is a fake
1370# property that means just the Latin1 range of the full property (whose name
1371# has an X prefix instead of L1).
a02047bf
KW
1372#
1373# An initial & means to use the subroutine from this file instead of an
1374# official inversion list.
3f427fd9 1375
0c4ecf42
KW
1376for my $charset (get_supported_code_pages()) {
1377 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1378
99f21fb9
KW
1379 @a2n = @{get_a2n($charset)};
1380 no warnings 'qw';
1381 # Ignore non-alpha in sort
1382 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
1383 ASCII
1384 Cased
1385 VertSpace
1386 XPerlSpace
1387 XPosixAlnum
1388 XPosixAlpha
1389 XPosixBlank
1390 XPosixCntrl
1391 XPosixDigit
1392 XPosixGraph
1393 XPosixLower
1394 XPosixPrint
1395 XPosixPunct
1396 XPosixSpace
1397 XPosixUpper
1398 XPosixWord
1399 XPosixXDigit
1400 _Perl_Any_Folds
1401 &NonL1_Perl_Non_Final_Folds
1402 _Perl_Folds_To_Multi_Char
1403 &UpperLatin1
1404 _Perl_IDStart
1405 _Perl_IDCont
02f811dd 1406 _Perl_GCB,EDGE
ca8226cf 1407 _Perl_LB,EDGE
bf4268fa 1408 _Perl_SB,EDGE
190d69bb 1409 _Perl_WB,EDGE,UNKNOWN
1c8c3428 1410 )
0f5e3c71
KW
1411 ) {
1412
1413 # For the Latin1 properties, we change to use the eXtended version of the
1414 # base property, then go through the result and get rid of everything not
1415 # in Latin1 (above 255). Actually, we retain the element for the range
1416 # that crosses the 255/256 boundary if it is one that matches the
1417 # property. For example, in the Word property, there is a range of code
1418 # points that start at U+00F8 and goes through U+02C1. Instead of
1419 # artificially cutting that off at 256 because 256 is the first code point
1420 # above Latin1, we let the range go to its natural ending. That gives us
1421 # extra information with no added space taken. But if the range that
1422 # crosses the boundary is one that doesn't match the property, we don't
1423 # start a new range above 255, as that could be construed as going to
1424 # infinity. For example, the Upper property doesn't include the character
1425 # at 255, but does include the one at 256. We don't include the 256 one.
1426 my $prop_name = $prop;
1427 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1428 my $extra_enums = "";
1429 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1430 my $lookup_prop = $prop_name;
1431 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1432 or $lookup_prop =~ s/^L1//);
1433 my $nonl1_only = 0;
1434 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1435 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1436
1437 my @invlist;
99f21fb9
KW
1438 my @invmap;
1439 my $map_format;
1440 my $map_default;
1441 my $maps_to_code_point;
1442 my $to_adjust;
0f5e3c71
KW
1443 if ($is_local_sub) {
1444 @invlist = eval $lookup_prop;
1445 }
1446 else {
1447 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1448 if (! @invlist) {
99f21fb9 1449
ad85f59a
KW
1450 # If couldn't find a non-empty inversion list, see if it is
1451 # instead an inversion map
1452 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1453 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1454 if (! $list_ref) {
1455 # An empty return here could mean an unknown property, or
1456 # merely that the original inversion list is empty. Call
1457 # in scalar context to differentiate
1458 my $count = prop_invlist($lookup_prop,
1459 '_perl_core_internal_ok');
1460 die "Could not find inversion list for '$lookup_prop'"
1461 unless defined $count;
1462 }
1463 else {
18b852b3
KW
1464 @invlist = @$list_ref;
1465 @invmap = @$map_ref;
1466 $map_format = $format;
1467 $map_default = $default;
1468 $maps_to_code_point = $map_format =~ /x/;
1469 $to_adjust = $map_format =~ /a/;
ad85f59a 1470 }
99f21fb9 1471 }
0f5e3c71 1472 }
ad85f59a
KW
1473
1474
1475 # Short-circuit an empty inversion list.
1476 if (! @invlist) {
1477 output_invlist($prop_name, \@invlist, $charset);
1478 next;
1479 }
ceb1de32 1480
99f21fb9
KW
1481 # Re-order the Unicode code points to native ones for this platform.
1482 # This is only needed for code points below 256, because native code
1483 # points are only in that range. For inversion maps of properties
1484 # where the mappings are adjusted (format =~ /a/), this reordering
1485 # could mess up the adjustment pattern that was in the input, so that
1486 # has to be dealt with.
1487 #
1488 # And inversion maps that map to code points need to eventually have
1489 # all those code points remapped to native, and it's better to do that
1490 # here, going through the whole list not just those below 256. This
1491 # is because some inversion maps have adjustments (format =~ /a/)
1492 # which may be affected by the reordering. This code needs to be done
1493 # both for when we are translating the inversion lists for < 256, and
1494 # for the inversion maps for everything. By doing both in this loop,
1495 # we can share that code.
1496 #
1497 # So, we go through everything for an inversion map to code points;
1498 # otherwise, we can skip any remapping at all if we are going to
1499 # output only the above-Latin1 values, or if the range spans the whole
1500 # of 0..256, as the remap will also include all of 0..256 (256 not
1501 # 255 because a re-ordering could cause 256 to need to be in the same
1502 # range as 255.)
1503 if ((@invmap && $maps_to_code_point)
1504 || (! $nonl1_only || ($invlist[0] < 256
1505 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1506 {
fb4554ea 1507
99f21fb9 1508 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1509 # Look at all the ranges that start before 257.
1510 my @latin1_list;
1511 while (@invlist) {
1512 last if $invlist[0] > 256;
1513 my $upper = @invlist > 1
1514 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1515
1516 # To infinity. You may want to stop much much
1517 # earlier; going this high may expose perl
1518 # deficiencies with very large numbers.
1519 : $Unicode::UCD::MAX_CP;
fb4554ea 1520 for my $j ($invlist[0] .. $upper) {
99f21fb9 1521 push @latin1_list, a2n($j);
0f5e3c71 1522 }
fb4554ea
KW
1523
1524 shift @invlist; # Shift off the range that's in the list
1525 shift @invlist; # Shift off the range not in the list
0c4ecf42 1526 }
fb4554ea
KW
1527
1528 # Here @invlist contains all the ranges in the original that start
1529 # at code points above 256, and @latin1_list contains all the
1530 # native code points for ranges that start with a Unicode code
1531 # point below 257. We sort the latter and convert it to inversion
1532 # list format. Then simply prepend it to the list of the higher
1533 # code points.
1534 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1535 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1536 unshift @invlist, @latin1_list;
99f21fb9
KW
1537 }
1538 else { # Is an inversion map
1539
1540 # This is a similar procedure as plain inversion list, but has
1541 # multiple buckets. A plain inversion list just has two
1542 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1543 # pretty much can ignore the 2nd bucket, as it is completely
1544 # defined by the 1st. But here, what we do is create buckets
1545 # which contain the code points that map to each, translated
1546 # to native and turned into an inversion list. Thus each
1547 # bucket is an inversion list of native code points that map
1548 # to it or don't map to it. We use these to create an
1549 # inversion map for the whole property.
1550
1551 # As mentioned earlier, we use this procedure to not just
1552 # remap the inversion list to native values, but also the maps
1553 # of code points to native ones. In the latter case we have
1554 # to look at the whole of the inversion map (or at least to
1555 # above Unicode; as the maps of code points above that should
1556 # all be to the default).
1557 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1558
1559 my %mapped_lists; # A hash whose keys are the buckets.
1560 while (@invlist) {
1561 last if $invlist[0] > $upper_limit;
1562
1563 # This shouldn't actually happen, as prop_invmap() returns
1564 # an extra element at the end that is beyond $upper_limit
1565 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1566
1567 my $bucket;
1568
1569 # A hash key can't be a ref (we are only expecting arrays
1570 # of scalars here), so convert any such to a string that
1571 # will be converted back later (using a vertical tab as
1572 # the separator). Even if the mapping is to code points,
1573 # we don't translate to native here because the code
1574 # output_map() calls to output these arrays assumes the
1575 # input is Unicode, not native.
1576 if (ref $invmap[0]) {
1577 $bucket = join "\cK", @{$invmap[0]};
1578 }
1579 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1580
1581 # Do convert to native for maps to single code points.
1582 # There are some properties that have a few outlier
1583 # maps that aren't code points, so the above test
1584 # skips those.
1585 $bucket = a2n($invmap[0]);
1586 } else {
1587 $bucket = $invmap[0];
1588 }
1589
1590 # We now have the bucket that all code points in the range
1591 # map to, though possibly they need to be adjusted. Go
1592 # through the range and put each translated code point in
1593 # it into its bucket.
1594 my $base_map = $invmap[0];
1595 for my $j ($invlist[0] .. $invlist[1] - 1) {
1596 if ($to_adjust
1597 # The 1st code point doesn't need adjusting
1598 && $j > $invlist[0]
1599
1600 # Skip any non-numeric maps: these are outliers
1601 # that aren't code points.
1602 && $base_map =~ $numeric_re
1603
1604 # 'ne' because the default can be a string
1605 && $base_map ne $map_default)
1606 {
1607 # We adjust, by incrementing each the bucket and
1608 # the map. For code point maps, translate to
1609 # native
1610 $base_map++;
1611 $bucket = ($maps_to_code_point)
1612 ? a2n($base_map)
1613 : $base_map;
1614 }
1615
1616 # Add the native code point to the bucket for the
1617 # current map
1618 push @{$mapped_lists{$bucket}}, a2n($j);
1619 } # End of loop through all code points in the range
1620
1621 # Get ready for the next range
1622 shift @invlist;
1623 shift @invmap;
1624 } # End of loop through all ranges in the map.
1625
1626 # Here, @invlist and @invmap retain all the ranges from the
1627 # originals that start with code points above $upper_limit.
1628 # Each bucket in %mapped_lists contains all the code points
1629 # that map to that bucket. If the bucket is for a map to a
1630 # single code point is a single code point, the bucket has
1631 # been converted to native. If something else (including
1632 # multiple code points), no conversion is done.
1633 #
1634 # Now we recreate the inversion map into %xlated, but this
1635 # time for the native character set.
1636 my %xlated;
1637 foreach my $bucket (keys %mapped_lists) {
1638
1639 # Sort and convert this bucket to an inversion list. The
1640 # result will be that ranges that start with even-numbered
1641 # indexes will be for code points that map to this bucket;
1642 # odd ones map to some other bucket, and are discarded
1643 # below.
1644 @{$mapped_lists{$bucket}}
1645 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
1646 @{$mapped_lists{$bucket}}
1647 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
1648
1649 # Add each even-numbered range in the bucket to %xlated;
1650 # so that the keys of %xlated become the range start code
1651 # points, and the values are their corresponding maps.
1652 while (@{$mapped_lists{$bucket}}) {
1653 my $range_start = $mapped_lists{$bucket}->[0];
1654 if ($bucket =~ /\cK/) {
1655 @{$xlated{$range_start}} = split /\cK/, $bucket;
1656 }
1657 else {
1658 $xlated{$range_start} = $bucket;
1659 }
1660 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
1661 shift @{$mapped_lists{$bucket}}; # Get ready for next
1662 # iteration
1663 }
1664 } # End of loop through all the buckets.
1665
1666 # Here %xlated's keys are the range starts of all the code
1667 # points in the inversion map. Construct an inversion list
1668 # from them.
1669 my @new_invlist = sort { $a <=> $b } keys %xlated;
1670
1671 # If the list is adjusted, we want to munge this list so that
1672 # we only have one entry for where consecutive code points map
1673 # to consecutive values. We just skip the subsequent entries
1674 # where this is the case.
1675 if ($to_adjust) {
1676 my @temp;
1677 for my $i (0 .. @new_invlist - 1) {
1678 next if $i > 0
1679 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
1680 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
1681 && $xlated{$new_invlist[$i]} =~ $numeric_re
1682 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
1683 push @temp, $new_invlist[$i];
1684 }
1685 @new_invlist = @temp;
1686 }
1687
1688 # The inversion map comes from %xlated's values. We can
1689 # unshift each onto the front of the untouched portion, in
1690 # reverse order of the portion we did process.
1691 foreach my $start (reverse @new_invlist) {
1692 unshift @invmap, $xlated{$start};
1693 }
1694
1695 # Finally prepend the inversion list we have just constructed to the
1696 # one that contains anything we didn't process.
1697 unshift @invlist, @new_invlist;
1698 }
1699 }
1700
1701 # prop_invmap() returns an extra final entry, which we can now
1702 # discard.
1703 if (@invmap) {
1704 pop @invlist;
1705 pop @invmap;
ceb1de32 1706 }
0f5e3c71
KW
1707
1708 if ($l1_only) {
99f21fb9 1709 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
1710 for my $i (0 .. @invlist - 1 - 1) {
1711 if ($invlist[$i] > 255) {
1712
1713 # In an inversion list, even-numbered elements give the code
1714 # points that begin ranges that match the property;
1715 # odd-numbered give ones that begin ranges that don't match.
1716 # If $i is odd, we are at the first code point above 255 that
1717 # doesn't match, which means the range it is ending does
1718 # match, and crosses the 255/256 boundary. We want to include
1719 # this ending point, so increment $i, so the splice below
1720 # includes it. Conversely, if $i is even, it is the first
1721 # code point above 255 that matches, which means there was no
1722 # matching range that crossed the boundary, and we don't want
1723 # to include this code point, so splice before it.
1724 $i++ if $i % 2 != 0;
1725
1726 # Remove everything past this.
1727 splice @invlist, $i;
99f21fb9 1728 splice @invmap, $i if @invmap;
0f5e3c71
KW
1729 last;
1730 }
0c4ecf42
KW
1731 }
1732 }
0f5e3c71
KW
1733 elsif ($nonl1_only) {
1734 my $found_nonl1 = 0;
1735 for my $i (0 .. @invlist - 1 - 1) {
1736 next if $invlist[$i] < 256;
1737
1738 # Here, we have the first element in the array that indicates an
1739 # element above Latin1. Get rid of all previous ones.
1740 splice @invlist, 0, $i;
99f21fb9 1741 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
1742
1743 # If this one's index is not divisible by 2, it means that this
1744 # element is inverting away from being in the list, which means
99f21fb9
KW
1745 # all code points from 256 to this one are in this list (or
1746 # map to the default for inversion maps)
1747 if ($i % 2 != 0) {
1748 unshift @invlist, 256;
1749 unshift @invmap, $map_default if @invmap;
1750 }
0f5e3c71 1751 $found_nonl1 = 1;
3f427fd9
KW
1752 last;
1753 }
0f5e3c71 1754 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 1755 }
3f427fd9 1756
0f5e3c71 1757 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 1758 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 1759 }
bffc0129 1760 end_file_pound_if;
0c4ecf42 1761 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
1762}
1763
973a28ed
KW
1764switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
1765
1766output_GCB_table();
6b659339 1767output_LB_table();
7e54b87f 1768output_WB_table();
6b659339 1769
973a28ed
KW
1770end_file_pound_if;
1771
2308ab83 1772my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
1773my @sources = ($0, qw(lib/unicore/mktables
1774 lib/Unicode/UCD.pm
1775 regen/charset_translations.pl
1776 ));
9a3da3ad
FC
1777{
1778 # Depend on mktables’ own sources. It’s a shorter list of files than
1779 # those that Unicode::UCD uses.
2308ab83
KW
1780 if (! open my $mktables_list, $sources_list) {
1781
1782 # This should force a rebuild once $sources_list exists
1783 push @sources, $sources_list;
1784 }
1785 else {
1786 while(<$mktables_list>) {
1787 last if /===/;
1788 chomp;
1789 push @sources, "lib/unicore/$_" if /^[^#]/;
1790 }
9a3da3ad
FC
1791 }
1792}
6b659339
KW
1793
1794read_only_bottom_close_and_rename($out_fh, \@sources);