This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
t/re/uniprops.t: Fix bug in diagnostic output
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
bffc0129
KW
45# The symbols generated by this program are all currently defined only in a
46# single dot c each. The code knows where most of them go, but this hash
47# gives overrides for the exceptions to the typical place
48my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
015bb97c 58
f79a09fc
KW
59# This hash contains the properties with enums that have hard-coded references
60# to them in C code. Its only use is to make sure that if perl is compiled
61# with an older Unicode data set, that all the enum values the code is
62# expecting will still be in the enum typedef. Thus the code doesn't have to
63# change. The Unicode version won't have any code points that have these enum
64# values, so the code that handles them will not get exercised. This is far
65# better than having to #ifdef things.
66my %hard_coded_enums =
67 ( gcb => [
68 'Control',
69 'CR',
70 'Extend',
71 'L',
72 'LF',
73 'LV',
74 'LVT',
75 'Other',
76 'Prepend',
77 'Regional_Indicator',
78 'SpacingMark',
79 'T',
80 'V',
81 ],
ca8226cf
KW
82 lb => [
83 'Alphabetic',
84 'Break_After',
85 'Break_Before',
86 'Break_Both',
87 'Break_Symbols',
88 'Carriage_Return',
89 'Close_Parenthesis',
90 'Close_Punctuation',
91 'Combining_Mark',
92 'Contingent_Break',
93 'Exclamation',
94 'Glue',
95 'H2',
96 'H3',
97 'Hebrew_Letter',
98 'Hyphen',
99 'Ideographic',
100 'Infix_Numeric',
101 'Inseparable',
102 'JL',
103 'JT',
104 'JV',
105 'Line_Feed',
106 'Mandatory_Break',
107 'Next_Line',
108 'Nonstarter',
109 'Numeric',
110 'Open_Punctuation',
111 'Postfix_Numeric',
112 'Prefix_Numeric',
113 'Quotation',
114 'Regional_Indicator',
115 'Space',
116 'Word_Joiner',
117 'ZWSpace',
118 ],
f79a09fc
KW
119 sb => [
120 'ATerm',
121 'Close',
122 'CR',
123 'Extend',
124 'Format',
125 'LF',
126 'Lower',
127 'Numeric',
128 'OLetter',
129 'Other',
130 'SContinue',
131 'Sep',
132 'Sp',
133 'STerm',
134 'Upper',
135 ],
136 wb => [
137 'ALetter',
138 'CR',
139 'Double_Quote',
140 'Extend',
141 'ExtendNumLet',
142 'Format',
143 'Hebrew_Letter',
144 'Katakana',
145 'LF',
146 'MidLetter',
147 'MidNum',
148 'MidNumLet',
149 'Newline',
150 'Numeric',
151 'Other',
f1f6961f 152 'Perl_Tailored_HSpace',
f79a09fc
KW
153 'Regional_Indicator',
154 'Single_Quote',
f79a09fc
KW
155 ],
156);
157
6b659339
KW
158my %lb_enums;
159my @lb_short_enums;
160
99f21fb9
KW
161my @a2n;
162
163sub uniques {
164 # Returns non-duplicated input values. From "Perl Best Practices:
165 # Encapsulated Cleverness". p. 455 in first edition.
166
167 my %seen;
168 return grep { ! $seen{$_}++ } @_;
169}
170
171sub a2n($) {
172 my $cp = shift;
173
174 # Returns the input Unicode code point translated to native.
175
176 return $cp if $cp !~ $numeric_re || $cp > 255;
177 return $a2n[$cp];
178}
179
bffc0129
KW
180sub end_file_pound_if {
181 if ($in_file_pound_if) {
182 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
183 $in_file_pound_if = 0;
184 }
185}
186
187sub switch_pound_if ($$) {
188 my $name = shift;
189 my $new_pound_if = shift;
190
191 # Switch to new #if given by the 2nd argument. If there is an override
192 # for this, it instead switches to that. The 1st argument is the
193 # static's name, used to look up the overrides
194
195 if (exists $exceptions_to_where_to_define{$name}) {
196 $new_pound_if = $exceptions_to_where_to_define{$name};
197 }
198
199 # Exit current #if if the new one is different from the old
200 if ($in_file_pound_if
201 && $in_file_pound_if !~ /$new_pound_if/)
202 {
203 end_file_pound_if;
204 }
205
206 # Enter new #if, if not already in it.
207 if (! $in_file_pound_if) {
208 $in_file_pound_if = "defined($new_pound_if)";
209 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
210 }
211}
212
0c4ecf42 213sub output_invlist ($$;$) {
9d9177be
KW
214 my $name = shift;
215 my $invlist = shift; # Reference to inversion list array
0c4ecf42 216 my $charset = shift // ""; # name of character set for comment
9d9177be 217
76d3994c 218 die "No inversion list for $name" unless defined $invlist
ad85f59a 219 && ref $invlist eq 'ARRAY';
76d3994c 220
9d9177be
KW
221 # Output the inversion list $invlist using the name $name for it.
222 # It is output in the exact internal form for inversion lists.
223
a0316a6c
KW
224 # Is the last element of the header 0, or 1 ?
225 my $zero_or_one = 0;
ad85f59a 226 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 227 unshift @$invlist, 0;
9d9177be
KW
228 $zero_or_one = 1;
229 }
0a07b44b 230 my $count = @$invlist;
9d9177be 231
bffc0129 232 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 233
0c4ecf42
KW
234 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
235 print $out_fh " /* for $charset */" if $charset;
236 print $out_fh "\n";
9d9177be 237
a0316a6c 238 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
239 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
240 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
241 ",\t/* 0 if the list starts at 0;",
242 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
243
244 # The main body are the UVs passed in to this routine. Do the final
245 # element separately
47d53124
KW
246 for my $i (0 .. @$invlist - 1) {
247 printf $out_fh "\t0x%X", $invlist->[$i];
248 print $out_fh "," if $i < @$invlist - 1;
249 print $out_fh "\n";
9d9177be
KW
250 }
251
9d9177be
KW
252 print $out_fh "};\n";
253}
254
99f21fb9
KW
255sub output_invmap ($$$$$$$) {
256 my $name = shift;
257 my $invmap = shift; # Reference to inversion map array
258 my $prop_name = shift;
259 my $input_format = shift; # The inversion map's format
260 my $default = shift; # The property value for code points who
261 # otherwise don't have a value specified.
262 my $extra_enums = shift; # comma-separated list of our additions to the
263 # property's standard possible values
264 my $charset = shift // ""; # name of character set for comment
265
266 # Output the inversion map $invmap for property $prop_name, but use $name
267 # as the actual data structure's name.
268
269 my $count = @$invmap;
270
271 my $output_format;
272 my $declaration_type;
273 my %enums;
274 my $name_prefix;
275
276 if ($input_format eq 's') {
b83e6484 277 my $orig_prop_name = $prop_name;
02f811dd
KW
278 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
279 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
b83e6484
KW
280 my @enums;
281 if ($orig_prop_name eq $prop_name) {
282 @enums = prop_values($prop_name);
283 }
284 else {
285 @enums = uniques(@$invmap);
286 }
99f21fb9
KW
287 if (! @enums) {
288 die "Only enum properties are currently handled; '$prop_name' isn't one";
289 }
290 else {
291
f79a09fc 292 # Convert short names to long
99f21fb9 293 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
f79a09fc
KW
294
295 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
296 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
297 if @expected_enums < @enums;
298
299 # Remove the enums found in the input from the ones we expect
300 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
301 splice(@expected_enums, $i, 1)
302 if grep { $expected_enums[$i] eq $_ } @enums;
303 }
304
305 # The ones remaining must be because we're using an older
306 # Unicode version. Add them to the list.
307 push @enums, @expected_enums;
308
309 # Add in the extra values coded into this program, and sort.
99f21fb9
KW
310 @enums = sort @enums;
311
6dc80864
KW
312 # The internal enums comes last.
313 push @enums, split /,/, $extra_enums if $extra_enums ne "";
314
99f21fb9
KW
315 # Assign a value to each element of the enum. The default
316 # value always gets 0; the others are arbitrarily assigned.
317 my $enum_val = 0;
02f811dd
KW
318 my $canonical_default = prop_value_aliases($prop_name, $default);
319 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
320 $enums{$default} = $enum_val++;
321 for my $enum (@enums) {
322 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
323 }
6b659339
KW
324
325 # Calculate the enum values for property _Perl_LB because we
326 # output a special table for that
327 if ($name eq '_Perl_LB' && ! %lb_enums) {
328 while (my ($enum, $value) = each %enums) {
329 my ($short) = prop_value_aliases('LB', $enum);
330 $short = substr(lc $enum, 0, 2) unless defined $short;
331 $lb_enums{$short} = $value;
332 @lb_short_enums[$value] = $short;
333 }
334 }
99f21fb9
KW
335 }
336
bffc0129
KW
337 # Inversion map stuff is currently used only by regexec
338 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
339 {
340
99f21fb9
KW
341 # The short names tend to be two lower case letters, but it looks
342 # better for those if they are upper. XXX
343 $short_name = uc($short_name) if length($short_name) < 3
344 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 345 $name_prefix = "${short_name}_";
99f21fb9
KW
346 my $enum_count = keys %enums;
347 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
348
349 print $out_fh "\ntypedef enum {\n";
6dc80864
KW
350 my @enum_list;
351 foreach my $enum (keys %enums) {
352 $enum_list[$enums{$enum}] = $enum;
353 }
354 foreach my $i (0 .. @enum_list - 1) {
355 my $name = $enum_list[$i];
356 print $out_fh "\t${name_prefix}$name = $i";
357 print $out_fh "," if $i < $enum_count - 1;
358 print $out_fh "\n";
99f21fb9
KW
359 }
360 $declaration_type = "${name_prefix}enum";
361 print $out_fh "} $declaration_type;\n";
362
363 $output_format = "${name_prefix}%s";
364 }
365 }
366 else {
367 die "'$input_format' invmap() format for '$prop_name' unimplemented";
368 }
369
370 die "No inversion map for $prop_name" unless defined $invmap
371 && ref $invmap eq 'ARRAY'
372 && $count;
373
374 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
375 print $out_fh " /* for $charset */" if $charset;
376 print $out_fh "\n";
377
378 # The main body are the scalars passed in to this routine.
379 for my $i (0 .. $count - 1) {
380 my $element = $invmap->[$i];
02f811dd
KW
381 my $full_element_name = prop_value_aliases($prop_name, $element);
382 $element = $full_element_name if defined $full_element_name;
383 $element = $name_prefix . $element;
99f21fb9
KW
384 print $out_fh "\t$element";
385 print $out_fh "," if $i < $count - 1;
386 print $out_fh "\n";
387 }
388 print $out_fh "};\n";
99f21fb9
KW
389}
390
5a7e5385 391sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
392
393 # Returns an inversion list constructed from the sorted input array of
394 # code points
395
396 my $list_ref = shift;
397
99f21fb9
KW
398 return unless @$list_ref;
399
a02047bf
KW
400 # Initialize to just the first element
401 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
402
403 # For each succeeding element, if it extends the previous range, adjust
404 # up, otherwise add it.
405 for my $i (1 .. @$list_ref - 1) {
406 if ($invlist[-1] == $list_ref->[$i]) {
407 $invlist[-1]++;
408 }
409 else {
410 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
411 }
412 }
413 return @invlist;
414}
415
416# Read in the Case Folding rules, and construct arrays of code points for the
417# properties we need.
418my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
419die "Could not find inversion map for Case_Folding" unless defined $format;
420die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
421 unless $format eq 'al'
422 || $format eq 'a';
a02047bf
KW
423my @has_multi_char_fold;
424my @is_non_final_fold;
425
426for my $i (0 .. @$folds_ref - 1) {
427 next unless ref $folds_ref->[$i]; # Skip single-char folds
428 push @has_multi_char_fold, $cp_ref->[$i];
429
b6a6e956 430 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
431 # position
432 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
433 push @is_non_final_fold, $folds_ref->[$i][$j]
434 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
435 }
436}
437
a02047bf
KW
438sub _Perl_Non_Final_Folds {
439 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 440 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
441}
442
99f21fb9
KW
443sub prop_name_for_cmp ($) { # Sort helper
444 my $name = shift;
445
446 # Returns the input lowercased, with non-alphas removed, as well as
447 # everything starting with a comma
448
449 $name =~ s/,.*//;
450 $name =~ s/[[:^alpha:]]//g;
451 return lc $name;
452}
453
892d8259 454sub UpperLatin1 {
5a7e5385 455 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
456}
457
6b659339
KW
458sub output_LB_table() {
459
460 # Create and output the enums, #defines, and pair table for use in
461 # determining Line Breaks. This uses the default line break algorithm,
462 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
463 # in that page, as the Unicode-furnished tests assume that tailoring.
464
465 switch_pound_if('LB_table', 'PERL_IN_REGEXEC_C');
466
467 # The result is really just true or false. But we follow along with tr14,
468 # creating a rule which is false for something like X SP* X. That gets
469 # encoding 2. The rest of the actions are synthetic ones that indicate
470 # some context handling is required. These each are added to the
471 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
472 # value can be retrieved. Actually only rules from 7 through 18 (which
473 # are the ones where space matter) are possible to have 2 added to them.
474 # The others below add just 0 or 1. It might be possible for one
475 # synthetic rule to be added to another, yielding a larger value. This
476 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
477 # names of the middle grouping below, it is impossible for that to occur
478 # for them because they all start with mutually exclusive classes. That
479 # the final rule can't be added to any of the others isn't obvious from
480 # its name, so it is assigned a power of 2 higher than the others can get
481 # to so any addition would preserve all data. (And the code will reach an
482 # assert(0) on debugging builds should this happen.)
483 my %lb_actions = (
484 LB_NOBREAK => 0,
485 LB_BREAKABLE => 1,
486 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
487
488 LB_CM_foo => 3, # Rule 9
489 LB_SP_foo => 6, # Rule 18
490 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
491 LB_SY_or_IS_then_various => 11, # Rule 25
492 LB_HY_or_BA_then_foo => 13, # Rule 21
493
494 LB_various_then_PO_or_PR => (1<<4), # Rule 25
495 );
496
497 # Output the #define list, sorted by numeric value
498 my @defines;
499 while (my ($enum, $value) = each %lb_actions) {
500 $defines[$value] = $enum;
501 }
502
503 print $out_fh "\n";
504
505 foreach my $i (0 .. @defines - 1) {
506 next unless defined $defines[$i];
507 print $out_fh "#define $defines[$i]\t$i\n";
508 }
509
510 # Construct the LB pair table. This is based on the rules in
511 # http://www.unicode.org/reports/tr14/, but modified as those rules are
512 # designed for someone taking a string of text and sequentially going
513 # through it to find the break opportunities, whereas, Perl requires
514 # determining if a given random spot is a break opportunity, without
515 # knowing all the entire string before it.
516 #
517 # The table is constructed in reverse order of the rules, to make the
518 # lower-numbered, higher priority ones override the later ones, as the
519 # algorithm stops at the earliest matching rule
520
521 my @lb_table;
522 my $table_size = @lb_short_enums;
523
524 # LB31. Break everywhere else
525 for my $i (0 .. $table_size - 1) {
526 for my $j (0 .. $table_size - 1) {
527 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
528 }
529 }
530
531 # LB30a. Don't break between Regional Indicators
532 $lb_table[$lb_enums{'RI'}][$lb_enums{'RI'}] = $lb_actions{'LB_NOBREAK'};
533
534 # LB30 Do not break between letters, numbers, or ordinary symbols and
535 # opening or closing parentheses.
536 # (AL | HL | NU) × OP
537 $lb_table[$lb_enums{'AL'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
538 $lb_table[$lb_enums{'HL'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
539 $lb_table[$lb_enums{'NU'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'};
540
541 # CP × (AL | HL | NU)
542 $lb_table[$lb_enums{'CP'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
543 $lb_table[$lb_enums{'CP'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
544 $lb_table[$lb_enums{'CP'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
545
546 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
547 # IS × (AL | HL)
548 $lb_table[$lb_enums{'IS'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
549 $lb_table[$lb_enums{'IS'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
550
551 # LB28 Do not break between alphabetics (“at”).
552 # (AL | HL) × (AL | HL)
553 $lb_table[$lb_enums{'AL'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
554 $lb_table[$lb_enums{'HL'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
555 $lb_table[$lb_enums{'AL'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
556 $lb_table[$lb_enums{'HL'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
557
558 # LB27 Treat a Korean Syllable Block the same as ID.
559 # (JL | JV | JT | H2 | H3) × IN
560 $lb_table[$lb_enums{'JL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
561 $lb_table[$lb_enums{'JV'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
562 $lb_table[$lb_enums{'JT'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
563 $lb_table[$lb_enums{'H2'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
564 $lb_table[$lb_enums{'H3'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
565
566 # (JL | JV | JT | H2 | H3) × PO
567 $lb_table[$lb_enums{'JL'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
568 $lb_table[$lb_enums{'JV'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
569 $lb_table[$lb_enums{'JT'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
570 $lb_table[$lb_enums{'H2'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
571 $lb_table[$lb_enums{'H3'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
572
573 # PR × (JL | JV | JT | H2 | H3)
574 $lb_table[$lb_enums{'PR'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
575 $lb_table[$lb_enums{'PR'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
576 $lb_table[$lb_enums{'PR'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
577 $lb_table[$lb_enums{'PR'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
578 $lb_table[$lb_enums{'PR'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
579
580 # LB26 Do not break a Korean syllable.
581 # JL × (JL | JV | H2 | H3)
582 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
583 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
584 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
585 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
586
587 # (JV | H2) × (JV | JT)
588 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
589 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
590 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
591 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
592
593 # (JT | H3) × JT
594 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
595 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
596
597 # LB25 Do not break between the following pairs of classes relevant to
598 # numbers, as tailored by example 7 in
599 # http://www.unicode.org/reports/tr14/#Examples
600 # We follow that tailoring because Unicode's test cases expect it
601 # (PR | PO) × ( OP | HY )? NU
602 $lb_table[$lb_enums{'PR'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
603 $lb_table[$lb_enums{'PO'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
604
605 # Given that (OP | HY )? is optional, we have to test for it in code.
606 # We add in the action (instead of overriding) for this, so that in
607 # the code we can recover the underlying break value.
608 $lb_table[$lb_enums{'PR'}][$lb_enums{'OP'}]
609 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
610 $lb_table[$lb_enums{'PO'}][$lb_enums{'OP'}]
611 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
612 $lb_table[$lb_enums{'PR'}][$lb_enums{'HY'}]
613 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
614 $lb_table[$lb_enums{'PO'}][$lb_enums{'HY'}]
615 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
616
617 # ( OP | HY ) × NU
618 $lb_table[$lb_enums{'OP'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
619 $lb_table[$lb_enums{'HY'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
620
621 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
622 # which can be rewritten as:
623 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
624 $lb_table[$lb_enums{'NU'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
625 $lb_table[$lb_enums{'NU'}][$lb_enums{'SY'}] = $lb_actions{'LB_NOBREAK'};
626 $lb_table[$lb_enums{'NU'}][$lb_enums{'IS'}] = $lb_actions{'LB_NOBREAK'};
627 $lb_table[$lb_enums{'NU'}][$lb_enums{'CL'}] = $lb_actions{'LB_NOBREAK'};
628 $lb_table[$lb_enums{'NU'}][$lb_enums{'CP'}] = $lb_actions{'LB_NOBREAK'};
629
630 # Like earlier where we have to test in code, we add in the action so
631 # that we can recover the underlying values. This is done in rules
632 # below, as well. The code assumes that we haven't added 2 actions.
633 # Shoul a later Unicode release break that assumption, then tests
634 # should start failing.
635 $lb_table[$lb_enums{'SY'}][$lb_enums{'NU'}]
636 += $lb_actions{'LB_SY_or_IS_then_various'};
637 $lb_table[$lb_enums{'SY'}][$lb_enums{'SY'}]
638 += $lb_actions{'LB_SY_or_IS_then_various'};
639 $lb_table[$lb_enums{'SY'}][$lb_enums{'IS'}]
640 += $lb_actions{'LB_SY_or_IS_then_various'};
641 $lb_table[$lb_enums{'SY'}][$lb_enums{'CL'}]
642 += $lb_actions{'LB_SY_or_IS_then_various'};
643 $lb_table[$lb_enums{'SY'}][$lb_enums{'CP'}]
644 += $lb_actions{'LB_SY_or_IS_then_various'};
645 $lb_table[$lb_enums{'IS'}][$lb_enums{'NU'}]
646 += $lb_actions{'LB_SY_or_IS_then_various'};
647 $lb_table[$lb_enums{'IS'}][$lb_enums{'SY'}]
648 += $lb_actions{'LB_SY_or_IS_then_various'};
649 $lb_table[$lb_enums{'IS'}][$lb_enums{'IS'}]
650 += $lb_actions{'LB_SY_or_IS_then_various'};
651 $lb_table[$lb_enums{'IS'}][$lb_enums{'CL'}]
652 += $lb_actions{'LB_SY_or_IS_then_various'};
653 $lb_table[$lb_enums{'IS'}][$lb_enums{'CP'}]
654 += $lb_actions{'LB_SY_or_IS_then_various'};
655
656 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
657 # which can be rewritten as:
658 # NU (SY | IS)* (CL | CP)? × (PO | PR)
659 $lb_table[$lb_enums{'NU'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
660 $lb_table[$lb_enums{'NU'}][$lb_enums{'PR'}] = $lb_actions{'LB_NOBREAK'};
661
662 $lb_table[$lb_enums{'CP'}][$lb_enums{'PO'}]
663 += $lb_actions{'LB_various_then_PO_or_PR'};
664 $lb_table[$lb_enums{'CL'}][$lb_enums{'PO'}]
665 += $lb_actions{'LB_various_then_PO_or_PR'};
666 $lb_table[$lb_enums{'IS'}][$lb_enums{'PO'}]
667 += $lb_actions{'LB_various_then_PO_or_PR'};
668 $lb_table[$lb_enums{'SY'}][$lb_enums{'PO'}]
669 += $lb_actions{'LB_various_then_PO_or_PR'};
670
671 $lb_table[$lb_enums{'CP'}][$lb_enums{'PR'}]
672 += $lb_actions{'LB_various_then_PO_or_PR'};
673 $lb_table[$lb_enums{'CL'}][$lb_enums{'PR'}]
674 += $lb_actions{'LB_various_then_PO_or_PR'};
675 $lb_table[$lb_enums{'IS'}][$lb_enums{'PR'}]
676 += $lb_actions{'LB_various_then_PO_or_PR'};
677 $lb_table[$lb_enums{'SY'}][$lb_enums{'PR'}]
678 += $lb_actions{'LB_various_then_PO_or_PR'};
679
680 # LB24 Do not break between prefix and letters or ideographs.
681 # PR × ID
682 $lb_table[$lb_enums{'PR'}][$lb_enums{'ID'}] = $lb_actions{'LB_NOBREAK'};
683
684 # PR × (AL | HL)
685 $lb_table[$lb_enums{'PR'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
686 $lb_table[$lb_enums{'PR'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
687
688 # PO × (AL | HL)
689 $lb_table[$lb_enums{'PO'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
690 $lb_table[$lb_enums{'PO'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
691
692 # LB23 Do not break within ‘a9’, ‘3a’, or ‘H%’.
693 # ID × PO
694 $lb_table[$lb_enums{'ID'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'};
695
696 # (AL | HL) × NU
697 $lb_table[$lb_enums{'AL'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
698 $lb_table[$lb_enums{'HL'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'};
699
700 # NU × (AL | HL)
701 $lb_table[$lb_enums{'NU'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'};
702 $lb_table[$lb_enums{'NU'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
703
704 # LB22 Do not break between two ellipses, or between letters, numbers or
705 # exclamations and ellipsis.
706 # (AL | HL) × IN
707 $lb_table[$lb_enums{'AL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
708 $lb_table[$lb_enums{'HL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
709
710 # EX × IN
711 $lb_table[$lb_enums{'EX'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
712
713 # ID × IN
714 $lb_table[$lb_enums{'ID'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
715
716 # IN × IN
717 $lb_table[$lb_enums{'IN'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
718
719 # NU × IN
720 $lb_table[$lb_enums{'NU'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'};
721
722 # LB21b Don’t break between Solidus and Hebrew letters.
723 # SY × HL
724 $lb_table[$lb_enums{'SY'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'};
725
726 # LB21a Don't break after Hebrew + Hyphen.
727 # HL (HY | BA) ×
728 for my $i (0 .. @lb_table - 1) {
729 $lb_table[$lb_enums{'HY'}][$i] += $lb_actions{'LB_HY_or_BA_then_foo'};
730 $lb_table[$lb_enums{'BA'}][$i] += $lb_actions{'LB_HY_or_BA_then_foo'};
731 }
732
733 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
734 # spaces, small kana, and other non-starters, or after acute accents.
735 # × BA
736 # × HY
737 # × NS
738 # BB ×
739 for my $i (0 .. @lb_table - 1) {
740 $lb_table[$i][$lb_enums{'BA'}] = $lb_actions{'LB_NOBREAK'};
741 $lb_table[$i][$lb_enums{'HY'}] = $lb_actions{'LB_NOBREAK'};
742 $lb_table[$i][$lb_enums{'NS'}] = $lb_actions{'LB_NOBREAK'};
743 $lb_table[$lb_enums{'BB'}][$i] = $lb_actions{'LB_NOBREAK'};
744 }
745
746 # LB20 Break before and after unresolved CB.
747 # ÷ CB
748 # CB ÷
749 # Conditional breaks should be resolved external to the line breaking
750 # rules. However, the default action is to treat unresolved CB as breaking
751 # before and after.
752 for my $i (0 .. @lb_table - 1) {
753 $lb_table[$i][$lb_enums{'CB'}] = $lb_actions{'LB_BREAKABLE'};
754 $lb_table[$lb_enums{'CB'}][$i] = $lb_actions{'LB_BREAKABLE'};
755 }
756
757 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
758 # × QU
759 # QU ×
760 for my $i (0 .. @lb_table - 1) {
761 $lb_table[$i][$lb_enums{'QU'}] = $lb_actions{'LB_NOBREAK'};
762 $lb_table[$lb_enums{'QU'}][$i] = $lb_actions{'LB_NOBREAK'};
763 }
764
765 # LB18 Break after spaces
766 # SP ÷
767 for my $i (0 .. @lb_table - 1) {
768 $lb_table[$lb_enums{'SP'}][$i] = $lb_actions{'LB_BREAKABLE'};
769 }
770
771 # LB17 Do not break within ‘——’, even with intervening spaces.
772 # B2 SP* × B2
773 $lb_table[$lb_enums{'B2'}][$lb_enums{'B2'}]
774 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
775
776 # LB16 Do not break between closing punctuation and a nonstarter even with
777 # intervening spaces.
778 # (CL | CP) SP* × NS
779 $lb_table[$lb_enums{'CL'}][$lb_enums{'NS'}]
780 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
781 $lb_table[$lb_enums{'CP'}][$lb_enums{'NS'}]
782 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
783
784
785 # LB15 Do not break within ‘”[’, even with intervening spaces.
786 # QU SP* × OP
787 $lb_table[$lb_enums{'QU'}][$lb_enums{'OP'}]
788 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
789
790 # LB14 Do not break after ‘[’, even after spaces.
791 # OP SP* ×
792 for my $i (0 .. @lb_table - 1) {
793 $lb_table[$lb_enums{'OP'}][$i]
794 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
795 }
796
797 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
798 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
799 # [^NU] × CL
800 # [^NU] × CP
801 # × EX
802 # [^NU] × IS
803 # [^NU] × SY
804 for my $i (0 .. @lb_table - 1) {
805 $lb_table[$i][$lb_enums{'EX'}]
806 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
807
808 next if $i == $lb_enums{'NU'};
809
810 $lb_table[$i][$lb_enums{'CL'}]
811 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
812 $lb_table[$i][$lb_enums{'CP'}]
813 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
814 $lb_table[$i][$lb_enums{'IS'}]
815 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
816 $lb_table[$i][$lb_enums{'SY'}]
817 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
818 }
819
820 # LB12a Do not break before NBSP and related characters, except after
821 # spaces and hyphens.
822 # [^SP BA HY] × GL
823 for my $i (0 .. @lb_table - 1) {
824 next if $i == $lb_enums{'SP'}
825 || $i == $lb_enums{'BA'}
826 || $i == $lb_enums{'HY'};
827
828 # We don't break, but if a property above has said don't break even
829 # with space between, don't override that (also in the next few rules)
830 next if $lb_table[$i][$lb_enums{'GL'}]
831 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
832 $lb_table[$i][$lb_enums{'GL'}] = $lb_actions{'LB_NOBREAK'};
833 }
834
835 # LB12 Do not break after NBSP and related characters.
836 # GL ×
837 for my $i (0 .. @lb_table - 1) {
838 next if $lb_table[$lb_enums{'GL'}][$i]
839 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
840 $lb_table[$lb_enums{'GL'}][$i] = $lb_actions{'LB_NOBREAK'};
841 }
842
843 # LB11 Do not break before or after Word joiner and related characters.
844 # × WJ
845 # WJ ×
846 for my $i (0 .. @lb_table - 1) {
847 if ($lb_table[$i][$lb_enums{'WJ'}]
848 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
849 {
850 $lb_table[$i][$lb_enums{'WJ'}] = $lb_actions{'LB_NOBREAK'};
851 }
852 if ($lb_table[$lb_enums{'WJ'}][$i]
853 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
854 {
855 $lb_table[$lb_enums{'WJ'}][$i] = $lb_actions{'LB_NOBREAK'};
856 }
857 }
858
859 # Special case this here to avoid having to do a special case in the code,
860 # by making this the same as other things with a SP in front of them that
861 # don't break, we avoid an extra test
862 $lb_table[$lb_enums{'SP'}][$lb_enums{'WJ'}]
863 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
864
865 # LB9 and LB10 are done in the same loop
866 #
867 # LB9 Do not break a combining character sequence; treat it as if it has
868 # the line breaking class of the base character in all of the
869 # higher-numbered rules.
870 # Treat X CM* as if it were X.
871 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
872
873 # LB10 Treat any remaining combining mark as AL. This catches the case
874 # where a CM is the first character on the line or follows SP, BK, CR, LF,
875 # NL, or ZW.
876 for my $i (0 .. @lb_table - 1) {
877
878 # When the CM is the first in the pair, we don't know without looking
879 # behind whether the CM is going to inherit from an earlier character,
880 # or not. So have to figure this out in the code
881 $lb_table[$lb_enums{'CM'}][$i] = $lb_actions{'LB_CM_foo'};
882
883 if ( $i == $lb_enums{'BK'}
884 || $i == $lb_enums{'ed'}
885 || $i == $lb_enums{'CR'}
886 || $i == $lb_enums{'LF'}
887 || $i == $lb_enums{'NL'}
888 || $i == $lb_enums{'SP'}
889 || $i == $lb_enums{'ZW'})
890 {
891 # For these classes, a following CM doesn't combine, and should do
892 # whatever 'AL' would do.
893 $lb_table[$i][$lb_enums{'CM'}] = $lb_table[$i][$lb_enums{'AL'}];
894 }
895 else {
896 # For these classes, the CM combines, so doesn't break, inheriting
897 # the type of nobreak from the master character.
898 if ($lb_table[$i][$lb_enums{'CM'}]
899 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
900 {
901 $lb_table[$i][$lb_enums{'CM'}] = $lb_actions{'LB_NOBREAK'};
902 }
903 }
904 }
905
906 # LB8 Break before any character following a zero-width space, even if one
907 # or more spaces intervene.
908 # ZW SP* ÷
909 for my $i (0 .. @lb_table - 1) {
910 $lb_table[$lb_enums{'ZW'}][$i] = $lb_actions{'LB_BREAKABLE'};
911 }
912
913 # Because of LB8-10, we need to look at context for "SP x", and this must
914 # be done in the code. So override the existing rules for that, by adding
915 # a constant to get new rules that tell the code it needs to look at
916 # context. By adding this action instead of replacing the existing one,
917 # we can get back to the original rule if necessary.
918 for my $i (0 .. @lb_table - 1) {
919 $lb_table[$lb_enums{'SP'}][$i] += $lb_actions{'LB_SP_foo'};
920 }
921
922 # LB7 Do not break before spaces or zero width space.
923 # × SP
924 # × ZW
925 for my $i (0 .. @lb_table - 1) {
926 $lb_table[$i][$lb_enums{'SP'}] = $lb_actions{'LB_NOBREAK'};
927 $lb_table[$i][$lb_enums{'ZW'}] = $lb_actions{'LB_NOBREAK'};
928 }
929
930 # LB6 Do not break before hard line breaks.
931 # × ( BK | CR | LF | NL )
932 for my $i (0 .. @lb_table - 1) {
933 $lb_table[$i][$lb_enums{'BK'}] = $lb_actions{'LB_NOBREAK'};
934 $lb_table[$i][$lb_enums{'CR'}] = $lb_actions{'LB_NOBREAK'};
935 $lb_table[$i][$lb_enums{'LF'}] = $lb_actions{'LB_NOBREAK'};
936 $lb_table[$i][$lb_enums{'NL'}] = $lb_actions{'LB_NOBREAK'};
937 }
938
939 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
940 # CR × LF
941 # CR !
942 # LF !
943 # NL !
944 for my $i (0 .. @lb_table - 1) {
945 $lb_table[$lb_enums{'CR'}][$i] = $lb_actions{'LB_BREAKABLE'};
946 $lb_table[$lb_enums{'LF'}][$i] = $lb_actions{'LB_BREAKABLE'};
947 $lb_table[$lb_enums{'NL'}][$i] = $lb_actions{'LB_BREAKABLE'};
948 }
949 $lb_table[$lb_enums{'CR'}][$lb_enums{'LF'}] = $lb_actions{'LB_NOBREAK'};
950
951 # LB4 Always break after hard line breaks.
952 # BK !
953 for my $i (0 .. @lb_table - 1) {
954 $lb_table[$lb_enums{'BK'}][$i] = $lb_actions{'LB_BREAKABLE'};
955 }
956
957 # LB2 Never break at the start of text.
958 # sot ×
959 # LB3 Always break at the end of text.
960 # ! eot
961 # but these are reversed in the loop below, so that won't break if there
962 # is no text
963 for my $i (0 .. @lb_table - 1) {
964 $lb_table[$i][$lb_enums{'ed'}] = $lb_actions{'LB_BREAKABLE'};
965 $lb_table[$lb_enums{'ed'}][$i] = $lb_actions{'LB_NOBREAK'};
966 }
967
968 # LB1 Assign a line breaking class to each code point of the input.
969 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
970 # depending on criteria outside the scope of this algorithm.
971 #
972 # In the absence of such criteria all characters with a specific
973 # combination of original class and General_Category property value are
974 # resolved as follows:
975 # Original Resolved General_Category
976 # AI, SG, XX AL Any
977 # SA CM Only Mn or Mc
978 # SA AL Any except Mn and Mc
979 # CJ NS Any
980 #
981 # This is done in mktables, so we never see any of the remapped-from
982 # classes.
983
984 print $out_fh "\nstatic const U8 LB_table[$table_size][$table_size] = {\n";
985 print $out_fh "\n/* 'ed' stands for 'edge' */\n";
986 print $out_fh "/* ";
987 for my $i (0 .. @lb_table - 1) {
988 print $out_fh " $lb_short_enums[$i]";
989 }
990 print $out_fh " */\n";
991
992 for my $i (0 .. @lb_table - 1) {
993 print $out_fh "/* $lb_short_enums[$i] */ ";
994 for my $j (0 .. @lb_table - 1) {
995 printf $out_fh "%2d", $lb_table[$i][$j];
996 print $out_fh "," if $i < @lb_table - 1 || $j < @lb_table - 1;
997 print $out_fh " " if $j < @lb_table - 1;
998 }
999 print $out_fh "\n";
1000 }
1001
1002 print $out_fh "};\n";
1003
1004 end_file_pound_if;
1005}
1006
9d9177be
KW
1007output_invlist("Latin1", [ 0, 256 ]);
1008output_invlist("AboveLatin1", [ 256 ]);
1009
bffc0129 1010end_file_pound_if;
43b443dd 1011
3f427fd9
KW
1012# We construct lists for all the POSIX and backslash sequence character
1013# classes in two forms:
1014# 1) ones which match only in the ASCII range
1015# 2) ones which match either in the Latin1 range, or the entire Unicode range
1016#
1017# These get compiled in, and hence affect the memory footprint of every Perl
1018# program, even those not using Unicode. To minimize the size, currently
1019# the Latin1 version is generated for the beyond ASCII range except for those
1020# lists that are quite small for the entire range, such as for \s, which is 22
1021# UVs long plus 4 UVs (currently) for the header.
1022#
1023# To save even more memory, the ASCII versions could be derived from the
1024# larger ones at runtime, saving some memory (minus the expense of the machine
1025# instructions to do so), but these are all small anyway, so their total is
1026# about 100 UVs.
1027#
1028# In the list of properties below that get generated, the L1 prefix is a fake
1029# property that means just the Latin1 range of the full property (whose name
1030# has an X prefix instead of L1).
a02047bf
KW
1031#
1032# An initial & means to use the subroutine from this file instead of an
1033# official inversion list.
3f427fd9 1034
0c4ecf42
KW
1035for my $charset (get_supported_code_pages()) {
1036 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1037
99f21fb9
KW
1038 @a2n = @{get_a2n($charset)};
1039 no warnings 'qw';
1040 # Ignore non-alpha in sort
1041 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
1042 ASCII
1043 Cased
1044 VertSpace
1045 XPerlSpace
1046 XPosixAlnum
1047 XPosixAlpha
1048 XPosixBlank
1049 XPosixCntrl
1050 XPosixDigit
1051 XPosixGraph
1052 XPosixLower
1053 XPosixPrint
1054 XPosixPunct
1055 XPosixSpace
1056 XPosixUpper
1057 XPosixWord
1058 XPosixXDigit
1059 _Perl_Any_Folds
1060 &NonL1_Perl_Non_Final_Folds
1061 _Perl_Folds_To_Multi_Char
1062 &UpperLatin1
1063 _Perl_IDStart
1064 _Perl_IDCont
02f811dd 1065 _Perl_GCB,EDGE
ca8226cf 1066 _Perl_LB,EDGE
bf4268fa 1067 _Perl_SB,EDGE
190d69bb 1068 _Perl_WB,EDGE,UNKNOWN
1c8c3428 1069 )
0f5e3c71
KW
1070 ) {
1071
1072 # For the Latin1 properties, we change to use the eXtended version of the
1073 # base property, then go through the result and get rid of everything not
1074 # in Latin1 (above 255). Actually, we retain the element for the range
1075 # that crosses the 255/256 boundary if it is one that matches the
1076 # property. For example, in the Word property, there is a range of code
1077 # points that start at U+00F8 and goes through U+02C1. Instead of
1078 # artificially cutting that off at 256 because 256 is the first code point
1079 # above Latin1, we let the range go to its natural ending. That gives us
1080 # extra information with no added space taken. But if the range that
1081 # crosses the boundary is one that doesn't match the property, we don't
1082 # start a new range above 255, as that could be construed as going to
1083 # infinity. For example, the Upper property doesn't include the character
1084 # at 255, but does include the one at 256. We don't include the 256 one.
1085 my $prop_name = $prop;
1086 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1087 my $extra_enums = "";
1088 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1089 my $lookup_prop = $prop_name;
1090 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1091 or $lookup_prop =~ s/^L1//);
1092 my $nonl1_only = 0;
1093 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1094 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1095
1096 my @invlist;
99f21fb9
KW
1097 my @invmap;
1098 my $map_format;
1099 my $map_default;
1100 my $maps_to_code_point;
1101 my $to_adjust;
0f5e3c71
KW
1102 if ($is_local_sub) {
1103 @invlist = eval $lookup_prop;
1104 }
1105 else {
1106 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1107 if (! @invlist) {
99f21fb9 1108
ad85f59a
KW
1109 # If couldn't find a non-empty inversion list, see if it is
1110 # instead an inversion map
1111 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1112 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1113 if (! $list_ref) {
1114 # An empty return here could mean an unknown property, or
1115 # merely that the original inversion list is empty. Call
1116 # in scalar context to differentiate
1117 my $count = prop_invlist($lookup_prop,
1118 '_perl_core_internal_ok');
1119 die "Could not find inversion list for '$lookup_prop'"
1120 unless defined $count;
1121 }
1122 else {
18b852b3
KW
1123 @invlist = @$list_ref;
1124 @invmap = @$map_ref;
1125 $map_format = $format;
1126 $map_default = $default;
1127 $maps_to_code_point = $map_format =~ /x/;
1128 $to_adjust = $map_format =~ /a/;
ad85f59a 1129 }
99f21fb9 1130 }
0f5e3c71 1131 }
ad85f59a
KW
1132
1133
1134 # Short-circuit an empty inversion list.
1135 if (! @invlist) {
1136 output_invlist($prop_name, \@invlist, $charset);
1137 next;
1138 }
ceb1de32 1139
99f21fb9
KW
1140 # Re-order the Unicode code points to native ones for this platform.
1141 # This is only needed for code points below 256, because native code
1142 # points are only in that range. For inversion maps of properties
1143 # where the mappings are adjusted (format =~ /a/), this reordering
1144 # could mess up the adjustment pattern that was in the input, so that
1145 # has to be dealt with.
1146 #
1147 # And inversion maps that map to code points need to eventually have
1148 # all those code points remapped to native, and it's better to do that
1149 # here, going through the whole list not just those below 256. This
1150 # is because some inversion maps have adjustments (format =~ /a/)
1151 # which may be affected by the reordering. This code needs to be done
1152 # both for when we are translating the inversion lists for < 256, and
1153 # for the inversion maps for everything. By doing both in this loop,
1154 # we can share that code.
1155 #
1156 # So, we go through everything for an inversion map to code points;
1157 # otherwise, we can skip any remapping at all if we are going to
1158 # output only the above-Latin1 values, or if the range spans the whole
1159 # of 0..256, as the remap will also include all of 0..256 (256 not
1160 # 255 because a re-ordering could cause 256 to need to be in the same
1161 # range as 255.)
1162 if ((@invmap && $maps_to_code_point)
1163 || (! $nonl1_only || ($invlist[0] < 256
1164 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1165 {
fb4554ea 1166
99f21fb9 1167 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1168 # Look at all the ranges that start before 257.
1169 my @latin1_list;
1170 while (@invlist) {
1171 last if $invlist[0] > 256;
1172 my $upper = @invlist > 1
1173 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1174
1175 # To infinity. You may want to stop much much
1176 # earlier; going this high may expose perl
1177 # deficiencies with very large numbers.
1178 : $Unicode::UCD::MAX_CP;
fb4554ea 1179 for my $j ($invlist[0] .. $upper) {
99f21fb9 1180 push @latin1_list, a2n($j);
0f5e3c71 1181 }
fb4554ea
KW
1182
1183 shift @invlist; # Shift off the range that's in the list
1184 shift @invlist; # Shift off the range not in the list
0c4ecf42 1185 }
fb4554ea
KW
1186
1187 # Here @invlist contains all the ranges in the original that start
1188 # at code points above 256, and @latin1_list contains all the
1189 # native code points for ranges that start with a Unicode code
1190 # point below 257. We sort the latter and convert it to inversion
1191 # list format. Then simply prepend it to the list of the higher
1192 # code points.
1193 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1194 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1195 unshift @invlist, @latin1_list;
99f21fb9
KW
1196 }
1197 else { # Is an inversion map
1198
1199 # This is a similar procedure as plain inversion list, but has
1200 # multiple buckets. A plain inversion list just has two
1201 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1202 # pretty much can ignore the 2nd bucket, as it is completely
1203 # defined by the 1st. But here, what we do is create buckets
1204 # which contain the code points that map to each, translated
1205 # to native and turned into an inversion list. Thus each
1206 # bucket is an inversion list of native code points that map
1207 # to it or don't map to it. We use these to create an
1208 # inversion map for the whole property.
1209
1210 # As mentioned earlier, we use this procedure to not just
1211 # remap the inversion list to native values, but also the maps
1212 # of code points to native ones. In the latter case we have
1213 # to look at the whole of the inversion map (or at least to
1214 # above Unicode; as the maps of code points above that should
1215 # all be to the default).
1216 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1217
1218 my %mapped_lists; # A hash whose keys are the buckets.
1219 while (@invlist) {
1220 last if $invlist[0] > $upper_limit;
1221
1222 # This shouldn't actually happen, as prop_invmap() returns
1223 # an extra element at the end that is beyond $upper_limit
1224 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1225
1226 my $bucket;
1227
1228 # A hash key can't be a ref (we are only expecting arrays
1229 # of scalars here), so convert any such to a string that
1230 # will be converted back later (using a vertical tab as
1231 # the separator). Even if the mapping is to code points,
1232 # we don't translate to native here because the code
1233 # output_map() calls to output these arrays assumes the
1234 # input is Unicode, not native.
1235 if (ref $invmap[0]) {
1236 $bucket = join "\cK", @{$invmap[0]};
1237 }
1238 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1239
1240 # Do convert to native for maps to single code points.
1241 # There are some properties that have a few outlier
1242 # maps that aren't code points, so the above test
1243 # skips those.
1244 $bucket = a2n($invmap[0]);
1245 } else {
1246 $bucket = $invmap[0];
1247 }
1248
1249 # We now have the bucket that all code points in the range
1250 # map to, though possibly they need to be adjusted. Go
1251 # through the range and put each translated code point in
1252 # it into its bucket.
1253 my $base_map = $invmap[0];
1254 for my $j ($invlist[0] .. $invlist[1] - 1) {
1255 if ($to_adjust
1256 # The 1st code point doesn't need adjusting
1257 && $j > $invlist[0]
1258
1259 # Skip any non-numeric maps: these are outliers
1260 # that aren't code points.
1261 && $base_map =~ $numeric_re
1262
1263 # 'ne' because the default can be a string
1264 && $base_map ne $map_default)
1265 {
1266 # We adjust, by incrementing each the bucket and
1267 # the map. For code point maps, translate to
1268 # native
1269 $base_map++;
1270 $bucket = ($maps_to_code_point)
1271 ? a2n($base_map)
1272 : $base_map;
1273 }
1274
1275 # Add the native code point to the bucket for the
1276 # current map
1277 push @{$mapped_lists{$bucket}}, a2n($j);
1278 } # End of loop through all code points in the range
1279
1280 # Get ready for the next range
1281 shift @invlist;
1282 shift @invmap;
1283 } # End of loop through all ranges in the map.
1284
1285 # Here, @invlist and @invmap retain all the ranges from the
1286 # originals that start with code points above $upper_limit.
1287 # Each bucket in %mapped_lists contains all the code points
1288 # that map to that bucket. If the bucket is for a map to a
1289 # single code point is a single code point, the bucket has
1290 # been converted to native. If something else (including
1291 # multiple code points), no conversion is done.
1292 #
1293 # Now we recreate the inversion map into %xlated, but this
1294 # time for the native character set.
1295 my %xlated;
1296 foreach my $bucket (keys %mapped_lists) {
1297
1298 # Sort and convert this bucket to an inversion list. The
1299 # result will be that ranges that start with even-numbered
1300 # indexes will be for code points that map to this bucket;
1301 # odd ones map to some other bucket, and are discarded
1302 # below.
1303 @{$mapped_lists{$bucket}}
1304 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
1305 @{$mapped_lists{$bucket}}
1306 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
1307
1308 # Add each even-numbered range in the bucket to %xlated;
1309 # so that the keys of %xlated become the range start code
1310 # points, and the values are their corresponding maps.
1311 while (@{$mapped_lists{$bucket}}) {
1312 my $range_start = $mapped_lists{$bucket}->[0];
1313 if ($bucket =~ /\cK/) {
1314 @{$xlated{$range_start}} = split /\cK/, $bucket;
1315 }
1316 else {
1317 $xlated{$range_start} = $bucket;
1318 }
1319 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
1320 shift @{$mapped_lists{$bucket}}; # Get ready for next
1321 # iteration
1322 }
1323 } # End of loop through all the buckets.
1324
1325 # Here %xlated's keys are the range starts of all the code
1326 # points in the inversion map. Construct an inversion list
1327 # from them.
1328 my @new_invlist = sort { $a <=> $b } keys %xlated;
1329
1330 # If the list is adjusted, we want to munge this list so that
1331 # we only have one entry for where consecutive code points map
1332 # to consecutive values. We just skip the subsequent entries
1333 # where this is the case.
1334 if ($to_adjust) {
1335 my @temp;
1336 for my $i (0 .. @new_invlist - 1) {
1337 next if $i > 0
1338 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
1339 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
1340 && $xlated{$new_invlist[$i]} =~ $numeric_re
1341 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
1342 push @temp, $new_invlist[$i];
1343 }
1344 @new_invlist = @temp;
1345 }
1346
1347 # The inversion map comes from %xlated's values. We can
1348 # unshift each onto the front of the untouched portion, in
1349 # reverse order of the portion we did process.
1350 foreach my $start (reverse @new_invlist) {
1351 unshift @invmap, $xlated{$start};
1352 }
1353
1354 # Finally prepend the inversion list we have just constructed to the
1355 # one that contains anything we didn't process.
1356 unshift @invlist, @new_invlist;
1357 }
1358 }
1359
1360 # prop_invmap() returns an extra final entry, which we can now
1361 # discard.
1362 if (@invmap) {
1363 pop @invlist;
1364 pop @invmap;
ceb1de32 1365 }
0f5e3c71
KW
1366
1367 if ($l1_only) {
99f21fb9 1368 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
1369 for my $i (0 .. @invlist - 1 - 1) {
1370 if ($invlist[$i] > 255) {
1371
1372 # In an inversion list, even-numbered elements give the code
1373 # points that begin ranges that match the property;
1374 # odd-numbered give ones that begin ranges that don't match.
1375 # If $i is odd, we are at the first code point above 255 that
1376 # doesn't match, which means the range it is ending does
1377 # match, and crosses the 255/256 boundary. We want to include
1378 # this ending point, so increment $i, so the splice below
1379 # includes it. Conversely, if $i is even, it is the first
1380 # code point above 255 that matches, which means there was no
1381 # matching range that crossed the boundary, and we don't want
1382 # to include this code point, so splice before it.
1383 $i++ if $i % 2 != 0;
1384
1385 # Remove everything past this.
1386 splice @invlist, $i;
99f21fb9 1387 splice @invmap, $i if @invmap;
0f5e3c71
KW
1388 last;
1389 }
0c4ecf42
KW
1390 }
1391 }
0f5e3c71
KW
1392 elsif ($nonl1_only) {
1393 my $found_nonl1 = 0;
1394 for my $i (0 .. @invlist - 1 - 1) {
1395 next if $invlist[$i] < 256;
1396
1397 # Here, we have the first element in the array that indicates an
1398 # element above Latin1. Get rid of all previous ones.
1399 splice @invlist, 0, $i;
99f21fb9 1400 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
1401
1402 # If this one's index is not divisible by 2, it means that this
1403 # element is inverting away from being in the list, which means
99f21fb9
KW
1404 # all code points from 256 to this one are in this list (or
1405 # map to the default for inversion maps)
1406 if ($i % 2 != 0) {
1407 unshift @invlist, 256;
1408 unshift @invmap, $map_default if @invmap;
1409 }
0f5e3c71 1410 $found_nonl1 = 1;
3f427fd9
KW
1411 last;
1412 }
0f5e3c71 1413 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 1414 }
3f427fd9 1415
0f5e3c71 1416 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 1417 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 1418 }
bffc0129 1419 end_file_pound_if;
0c4ecf42 1420 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
1421}
1422
6b659339
KW
1423output_LB_table();
1424
2308ab83 1425my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
1426my @sources = ($0, qw(lib/unicore/mktables
1427 lib/Unicode/UCD.pm
1428 regen/charset_translations.pl
1429 ));
9a3da3ad
FC
1430{
1431 # Depend on mktables’ own sources. It’s a shorter list of files than
1432 # those that Unicode::UCD uses.
2308ab83
KW
1433 if (! open my $mktables_list, $sources_list) {
1434
1435 # This should force a rebuild once $sources_list exists
1436 push @sources, $sources_list;
1437 }
1438 else {
1439 while(<$mktables_list>) {
1440 last if /===/;
1441 chomp;
1442 push @sources, "lib/unicore/$_" if /^[^#]/;
1443 }
9a3da3ad
FC
1444 }
1445}
6b659339
KW
1446
1447read_only_bottom_close_and_rename($out_fh, \@sources);