Commit | Line | Data |
---|---|---|
4bc3dcfa KW |
1 | use v5.16.0; |
2 | use strict; | |
3 | use warnings; | |
94e72741 | 4 | use integer; |
3d7c117d MB |
5 | |
6 | BEGIN { unshift @INC, '.' } | |
7 | ||
8 | require './regen/regen_lib.pl'; | |
9 | require './regen/charset_translations.pl'; | |
4bc3dcfa KW |
10 | |
11 | # Generates the EBCDIC translation tables that were formerly hard-coded into | |
12 | # utfebcdic.h | |
13 | ||
14 | my $out_fh = open_new('ebcdic_tables.h', '>', | |
15 | {style => '*', by => $0, }); | |
16 | ||
94e72741 KW |
17 | sub get_column_headers ($$;$) { |
18 | my ($row_hdr_len, $field_width, $dfa_columns) = @_; | |
19 | my $format; | |
20 | my $final_column_format; | |
21 | my $num_columns; | |
22 | ||
23 | if (defined $dfa_columns) { | |
24 | $num_columns = $dfa_columns; | |
25 | ||
26 | # Trailing blank to correspond with commas in the rows below | |
27 | $format = "%${field_width}d "; | |
28 | } | |
29 | else { # Is a regular table | |
30 | $num_columns = 16; | |
31 | ||
32 | # Use blanks to separate the fields | |
33 | $format = " " x ( $field_width | |
34 | - 2); # For the '_X' | |
35 | $format .= "_%X "; # Again, trailing blank over the commas below | |
36 | } | |
37 | ||
38 | my $header = "/*" . " " x ($row_hdr_len - length "/*"); | |
39 | ||
40 | # All but the final column | |
41 | $header .= sprintf($format, $_) for 0 .. $num_columns - 2; | |
42 | ||
43 | # Get rid of trailing blank, so that the final column takes up one less | |
44 | # space so that the "*/" doesn't extend past the commas in the rows below | |
45 | chop $header; | |
46 | $header .= sprintf $format, $num_columns - 1; | |
47 | ||
48 | # Again, remove trailing blank | |
49 | chop $header; | |
50 | ||
51 | return $header . "*/\n"; | |
52 | } | |
53 | ||
8c3c6496 KW |
54 | sub output_table_start($$$;$) { |
55 | my ($out_fh, $TYPE, $name, $size) = @_; | |
635ff1f9 | 56 | |
8c3c6496 KW |
57 | $size = "" unless defined $size; |
58 | my $declaration = "EXTCONST $TYPE $name\[$size\]"; | |
635ff1f9 KW |
59 | print $out_fh <<EOF; |
60 | # ifndef DOINIT | |
0a142f46 | 61 | $declaration; |
635ff1f9 | 62 | # else |
0a142f46 | 63 | $declaration = { |
635ff1f9 KW |
64 | EOF |
65 | } | |
66 | ||
67 | sub output_table_end($) { | |
68 | print $out_fh "};\n# endif\n\n"; | |
69 | } | |
70 | ||
702cfe48 | 71 | sub output_table ($$;$) { |
4bc3dcfa KW |
72 | my $table_ref = shift; |
73 | my $name = shift; | |
74 | ||
94e72741 KW |
75 | # 0 => print in decimal |
76 | # 1 => print in hex (translates code point to code point) | |
f6521f7c | 77 | # >= 2 => is a dfa table, like https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
94e72741 KW |
78 | # The number is how many columns in the part after the code point |
79 | # portion. | |
80 | # | |
81 | # code point tables in hex areasier to debug, but don't fit into 80 | |
82 | # columns | |
83 | my $type = shift // 1; | |
84 | ||
85 | my $print_in_hex = $type == 1; | |
86 | my $is_dfa = ($type >= 2) ? $type : 0; | |
87 | my $columns_after_256 = 16; | |
88 | ||
89 | die "Requres 256 entries in table $name, got @$table_ref" | |
90 | if ! $is_dfa && @$table_ref != 256; | |
91 | if (! $is_dfa) { | |
92 | die "Requres 256 entries in table $name, got @$table_ref" | |
93 | if @$table_ref != 256; | |
94 | } | |
95 | else { | |
96 | $columns_after_256 = $is_dfa; | |
97 | ||
98 | print $out_fh <<'EOF'; | |
99 | ||
100 | /* The table below is adapted from | |
f6521f7c | 101 | * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
94e72741 KW |
102 | * See copyright notice at the beginning of this file. |
103 | */ | |
104 | ||
105 | EOF | |
106 | } | |
107 | ||
108 | # Highest number in the table | |
109 | my $max_entry = 0; | |
110 | $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref; | |
43bf0d50 | 111 | |
94e72741 KW |
112 | # We assume that every table has at least one two digit entry, and none |
113 | # are more than three digit. | |
114 | my $field_width = ($print_in_hex) | |
115 | ? 4 | |
116 | : (($max_entry) > 99 ? 3 : 2); | |
4bc3dcfa | 117 | |
94e72741 KW |
118 | my $row_hdr_length; |
119 | my $node_number_field_width; | |
120 | my $node_value_field_width; | |
121 | ||
122 | # dfa tables have a special header for the rows in the transitions part of | |
123 | # the table. It is longer than the regular one. | |
124 | if ($is_dfa) { | |
125 | my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1; | |
126 | $node_number_field_width = ($max_node_number > 9) ? 2 : 1; | |
127 | $node_value_field_width = ($max_node_number * $columns_after_256 > 99) | |
128 | ? 3 : 2; | |
129 | # The header starts with this template, and adds in the number of | |
130 | # digits needed to represent the maximum node number and its value | |
131 | $row_hdr_length = length("/*N=*/") | |
132 | + $node_number_field_width | |
133 | + $node_value_field_width; | |
134 | } | |
135 | else { | |
136 | $row_hdr_length = length "/*_X*/"; # Template for what the header | |
137 | # looks like | |
138 | } | |
139 | ||
140 | # The table may not be representable in 8 bits. | |
141 | my $TYPE = 'U8'; | |
142 | $TYPE = 'U16' if grep { $_ > 255 } @$table_ref; | |
143 | ||
635ff1f9 | 144 | output_table_start $out_fh, $TYPE, $name; |
4bc3dcfa | 145 | |
94e72741 KW |
146 | # First the headers for the columns |
147 | print $out_fh get_column_headers($row_hdr_length, $field_width); | |
148 | ||
149 | # Now the table body | |
150 | my $count = @$table_ref; | |
151 | my $last_was_nl = 1; | |
152 | ||
153 | # Print each element individually, arranged in rows of columns | |
154 | for my $i (0 .. $count - 1) { | |
155 | ||
156 | # Node number for here is -1 until get into the dfa state transitions | |
157 | my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256; | |
158 | ||
159 | # Print row header at beginning of each row | |
160 | if ($last_was_nl) { | |
161 | if ($node >= 0) { | |
162 | printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node, | |
163 | $node_value_field_width, $i - 256; | |
164 | } | |
165 | else { # Otherwise is regular row; print its number | |
166 | printf $out_fh "/*%X_", $i / 16; | |
167 | ||
168 | # These rows in a dfa table require extra space so columns | |
169 | # will align vertically (because the Ndd=ddd requires extra | |
170 | # space) | |
171 | if ($is_dfa) { | |
172 | print $out_fh " " x ( $node_number_field_width | |
173 | + $node_value_field_width); | |
174 | } | |
175 | print $out_fh "*/"; | |
176 | } | |
177 | } | |
178 | ||
43bf0d50 | 179 | if ($print_in_hex) { |
702cfe48 | 180 | printf $out_fh "0x%02X", $table_ref->[$i]; |
43bf0d50 KW |
181 | } |
182 | else { | |
94e72741 KW |
183 | printf $out_fh "%${field_width}d", $table_ref->[$i]; |
184 | } | |
185 | ||
186 | print $out_fh ",", if $i < $count -1; # No comma on final entry | |
187 | ||
188 | # Add \n if at end of row, which is 16 columns until we get to the | |
189 | # transitions part | |
190 | if ( ($node < 0 && $i % 16 == 15) | |
191 | || ($node >= 0 && ($i -256) % $columns_after_256 | |
192 | == $columns_after_256 - 1)) | |
193 | { | |
194 | print $out_fh "\n"; | |
195 | $last_was_nl = 1; | |
196 | } | |
197 | else { | |
198 | $last_was_nl = 0; | |
43bf0d50 | 199 | } |
4bc3dcfa | 200 | } |
94e72741 KW |
201 | |
202 | # Print column footer | |
203 | print $out_fh get_column_headers($row_hdr_length, $field_width, | |
204 | ($is_dfa) ? $columns_after_256 : undef); | |
205 | ||
635ff1f9 | 206 | output_table_end($out_fh); |
4bc3dcfa KW |
207 | } |
208 | ||
3de6d141 | 209 | print $out_fh <<'END'; |
4bc3dcfa | 210 | |
6a5bc5ac KW |
211 | #ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */ |
212 | #define PERL_EBCDIC_TABLES_H_ 1 | |
4bc3dcfa KW |
213 | |
214 | /* This file contains definitions for various tables used in EBCDIC handling. | |
3de6d141 KW |
215 | * More info is in utfebcdic.h |
216 | * | |
217 | * Some of the tables are adapted from | |
f6521f7c | 218 | * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
3de6d141 KW |
219 | * which requires this copyright notice: |
220 | ||
221 | Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
222 | ||
223 | Permission is hereby granted, free of charge, to any person obtaining a copy of | |
224 | this software and associated documentation files (the "Software"), to deal in | |
225 | the Software without restriction, including without limitation the rights to | |
226 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
227 | of the Software, and to permit persons to whom the Software is furnished to do | |
228 | so, subject to the following conditions: | |
229 | ||
230 | The above copyright notice and this permission notice shall be included in all | |
231 | copies or substantial portions of the Software. | |
232 | ||
233 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
234 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
235 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
236 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
237 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
238 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
239 | SOFTWARE. | |
240 | ||
241 | */ | |
4bc3dcfa KW |
242 | END |
243 | ||
244 | my @charsets = get_supported_code_pages(); | |
245 | shift @charsets; # ASCII is the 0th, and we don't deal with that here. | |
246 | foreach my $charset (@charsets) { | |
c30a0cf2 TC |
247 | # we process the whole array several times, make a copy |
248 | my @a2e = @{get_a2n($charset)}; | |
b37fc6e8 | 249 | my @e2a; |
4bc3dcfa KW |
250 | |
251 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
252 | print $out_fh "\n"; | |
253 | ||
254 | print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; | |
255 | output_table(\@a2e, "PL_a2e"); | |
256 | ||
257 | { # Construct the inverse | |
4bc3dcfa KW |
258 | for my $i (0 .. 255) { |
259 | $e2a[$a2e[$i]] = $i; | |
260 | } | |
261 | print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; | |
262 | output_table(\@e2a, "PL_e2a"); | |
263 | } | |
264 | ||
e0dcdb0a | 265 | my @i82utf = @{get_I8_2_utf($charset)}; |
4bc3dcfa KW |
266 | print $out_fh <<END; |
267 | /* (Confusingly named) Index is $charset I8 byte; value is | |
268 | * $charset UTF-EBCDIC equivalent */ | |
269 | END | |
270 | output_table(\@i82utf, "PL_utf2e"); | |
271 | ||
272 | { #Construct the inverse | |
273 | my @utf2i8; | |
274 | for my $i (0 .. 255) { | |
275 | $utf2i8[$i82utf[$i]] = $i; | |
276 | } | |
277 | print $out_fh <<END; | |
278 | /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is | |
279 | * $charset I8 equivalent */ | |
280 | END | |
281 | output_table(\@utf2i8, "PL_e2utf"); | |
282 | } | |
283 | ||
284 | { | |
285 | my @utf8skip; | |
286 | ||
287 | # These are invariants or continuation bytes. | |
288 | for my $i (0 .. 0xBF) { | |
289 | $utf8skip[$i82utf[$i]] = 1; | |
290 | } | |
291 | ||
292 | # These are start bytes; The skip is the number of consecutive highest | |
293 | # order 1-bits (up to 7) | |
294 | for my $i (0xC0 .. 255) { | |
295 | my $count; | |
c0236afe KW |
296 | if ($i == 0b11111111) { |
297 | no warnings 'once'; | |
298 | $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; | |
299 | } | |
300 | elsif (($i & 0b11111110) == 0b11111110) { | |
4bc3dcfa KW |
301 | $count= 7; |
302 | } | |
303 | elsif (($i & 0b11111100) == 0b11111100) { | |
304 | $count= 6; | |
305 | } | |
306 | elsif (($i & 0b11111000) == 0b11111000) { | |
307 | $count= 5; | |
308 | } | |
309 | elsif (($i & 0b11110000) == 0b11110000) { | |
310 | $count= 4; | |
311 | } | |
312 | elsif (($i & 0b11100000) == 0b11100000) { | |
313 | $count= 3; | |
314 | } | |
315 | elsif (($i & 0b11000000) == 0b11000000) { | |
316 | $count= 2; | |
317 | } | |
318 | else { | |
319 | die "Something wrong for UTF8SKIP calculation for $i"; | |
320 | } | |
321 | $utf8skip[$i82utf[$i]] = $count; | |
322 | } | |
323 | ||
324 | print $out_fh <<END; | |
4719093e KW |
325 | /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes |
326 | * (including for overlongs); 1 for continuation. Adapted from the shadow | |
327 | * flags table in tr16. The entries marked 9 in tr16 are continuation bytes | |
328 | * and are marked as length 1 here so that we can recover. */ | |
4bc3dcfa | 329 | END |
702cfe48 KW |
330 | output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print |
331 | # in hex | |
4bc3dcfa KW |
332 | } |
333 | ||
334 | use feature 'unicode_strings'; | |
335 | ||
336 | { | |
337 | my @lc; | |
338 | for my $i (0 .. 255) { | |
339 | $lc[$a2e[$i]] = $a2e[ord lc chr $i]; | |
340 | } | |
94e72741 KW |
341 | print $out_fh |
342 | "/* Index is $charset code point; value is its lowercase equivalent */\n"; | |
4bc3dcfa KW |
343 | output_table(\@lc, "PL_latin1_lc"); |
344 | } | |
345 | ||
346 | { | |
347 | my @uc; | |
348 | for my $i (0 .. 255) { | |
349 | my $uc = uc chr $i; | |
350 | if (length $uc > 1 || ord $uc > 255) { | |
351 | $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; | |
352 | } | |
353 | $uc[$a2e[$i]] = $a2e[ord $uc]; | |
354 | } | |
355 | print $out_fh <<END; | |
356 | /* Index is $charset code point; value is its uppercase equivalent. | |
357 | * The 'mod' in the name means that codepoints whose uppercase is above 255 or | |
358 | * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ | |
359 | END | |
360 | output_table(\@uc, "PL_mod_latin1_uc"); | |
361 | } | |
362 | ||
363 | { # PL_fold | |
364 | my @ascii_fold; | |
365 | for my $i (0 .. 255) { # Initialise to identity map | |
366 | $ascii_fold[$i] = $i; | |
367 | } | |
368 | ||
369 | # Overwrite the entries that aren't identity | |
370 | for my $chr ('A' .. 'Z') { | |
371 | $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; | |
372 | } | |
373 | for my $chr ('a' .. 'z') { | |
374 | $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; | |
375 | } | |
376 | print $out_fh <<END; | |
377 | /* Index is $charset code point; For A-Z, value is a-z; for a-z, value | |
378 | * is A-Z; all other code points map to themselves */ | |
379 | END | |
380 | output_table(\@ascii_fold, "PL_fold"); | |
381 | } | |
382 | ||
383 | { | |
384 | my @latin1_fold; | |
385 | for my $i (0 .. 255) { | |
386 | my $char = chr $i; | |
387 | my $lc = lc $char; | |
388 | ||
389 | # lc and uc adequately proxy for fold-case pairs in this 0-255 | |
390 | # range | |
391 | my $uc = uc $char; | |
392 | $uc = $char if length $uc > 1 || ord $uc > 255; | |
393 | if ($lc ne $char) { | |
394 | $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; | |
395 | } | |
396 | elsif ($uc ne $char) { | |
397 | $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; | |
398 | } | |
399 | else { | |
400 | $latin1_fold[$a2e[$i]] = $a2e[$i]; | |
401 | } | |
402 | } | |
403 | print $out_fh <<END; | |
404 | /* Index is $charset code point; value is its other fold-pair equivalent | |
405 | * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is | |
406 | * the code point itself */ | |
407 | END | |
408 | output_table(\@latin1_fold, "PL_fold_latin1"); | |
409 | } | |
410 | ||
3de6d141 KW |
411 | { |
412 | # This generates the dfa table for perl extended UTF-8, which accepts | |
413 | # surrogates, non-characters, and accepts start bytes up through FE | |
414 | # (start byte FF has to be handled outside this dfa). The class numbers | |
415 | # for start bytes are constrained so that they can be used as a shift | |
416 | # count for masking off the leading one bits | |
417 | # | |
418 | # The classes are | |
419 | # 00-9F 0 | |
420 | # A0-A1 7 Not legal immediately after start bytes F0 F8 FC | |
421 | # FE | |
422 | # A2-A3 8 Not legal immediately after start bytes F0 F8 FC | |
423 | # A4-A7 9 Not legal immediately after start bytes F0 F8 | |
424 | # A8-AF 10 Not legal immediately after start bytes F0 | |
425 | # B0-BF 11 | |
426 | # C0-C4 1 | |
427 | # C5-DF 2 | |
428 | # E0 1 | |
429 | # E1-EF 3 | |
430 | # F0 12 | |
431 | # F1-F7 4 | |
432 | # F8 13 | |
433 | # F9-FB 5 | |
434 | # FC 14 | |
435 | # FD 6 | |
436 | # FE 15 | |
437 | # FF 1 | |
438 | # | |
439 | # Here's the I8 for the code points before which overlongs occur: | |
440 | # U+4000: \xF0\xB0\xA0\xA0 | |
441 | # U+40000: \xF8\xA8\xA0\xA0\xA0 | |
442 | # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0 | |
443 | # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0 | |
444 | # | |
445 | # The first part of the table maps bytes to character classes to reduce | |
446 | # the size of the transition table and create bitmasks. | |
447 | # | |
448 | # The second part is a transition table that maps a combination of a | |
449 | # state of the automaton and a character class to a new state. The | |
450 | # numbering of the original nodes is retained, but some have been split | |
451 | # so that there are new nodes. They mean: | |
452 | # N0 The initial state, and final accepting one. | |
453 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
454 | # immediately when the start byte indicates a two-byte sequence | |
455 | # N2 Two continuation bytes left. | |
456 | # N3 Three continuation bytes left. | |
457 | # N4 Four continuation bytes left. | |
458 | # N5 Five continuation bytes left. | |
459 | # N6 Start byte is F0. Continuation bytes A[0-F] are illegal | |
460 | # (overlong); the other continuations transition to N2 | |
461 | # N7 Start byte is F8. Continuation bytes A[0-7] are illegal | |
462 | # (overlong); the other continuations transition to N3 | |
463 | # N8 Start byte is FC. Continuation bytes A[0-3] are illegal | |
464 | # (overlong); the other continuations transition to N4 | |
465 | # N9 Start byte is FE. Continuation bytes A[01] are illegal | |
466 | # (overlong); the other continuations transition to N5 | |
467 | # 1 Reject. All transitions not mentioned above (except the single | |
468 | # byte ones (as they are always legal) are to this state. | |
469 | ||
470 | my $NUM_CLASSES = 16; | |
471 | my $N0 = 0; | |
472 | my $N1 = $N0 + $NUM_CLASSES; | |
473 | my $N2 = $N1 + $NUM_CLASSES; | |
474 | my $N3 = $N2 + $NUM_CLASSES; | |
475 | my $N4 = $N3 + $NUM_CLASSES; | |
476 | my $N5 = $N4 + $NUM_CLASSES; | |
477 | my $N6 = $N5 + $NUM_CLASSES; | |
478 | my $N7 = $N6 + $NUM_CLASSES; | |
479 | my $N8 = $N7 + $NUM_CLASSES; | |
480 | my $N9 = $N8 + $NUM_CLASSES; | |
481 | my $N10 = $N9 + $NUM_CLASSES; | |
482 | ||
483 | my @perl_extended_utf8_dfa; | |
484 | my @i8 = ( | |
485 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
486 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
487 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
488 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
489 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
490 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
491 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
492 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
493 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
494 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
495 | 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF | |
496 | 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF | |
497 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
498 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
499 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
500 | 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF | |
501 | ); | |
502 | $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
503 | push @perl_extended_utf8_dfa, ( | |
504 | # Class: | |
505 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
506 | 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0 | |
507 | 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1 | |
508 | 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2 | |
509 | 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3 | |
510 | 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4 | |
511 | 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5 | |
512 | ||
513 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6 | |
514 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7 | |
515 | 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8 | |
516 | 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9 | |
517 | ); | |
71525f77 | 518 | output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab", |
3de6d141 KW |
519 | $NUM_CLASSES); |
520 | } | |
521 | ||
123deead KW |
522 | { |
523 | # This generates the dfa table for strict UTF-8, which rejects | |
524 | # surrogates, non-characters, and above Unicode. | |
525 | # | |
526 | # The classes are | |
527 | # 00-9F 0 Always legal at start | |
528 | # A0 10 Not legal immediately after start bytes F0 F8 | |
529 | # A1 11 Not legal immediately after start bytes F0 F8, | |
530 | # A2-A7 12 Not legal immediately after start bytes F0 F8 F9 | |
531 | # A8,AA,AC 13 Not legal immediately after start bytes F0 F9 | |
532 | # A9,AB,AD 14 Not legal immediately after start byte F0 | |
533 | # AE 15 Not legal immediately after start byte F0 | |
534 | # AF 16 Not legal immediately after start bytes F0 | |
535 | # B[0248AC] 17 Not legal immediately after start byte F9 | |
536 | # B[1359D] 18 Not legal immediately after start byte F9 | |
537 | # B6 19 Not legal immediately after start byte F9 | |
538 | # B7 20 Not legal immediately after start byte F9 | |
539 | # BE 21 Not legal immediately after start byte F9 | |
540 | # BF 22 Not legal immediately after start byte F9 | |
541 | # C0-C4 1 (reject, all are overlong) | |
542 | # C5-DF 2 Accepts any legal continuation | |
543 | # E0 1 (reject, all are overlong) | |
544 | # E1-EF 3 Accepts any legal continuation | |
545 | # F0 8 (has overlongs) | |
546 | # F1 6 (has surrogates, non-chars) | |
547 | # F2,F4,F6 4 Accepts any legal continuation | |
548 | # F3,F5,F7 5 (has non-chars) | |
549 | # F8 9 (has overlongs, non-chars) | |
550 | # F9 7 (has non-chars, non-Unicode) | |
551 | # FA-FF 1 (reject, all are non-Unicode) | |
552 | # | |
553 | # Here's the I8 for enough code points so that you can figure out what's | |
554 | # going on: | |
555 | # | |
556 | # U+D800: \xF1\xB6\xA0\xA0 | |
557 | # U+DFFF: \xF1\xB7\xBF\xBF | |
558 | # U+FDD0: \xF1\xBF\xAE\xB0 | |
559 | # U+FDEF: \xF1\xBF\xAF\xAF | |
560 | # U+FFFE: \xF1\xBF\xBF\xBE | |
561 | # U+1FFFE: \xF3\xBF\xBF\xBE | |
562 | # U+2FFFE: \xF5\xBF\xBF\xBE | |
563 | # U+3FFFE: \xF7\xBF\xBF\xBE | |
564 | # U+4FFFE: \xF8\xA9\xBF\xBF\xBE | |
565 | # U+5FFFE: \xF8\xAB\xBF\xBF\xBE | |
566 | # U+6FFFE: \xF8\xAD\xBF\xBF\xBE | |
567 | # U+7FFFE: \xF8\xAF\xBF\xBF\xBE | |
568 | # U+8FFFE: \xF8\xB1\xBF\xBF\xBE | |
569 | # U+9FFFE: \xF8\xB3\xBF\xBF\xBE | |
570 | # U+AFFFE: \xF8\xB5\xBF\xBF\xBE | |
571 | # U+BFFFE: \xF8\xB7\xBF\xBF\xBE | |
572 | # U+CFFFE: \xF8\xB9\xBF\xBF\xBE | |
573 | # U+DFFFE: \xF8\xBB\xBF\xBF\xBE | |
574 | # U+EFFFE: \xF8\xBD\xBF\xBF\xBE | |
575 | # U+FFFFE: \xF8\xBF\xBF\xBF\xBE | |
576 | # U+10FFFE: \xF9\xA1\xBF\xBF\xBE | |
577 | # | |
578 | # The first part of the table maps bytes to character classes to reduce | |
579 | # the size of the transition table and create bitmasks. | |
580 | # | |
581 | # The second part is a transition table that maps a combination of a | |
582 | # state of the automaton and a character class to a new state. The | |
583 | # numbering of the original nodes is retained, but some have been split | |
584 | # so that there are new nodes. They mean: | |
585 | # N0 The initial state, and final accepting one. | |
586 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
587 | # immediately when the start byte indicates a two-byte sequence | |
588 | # N2 Two continuation bytes left. | |
589 | # N3 Three continuation bytes left. | |
590 | # N4 Start byte is F0. Continuation bytes A[0-F] are illegal | |
591 | # (overlong); the other continuations transition to N2 | |
592 | # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal | |
593 | # (surrogates); BF transitions to N9; the other continuations to | |
594 | # N2 | |
595 | # N6 Start byte is F[357]. Continuation byte BF transitions to N12; | |
596 | # other continuations to N2 | |
597 | # N5 Start byte is F8. Continuation bytes A[0-7] are illegal | |
598 | # (overlong); continuations A[9BDF] and B[13579BDF] transition to | |
599 | # N14; the other continuations to N3 | |
600 | # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1 | |
601 | # to N14; the other continuation bytes are illegal. | |
602 | # N9 Initial sequence is F1 BF. Continuation byte AE transitions to | |
603 | # state N10; AF to N11; BF to N13; the other continuations to N1. | |
604 | # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are | |
605 | # illegal (non-chars); the other continuations are legal | |
606 | # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are | |
607 | # illegal (non-chars); the other continuations are legal | |
608 | # N12 Initial sequence is F[357] BF. Continuation bytes BF | |
609 | # transitions to N13; the other continuations to N1 | |
610 | # N13 Initial sequence is F[1357] BF BF or F8 x BF (where x is | |
611 | # something that can lead to a non-char. Continuation bytes BE | |
612 | # and BF are illegal (non-chars); the other continuations are | |
613 | # legal | |
614 | # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1. | |
615 | # Continuation byte BF transitions to N13; the other | |
616 | # continuations to N2 | |
617 | # 1 Reject. All transitions not mentioned above (except the single | |
618 | # byte ones (as they are always legal) are to this state. | |
619 | ||
620 | my $NUM_CLASSES = 23; | |
621 | my $N0 = 0; | |
622 | my $N1 = $N0 + $NUM_CLASSES; | |
623 | my $N2 = $N1 + $NUM_CLASSES; | |
624 | my $N3 = $N2 + $NUM_CLASSES; | |
625 | my $N4 = $N3 + $NUM_CLASSES; | |
626 | my $N5 = $N4 + $NUM_CLASSES; | |
627 | my $N6 = $N5 + $NUM_CLASSES; | |
628 | my $N7 = $N6 + $NUM_CLASSES; | |
629 | my $N8 = $N7 + $NUM_CLASSES; | |
630 | my $N9 = $N8 + $NUM_CLASSES; | |
631 | my $N10 = $N9 + $NUM_CLASSES; | |
632 | my $N11 = $N10 + $NUM_CLASSES; | |
633 | my $N12 = $N11 + $NUM_CLASSES; | |
634 | my $N13 = $N12 + $NUM_CLASSES; | |
635 | my $N14 = $N13 + $NUM_CLASSES; | |
636 | ||
637 | my @strict_utf8_dfa; | |
638 | my @i8 = ( | |
639 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
640 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
641 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
642 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
643 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
644 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
645 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
646 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
647 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
648 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
649 | 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF | |
650 | 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF | |
651 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
652 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
653 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
654 | 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF | |
655 | ); | |
656 | $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
657 | push @strict_utf8_dfa, ( | |
658 | # Class: | |
659 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | |
660 | 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0 | |
661 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1 | |
662 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2 | |
663 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3 | |
664 | ||
665 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4 | |
666 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5 | |
667 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6 | |
668 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7 | |
669 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8 | |
670 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9 | |
671 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10 | |
672 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11 | |
673 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12 | |
674 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13 | |
675 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14 | |
676 | ); | |
71525f77 | 677 | output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES); |
123deead KW |
678 | } |
679 | ||
c5bfbb64 KW |
680 | { |
681 | # This generates the dfa table for C9 strict UTF-8, which rejects | |
682 | # surrogates and above Unicode, but allows non-characters,. | |
683 | # | |
684 | # The classes are | |
685 | # 00-9F 0 Always legal at start | |
686 | # A0-A1 9 Not legal immediately after start bytes F0 F8 | |
687 | # A2-A7 10 Not legal immediately after start bytes F0 F8 F9 | |
688 | # A8-AF 11 Not legal immediately after start bytes F0 F9 | |
689 | # B0-B5,B8-BF 12 Not legal immediately after start byte F9 | |
690 | # B6,B7 13 | |
691 | # C0-C4 1 (reject, all are overlong) | |
692 | # C5-DF 2 Accepts any legal continuation | |
693 | # E0 1 (reject, all are overlong) | |
694 | # E1-EF 3 Accepts any legal continuation | |
695 | # F0 6 (has overlongs) | |
696 | # F1 5 (has surrogates) | |
697 | # F2-F7 4 Accepts any legal continuation | |
698 | # F8 8 (has overlongs) | |
699 | # F9 7 (has non-Unicode) | |
700 | # FA-FF 1 (reject, all are non-Unicode) | |
701 | # | |
702 | # The first part of the table maps bytes to character classes to reduce | |
703 | # the size of the transition table and create bitmasks. | |
704 | # | |
705 | # The second part is a transition table that maps a combination of a | |
706 | # state of the automaton and a character class to a new state. The | |
707 | # numbering of the original nodes is retained, but some have been split | |
708 | # so that there are new nodes. They mean: | |
709 | # N0 The initial state, and final accepting one. | |
710 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
711 | # immediately when the start byte indicates a two-byte sequence | |
712 | # N2 Two continuation bytes left. | |
713 | # N3 Three continuation bytes left. | |
714 | # N4 Start byte is F0. Continuation bytes A[0-F] are illegal | |
715 | # (overlong); the other continuations transition to N2 | |
716 | # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the | |
717 | # other continuations transition to N2 | |
718 | # N6 Start byte is F8. Continuation bytes A[0-7] are illegal | |
719 | # (overlong); the other continuations transition to N3 | |
720 | # N7 Start byte is F9. Continuation bytes A0 and A1 transition to | |
721 | # N3; the other continuation bytes are illegal (non-Unicode) | |
722 | # 1 Reject. All transitions not mentioned above (except the single | |
723 | # byte ones (as they are always legal) are to this state. | |
724 | ||
725 | my $NUM_CLASSES = 14; | |
726 | my $N0 = 0; | |
727 | my $N1 = $N0 + $NUM_CLASSES; | |
728 | my $N2 = $N1 + $NUM_CLASSES; | |
729 | my $N3 = $N2 + $NUM_CLASSES; | |
730 | my $N4 = $N3 + $NUM_CLASSES; | |
731 | my $N5 = $N4 + $NUM_CLASSES; | |
732 | my $N6 = $N5 + $NUM_CLASSES; | |
733 | my $N7 = $N6 + $NUM_CLASSES; | |
734 | ||
735 | my @C9_utf8_dfa; | |
736 | my @i8 = ( | |
737 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
738 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
739 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
740 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
741 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
742 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
743 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
744 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
745 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
746 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
747 | 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF | |
748 | 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF | |
749 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
750 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
751 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
752 | 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF | |
753 | ); | |
754 | $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
755 | push @C9_utf8_dfa, ( | |
756 | # Class: | |
757 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 | |
758 | 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0 | |
759 | 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1 | |
760 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2 | |
761 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3 | |
762 | ||
763 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4 | |
764 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5 | |
765 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6 | |
766 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7 | |
767 | ); | |
71525f77 | 768 | output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES); |
c5bfbb64 KW |
769 | } |
770 | ||
b37fc6e8 KW |
771 | { |
772 | print $out_fh <<EOF; | |
773 | /* This table partitions all the code points of the platform into ranges which | |
774 | * have the property that all the code points in each range have the same | |
775 | * number of bytes in their UTF-EBCDIC representations, and the adjacent | |
776 | * ranges have a different number of bytes. | |
777 | * | |
778 | * Each number in the table begins such a range, which extends up to just | |
779 | * before the following table entry, except the final entry is understood to | |
780 | * extend to the platform's infinity | |
781 | */ | |
782 | EOF | |
b37fc6e8 KW |
783 | # The lengths of the characters between 0 and 255 are either 1 or 2, |
784 | # with those whose ASCII platform equivalents below 160 being 1, and | |
785 | # the rest being 2. | |
786 | my @list; | |
787 | push @list, 0; | |
788 | my $pushed_range_is_length_1 = 1; | |
789 | ||
790 | for my $i (1 .. 0xFF) { | |
791 | my $this_code_point_is_length_1 = ($e2a[$i] < 160); | |
792 | if ($pushed_range_is_length_1 != $this_code_point_is_length_1) { | |
793 | push @list, $i; | |
794 | $pushed_range_is_length_1 = $this_code_point_is_length_1; | |
795 | } | |
796 | } | |
797 | ||
798 | # Starting at 256, the length is 2. | |
799 | push @list, 0x100 if $pushed_range_is_length_1; | |
800 | ||
801 | # These are based on the fundamental properties of UTF-EBCDIC. Each | |
802 | # continuation byte has 5 bits of information. Comments in utf8.h | |
803 | # explain the rest. | |
804 | my $UTF_ACCUMULATION_SHIFT = 5; | |
805 | push @list, (32 * (1 << ( $UTF_ACCUMULATION_SHIFT))); | |
806 | push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT))); | |
807 | push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT))); | |
808 | push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT))); | |
809 | push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT))); | |
810 | push @list, ( (1 << (6 * $UTF_ACCUMULATION_SHIFT))); | |
811 | ||
8c3c6496 KW |
812 | output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list); |
813 | print $out_fh "\t"; | |
814 | ||
b37fc6e8 KW |
815 | print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list; |
816 | print $out_fh "\n"; | |
817 | ||
818 | output_table_end($out_fh); | |
819 | } | |
820 | ||
4bc3dcfa KW |
821 | print $out_fh get_conditional_compile_line_end(); |
822 | } | |
823 | ||
6a5bc5ac | 824 | print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n"; |
4bc3dcfa KW |
825 | |
826 | read_only_bottom_close_and_rename($out_fh); |