Commit | Line | Data |
---|---|---|
4bc3dcfa KW |
1 | use v5.16.0; |
2 | use strict; | |
3 | use warnings; | |
94e72741 | 4 | use integer; |
3d7c117d MB |
5 | |
6 | BEGIN { unshift @INC, '.' } | |
7 | ||
8 | require './regen/regen_lib.pl'; | |
9 | require './regen/charset_translations.pl'; | |
4bc3dcfa KW |
10 | |
11 | # Generates the EBCDIC translation tables that were formerly hard-coded into | |
12 | # utfebcdic.h | |
13 | ||
14 | my $out_fh = open_new('ebcdic_tables.h', '>', | |
15 | {style => '*', by => $0, }); | |
16 | ||
94e72741 KW |
17 | sub get_column_headers ($$;$) { |
18 | my ($row_hdr_len, $field_width, $dfa_columns) = @_; | |
19 | my $format; | |
20 | my $final_column_format; | |
21 | my $num_columns; | |
22 | ||
23 | if (defined $dfa_columns) { | |
24 | $num_columns = $dfa_columns; | |
25 | ||
26 | # Trailing blank to correspond with commas in the rows below | |
27 | $format = "%${field_width}d "; | |
28 | } | |
29 | else { # Is a regular table | |
30 | $num_columns = 16; | |
31 | ||
32 | # Use blanks to separate the fields | |
33 | $format = " " x ( $field_width | |
34 | - 2); # For the '_X' | |
35 | $format .= "_%X "; # Again, trailing blank over the commas below | |
36 | } | |
37 | ||
38 | my $header = "/*" . " " x ($row_hdr_len - length "/*"); | |
39 | ||
40 | # All but the final column | |
41 | $header .= sprintf($format, $_) for 0 .. $num_columns - 2; | |
42 | ||
43 | # Get rid of trailing blank, so that the final column takes up one less | |
44 | # space so that the "*/" doesn't extend past the commas in the rows below | |
45 | chop $header; | |
46 | $header .= sprintf $format, $num_columns - 1; | |
47 | ||
48 | # Again, remove trailing blank | |
49 | chop $header; | |
50 | ||
51 | return $header . "*/\n"; | |
52 | } | |
53 | ||
635ff1f9 KW |
54 | sub output_table_start($$$) { |
55 | my ($out_fh, $TYPE, $name) = @_; | |
56 | ||
57 | my $declaration = "EXTCONST $TYPE $name\[\]"; | |
58 | print $out_fh <<EOF; | |
59 | # ifndef DOINIT | |
0a142f46 | 60 | $declaration; |
635ff1f9 | 61 | # else |
0a142f46 | 62 | $declaration = { |
635ff1f9 KW |
63 | EOF |
64 | } | |
65 | ||
66 | sub output_table_end($) { | |
67 | print $out_fh "};\n# endif\n\n"; | |
68 | } | |
69 | ||
702cfe48 | 70 | sub output_table ($$;$) { |
4bc3dcfa KW |
71 | my $table_ref = shift; |
72 | my $name = shift; | |
73 | ||
94e72741 KW |
74 | # 0 => print in decimal |
75 | # 1 => print in hex (translates code point to code point) | |
76 | # >= 2 => is a dfa table, like http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ | |
77 | # The number is how many columns in the part after the code point | |
78 | # portion. | |
79 | # | |
80 | # code point tables in hex areasier to debug, but don't fit into 80 | |
81 | # columns | |
82 | my $type = shift // 1; | |
83 | ||
84 | my $print_in_hex = $type == 1; | |
85 | my $is_dfa = ($type >= 2) ? $type : 0; | |
86 | my $columns_after_256 = 16; | |
87 | ||
88 | die "Requres 256 entries in table $name, got @$table_ref" | |
89 | if ! $is_dfa && @$table_ref != 256; | |
90 | if (! $is_dfa) { | |
91 | die "Requres 256 entries in table $name, got @$table_ref" | |
92 | if @$table_ref != 256; | |
93 | } | |
94 | else { | |
95 | $columns_after_256 = $is_dfa; | |
96 | ||
97 | print $out_fh <<'EOF'; | |
98 | ||
99 | /* The table below is adapted from | |
100 | * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ | |
101 | * See copyright notice at the beginning of this file. | |
102 | */ | |
103 | ||
104 | EOF | |
105 | } | |
106 | ||
107 | # Highest number in the table | |
108 | my $max_entry = 0; | |
109 | $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref; | |
43bf0d50 | 110 | |
94e72741 KW |
111 | # We assume that every table has at least one two digit entry, and none |
112 | # are more than three digit. | |
113 | my $field_width = ($print_in_hex) | |
114 | ? 4 | |
115 | : (($max_entry) > 99 ? 3 : 2); | |
4bc3dcfa | 116 | |
94e72741 KW |
117 | my $row_hdr_length; |
118 | my $node_number_field_width; | |
119 | my $node_value_field_width; | |
120 | ||
121 | # dfa tables have a special header for the rows in the transitions part of | |
122 | # the table. It is longer than the regular one. | |
123 | if ($is_dfa) { | |
124 | my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1; | |
125 | $node_number_field_width = ($max_node_number > 9) ? 2 : 1; | |
126 | $node_value_field_width = ($max_node_number * $columns_after_256 > 99) | |
127 | ? 3 : 2; | |
128 | # The header starts with this template, and adds in the number of | |
129 | # digits needed to represent the maximum node number and its value | |
130 | $row_hdr_length = length("/*N=*/") | |
131 | + $node_number_field_width | |
132 | + $node_value_field_width; | |
133 | } | |
134 | else { | |
135 | $row_hdr_length = length "/*_X*/"; # Template for what the header | |
136 | # looks like | |
137 | } | |
138 | ||
139 | # The table may not be representable in 8 bits. | |
140 | my $TYPE = 'U8'; | |
141 | $TYPE = 'U16' if grep { $_ > 255 } @$table_ref; | |
142 | ||
635ff1f9 | 143 | output_table_start $out_fh, $TYPE, $name; |
4bc3dcfa | 144 | |
94e72741 KW |
145 | # First the headers for the columns |
146 | print $out_fh get_column_headers($row_hdr_length, $field_width); | |
147 | ||
148 | # Now the table body | |
149 | my $count = @$table_ref; | |
150 | my $last_was_nl = 1; | |
151 | ||
152 | # Print each element individually, arranged in rows of columns | |
153 | for my $i (0 .. $count - 1) { | |
154 | ||
155 | # Node number for here is -1 until get into the dfa state transitions | |
156 | my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256; | |
157 | ||
158 | # Print row header at beginning of each row | |
159 | if ($last_was_nl) { | |
160 | if ($node >= 0) { | |
161 | printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node, | |
162 | $node_value_field_width, $i - 256; | |
163 | } | |
164 | else { # Otherwise is regular row; print its number | |
165 | printf $out_fh "/*%X_", $i / 16; | |
166 | ||
167 | # These rows in a dfa table require extra space so columns | |
168 | # will align vertically (because the Ndd=ddd requires extra | |
169 | # space) | |
170 | if ($is_dfa) { | |
171 | print $out_fh " " x ( $node_number_field_width | |
172 | + $node_value_field_width); | |
173 | } | |
174 | print $out_fh "*/"; | |
175 | } | |
176 | } | |
177 | ||
43bf0d50 | 178 | if ($print_in_hex) { |
702cfe48 | 179 | printf $out_fh "0x%02X", $table_ref->[$i]; |
43bf0d50 KW |
180 | } |
181 | else { | |
94e72741 KW |
182 | printf $out_fh "%${field_width}d", $table_ref->[$i]; |
183 | } | |
184 | ||
185 | print $out_fh ",", if $i < $count -1; # No comma on final entry | |
186 | ||
187 | # Add \n if at end of row, which is 16 columns until we get to the | |
188 | # transitions part | |
189 | if ( ($node < 0 && $i % 16 == 15) | |
190 | || ($node >= 0 && ($i -256) % $columns_after_256 | |
191 | == $columns_after_256 - 1)) | |
192 | { | |
193 | print $out_fh "\n"; | |
194 | $last_was_nl = 1; | |
195 | } | |
196 | else { | |
197 | $last_was_nl = 0; | |
43bf0d50 | 198 | } |
4bc3dcfa | 199 | } |
94e72741 KW |
200 | |
201 | # Print column footer | |
202 | print $out_fh get_column_headers($row_hdr_length, $field_width, | |
203 | ($is_dfa) ? $columns_after_256 : undef); | |
204 | ||
635ff1f9 | 205 | output_table_end($out_fh); |
4bc3dcfa KW |
206 | } |
207 | ||
3de6d141 | 208 | print $out_fh <<'END'; |
4bc3dcfa | 209 | |
6a5bc5ac KW |
210 | #ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */ |
211 | #define PERL_EBCDIC_TABLES_H_ 1 | |
4bc3dcfa KW |
212 | |
213 | /* This file contains definitions for various tables used in EBCDIC handling. | |
3de6d141 KW |
214 | * More info is in utfebcdic.h |
215 | * | |
216 | * Some of the tables are adapted from | |
217 | * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ | |
218 | * which requires this copyright notice: | |
219 | ||
220 | Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
221 | ||
222 | Permission is hereby granted, free of charge, to any person obtaining a copy of | |
223 | this software and associated documentation files (the "Software"), to deal in | |
224 | the Software without restriction, including without limitation the rights to | |
225 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
226 | of the Software, and to permit persons to whom the Software is furnished to do | |
227 | so, subject to the following conditions: | |
228 | ||
229 | The above copyright notice and this permission notice shall be included in all | |
230 | copies or substantial portions of the Software. | |
231 | ||
232 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
233 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
234 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
235 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
236 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
237 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
238 | SOFTWARE. | |
239 | ||
240 | */ | |
4bc3dcfa KW |
241 | END |
242 | ||
243 | my @charsets = get_supported_code_pages(); | |
244 | shift @charsets; # ASCII is the 0th, and we don't deal with that here. | |
245 | foreach my $charset (@charsets) { | |
c30a0cf2 TC |
246 | # we process the whole array several times, make a copy |
247 | my @a2e = @{get_a2n($charset)}; | |
4bc3dcfa KW |
248 | |
249 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
250 | print $out_fh "\n"; | |
251 | ||
252 | print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; | |
253 | output_table(\@a2e, "PL_a2e"); | |
254 | ||
255 | { # Construct the inverse | |
256 | my @e2a; | |
257 | for my $i (0 .. 255) { | |
258 | $e2a[$a2e[$i]] = $i; | |
259 | } | |
260 | print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; | |
261 | output_table(\@e2a, "PL_e2a"); | |
262 | } | |
263 | ||
e0dcdb0a | 264 | my @i82utf = @{get_I8_2_utf($charset)}; |
4bc3dcfa KW |
265 | print $out_fh <<END; |
266 | /* (Confusingly named) Index is $charset I8 byte; value is | |
267 | * $charset UTF-EBCDIC equivalent */ | |
268 | END | |
269 | output_table(\@i82utf, "PL_utf2e"); | |
270 | ||
271 | { #Construct the inverse | |
272 | my @utf2i8; | |
273 | for my $i (0 .. 255) { | |
274 | $utf2i8[$i82utf[$i]] = $i; | |
275 | } | |
276 | print $out_fh <<END; | |
277 | /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is | |
278 | * $charset I8 equivalent */ | |
279 | END | |
280 | output_table(\@utf2i8, "PL_e2utf"); | |
281 | } | |
282 | ||
283 | { | |
284 | my @utf8skip; | |
285 | ||
286 | # These are invariants or continuation bytes. | |
287 | for my $i (0 .. 0xBF) { | |
288 | $utf8skip[$i82utf[$i]] = 1; | |
289 | } | |
290 | ||
291 | # These are start bytes; The skip is the number of consecutive highest | |
292 | # order 1-bits (up to 7) | |
293 | for my $i (0xC0 .. 255) { | |
294 | my $count; | |
c0236afe KW |
295 | if ($i == 0b11111111) { |
296 | no warnings 'once'; | |
297 | $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; | |
298 | } | |
299 | elsif (($i & 0b11111110) == 0b11111110) { | |
4bc3dcfa KW |
300 | $count= 7; |
301 | } | |
302 | elsif (($i & 0b11111100) == 0b11111100) { | |
303 | $count= 6; | |
304 | } | |
305 | elsif (($i & 0b11111000) == 0b11111000) { | |
306 | $count= 5; | |
307 | } | |
308 | elsif (($i & 0b11110000) == 0b11110000) { | |
309 | $count= 4; | |
310 | } | |
311 | elsif (($i & 0b11100000) == 0b11100000) { | |
312 | $count= 3; | |
313 | } | |
314 | elsif (($i & 0b11000000) == 0b11000000) { | |
315 | $count= 2; | |
316 | } | |
317 | else { | |
318 | die "Something wrong for UTF8SKIP calculation for $i"; | |
319 | } | |
320 | $utf8skip[$i82utf[$i]] = $count; | |
321 | } | |
322 | ||
323 | print $out_fh <<END; | |
4719093e KW |
324 | /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes |
325 | * (including for overlongs); 1 for continuation. Adapted from the shadow | |
326 | * flags table in tr16. The entries marked 9 in tr16 are continuation bytes | |
327 | * and are marked as length 1 here so that we can recover. */ | |
4bc3dcfa | 328 | END |
702cfe48 KW |
329 | output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print |
330 | # in hex | |
4bc3dcfa KW |
331 | } |
332 | ||
333 | use feature 'unicode_strings'; | |
334 | ||
335 | { | |
336 | my @lc; | |
337 | for my $i (0 .. 255) { | |
338 | $lc[$a2e[$i]] = $a2e[ord lc chr $i]; | |
339 | } | |
94e72741 KW |
340 | print $out_fh |
341 | "/* Index is $charset code point; value is its lowercase equivalent */\n"; | |
4bc3dcfa KW |
342 | output_table(\@lc, "PL_latin1_lc"); |
343 | } | |
344 | ||
345 | { | |
346 | my @uc; | |
347 | for my $i (0 .. 255) { | |
348 | my $uc = uc chr $i; | |
349 | if (length $uc > 1 || ord $uc > 255) { | |
350 | $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; | |
351 | } | |
352 | $uc[$a2e[$i]] = $a2e[ord $uc]; | |
353 | } | |
354 | print $out_fh <<END; | |
355 | /* Index is $charset code point; value is its uppercase equivalent. | |
356 | * The 'mod' in the name means that codepoints whose uppercase is above 255 or | |
357 | * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ | |
358 | END | |
359 | output_table(\@uc, "PL_mod_latin1_uc"); | |
360 | } | |
361 | ||
362 | { # PL_fold | |
363 | my @ascii_fold; | |
364 | for my $i (0 .. 255) { # Initialise to identity map | |
365 | $ascii_fold[$i] = $i; | |
366 | } | |
367 | ||
368 | # Overwrite the entries that aren't identity | |
369 | for my $chr ('A' .. 'Z') { | |
370 | $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; | |
371 | } | |
372 | for my $chr ('a' .. 'z') { | |
373 | $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; | |
374 | } | |
375 | print $out_fh <<END; | |
376 | /* Index is $charset code point; For A-Z, value is a-z; for a-z, value | |
377 | * is A-Z; all other code points map to themselves */ | |
378 | END | |
379 | output_table(\@ascii_fold, "PL_fold"); | |
380 | } | |
381 | ||
382 | { | |
383 | my @latin1_fold; | |
384 | for my $i (0 .. 255) { | |
385 | my $char = chr $i; | |
386 | my $lc = lc $char; | |
387 | ||
388 | # lc and uc adequately proxy for fold-case pairs in this 0-255 | |
389 | # range | |
390 | my $uc = uc $char; | |
391 | $uc = $char if length $uc > 1 || ord $uc > 255; | |
392 | if ($lc ne $char) { | |
393 | $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; | |
394 | } | |
395 | elsif ($uc ne $char) { | |
396 | $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; | |
397 | } | |
398 | else { | |
399 | $latin1_fold[$a2e[$i]] = $a2e[$i]; | |
400 | } | |
401 | } | |
402 | print $out_fh <<END; | |
403 | /* Index is $charset code point; value is its other fold-pair equivalent | |
404 | * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is | |
405 | * the code point itself */ | |
406 | END | |
407 | output_table(\@latin1_fold, "PL_fold_latin1"); | |
408 | } | |
409 | ||
3de6d141 KW |
410 | { |
411 | # This generates the dfa table for perl extended UTF-8, which accepts | |
412 | # surrogates, non-characters, and accepts start bytes up through FE | |
413 | # (start byte FF has to be handled outside this dfa). The class numbers | |
414 | # for start bytes are constrained so that they can be used as a shift | |
415 | # count for masking off the leading one bits | |
416 | # | |
417 | # The classes are | |
418 | # 00-9F 0 | |
419 | # A0-A1 7 Not legal immediately after start bytes F0 F8 FC | |
420 | # FE | |
421 | # A2-A3 8 Not legal immediately after start bytes F0 F8 FC | |
422 | # A4-A7 9 Not legal immediately after start bytes F0 F8 | |
423 | # A8-AF 10 Not legal immediately after start bytes F0 | |
424 | # B0-BF 11 | |
425 | # C0-C4 1 | |
426 | # C5-DF 2 | |
427 | # E0 1 | |
428 | # E1-EF 3 | |
429 | # F0 12 | |
430 | # F1-F7 4 | |
431 | # F8 13 | |
432 | # F9-FB 5 | |
433 | # FC 14 | |
434 | # FD 6 | |
435 | # FE 15 | |
436 | # FF 1 | |
437 | # | |
438 | # Here's the I8 for the code points before which overlongs occur: | |
439 | # U+4000: \xF0\xB0\xA0\xA0 | |
440 | # U+40000: \xF8\xA8\xA0\xA0\xA0 | |
441 | # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0 | |
442 | # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0 | |
443 | # | |
444 | # The first part of the table maps bytes to character classes to reduce | |
445 | # the size of the transition table and create bitmasks. | |
446 | # | |
447 | # The second part is a transition table that maps a combination of a | |
448 | # state of the automaton and a character class to a new state. The | |
449 | # numbering of the original nodes is retained, but some have been split | |
450 | # so that there are new nodes. They mean: | |
451 | # N0 The initial state, and final accepting one. | |
452 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
453 | # immediately when the start byte indicates a two-byte sequence | |
454 | # N2 Two continuation bytes left. | |
455 | # N3 Three continuation bytes left. | |
456 | # N4 Four continuation bytes left. | |
457 | # N5 Five continuation bytes left. | |
458 | # N6 Start byte is F0. Continuation bytes A[0-F] are illegal | |
459 | # (overlong); the other continuations transition to N2 | |
460 | # N7 Start byte is F8. Continuation bytes A[0-7] are illegal | |
461 | # (overlong); the other continuations transition to N3 | |
462 | # N8 Start byte is FC. Continuation bytes A[0-3] are illegal | |
463 | # (overlong); the other continuations transition to N4 | |
464 | # N9 Start byte is FE. Continuation bytes A[01] are illegal | |
465 | # (overlong); the other continuations transition to N5 | |
466 | # 1 Reject. All transitions not mentioned above (except the single | |
467 | # byte ones (as they are always legal) are to this state. | |
468 | ||
469 | my $NUM_CLASSES = 16; | |
470 | my $N0 = 0; | |
471 | my $N1 = $N0 + $NUM_CLASSES; | |
472 | my $N2 = $N1 + $NUM_CLASSES; | |
473 | my $N3 = $N2 + $NUM_CLASSES; | |
474 | my $N4 = $N3 + $NUM_CLASSES; | |
475 | my $N5 = $N4 + $NUM_CLASSES; | |
476 | my $N6 = $N5 + $NUM_CLASSES; | |
477 | my $N7 = $N6 + $NUM_CLASSES; | |
478 | my $N8 = $N7 + $NUM_CLASSES; | |
479 | my $N9 = $N8 + $NUM_CLASSES; | |
480 | my $N10 = $N9 + $NUM_CLASSES; | |
481 | ||
482 | my @perl_extended_utf8_dfa; | |
483 | my @i8 = ( | |
484 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
485 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
486 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
487 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
488 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
489 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
490 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
491 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
492 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
493 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
494 | 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF | |
495 | 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF | |
496 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
497 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
498 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
499 | 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF | |
500 | ); | |
501 | $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
502 | push @perl_extended_utf8_dfa, ( | |
503 | # Class: | |
504 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
505 | 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0 | |
506 | 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1 | |
507 | 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2 | |
508 | 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3 | |
509 | 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4 | |
510 | 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5 | |
511 | ||
512 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6 | |
513 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7 | |
514 | 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8 | |
515 | 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9 | |
516 | ); | |
71525f77 | 517 | output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab", |
3de6d141 KW |
518 | $NUM_CLASSES); |
519 | } | |
520 | ||
123deead KW |
521 | { |
522 | # This generates the dfa table for strict UTF-8, which rejects | |
523 | # surrogates, non-characters, and above Unicode. | |
524 | # | |
525 | # The classes are | |
526 | # 00-9F 0 Always legal at start | |
527 | # A0 10 Not legal immediately after start bytes F0 F8 | |
528 | # A1 11 Not legal immediately after start bytes F0 F8, | |
529 | # A2-A7 12 Not legal immediately after start bytes F0 F8 F9 | |
530 | # A8,AA,AC 13 Not legal immediately after start bytes F0 F9 | |
531 | # A9,AB,AD 14 Not legal immediately after start byte F0 | |
532 | # AE 15 Not legal immediately after start byte F0 | |
533 | # AF 16 Not legal immediately after start bytes F0 | |
534 | # B[0248AC] 17 Not legal immediately after start byte F9 | |
535 | # B[1359D] 18 Not legal immediately after start byte F9 | |
536 | # B6 19 Not legal immediately after start byte F9 | |
537 | # B7 20 Not legal immediately after start byte F9 | |
538 | # BE 21 Not legal immediately after start byte F9 | |
539 | # BF 22 Not legal immediately after start byte F9 | |
540 | # C0-C4 1 (reject, all are overlong) | |
541 | # C5-DF 2 Accepts any legal continuation | |
542 | # E0 1 (reject, all are overlong) | |
543 | # E1-EF 3 Accepts any legal continuation | |
544 | # F0 8 (has overlongs) | |
545 | # F1 6 (has surrogates, non-chars) | |
546 | # F2,F4,F6 4 Accepts any legal continuation | |
547 | # F3,F5,F7 5 (has non-chars) | |
548 | # F8 9 (has overlongs, non-chars) | |
549 | # F9 7 (has non-chars, non-Unicode) | |
550 | # FA-FF 1 (reject, all are non-Unicode) | |
551 | # | |
552 | # Here's the I8 for enough code points so that you can figure out what's | |
553 | # going on: | |
554 | # | |
555 | # U+D800: \xF1\xB6\xA0\xA0 | |
556 | # U+DFFF: \xF1\xB7\xBF\xBF | |
557 | # U+FDD0: \xF1\xBF\xAE\xB0 | |
558 | # U+FDEF: \xF1\xBF\xAF\xAF | |
559 | # U+FFFE: \xF1\xBF\xBF\xBE | |
560 | # U+1FFFE: \xF3\xBF\xBF\xBE | |
561 | # U+2FFFE: \xF5\xBF\xBF\xBE | |
562 | # U+3FFFE: \xF7\xBF\xBF\xBE | |
563 | # U+4FFFE: \xF8\xA9\xBF\xBF\xBE | |
564 | # U+5FFFE: \xF8\xAB\xBF\xBF\xBE | |
565 | # U+6FFFE: \xF8\xAD\xBF\xBF\xBE | |
566 | # U+7FFFE: \xF8\xAF\xBF\xBF\xBE | |
567 | # U+8FFFE: \xF8\xB1\xBF\xBF\xBE | |
568 | # U+9FFFE: \xF8\xB3\xBF\xBF\xBE | |
569 | # U+AFFFE: \xF8\xB5\xBF\xBF\xBE | |
570 | # U+BFFFE: \xF8\xB7\xBF\xBF\xBE | |
571 | # U+CFFFE: \xF8\xB9\xBF\xBF\xBE | |
572 | # U+DFFFE: \xF8\xBB\xBF\xBF\xBE | |
573 | # U+EFFFE: \xF8\xBD\xBF\xBF\xBE | |
574 | # U+FFFFE: \xF8\xBF\xBF\xBF\xBE | |
575 | # U+10FFFE: \xF9\xA1\xBF\xBF\xBE | |
576 | # | |
577 | # The first part of the table maps bytes to character classes to reduce | |
578 | # the size of the transition table and create bitmasks. | |
579 | # | |
580 | # The second part is a transition table that maps a combination of a | |
581 | # state of the automaton and a character class to a new state. The | |
582 | # numbering of the original nodes is retained, but some have been split | |
583 | # so that there are new nodes. They mean: | |
584 | # N0 The initial state, and final accepting one. | |
585 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
586 | # immediately when the start byte indicates a two-byte sequence | |
587 | # N2 Two continuation bytes left. | |
588 | # N3 Three continuation bytes left. | |
589 | # N4 Start byte is F0. Continuation bytes A[0-F] are illegal | |
590 | # (overlong); the other continuations transition to N2 | |
591 | # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal | |
592 | # (surrogates); BF transitions to N9; the other continuations to | |
593 | # N2 | |
594 | # N6 Start byte is F[357]. Continuation byte BF transitions to N12; | |
595 | # other continuations to N2 | |
596 | # N5 Start byte is F8. Continuation bytes A[0-7] are illegal | |
597 | # (overlong); continuations A[9BDF] and B[13579BDF] transition to | |
598 | # N14; the other continuations to N3 | |
599 | # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1 | |
600 | # to N14; the other continuation bytes are illegal. | |
601 | # N9 Initial sequence is F1 BF. Continuation byte AE transitions to | |
602 | # state N10; AF to N11; BF to N13; the other continuations to N1. | |
603 | # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are | |
604 | # illegal (non-chars); the other continuations are legal | |
605 | # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are | |
606 | # illegal (non-chars); the other continuations are legal | |
607 | # N12 Initial sequence is F[357] BF. Continuation bytes BF | |
608 | # transitions to N13; the other continuations to N1 | |
609 | # N13 Initial sequence is F[1357] BF BF or F8 x BF (where x is | |
610 | # something that can lead to a non-char. Continuation bytes BE | |
611 | # and BF are illegal (non-chars); the other continuations are | |
612 | # legal | |
613 | # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1. | |
614 | # Continuation byte BF transitions to N13; the other | |
615 | # continuations to N2 | |
616 | # 1 Reject. All transitions not mentioned above (except the single | |
617 | # byte ones (as they are always legal) are to this state. | |
618 | ||
619 | my $NUM_CLASSES = 23; | |
620 | my $N0 = 0; | |
621 | my $N1 = $N0 + $NUM_CLASSES; | |
622 | my $N2 = $N1 + $NUM_CLASSES; | |
623 | my $N3 = $N2 + $NUM_CLASSES; | |
624 | my $N4 = $N3 + $NUM_CLASSES; | |
625 | my $N5 = $N4 + $NUM_CLASSES; | |
626 | my $N6 = $N5 + $NUM_CLASSES; | |
627 | my $N7 = $N6 + $NUM_CLASSES; | |
628 | my $N8 = $N7 + $NUM_CLASSES; | |
629 | my $N9 = $N8 + $NUM_CLASSES; | |
630 | my $N10 = $N9 + $NUM_CLASSES; | |
631 | my $N11 = $N10 + $NUM_CLASSES; | |
632 | my $N12 = $N11 + $NUM_CLASSES; | |
633 | my $N13 = $N12 + $NUM_CLASSES; | |
634 | my $N14 = $N13 + $NUM_CLASSES; | |
635 | ||
636 | my @strict_utf8_dfa; | |
637 | my @i8 = ( | |
638 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
639 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
640 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
641 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
642 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
643 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
644 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
645 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
646 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
647 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
648 | 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF | |
649 | 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF | |
650 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
651 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
652 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
653 | 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF | |
654 | ); | |
655 | $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
656 | push @strict_utf8_dfa, ( | |
657 | # Class: | |
658 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | |
659 | 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0 | |
660 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1 | |
661 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2 | |
662 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3 | |
663 | ||
664 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4 | |
665 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5 | |
666 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6 | |
667 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7 | |
668 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8 | |
669 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9 | |
670 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10 | |
671 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11 | |
672 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12 | |
673 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13 | |
674 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14 | |
675 | ); | |
71525f77 | 676 | output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES); |
123deead KW |
677 | } |
678 | ||
c5bfbb64 KW |
679 | { |
680 | # This generates the dfa table for C9 strict UTF-8, which rejects | |
681 | # surrogates and above Unicode, but allows non-characters,. | |
682 | # | |
683 | # The classes are | |
684 | # 00-9F 0 Always legal at start | |
685 | # A0-A1 9 Not legal immediately after start bytes F0 F8 | |
686 | # A2-A7 10 Not legal immediately after start bytes F0 F8 F9 | |
687 | # A8-AF 11 Not legal immediately after start bytes F0 F9 | |
688 | # B0-B5,B8-BF 12 Not legal immediately after start byte F9 | |
689 | # B6,B7 13 | |
690 | # C0-C4 1 (reject, all are overlong) | |
691 | # C5-DF 2 Accepts any legal continuation | |
692 | # E0 1 (reject, all are overlong) | |
693 | # E1-EF 3 Accepts any legal continuation | |
694 | # F0 6 (has overlongs) | |
695 | # F1 5 (has surrogates) | |
696 | # F2-F7 4 Accepts any legal continuation | |
697 | # F8 8 (has overlongs) | |
698 | # F9 7 (has non-Unicode) | |
699 | # FA-FF 1 (reject, all are non-Unicode) | |
700 | # | |
701 | # The first part of the table maps bytes to character classes to reduce | |
702 | # the size of the transition table and create bitmasks. | |
703 | # | |
704 | # The second part is a transition table that maps a combination of a | |
705 | # state of the automaton and a character class to a new state. The | |
706 | # numbering of the original nodes is retained, but some have been split | |
707 | # so that there are new nodes. They mean: | |
708 | # N0 The initial state, and final accepting one. | |
709 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
710 | # immediately when the start byte indicates a two-byte sequence | |
711 | # N2 Two continuation bytes left. | |
712 | # N3 Three continuation bytes left. | |
713 | # N4 Start byte is F0. Continuation bytes A[0-F] are illegal | |
714 | # (overlong); the other continuations transition to N2 | |
715 | # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the | |
716 | # other continuations transition to N2 | |
717 | # N6 Start byte is F8. Continuation bytes A[0-7] are illegal | |
718 | # (overlong); the other continuations transition to N3 | |
719 | # N7 Start byte is F9. Continuation bytes A0 and A1 transition to | |
720 | # N3; the other continuation bytes are illegal (non-Unicode) | |
721 | # 1 Reject. All transitions not mentioned above (except the single | |
722 | # byte ones (as they are always legal) are to this state. | |
723 | ||
724 | my $NUM_CLASSES = 14; | |
725 | my $N0 = 0; | |
726 | my $N1 = $N0 + $NUM_CLASSES; | |
727 | my $N2 = $N1 + $NUM_CLASSES; | |
728 | my $N3 = $N2 + $NUM_CLASSES; | |
729 | my $N4 = $N3 + $NUM_CLASSES; | |
730 | my $N5 = $N4 + $NUM_CLASSES; | |
731 | my $N6 = $N5 + $NUM_CLASSES; | |
732 | my $N7 = $N6 + $NUM_CLASSES; | |
733 | ||
734 | my @C9_utf8_dfa; | |
735 | my @i8 = ( | |
736 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
737 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
738 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
739 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
740 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
741 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
742 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
743 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
744 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
745 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
746 | 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF | |
747 | 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF | |
748 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
749 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
750 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
751 | 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF | |
752 | ); | |
753 | $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
754 | push @C9_utf8_dfa, ( | |
755 | # Class: | |
756 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 | |
757 | 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0 | |
758 | 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1 | |
759 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2 | |
760 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3 | |
761 | ||
762 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4 | |
763 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5 | |
764 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6 | |
765 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7 | |
766 | ); | |
71525f77 | 767 | output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES); |
c5bfbb64 KW |
768 | } |
769 | ||
4bc3dcfa KW |
770 | print $out_fh get_conditional_compile_line_end(); |
771 | } | |
772 | ||
6a5bc5ac | 773 | print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n"; |
4bc3dcfa KW |
774 | |
775 | read_only_bottom_close_and_rename($out_fh); |