Commit | Line | Data |
---|---|---|
4bc3dcfa KW |
1 | use v5.16.0; |
2 | use strict; | |
3 | use warnings; | |
94e72741 | 4 | use integer; |
3d7c117d MB |
5 | |
6 | BEGIN { unshift @INC, '.' } | |
7 | ||
8 | require './regen/regen_lib.pl'; | |
9 | require './regen/charset_translations.pl'; | |
4bc3dcfa KW |
10 | |
11 | # Generates the EBCDIC translation tables that were formerly hard-coded into | |
12 | # utfebcdic.h | |
13 | ||
14 | my $out_fh = open_new('ebcdic_tables.h', '>', | |
15 | {style => '*', by => $0, }); | |
16 | ||
94e72741 KW |
17 | sub get_column_headers ($$;$) { |
18 | my ($row_hdr_len, $field_width, $dfa_columns) = @_; | |
19 | my $format; | |
20 | my $final_column_format; | |
21 | my $num_columns; | |
22 | ||
23 | if (defined $dfa_columns) { | |
24 | $num_columns = $dfa_columns; | |
25 | ||
26 | # Trailing blank to correspond with commas in the rows below | |
27 | $format = "%${field_width}d "; | |
28 | } | |
29 | else { # Is a regular table | |
30 | $num_columns = 16; | |
31 | ||
32 | # Use blanks to separate the fields | |
33 | $format = " " x ( $field_width | |
34 | - 2); # For the '_X' | |
35 | $format .= "_%X "; # Again, trailing blank over the commas below | |
36 | } | |
37 | ||
38 | my $header = "/*" . " " x ($row_hdr_len - length "/*"); | |
39 | ||
40 | # All but the final column | |
41 | $header .= sprintf($format, $_) for 0 .. $num_columns - 2; | |
42 | ||
43 | # Get rid of trailing blank, so that the final column takes up one less | |
44 | # space so that the "*/" doesn't extend past the commas in the rows below | |
45 | chop $header; | |
46 | $header .= sprintf $format, $num_columns - 1; | |
47 | ||
48 | # Again, remove trailing blank | |
49 | chop $header; | |
50 | ||
51 | return $header . "*/\n"; | |
52 | } | |
53 | ||
702cfe48 | 54 | sub output_table ($$;$) { |
4bc3dcfa KW |
55 | my $table_ref = shift; |
56 | my $name = shift; | |
57 | ||
94e72741 KW |
58 | # 0 => print in decimal |
59 | # 1 => print in hex (translates code point to code point) | |
60 | # >= 2 => is a dfa table, like http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ | |
61 | # The number is how many columns in the part after the code point | |
62 | # portion. | |
63 | # | |
64 | # code point tables in hex areasier to debug, but don't fit into 80 | |
65 | # columns | |
66 | my $type = shift // 1; | |
67 | ||
68 | my $print_in_hex = $type == 1; | |
69 | my $is_dfa = ($type >= 2) ? $type : 0; | |
70 | my $columns_after_256 = 16; | |
71 | ||
72 | die "Requres 256 entries in table $name, got @$table_ref" | |
73 | if ! $is_dfa && @$table_ref != 256; | |
74 | if (! $is_dfa) { | |
75 | die "Requres 256 entries in table $name, got @$table_ref" | |
76 | if @$table_ref != 256; | |
77 | } | |
78 | else { | |
79 | $columns_after_256 = $is_dfa; | |
80 | ||
81 | print $out_fh <<'EOF'; | |
82 | ||
83 | /* The table below is adapted from | |
84 | * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ | |
85 | * See copyright notice at the beginning of this file. | |
86 | */ | |
87 | ||
88 | EOF | |
89 | } | |
90 | ||
91 | # Highest number in the table | |
92 | my $max_entry = 0; | |
93 | $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref; | |
43bf0d50 | 94 | |
94e72741 KW |
95 | # We assume that every table has at least one two digit entry, and none |
96 | # are more than three digit. | |
97 | my $field_width = ($print_in_hex) | |
98 | ? 4 | |
99 | : (($max_entry) > 99 ? 3 : 2); | |
4bc3dcfa | 100 | |
94e72741 KW |
101 | my $row_hdr_length; |
102 | my $node_number_field_width; | |
103 | my $node_value_field_width; | |
104 | ||
105 | # dfa tables have a special header for the rows in the transitions part of | |
106 | # the table. It is longer than the regular one. | |
107 | if ($is_dfa) { | |
108 | my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1; | |
109 | $node_number_field_width = ($max_node_number > 9) ? 2 : 1; | |
110 | $node_value_field_width = ($max_node_number * $columns_after_256 > 99) | |
111 | ? 3 : 2; | |
112 | # The header starts with this template, and adds in the number of | |
113 | # digits needed to represent the maximum node number and its value | |
114 | $row_hdr_length = length("/*N=*/") | |
115 | + $node_number_field_width | |
116 | + $node_value_field_width; | |
117 | } | |
118 | else { | |
119 | $row_hdr_length = length "/*_X*/"; # Template for what the header | |
120 | # looks like | |
121 | } | |
122 | ||
123 | # The table may not be representable in 8 bits. | |
124 | my $TYPE = 'U8'; | |
125 | $TYPE = 'U16' if grep { $_ > 255 } @$table_ref; | |
126 | ||
127 | my $declaration = "EXTCONST $TYPE $name\[\]"; | |
c05125c5 KW |
128 | print $out_fh <<EOF; |
129 | # ifndef DOINIT | |
130 | # $declaration; | |
131 | # else | |
132 | # $declaration = { | |
133 | EOF | |
4bc3dcfa | 134 | |
94e72741 KW |
135 | # First the headers for the columns |
136 | print $out_fh get_column_headers($row_hdr_length, $field_width); | |
137 | ||
138 | # Now the table body | |
139 | my $count = @$table_ref; | |
140 | my $last_was_nl = 1; | |
141 | ||
142 | # Print each element individually, arranged in rows of columns | |
143 | for my $i (0 .. $count - 1) { | |
144 | ||
145 | # Node number for here is -1 until get into the dfa state transitions | |
146 | my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256; | |
147 | ||
148 | # Print row header at beginning of each row | |
149 | if ($last_was_nl) { | |
150 | if ($node >= 0) { | |
151 | printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node, | |
152 | $node_value_field_width, $i - 256; | |
153 | } | |
154 | else { # Otherwise is regular row; print its number | |
155 | printf $out_fh "/*%X_", $i / 16; | |
156 | ||
157 | # These rows in a dfa table require extra space so columns | |
158 | # will align vertically (because the Ndd=ddd requires extra | |
159 | # space) | |
160 | if ($is_dfa) { | |
161 | print $out_fh " " x ( $node_number_field_width | |
162 | + $node_value_field_width); | |
163 | } | |
164 | print $out_fh "*/"; | |
165 | } | |
166 | } | |
167 | ||
43bf0d50 | 168 | if ($print_in_hex) { |
702cfe48 | 169 | printf $out_fh "0x%02X", $table_ref->[$i]; |
43bf0d50 KW |
170 | } |
171 | else { | |
94e72741 KW |
172 | printf $out_fh "%${field_width}d", $table_ref->[$i]; |
173 | } | |
174 | ||
175 | print $out_fh ",", if $i < $count -1; # No comma on final entry | |
176 | ||
177 | # Add \n if at end of row, which is 16 columns until we get to the | |
178 | # transitions part | |
179 | if ( ($node < 0 && $i % 16 == 15) | |
180 | || ($node >= 0 && ($i -256) % $columns_after_256 | |
181 | == $columns_after_256 - 1)) | |
182 | { | |
183 | print $out_fh "\n"; | |
184 | $last_was_nl = 1; | |
185 | } | |
186 | else { | |
187 | $last_was_nl = 0; | |
43bf0d50 | 188 | } |
4bc3dcfa | 189 | } |
94e72741 KW |
190 | |
191 | # Print column footer | |
192 | print $out_fh get_column_headers($row_hdr_length, $field_width, | |
193 | ($is_dfa) ? $columns_after_256 : undef); | |
194 | ||
c05125c5 | 195 | print $out_fh "};\n# endif\n\n"; |
4bc3dcfa KW |
196 | } |
197 | ||
3de6d141 | 198 | print $out_fh <<'END'; |
4bc3dcfa | 199 | |
6a5bc5ac KW |
200 | #ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */ |
201 | #define PERL_EBCDIC_TABLES_H_ 1 | |
4bc3dcfa KW |
202 | |
203 | /* This file contains definitions for various tables used in EBCDIC handling. | |
3de6d141 KW |
204 | * More info is in utfebcdic.h |
205 | * | |
206 | * Some of the tables are adapted from | |
207 | * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ | |
208 | * which requires this copyright notice: | |
209 | ||
210 | Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
211 | ||
212 | Permission is hereby granted, free of charge, to any person obtaining a copy of | |
213 | this software and associated documentation files (the "Software"), to deal in | |
214 | the Software without restriction, including without limitation the rights to | |
215 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
216 | of the Software, and to permit persons to whom the Software is furnished to do | |
217 | so, subject to the following conditions: | |
218 | ||
219 | The above copyright notice and this permission notice shall be included in all | |
220 | copies or substantial portions of the Software. | |
221 | ||
222 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
223 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
224 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
225 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
226 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
227 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
228 | SOFTWARE. | |
229 | ||
230 | */ | |
4bc3dcfa KW |
231 | END |
232 | ||
233 | my @charsets = get_supported_code_pages(); | |
234 | shift @charsets; # ASCII is the 0th, and we don't deal with that here. | |
235 | foreach my $charset (@charsets) { | |
c30a0cf2 TC |
236 | # we process the whole array several times, make a copy |
237 | my @a2e = @{get_a2n($charset)}; | |
4bc3dcfa KW |
238 | |
239 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
240 | print $out_fh "\n"; | |
241 | ||
242 | print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; | |
243 | output_table(\@a2e, "PL_a2e"); | |
244 | ||
245 | { # Construct the inverse | |
246 | my @e2a; | |
247 | for my $i (0 .. 255) { | |
248 | $e2a[$a2e[$i]] = $i; | |
249 | } | |
250 | print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; | |
251 | output_table(\@e2a, "PL_e2a"); | |
252 | } | |
253 | ||
e0dcdb0a | 254 | my @i82utf = @{get_I8_2_utf($charset)}; |
4bc3dcfa KW |
255 | print $out_fh <<END; |
256 | /* (Confusingly named) Index is $charset I8 byte; value is | |
257 | * $charset UTF-EBCDIC equivalent */ | |
258 | END | |
259 | output_table(\@i82utf, "PL_utf2e"); | |
260 | ||
261 | { #Construct the inverse | |
262 | my @utf2i8; | |
263 | for my $i (0 .. 255) { | |
264 | $utf2i8[$i82utf[$i]] = $i; | |
265 | } | |
266 | print $out_fh <<END; | |
267 | /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is | |
268 | * $charset I8 equivalent */ | |
269 | END | |
270 | output_table(\@utf2i8, "PL_e2utf"); | |
271 | } | |
272 | ||
273 | { | |
274 | my @utf8skip; | |
275 | ||
276 | # These are invariants or continuation bytes. | |
277 | for my $i (0 .. 0xBF) { | |
278 | $utf8skip[$i82utf[$i]] = 1; | |
279 | } | |
280 | ||
281 | # These are start bytes; The skip is the number of consecutive highest | |
282 | # order 1-bits (up to 7) | |
283 | for my $i (0xC0 .. 255) { | |
284 | my $count; | |
c0236afe KW |
285 | if ($i == 0b11111111) { |
286 | no warnings 'once'; | |
287 | $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; | |
288 | } | |
289 | elsif (($i & 0b11111110) == 0b11111110) { | |
4bc3dcfa KW |
290 | $count= 7; |
291 | } | |
292 | elsif (($i & 0b11111100) == 0b11111100) { | |
293 | $count= 6; | |
294 | } | |
295 | elsif (($i & 0b11111000) == 0b11111000) { | |
296 | $count= 5; | |
297 | } | |
298 | elsif (($i & 0b11110000) == 0b11110000) { | |
299 | $count= 4; | |
300 | } | |
301 | elsif (($i & 0b11100000) == 0b11100000) { | |
302 | $count= 3; | |
303 | } | |
304 | elsif (($i & 0b11000000) == 0b11000000) { | |
305 | $count= 2; | |
306 | } | |
307 | else { | |
308 | die "Something wrong for UTF8SKIP calculation for $i"; | |
309 | } | |
310 | $utf8skip[$i82utf[$i]] = $count; | |
311 | } | |
312 | ||
313 | print $out_fh <<END; | |
4719093e KW |
314 | /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes |
315 | * (including for overlongs); 1 for continuation. Adapted from the shadow | |
316 | * flags table in tr16. The entries marked 9 in tr16 are continuation bytes | |
317 | * and are marked as length 1 here so that we can recover. */ | |
4bc3dcfa | 318 | END |
702cfe48 KW |
319 | output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print |
320 | # in hex | |
4bc3dcfa KW |
321 | } |
322 | ||
323 | use feature 'unicode_strings'; | |
324 | ||
325 | { | |
326 | my @lc; | |
327 | for my $i (0 .. 255) { | |
328 | $lc[$a2e[$i]] = $a2e[ord lc chr $i]; | |
329 | } | |
94e72741 KW |
330 | print $out_fh |
331 | "/* Index is $charset code point; value is its lowercase equivalent */\n"; | |
4bc3dcfa KW |
332 | output_table(\@lc, "PL_latin1_lc"); |
333 | } | |
334 | ||
335 | { | |
336 | my @uc; | |
337 | for my $i (0 .. 255) { | |
338 | my $uc = uc chr $i; | |
339 | if (length $uc > 1 || ord $uc > 255) { | |
340 | $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; | |
341 | } | |
342 | $uc[$a2e[$i]] = $a2e[ord $uc]; | |
343 | } | |
344 | print $out_fh <<END; | |
345 | /* Index is $charset code point; value is its uppercase equivalent. | |
346 | * The 'mod' in the name means that codepoints whose uppercase is above 255 or | |
347 | * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ | |
348 | END | |
349 | output_table(\@uc, "PL_mod_latin1_uc"); | |
350 | } | |
351 | ||
352 | { # PL_fold | |
353 | my @ascii_fold; | |
354 | for my $i (0 .. 255) { # Initialise to identity map | |
355 | $ascii_fold[$i] = $i; | |
356 | } | |
357 | ||
358 | # Overwrite the entries that aren't identity | |
359 | for my $chr ('A' .. 'Z') { | |
360 | $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; | |
361 | } | |
362 | for my $chr ('a' .. 'z') { | |
363 | $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; | |
364 | } | |
365 | print $out_fh <<END; | |
366 | /* Index is $charset code point; For A-Z, value is a-z; for a-z, value | |
367 | * is A-Z; all other code points map to themselves */ | |
368 | END | |
369 | output_table(\@ascii_fold, "PL_fold"); | |
370 | } | |
371 | ||
372 | { | |
373 | my @latin1_fold; | |
374 | for my $i (0 .. 255) { | |
375 | my $char = chr $i; | |
376 | my $lc = lc $char; | |
377 | ||
378 | # lc and uc adequately proxy for fold-case pairs in this 0-255 | |
379 | # range | |
380 | my $uc = uc $char; | |
381 | $uc = $char if length $uc > 1 || ord $uc > 255; | |
382 | if ($lc ne $char) { | |
383 | $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; | |
384 | } | |
385 | elsif ($uc ne $char) { | |
386 | $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; | |
387 | } | |
388 | else { | |
389 | $latin1_fold[$a2e[$i]] = $a2e[$i]; | |
390 | } | |
391 | } | |
392 | print $out_fh <<END; | |
393 | /* Index is $charset code point; value is its other fold-pair equivalent | |
394 | * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is | |
395 | * the code point itself */ | |
396 | END | |
397 | output_table(\@latin1_fold, "PL_fold_latin1"); | |
398 | } | |
399 | ||
3de6d141 KW |
400 | { |
401 | # This generates the dfa table for perl extended UTF-8, which accepts | |
402 | # surrogates, non-characters, and accepts start bytes up through FE | |
403 | # (start byte FF has to be handled outside this dfa). The class numbers | |
404 | # for start bytes are constrained so that they can be used as a shift | |
405 | # count for masking off the leading one bits | |
406 | # | |
407 | # The classes are | |
408 | # 00-9F 0 | |
409 | # A0-A1 7 Not legal immediately after start bytes F0 F8 FC | |
410 | # FE | |
411 | # A2-A3 8 Not legal immediately after start bytes F0 F8 FC | |
412 | # A4-A7 9 Not legal immediately after start bytes F0 F8 | |
413 | # A8-AF 10 Not legal immediately after start bytes F0 | |
414 | # B0-BF 11 | |
415 | # C0-C4 1 | |
416 | # C5-DF 2 | |
417 | # E0 1 | |
418 | # E1-EF 3 | |
419 | # F0 12 | |
420 | # F1-F7 4 | |
421 | # F8 13 | |
422 | # F9-FB 5 | |
423 | # FC 14 | |
424 | # FD 6 | |
425 | # FE 15 | |
426 | # FF 1 | |
427 | # | |
428 | # Here's the I8 for the code points before which overlongs occur: | |
429 | # U+4000: \xF0\xB0\xA0\xA0 | |
430 | # U+40000: \xF8\xA8\xA0\xA0\xA0 | |
431 | # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0 | |
432 | # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0 | |
433 | # | |
434 | # The first part of the table maps bytes to character classes to reduce | |
435 | # the size of the transition table and create bitmasks. | |
436 | # | |
437 | # The second part is a transition table that maps a combination of a | |
438 | # state of the automaton and a character class to a new state. The | |
439 | # numbering of the original nodes is retained, but some have been split | |
440 | # so that there are new nodes. They mean: | |
441 | # N0 The initial state, and final accepting one. | |
442 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
443 | # immediately when the start byte indicates a two-byte sequence | |
444 | # N2 Two continuation bytes left. | |
445 | # N3 Three continuation bytes left. | |
446 | # N4 Four continuation bytes left. | |
447 | # N5 Five continuation bytes left. | |
448 | # N6 Start byte is F0. Continuation bytes A[0-F] are illegal | |
449 | # (overlong); the other continuations transition to N2 | |
450 | # N7 Start byte is F8. Continuation bytes A[0-7] are illegal | |
451 | # (overlong); the other continuations transition to N3 | |
452 | # N8 Start byte is FC. Continuation bytes A[0-3] are illegal | |
453 | # (overlong); the other continuations transition to N4 | |
454 | # N9 Start byte is FE. Continuation bytes A[01] are illegal | |
455 | # (overlong); the other continuations transition to N5 | |
456 | # 1 Reject. All transitions not mentioned above (except the single | |
457 | # byte ones (as they are always legal) are to this state. | |
458 | ||
459 | my $NUM_CLASSES = 16; | |
460 | my $N0 = 0; | |
461 | my $N1 = $N0 + $NUM_CLASSES; | |
462 | my $N2 = $N1 + $NUM_CLASSES; | |
463 | my $N3 = $N2 + $NUM_CLASSES; | |
464 | my $N4 = $N3 + $NUM_CLASSES; | |
465 | my $N5 = $N4 + $NUM_CLASSES; | |
466 | my $N6 = $N5 + $NUM_CLASSES; | |
467 | my $N7 = $N6 + $NUM_CLASSES; | |
468 | my $N8 = $N7 + $NUM_CLASSES; | |
469 | my $N9 = $N8 + $NUM_CLASSES; | |
470 | my $N10 = $N9 + $NUM_CLASSES; | |
471 | ||
472 | my @perl_extended_utf8_dfa; | |
473 | my @i8 = ( | |
474 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
475 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
476 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
477 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
478 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
479 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
480 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
481 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
482 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
483 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
484 | 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF | |
485 | 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF | |
486 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
487 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
488 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
489 | 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF | |
490 | ); | |
491 | $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
492 | push @perl_extended_utf8_dfa, ( | |
493 | # Class: | |
494 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
495 | 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0 | |
496 | 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1 | |
497 | 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2 | |
498 | 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3 | |
499 | 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4 | |
500 | 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5 | |
501 | ||
502 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6 | |
503 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7 | |
504 | 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8 | |
505 | 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9 | |
506 | ); | |
71525f77 | 507 | output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab", |
3de6d141 KW |
508 | $NUM_CLASSES); |
509 | } | |
510 | ||
123deead KW |
511 | { |
512 | # This generates the dfa table for strict UTF-8, which rejects | |
513 | # surrogates, non-characters, and above Unicode. | |
514 | # | |
515 | # The classes are | |
516 | # 00-9F 0 Always legal at start | |
517 | # A0 10 Not legal immediately after start bytes F0 F8 | |
518 | # A1 11 Not legal immediately after start bytes F0 F8, | |
519 | # A2-A7 12 Not legal immediately after start bytes F0 F8 F9 | |
520 | # A8,AA,AC 13 Not legal immediately after start bytes F0 F9 | |
521 | # A9,AB,AD 14 Not legal immediately after start byte F0 | |
522 | # AE 15 Not legal immediately after start byte F0 | |
523 | # AF 16 Not legal immediately after start bytes F0 | |
524 | # B[0248AC] 17 Not legal immediately after start byte F9 | |
525 | # B[1359D] 18 Not legal immediately after start byte F9 | |
526 | # B6 19 Not legal immediately after start byte F9 | |
527 | # B7 20 Not legal immediately after start byte F9 | |
528 | # BE 21 Not legal immediately after start byte F9 | |
529 | # BF 22 Not legal immediately after start byte F9 | |
530 | # C0-C4 1 (reject, all are overlong) | |
531 | # C5-DF 2 Accepts any legal continuation | |
532 | # E0 1 (reject, all are overlong) | |
533 | # E1-EF 3 Accepts any legal continuation | |
534 | # F0 8 (has overlongs) | |
535 | # F1 6 (has surrogates, non-chars) | |
536 | # F2,F4,F6 4 Accepts any legal continuation | |
537 | # F3,F5,F7 5 (has non-chars) | |
538 | # F8 9 (has overlongs, non-chars) | |
539 | # F9 7 (has non-chars, non-Unicode) | |
540 | # FA-FF 1 (reject, all are non-Unicode) | |
541 | # | |
542 | # Here's the I8 for enough code points so that you can figure out what's | |
543 | # going on: | |
544 | # | |
545 | # U+D800: \xF1\xB6\xA0\xA0 | |
546 | # U+DFFF: \xF1\xB7\xBF\xBF | |
547 | # U+FDD0: \xF1\xBF\xAE\xB0 | |
548 | # U+FDEF: \xF1\xBF\xAF\xAF | |
549 | # U+FFFE: \xF1\xBF\xBF\xBE | |
550 | # U+1FFFE: \xF3\xBF\xBF\xBE | |
551 | # U+2FFFE: \xF5\xBF\xBF\xBE | |
552 | # U+3FFFE: \xF7\xBF\xBF\xBE | |
553 | # U+4FFFE: \xF8\xA9\xBF\xBF\xBE | |
554 | # U+5FFFE: \xF8\xAB\xBF\xBF\xBE | |
555 | # U+6FFFE: \xF8\xAD\xBF\xBF\xBE | |
556 | # U+7FFFE: \xF8\xAF\xBF\xBF\xBE | |
557 | # U+8FFFE: \xF8\xB1\xBF\xBF\xBE | |
558 | # U+9FFFE: \xF8\xB3\xBF\xBF\xBE | |
559 | # U+AFFFE: \xF8\xB5\xBF\xBF\xBE | |
560 | # U+BFFFE: \xF8\xB7\xBF\xBF\xBE | |
561 | # U+CFFFE: \xF8\xB9\xBF\xBF\xBE | |
562 | # U+DFFFE: \xF8\xBB\xBF\xBF\xBE | |
563 | # U+EFFFE: \xF8\xBD\xBF\xBF\xBE | |
564 | # U+FFFFE: \xF8\xBF\xBF\xBF\xBE | |
565 | # U+10FFFE: \xF9\xA1\xBF\xBF\xBE | |
566 | # | |
567 | # The first part of the table maps bytes to character classes to reduce | |
568 | # the size of the transition table and create bitmasks. | |
569 | # | |
570 | # The second part is a transition table that maps a combination of a | |
571 | # state of the automaton and a character class to a new state. The | |
572 | # numbering of the original nodes is retained, but some have been split | |
573 | # so that there are new nodes. They mean: | |
574 | # N0 The initial state, and final accepting one. | |
575 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
576 | # immediately when the start byte indicates a two-byte sequence | |
577 | # N2 Two continuation bytes left. | |
578 | # N3 Three continuation bytes left. | |
579 | # N4 Start byte is F0. Continuation bytes A[0-F] are illegal | |
580 | # (overlong); the other continuations transition to N2 | |
581 | # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal | |
582 | # (surrogates); BF transitions to N9; the other continuations to | |
583 | # N2 | |
584 | # N6 Start byte is F[357]. Continuation byte BF transitions to N12; | |
585 | # other continuations to N2 | |
586 | # N5 Start byte is F8. Continuation bytes A[0-7] are illegal | |
587 | # (overlong); continuations A[9BDF] and B[13579BDF] transition to | |
588 | # N14; the other continuations to N3 | |
589 | # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1 | |
590 | # to N14; the other continuation bytes are illegal. | |
591 | # N9 Initial sequence is F1 BF. Continuation byte AE transitions to | |
592 | # state N10; AF to N11; BF to N13; the other continuations to N1. | |
593 | # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are | |
594 | # illegal (non-chars); the other continuations are legal | |
595 | # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are | |
596 | # illegal (non-chars); the other continuations are legal | |
597 | # N12 Initial sequence is F[357] BF. Continuation bytes BF | |
598 | # transitions to N13; the other continuations to N1 | |
599 | # N13 Initial sequence is F[1357] BF BF or F8 x BF (where x is | |
600 | # something that can lead to a non-char. Continuation bytes BE | |
601 | # and BF are illegal (non-chars); the other continuations are | |
602 | # legal | |
603 | # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1. | |
604 | # Continuation byte BF transitions to N13; the other | |
605 | # continuations to N2 | |
606 | # 1 Reject. All transitions not mentioned above (except the single | |
607 | # byte ones (as they are always legal) are to this state. | |
608 | ||
609 | my $NUM_CLASSES = 23; | |
610 | my $N0 = 0; | |
611 | my $N1 = $N0 + $NUM_CLASSES; | |
612 | my $N2 = $N1 + $NUM_CLASSES; | |
613 | my $N3 = $N2 + $NUM_CLASSES; | |
614 | my $N4 = $N3 + $NUM_CLASSES; | |
615 | my $N5 = $N4 + $NUM_CLASSES; | |
616 | my $N6 = $N5 + $NUM_CLASSES; | |
617 | my $N7 = $N6 + $NUM_CLASSES; | |
618 | my $N8 = $N7 + $NUM_CLASSES; | |
619 | my $N9 = $N8 + $NUM_CLASSES; | |
620 | my $N10 = $N9 + $NUM_CLASSES; | |
621 | my $N11 = $N10 + $NUM_CLASSES; | |
622 | my $N12 = $N11 + $NUM_CLASSES; | |
623 | my $N13 = $N12 + $NUM_CLASSES; | |
624 | my $N14 = $N13 + $NUM_CLASSES; | |
625 | ||
626 | my @strict_utf8_dfa; | |
627 | my @i8 = ( | |
628 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
629 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
630 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
631 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
632 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
633 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
634 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
635 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
636 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
637 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
638 | 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF | |
639 | 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF | |
640 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
641 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
642 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
643 | 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF | |
644 | ); | |
645 | $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
646 | push @strict_utf8_dfa, ( | |
647 | # Class: | |
648 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | |
649 | 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0 | |
650 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1 | |
651 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2 | |
652 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3 | |
653 | ||
654 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4 | |
655 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5 | |
656 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6 | |
657 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7 | |
658 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8 | |
659 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9 | |
660 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10 | |
661 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11 | |
662 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12 | |
663 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13 | |
664 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14 | |
665 | ); | |
71525f77 | 666 | output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES); |
123deead KW |
667 | } |
668 | ||
c5bfbb64 KW |
669 | { |
670 | # This generates the dfa table for C9 strict UTF-8, which rejects | |
671 | # surrogates and above Unicode, but allows non-characters,. | |
672 | # | |
673 | # The classes are | |
674 | # 00-9F 0 Always legal at start | |
675 | # A0-A1 9 Not legal immediately after start bytes F0 F8 | |
676 | # A2-A7 10 Not legal immediately after start bytes F0 F8 F9 | |
677 | # A8-AF 11 Not legal immediately after start bytes F0 F9 | |
678 | # B0-B5,B8-BF 12 Not legal immediately after start byte F9 | |
679 | # B6,B7 13 | |
680 | # C0-C4 1 (reject, all are overlong) | |
681 | # C5-DF 2 Accepts any legal continuation | |
682 | # E0 1 (reject, all are overlong) | |
683 | # E1-EF 3 Accepts any legal continuation | |
684 | # F0 6 (has overlongs) | |
685 | # F1 5 (has surrogates) | |
686 | # F2-F7 4 Accepts any legal continuation | |
687 | # F8 8 (has overlongs) | |
688 | # F9 7 (has non-Unicode) | |
689 | # FA-FF 1 (reject, all are non-Unicode) | |
690 | # | |
691 | # The first part of the table maps bytes to character classes to reduce | |
692 | # the size of the transition table and create bitmasks. | |
693 | # | |
694 | # The second part is a transition table that maps a combination of a | |
695 | # state of the automaton and a character class to a new state. The | |
696 | # numbering of the original nodes is retained, but some have been split | |
697 | # so that there are new nodes. They mean: | |
698 | # N0 The initial state, and final accepting one. | |
699 | # N1 One continuation byte (A0-BF) left. This is transitioned to | |
700 | # immediately when the start byte indicates a two-byte sequence | |
701 | # N2 Two continuation bytes left. | |
702 | # N3 Three continuation bytes left. | |
703 | # N4 Start byte is F0. Continuation bytes A[0-F] are illegal | |
704 | # (overlong); the other continuations transition to N2 | |
705 | # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the | |
706 | # other continuations transition to N2 | |
707 | # N6 Start byte is F8. Continuation bytes A[0-7] are illegal | |
708 | # (overlong); the other continuations transition to N3 | |
709 | # N7 Start byte is F9. Continuation bytes A0 and A1 transition to | |
710 | # N3; the other continuation bytes are illegal (non-Unicode) | |
711 | # 1 Reject. All transitions not mentioned above (except the single | |
712 | # byte ones (as they are always legal) are to this state. | |
713 | ||
714 | my $NUM_CLASSES = 14; | |
715 | my $N0 = 0; | |
716 | my $N1 = $N0 + $NUM_CLASSES; | |
717 | my $N2 = $N1 + $NUM_CLASSES; | |
718 | my $N3 = $N2 + $NUM_CLASSES; | |
719 | my $N4 = $N3 + $NUM_CLASSES; | |
720 | my $N5 = $N4 + $NUM_CLASSES; | |
721 | my $N6 = $N5 + $NUM_CLASSES; | |
722 | my $N7 = $N6 + $NUM_CLASSES; | |
723 | ||
724 | my @C9_utf8_dfa; | |
725 | my @i8 = ( | |
726 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F | |
727 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F | |
728 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F | |
729 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F | |
730 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F | |
731 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F | |
732 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F | |
733 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F | |
734 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F | |
735 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F | |
736 | 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF | |
737 | 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF | |
738 | 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF | |
739 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF | |
740 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF | |
741 | 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF | |
742 | ); | |
743 | $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); | |
744 | push @C9_utf8_dfa, ( | |
745 | # Class: | |
746 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 | |
747 | 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0 | |
748 | 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1 | |
749 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2 | |
750 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3 | |
751 | ||
752 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4 | |
753 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5 | |
754 | 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6 | |
755 | 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7 | |
756 | ); | |
71525f77 | 757 | output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES); |
c5bfbb64 KW |
758 | } |
759 | ||
4bc3dcfa KW |
760 | print $out_fh get_conditional_compile_line_end(); |
761 | } | |
762 | ||
6a5bc5ac | 763 | print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n"; |
4bc3dcfa KW |
764 | |
765 | read_only_bottom_close_and_rename($out_fh); |