This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/ebcdic.pl: Add tables that partition by UTF-8 length
authorKarl Williamson <khw@cpan.org>
Wed, 2 Oct 2019 21:36:19 +0000 (15:36 -0600)
committerKarl Williamson <khw@cpan.org>
Thu, 7 Nov 2019 04:22:24 +0000 (21:22 -0700)
These will be used in a future commit.  This creates equivalence classes
of ranges of code points whose UTF-8 representations are the same length

ebcdic_tables.h
regen/ebcdic.pl

index 4e9c281..214b984 100644 (file)
@@ -412,6 +412,60 @@ SOFTWARE.
 };
 #  endif
 
+/* This table partitions all the code points of the platform into ranges which
+ * have the property that all the code points in each range have the same
+ * number of bytes in their UTF-EBCDIC representations, and the adjacent
+ * ranges have a different number of bytes.
+ *
+ * Each number in the table begins such a range, which extends up to just
+ * before the following table entry, except the final entry is understood to
+ * extend to the platform's infinity
+ */
+#  ifndef DOINIT
+    EXTCONST UV PL_partition_by_byte_length[];
+#  else
+    EXTCONST UV PL_partition_by_byte_length[] = {
+       0x00,
+       0x41,
+       0x4b,
+       0x51,
+       0x5a,
+       0x62,
+       0x6b,
+       0x70,
+       0x79,
+       0x80,
+       0x81,
+       0x8a,
+       0x91,
+       0x9a,
+       0xa1,
+       0xaa,
+       0xad,
+       0xae,
+       0xbd,
+       0xbe,
+       0xc0,
+       0xca,
+       0xd0,
+       0xda,
+       0xe0,
+       0xe1,
+       0xe2,
+       0xea,
+       0xf0,
+       0xfa,
+       0xff,
+       0x100,
+       0x400,
+       0x4000,
+       0x40000,
+       0x400000,
+       0x4000000,
+       0x40000000
+};
+#  endif
+
 #endif /* EBCDIC 1047 */
 
 #if 'A' == 193 /* EBCDIC 037 */ \
@@ -789,6 +843,62 @@ SOFTWARE.
 };
 #  endif
 
+/* This table partitions all the code points of the platform into ranges which
+ * have the property that all the code points in each range have the same
+ * number of bytes in their UTF-EBCDIC representations, and the adjacent
+ * ranges have a different number of bytes.
+ *
+ * Each number in the table begins such a range, which extends up to just
+ * before the following table entry, except the final entry is understood to
+ * extend to the platform's infinity
+ */
+#  ifndef DOINIT
+    EXTCONST UV PL_partition_by_byte_length[];
+#  else
+    EXTCONST UV PL_partition_by_byte_length[] = {
+       0x00,
+       0x41,
+       0x4b,
+       0x51,
+       0x5a,
+       0x5f,
+       0x60,
+       0x62,
+       0x6b,
+       0x70,
+       0x79,
+       0x80,
+       0x81,
+       0x8a,
+       0x91,
+       0x9a,
+       0xa1,
+       0xaa,
+       0xb0,
+       0xb1,
+       0xba,
+       0xbc,
+       0xc0,
+       0xca,
+       0xd0,
+       0xda,
+       0xe0,
+       0xe1,
+       0xe2,
+       0xea,
+       0xf0,
+       0xfa,
+       0xff,
+       0x100,
+       0x400,
+       0x4000,
+       0x40000,
+       0x400000,
+       0x4000000,
+       0x40000000
+};
+#  endif
+
 #endif /* EBCDIC 037 */
 
 #endif /* PERL_EBCDIC_TABLES_H_ */
index 9d6b87e..c0b4512 100644 (file)
@@ -245,6 +245,7 @@ shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
 foreach my $charset (@charsets) {
     # we process the whole array several times, make a copy
     my @a2e = @{get_a2n($charset)};
+    my @e2a;
 
     print $out_fh "\n" . get_conditional_compile_line_start($charset);
     print $out_fh "\n";
@@ -253,7 +254,6 @@ foreach my $charset (@charsets) {
     output_table(\@a2e, "PL_a2e");
 
     { # Construct the inverse
-        my @e2a;
         for my $i (0 .. 255) {
             $e2a[$a2e[$i]] = $i;
         }
@@ -767,6 +767,56 @@ END
         output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
     }
 
+    {
+        print $out_fh <<EOF;
+/* This table partitions all the code points of the platform into ranges which
+ * have the property that all the code points in each range have the same
+ * number of bytes in their UTF-EBCDIC representations, and the adjacent
+ * ranges have a different number of bytes.
+ *
+ * Each number in the table begins such a range, which extends up to just
+ * before the following table entry, except the final entry is understood to
+ * extend to the platform's infinity
+ */
+EOF
+        output_table_start($out_fh, "UV", "PL_partition_by_byte_length");
+        print $out_fh "\t";
+
+        # The lengths of the characters between 0 and 255 are either 1 or 2,
+        # with those whose ASCII platform equivalents below 160 being 1, and
+        # the rest being 2.
+        my @list;
+        push @list, 0;
+        my $pushed_range_is_length_1 = 1;
+
+        for my $i (1 .. 0xFF) {
+            my $this_code_point_is_length_1 = ($e2a[$i] < 160);
+            if ($pushed_range_is_length_1 != $this_code_point_is_length_1) {
+                push @list, $i;
+                $pushed_range_is_length_1 = $this_code_point_is_length_1;
+            }
+        }
+
+        # Starting at 256, the length is 2.
+        push @list, 0x100 if $pushed_range_is_length_1;
+
+        # These are based on the fundamental properties of UTF-EBCDIC.  Each
+        # continuation byte has 5 bits of information.  Comments in utf8.h
+        # explain the rest.
+        my $UTF_ACCUMULATION_SHIFT = 5;
+        push @list, (32 * (1 << (    $UTF_ACCUMULATION_SHIFT)));
+        push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT)));
+        push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT)));
+        push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT)));
+        push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT)));
+        push @list, (     (1 << (6 * $UTF_ACCUMULATION_SHIFT)));
+
+        print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list;
+        print $out_fh "\n";
+
+        output_table_end($out_fh);
+    }
+
     print $out_fh get_conditional_compile_line_end();
 }