| 1 | #!perl -w |
| 2 | use 5.015; |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | use Unicode::UCD "prop_invlist"; |
| 6 | require 'regen/regen_lib.pl'; |
| 7 | |
| 8 | # This program outputs charclass_invlists.h, which contains various inversion |
| 9 | # lists in the form of C arrays that are to be used as-is for inversion lists. |
| 10 | # Thus, the lists it contains are essentially pre-compiled, and need only a |
| 11 | # light-weight fast wrapper to make them usable at run-time. |
| 12 | |
| 13 | # As such, this code knows about the internal structure of these lists, and |
| 14 | # any change made to that has to be done here as well. A random number stored |
| 15 | # in the headers is used to minimize the possibility of things getting |
| 16 | # out-of-sync, or the wrong data structure being passed. Currently that |
| 17 | # random number is: |
| 18 | my $VERSION_DATA_STRUCTURE_TYPE = 1064334010; |
| 19 | |
| 20 | my $out_fh = open_new('charclass_invlists.h', '>', |
| 21 | {style => '*', by => $0, |
| 22 | from => "Unicode::UCD"}); |
| 23 | |
| 24 | print $out_fh "/* See the generating file for comments */\n\n"; |
| 25 | |
| 26 | sub output_invlist ($$) { |
| 27 | my $name = shift; |
| 28 | my $invlist = shift; # Reference to inversion list array |
| 29 | |
| 30 | # Output the inversion list $invlist using the name $name for it. |
| 31 | # It is output in the exact internal form for inversion lists. |
| 32 | |
| 33 | my $zero_or_one; # Is the last element of the header 0, or 1 ? |
| 34 | |
| 35 | # If the first element is 0, it goes in the header, instead of the body |
| 36 | if ($invlist->[0] == 0) { |
| 37 | shift @$invlist; |
| 38 | |
| 39 | $zero_or_one = 0; |
| 40 | |
| 41 | # Add a dummy 0 at the end so that the length is constant. inversion |
| 42 | # lists are always stored with enough room so that if they change from |
| 43 | # beginning with 0, they don't have to grow. |
| 44 | push @$invlist, 0; |
| 45 | } |
| 46 | else { |
| 47 | $zero_or_one = 1; |
| 48 | } |
| 49 | |
| 50 | print $out_fh "\nUV ${name}_invlist[] = {\n"; |
| 51 | |
| 52 | print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n"; |
| 53 | print $out_fh "\t0,\t/* Current iteration position */\n"; |
| 54 | print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n"; |
| 55 | print $out_fh "\t", $zero_or_one, |
| 56 | ",\t/* 0 if this is the first element of the list proper;", |
| 57 | "\n\t\t 1 if the next element is the first */\n"; |
| 58 | |
| 59 | # The main body are the UVs passed in to this routine. Do the final |
| 60 | # element separately |
| 61 | for my $i (0 .. @$invlist - 1 - 1) { |
| 62 | print $out_fh "\t$invlist->[$i],\n"; |
| 63 | } |
| 64 | |
| 65 | # The final element does not have a trailing comma, as C can't handle it. |
| 66 | print $out_fh "\t$invlist->[-1]\n"; |
| 67 | |
| 68 | print $out_fh "};\n"; |
| 69 | } |
| 70 | |
| 71 | output_invlist("Latin1", [ 0, 256 ]); |
| 72 | output_invlist("AboveLatin1", [ 256 ]); |
| 73 | |
| 74 | # We construct lists for all the POSIX and backslash sequence character |
| 75 | # classes in two forms: |
| 76 | # 1) ones which match only in the ASCII range |
| 77 | # 2) ones which match either in the Latin1 range, or the entire Unicode range |
| 78 | # |
| 79 | # These get compiled in, and hence affect the memory footprint of every Perl |
| 80 | # program, even those not using Unicode. To minimize the size, currently |
| 81 | # the Latin1 version is generated for the beyond ASCII range except for those |
| 82 | # lists that are quite small for the entire range, such as for \s, which is 22 |
| 83 | # UVs long plus 4 UVs (currently) for the header. |
| 84 | # |
| 85 | # To save even more memory, the ASCII versions could be derived from the |
| 86 | # larger ones at runtime, saving some memory (minus the expense of the machine |
| 87 | # instructions to do so), but these are all small anyway, so their total is |
| 88 | # about 100 UVs. |
| 89 | # |
| 90 | # In the list of properties below that get generated, the L1 prefix is a fake |
| 91 | # property that means just the Latin1 range of the full property (whose name |
| 92 | # has an X prefix instead of L1). |
| 93 | |
| 94 | for my $prop (qw( |
| 95 | ASCII |
| 96 | HorizSpace |
| 97 | VertSpace |
| 98 | PerlSpace |
| 99 | XPerlSpace |
| 100 | PosixAlnum |
| 101 | L1PosixAlnum |
| 102 | PosixAlpha |
| 103 | L1PosixAlpha |
| 104 | PosixBlank |
| 105 | XPosixBlank |
| 106 | PosixCntrl |
| 107 | XPosixCntrl |
| 108 | PosixDigit |
| 109 | PosixGraph |
| 110 | L1PosixGraph |
| 111 | PosixLower |
| 112 | L1PosixLower |
| 113 | PosixPrint |
| 114 | L1PosixPrint |
| 115 | PosixPunct |
| 116 | L1PosixPunct |
| 117 | PosixSpace |
| 118 | XPosixSpace |
| 119 | PosixUpper |
| 120 | L1PosixUpper |
| 121 | PosixWord |
| 122 | L1PosixWord |
| 123 | PosixXDigit |
| 124 | XPosixXDigit |
| 125 | ) |
| 126 | ) { |
| 127 | |
| 128 | # For the Latin1 properties, we change to use the eXtended version of the |
| 129 | # base property, then go through the result and get rid of everything not |
| 130 | # in Latin1 (above 255). Actually, we retain the element that crosses the |
| 131 | # 255/256 boundary. For example, in the Word property, there is a range |
| 132 | # of code points that start at U+00F8 and goes through U+02C1. Instead of |
| 133 | # artifically cutting that off at 256 because 256 is the first code point |
| 134 | # above Latin1, we let the range go to its natural ending. That gives us |
| 135 | # extra information with no added space taken. |
| 136 | my $lookup_prop = $prop =~ s/^L1/X/r; |
| 137 | my @invlist = prop_invlist($lookup_prop); |
| 138 | |
| 139 | if ($lookup_prop ne $prop) { |
| 140 | for my $i (0 .. @invlist - 1 - 1) { |
| 141 | if ($invlist[$i] > 255) { |
| 142 | splice @invlist, $i+1; |
| 143 | last; |
| 144 | } |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | output_invlist($prop, \@invlist); |
| 149 | } |
| 150 | |
| 151 | read_only_bottom_close_and_rename($out_fh) |