This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
basic.t: Provide descriptions for all unit tests
[perl5.git] / regen / mph.pl
CommitLineData
1a4d3814
YO
1package MinimalPerfectHash;
2use strict;
3use warnings;
4use Data::Dumper;
5use Carp;
6use Text::Wrap;
00d503c4 7use bignum; # Otherwise fails on 32-bit systems
1a4d3814
YO
8
9my $DEBUG= 0;
10my $RSHIFT= 8;
11my $FNV_CONST= 16777619;
12
db027c8a
KW
13# The basic idea is that you have a two level structure, and effectively
14# hash the key twice.
15#
16# The first hash finds a bucket in the array which contains a seed which
17# is used for the second hash, which then leads to a bucket with key
18# data which is compared against to determine if the key is a match.
19#
20# If the first hash finds no seed, then the key cannot match.
21#
22# In our case we cheat a bit, and hash the key only once, but use the
23# low bits for the first lookup and the high-bits for the second.
24#
25# So for instance:
26#
27# h= (h >> RSHIFT) ^ s;
28#
29# is how the second hash is computed. We right shift the original hash
30# value and then xor in the seed2, which will be non-zero.
31#
32# That then gives us the bucket which contains the key data we need to
33# match for a valid key.
34
1a4d3814
YO
35sub _fnv {
36 my ($key, $seed)= @_;
1a4d3814
YO
37 my $hash = 0+$seed;
38 foreach my $char (split //, $key) {
39 $hash = $hash ^ ord($char);
40 $hash = ($hash * $FNV_CONST) & 0xFFFFFFFF;
41 }
42 return $hash;
43}
44
45sub build_perfect_hash {
46 my ($hash, $split_pos)= @_;
47
48 my $n= 0+keys %$hash;
49 my $max_h= 0xFFFFFFFF;
50 $max_h -= $max_h % $n; # this just avoids a tiny bit bias
51 my $seed1= unpack("N", "Perl") - 1;
52 my $hash_to_key;
53 my $key_to_hash;
54 my $length_all_keys;
55 my $key_buckets;
56 SEED1:
57 for ($seed1++;1;$seed1++) {
58 my %hash_to_key;
59 my %key_to_hash;
60 my %key_buckets;
61 my %high;
62 $length_all_keys= 0;
63 foreach my $key (sort keys %$hash) {
64 $length_all_keys += length $key;
65 my $h= _fnv($key,$seed1);
66 next SEED1 if $h >= $max_h; # check if this hash would bias, and if so find a new seed
67 next SEED1 if exists $hash_to_key{$h};
68 next SEED1 if $high{$h >> $RSHIFT}++;
69 $hash_to_key{$h}= $key;
70 $key_to_hash{$key}= $h;
71 push @{$key_buckets{$h % $n}}, $key;
72 }
73 $hash_to_key= \%hash_to_key;
74 $key_to_hash= \%key_to_hash;
75 $key_buckets= \%key_buckets;
76 last SEED1;
77 }
78
79 my %token;
80 my @first_level;
81 my @second_level;
82 foreach my $first_idx (sort { @{$key_buckets->{$b}} <=> @{$key_buckets->{$a}} || $a <=> $b } keys %$key_buckets) {
83 my $keys= $key_buckets->{$first_idx};
84 #printf "got %d keys in bucket %d\n", 0+@$keys, $first_idx;
85 my $seed2;
86 SEED2:
87 for ($seed2=1;1;$seed2++) {
88 goto FIND_SEED if $seed2 > 0xFFFF;
89 my @idx= map {
90 ( ( ( $key_to_hash->{$_} >> $RSHIFT ) ^ $seed2 ) & 0xFFFFFFFF ) % $n
91 } @$keys;
92 my %seen;
93 next SEED2 if grep { $second_level[$_] || $seen{$_}++ } @idx;
94 $first_level[$first_idx]= $seed2;
95 @second_level[@idx]= map {
96 my $sp= $split_pos->{$_} // die "no split pos for '$_':$!";
97 my ($prefix,$suffix)= unpack "A${sp}A*", $_;
98
99 +{
100 key => $_,
101 prefix => $prefix,
102 suffix => $suffix,
103 hash => $key_to_hash->{$_},
104 value => $hash->{$_},
105 seed2 => 0,
106 }
107 } @$keys;
108 last;
109 }
110
111 }
112 $second_level[$_]{seed2}= $first_level[$_]||0, $second_level[$_]{idx}= $_ for 0 .. $#second_level;
113
114 return $seed1, \@second_level, $length_all_keys;
115}
116
117sub build_split_words {
118 my ($hash, $preprocess, $blob, $old_res)= @_;
119 my %appended;
120 $blob //= "";
121 if ($preprocess) {
122 my %parts;
123 foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %$hash) {
124 my ($prefix,$suffix);
125 if ($key=~/^([^=]+=)([^=]+)\z/) {
126 ($prefix,$suffix)= ($1, $2);
127 $parts{$suffix}++;
128 #$parts{$prefix}++;
129 } else {
130 $prefix= $key;
131 $parts{$prefix}++;
132 }
133
134 }
135 foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %parts) {
136 $blob .= $key . "\0";
137 }
138 printf "Using preprocessing, initial blob size %d\n", length($blob);
139 } else {
140 printf "No preprocessing, initial blob size %d\n", length($blob);
141 }
142 my %res;
143
144 REDO:
145 %res= ();
146 KEY:
147 foreach my $key (
148 sort {
149 (length($b) <=> length($a)) ||
150 ($a cmp $b)
151 }
152 keys %$hash
153 ) {
154 next if exists $res{$key};
155 if (index($blob,$key) >= 0 ) {
156 my $idx= length($key);
157 if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
158 print "changing: $key => $old_res->{$key} : $idx\n";
159 }
160 $res{$key}= $idx;
161 next KEY;
162 }
163 my $best= length($key);
164 my $append= $key;
165 my $min= 0; #length $key >= 4 ? 4 : 0;
166 my $best_prefix;
167 my $best_suffix;
168 foreach my $idx (reverse $min .. length($key)) {
169 my $prefix= substr($key,0,$idx);
170 my $suffix= substr($key,$idx);
171 my $i1= index($blob,$prefix)>=0;
172 my $i2= index($blob,$suffix)>=0;
173 if ($i1 and $i2) {
174 if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
175 print "changing: $key => $old_res->{$key} : $idx\n";
176 }
177 $res{$key}= $idx;
178 $appended{$prefix}++;
179 $appended{$suffix}++;
180 next KEY;
181 } elsif ($i1) {
182 if (length $suffix <= length $append) {
183 $best= $idx;
184 $append= $suffix;
185 $best_prefix= $prefix;
186 $best_suffix= $suffix;
187 }
188 } elsif ($i2) {
189 if (length $prefix <= length $append) {
190 $best= $idx;
191 $append= $prefix;
192 $best_prefix= $prefix;
193 $best_suffix= $suffix;
194 }
195 }
196 }
197 if ($DEBUG and $old_res and $old_res->{$key} != $best) {
198 print "changing: $key => $old_res->{$key} : $best\n";
199 }
200 #print "$best_prefix|$best_suffix => $best => $append\n";
201 $res{$key}= $best;
202 $blob .= $append;
203 $appended{$best_prefix}++;
204 $appended{$best_suffix}++;
205 }
206 my $b2 = "";
207 foreach my $key (sort { length($b) <=> length($a) || $a cmp $b } keys %appended) {
208 $b2 .= $key unless index($b2,$key)>=0;
209 }
210 if (length($b2)<length($blob)) {
211 printf "Length old blob: %d length new blob: %d, recomputing using new blob\n", length($blob),length($b2);
212 $blob= $b2;
213 %appended=();
214 goto REDO;
215 } else {
216 printf "Length old blob: %d length new blob: %d, keeping old blob\n", length($blob),length($b2);
217 }
218 die sprintf "not same size? %d != %d", 0+keys %res, 0+keys %$hash unless keys %res == keys %$hash;
219 return ($blob,\%res);
220}
221
222
223sub blob_as_code {
224 my ($blob,$blob_name)= @_;
225
226 $blob_name ||= "mph_blob";
227
228 # output the blob as C code.
9923c0b5 229 my @code= (sprintf "STATIC const unsigned char %s[] =\n",$blob_name);
1a4d3814
YO
230 my $blob_len= length $blob;
231 while (length($blob)) {
232 push @code, sprintf qq( "%s"), substr($blob,0,65,"");
233 push @code, length $blob ? "\n" : ";\n";
234 }
235 push @code, "/* $blob_name length: $blob_len */\n";
236 return join "",@code;
237}
238
239sub print_includes {
240 my $ofh= shift;
241 print $ofh "#include <stdio.h>\n";
242 print $ofh "#include <string.h>\n";
243 print $ofh "#include <stdint.h>\n";
244 print $ofh "\n";
245}
246
247sub print_defines {
248 my ($ofh,$defines)= @_;
249
250 my $key_len;
251 foreach my $def (keys %$defines) {
252 $key_len //= length $def;
253 $key_len= length $def if $key_len < length $def;
254 }
255 foreach my $def (sort keys %$defines) {
256 printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def};
257 }
258 print $ofh "\n";
259}
260
261
262sub build_array_of_struct {
263 my ($second_level,$blob)= @_;
264
265 my %defines;
266 my %tests;
267 my @rows;
268 foreach my $row (@$second_level) {
269 $defines{$row->{value}}= $row->{idx}+1;
270 $tests{$row->{key}}= $defines{$row->{value}};
271 my @u16= (
272 $row->{seed2},
273 index($blob,$row->{prefix}//0),
274 index($blob,$row->{suffix}//0),
275 );
9923c0b5 276 $_ > 0xFFFF and die "panic: value exceeds range of U16"
1a4d3814
YO
277 for @u16;
278 my @u8= (
279 length($row->{prefix}),
280 length($row->{suffix}),
281 );
9923c0b5 282 $_ > 0xFF and die "panic: value exceeds range of U8"
1a4d3814 283 for @u8;
c2300ef8
KW
284 push @rows, sprintf(" { %5d, %5d, %5d, %3d, %3d, %s } /* %s%s */",
285 @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix});
1a4d3814
YO
286 }
287 return \@rows,\%defines,\%tests;
288}
289
64affa0c
KW
290sub make_algo {
291 my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows,
1a4d3814
YO
292 $blob_name, $struct_name, $table_name, $match_name, $prefix) = @_;
293
294 $blob_name ||= "mph_blob";
295 $struct_name ||= "mph_struct";
296 $table_name ||= "mph_table";
297 $prefix ||= "MPH";
298
1a4d3814
YO
299 my $n= 0+@$second_level;
300 my $data_size= 0+@$second_level * 8 + length $smart_blob;
301
64affa0c
KW
302 my @code = "#define ${prefix}_VALt I16\n\n";
303 push @code, "/*\n";
304 push @code, sprintf "rows: %s\n", $n;
305 push @code, sprintf "seed: %s\n", $seed1;
306 push @code, sprintf "full length of keys: %d\n", $length_all_keys;
307 push @code, sprintf "blob length: %d\n", length $smart_blob;
308 push @code, sprintf "ref length: %d\n", 0+@$second_level * 8;
309 push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
310 push @code, "*/\n\n";
311
312 push @code, blob_as_code($smart_blob, $blob_name);
313 push @code, <<"EOF_CODE";
1a4d3814
YO
314
315struct $struct_name {
9923c0b5
KW
316 U16 seed2;
317 U16 pfx;
318 U16 sfx;
319 U8 pfx_len;
320 U8 sfx_len;
1a4d3814
YO
321 ${prefix}_VALt value;
322};
323
324EOF_CODE
325
64affa0c
KW
326 push @code, "#define ${prefix}_RSHIFT $RSHIFT\n";
327 push @code, "#define ${prefix}_BUCKETS $n\n\n";
328 push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1;
329 push @code, sprintf "STATIC const U32 ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;
1a4d3814 330
db55a61c 331 push @code, "/* The comments give the input key for the row it is in */\n";
64affa0c
KW
332 push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
333 push @code, <<"EOF_CODE";
9923c0b5 334${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) {
1a4d3814
YO
335 const unsigned char * ptr= key;
336 const unsigned char * ptr_end= key + key_len;
9923c0b5
KW
337 U32 h= ${prefix}_SEED1;
338 U32 s;
339 U32 n;
1a4d3814
YO
340 do {
341 h ^= *ptr;
342 h *= ${prefix}_FNV_CONST;
343 } while ( ++ptr < ptr_end );
344 n= h % ${prefix}_BUCKETS;
345 s = $table_name\[n].seed2;
346 if (s) {
347 h= (h >> ${prefix}_RSHIFT) ^ s;
348 n = h % ${prefix}_BUCKETS;
349 if (
350 ( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) &&
351 ( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) &&
352 ( !$table_name\[n].sfx_len || memcmp($blob_name + $table_name\[n].sfx,
353 key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 )
354 ) {
355 return $table_name\[n].value;
356 }
357 }
358 return 0;
359}
360EOF_CODE
64affa0c
KW
361
362 return join "", @code;
363}
364
365sub print_algo {
366 my ($ofh, $second_level, $seed1, $long_blob, $smart_blob, $rows,
367 $blob_name, $struct_name, $table_name, $match_name ) = @_;
368
369 if (!ref $ofh) {
370 my $file= $ofh;
371 undef $ofh;
372 open $ofh, ">", $file
373 or die "Failed to open '$file': $!";
374 }
375
376 my $code = make_algo(
377 $second_level, $seed1, $long_blob, $smart_blob, $rows,
378 $blob_name, $struct_name, $table_name, $match_name );
379 print $ofh $code;
1a4d3814
YO
380}
381
382sub print_main {
383 my ($ofh,$h_file,$match_name,$prefix)=@_;
384 print $ofh <<"EOF_CODE";
1a4d3814
YO
385#include "$h_file"
386
387int main(int argc, char *argv[]){
388 int i;
389 for (i=1; i<argc; i++) {
390 unsigned char *key = (unsigned char *)argv[i];
391 int key_len = strlen(argv[i]);
392 printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len));
393 }
394 return 0;
395}
396EOF_CODE
397}
398
399# output the test Perl code.
400sub print_tests {
401 my ($file, $tests_hash)= @_;
402 open my $ofh, ">", $file
403 or die "Failed to open '$file' for writing: $!";
404 my $num_tests= 2 + keys %$tests_hash;
405 print $ofh "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;";
406 my $bytes= 0;
407 my @tests= sort keys %$tests_hash;
408 print $ofh "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n";
409 print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n";
410 print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n";
411 while (@tests) {
412 my @batch= splice @tests,0,10;
413 my $batch_args= join " ", map { "'$_'" } @batch;
414 print $ofh "\@res=`./mph_test $batch_args`;\n";
415 foreach my $i (0..$#batch) {
416 my $key= $batch[$i];
417 my $want= $tests_hash->{$key};
418 print $ofh "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n";
419 }
420 }
421 close $ofh;
422}
423
424sub print_test_binary {
425 my ($file,$h_file, $second_level, $seed1, $length_all_keys,
426 $smart_blob, $rows, $defines, $match_name, $prefix)= @_;
427 open my $ofh, ">", $file
428 or die "Failed to open '$file': $!";
429 print_includes($ofh);
430 print_defines($ofh, $defines);
431 print_main($ofh,$h_file,$match_name,$prefix);
432 close $ofh;
433}
434
435sub make_mph_from_hash {
436 my $hash= shift;
437
438 # we do this twice because often we can find longer prefixes on the second pass.
439 my @keys= sort {length($b) <=> length($a) || $a cmp $b } keys %$hash;
440
441 my ($smart_blob, $res_to_split)= build_split_words($hash,0);
442 {
443 my ($smart_blob2, $res_to_split2)= build_split_words($hash,1);
444 if (length($smart_blob) > length($smart_blob2)) {
445 printf "Using preprocess-smart blob, length: %d (vs %d)\n", length $smart_blob2, length $smart_blob;
446 $smart_blob= $smart_blob2;
447 $res_to_split= $res_to_split2;
448 } else {
449 printf "Using greedy-smart blob, length: %d (vs %d)\n", length $smart_blob, length $smart_blob2;
450 }
451 }
452 my ($seed1, $second_level, $length_all_keys)= build_perfect_hash($hash, $res_to_split);
453 my ($rows, $defines, $tests)= build_array_of_struct($second_level, $smart_blob);
454 return ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, $defines, $tests);
455}
456
457sub make_files {
458 my ($hash,$base_name)= @_;
459
460 my $h_name= $base_name . "_algo.h";
461 my $c_name= $base_name . "_test.c";
462 my $p_name= $base_name . "_test.pl";
463 my $blob_name= $base_name . "_blob";
464 my $struct_name= $base_name . "_bucket_info";
465 my $table_name= $base_name . "_table";
466 my $match_name= $base_name . "_match";
467 my $prefix= uc($base_name);
468
469 my ($second_level, $seed1, $length_all_keys,
470 $smart_blob, $rows, $defines, $tests)= make_mph_from_hash( $hash );
471 print_algo( $h_name,
472 $second_level, $seed1, $length_all_keys, $smart_blob, $rows,
473 $blob_name, $struct_name, $table_name, $match_name, $prefix );
474 print_test_binary( $c_name, $h_name, $second_level, $seed1, $length_all_keys,
475 $smart_blob, $rows, $defines,
476 $match_name, $prefix );
477 print_tests( $p_name, $tests );
478}
479
480unless (caller) {
481 my %hash;
482 {
483 no warnings;
484 do "../perl/lib/unicore/Heavy.pl";
485 %hash= %utf8::loose_to_file_of;
486 }
487 if ($ENV{MERGE_KEYS}) {
488 my @keys= keys %hash;
489 foreach my $loose (keys %utf8::loose_property_name_of) {
490 my $to= $utf8::loose_property_name_of{$loose};
491 next if $to eq $loose;
492 foreach my $key (@keys) {
493 my $copy= $key;
494 if ($copy=~s/^\Q$to\E(=|\z)/$loose$1/) {
495 #print "$key => $copy\n";
496 $hash{$copy}= $key;
497 }
498 }
499 }
500 }
501 foreach my $key (keys %hash) {
502 my $munged= uc($key);
503 $munged=~s/\W/__/g;
504 $hash{$key} = $munged;
505 }
506
507 my $name= shift @ARGV;
508 $name ||= "mph";
509 make_files(\%hash,$name);
510}
511
5121;
513__END__