Commit | Line | Data |
---|---|---|
1a4d3814 YO |
1 | package MinimalPerfectHash; |
2 | use strict; | |
3 | use warnings; | |
4 | use Data::Dumper; | |
5 | use Carp; | |
6 | use Text::Wrap; | |
00d503c4 | 7 | use bignum; # Otherwise fails on 32-bit systems |
1a4d3814 YO |
8 | |
9 | my $DEBUG= 0; | |
10 | my $RSHIFT= 8; | |
11 | my $FNV_CONST= 16777619; | |
12 | ||
db027c8a KW |
13 | # The basic idea is that you have a two level structure, and effectively |
14 | # hash the key twice. | |
15 | # | |
16 | # The first hash finds a bucket in the array which contains a seed which | |
17 | # is used for the second hash, which then leads to a bucket with key | |
18 | # data which is compared against to determine if the key is a match. | |
19 | # | |
20 | # If the first hash finds no seed, then the key cannot match. | |
21 | # | |
22 | # In our case we cheat a bit, and hash the key only once, but use the | |
23 | # low bits for the first lookup and the high-bits for the second. | |
24 | # | |
25 | # So for instance: | |
26 | # | |
27 | # h= (h >> RSHIFT) ^ s; | |
28 | # | |
29 | # is how the second hash is computed. We right shift the original hash | |
30 | # value and then xor in the seed2, which will be non-zero. | |
31 | # | |
32 | # That then gives us the bucket which contains the key data we need to | |
33 | # match for a valid key. | |
34 | ||
1a4d3814 YO |
35 | sub _fnv { |
36 | my ($key, $seed)= @_; | |
1a4d3814 YO |
37 | my $hash = 0+$seed; |
38 | foreach my $char (split //, $key) { | |
39 | $hash = $hash ^ ord($char); | |
40 | $hash = ($hash * $FNV_CONST) & 0xFFFFFFFF; | |
41 | } | |
42 | return $hash; | |
43 | } | |
44 | ||
45 | sub build_perfect_hash { | |
46 | my ($hash, $split_pos)= @_; | |
47 | ||
48 | my $n= 0+keys %$hash; | |
49 | my $max_h= 0xFFFFFFFF; | |
50 | $max_h -= $max_h % $n; # this just avoids a tiny bit bias | |
51 | my $seed1= unpack("N", "Perl") - 1; | |
52 | my $hash_to_key; | |
53 | my $key_to_hash; | |
54 | my $length_all_keys; | |
55 | my $key_buckets; | |
56 | SEED1: | |
57 | for ($seed1++;1;$seed1++) { | |
58 | my %hash_to_key; | |
59 | my %key_to_hash; | |
60 | my %key_buckets; | |
61 | my %high; | |
62 | $length_all_keys= 0; | |
63 | foreach my $key (sort keys %$hash) { | |
64 | $length_all_keys += length $key; | |
65 | my $h= _fnv($key,$seed1); | |
66 | next SEED1 if $h >= $max_h; # check if this hash would bias, and if so find a new seed | |
67 | next SEED1 if exists $hash_to_key{$h}; | |
68 | next SEED1 if $high{$h >> $RSHIFT}++; | |
69 | $hash_to_key{$h}= $key; | |
70 | $key_to_hash{$key}= $h; | |
71 | push @{$key_buckets{$h % $n}}, $key; | |
72 | } | |
73 | $hash_to_key= \%hash_to_key; | |
74 | $key_to_hash= \%key_to_hash; | |
75 | $key_buckets= \%key_buckets; | |
76 | last SEED1; | |
77 | } | |
78 | ||
79 | my %token; | |
80 | my @first_level; | |
81 | my @second_level; | |
82 | foreach my $first_idx (sort { @{$key_buckets->{$b}} <=> @{$key_buckets->{$a}} || $a <=> $b } keys %$key_buckets) { | |
83 | my $keys= $key_buckets->{$first_idx}; | |
84 | #printf "got %d keys in bucket %d\n", 0+@$keys, $first_idx; | |
85 | my $seed2; | |
86 | SEED2: | |
87 | for ($seed2=1;1;$seed2++) { | |
88 | goto FIND_SEED if $seed2 > 0xFFFF; | |
89 | my @idx= map { | |
90 | ( ( ( $key_to_hash->{$_} >> $RSHIFT ) ^ $seed2 ) & 0xFFFFFFFF ) % $n | |
91 | } @$keys; | |
92 | my %seen; | |
93 | next SEED2 if grep { $second_level[$_] || $seen{$_}++ } @idx; | |
94 | $first_level[$first_idx]= $seed2; | |
95 | @second_level[@idx]= map { | |
96 | my $sp= $split_pos->{$_} // die "no split pos for '$_':$!"; | |
97 | my ($prefix,$suffix)= unpack "A${sp}A*", $_; | |
98 | ||
99 | +{ | |
100 | key => $_, | |
101 | prefix => $prefix, | |
102 | suffix => $suffix, | |
103 | hash => $key_to_hash->{$_}, | |
104 | value => $hash->{$_}, | |
105 | seed2 => 0, | |
106 | } | |
107 | } @$keys; | |
108 | last; | |
109 | } | |
110 | ||
111 | } | |
112 | $second_level[$_]{seed2}= $first_level[$_]||0, $second_level[$_]{idx}= $_ for 0 .. $#second_level; | |
113 | ||
114 | return $seed1, \@second_level, $length_all_keys; | |
115 | } | |
116 | ||
117 | sub build_split_words { | |
118 | my ($hash, $preprocess, $blob, $old_res)= @_; | |
119 | my %appended; | |
120 | $blob //= ""; | |
121 | if ($preprocess) { | |
122 | my %parts; | |
123 | foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %$hash) { | |
124 | my ($prefix,$suffix); | |
125 | if ($key=~/^([^=]+=)([^=]+)\z/) { | |
126 | ($prefix,$suffix)= ($1, $2); | |
127 | $parts{$suffix}++; | |
128 | #$parts{$prefix}++; | |
129 | } else { | |
130 | $prefix= $key; | |
131 | $parts{$prefix}++; | |
132 | } | |
133 | ||
134 | } | |
135 | foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %parts) { | |
136 | $blob .= $key . "\0"; | |
137 | } | |
138 | printf "Using preprocessing, initial blob size %d\n", length($blob); | |
139 | } else { | |
140 | printf "No preprocessing, initial blob size %d\n", length($blob); | |
141 | } | |
142 | my %res; | |
143 | ||
144 | REDO: | |
145 | %res= (); | |
146 | KEY: | |
147 | foreach my $key ( | |
148 | sort { | |
149 | (length($b) <=> length($a)) || | |
150 | ($a cmp $b) | |
151 | } | |
152 | keys %$hash | |
153 | ) { | |
154 | next if exists $res{$key}; | |
155 | if (index($blob,$key) >= 0 ) { | |
156 | my $idx= length($key); | |
157 | if ($DEBUG and $old_res and $old_res->{$key} != $idx) { | |
158 | print "changing: $key => $old_res->{$key} : $idx\n"; | |
159 | } | |
160 | $res{$key}= $idx; | |
161 | next KEY; | |
162 | } | |
163 | my $best= length($key); | |
164 | my $append= $key; | |
165 | my $min= 0; #length $key >= 4 ? 4 : 0; | |
166 | my $best_prefix; | |
167 | my $best_suffix; | |
168 | foreach my $idx (reverse $min .. length($key)) { | |
169 | my $prefix= substr($key,0,$idx); | |
170 | my $suffix= substr($key,$idx); | |
171 | my $i1= index($blob,$prefix)>=0; | |
172 | my $i2= index($blob,$suffix)>=0; | |
173 | if ($i1 and $i2) { | |
174 | if ($DEBUG and $old_res and $old_res->{$key} != $idx) { | |
175 | print "changing: $key => $old_res->{$key} : $idx\n"; | |
176 | } | |
177 | $res{$key}= $idx; | |
178 | $appended{$prefix}++; | |
179 | $appended{$suffix}++; | |
180 | next KEY; | |
181 | } elsif ($i1) { | |
182 | if (length $suffix <= length $append) { | |
183 | $best= $idx; | |
184 | $append= $suffix; | |
185 | $best_prefix= $prefix; | |
186 | $best_suffix= $suffix; | |
187 | } | |
188 | } elsif ($i2) { | |
189 | if (length $prefix <= length $append) { | |
190 | $best= $idx; | |
191 | $append= $prefix; | |
192 | $best_prefix= $prefix; | |
193 | $best_suffix= $suffix; | |
194 | } | |
195 | } | |
196 | } | |
197 | if ($DEBUG and $old_res and $old_res->{$key} != $best) { | |
198 | print "changing: $key => $old_res->{$key} : $best\n"; | |
199 | } | |
200 | #print "$best_prefix|$best_suffix => $best => $append\n"; | |
201 | $res{$key}= $best; | |
202 | $blob .= $append; | |
203 | $appended{$best_prefix}++; | |
204 | $appended{$best_suffix}++; | |
205 | } | |
206 | my $b2 = ""; | |
207 | foreach my $key (sort { length($b) <=> length($a) || $a cmp $b } keys %appended) { | |
208 | $b2 .= $key unless index($b2,$key)>=0; | |
209 | } | |
210 | if (length($b2)<length($blob)) { | |
211 | printf "Length old blob: %d length new blob: %d, recomputing using new blob\n", length($blob),length($b2); | |
212 | $blob= $b2; | |
213 | %appended=(); | |
214 | goto REDO; | |
215 | } else { | |
216 | printf "Length old blob: %d length new blob: %d, keeping old blob\n", length($blob),length($b2); | |
217 | } | |
218 | die sprintf "not same size? %d != %d", 0+keys %res, 0+keys %$hash unless keys %res == keys %$hash; | |
219 | return ($blob,\%res); | |
220 | } | |
221 | ||
222 | ||
223 | sub blob_as_code { | |
224 | my ($blob,$blob_name)= @_; | |
225 | ||
226 | $blob_name ||= "mph_blob"; | |
227 | ||
228 | # output the blob as C code. | |
9923c0b5 | 229 | my @code= (sprintf "STATIC const unsigned char %s[] =\n",$blob_name); |
1a4d3814 YO |
230 | my $blob_len= length $blob; |
231 | while (length($blob)) { | |
232 | push @code, sprintf qq( "%s"), substr($blob,0,65,""); | |
233 | push @code, length $blob ? "\n" : ";\n"; | |
234 | } | |
235 | push @code, "/* $blob_name length: $blob_len */\n"; | |
236 | return join "",@code; | |
237 | } | |
238 | ||
239 | sub print_includes { | |
240 | my $ofh= shift; | |
241 | print $ofh "#include <stdio.h>\n"; | |
242 | print $ofh "#include <string.h>\n"; | |
243 | print $ofh "#include <stdint.h>\n"; | |
244 | print $ofh "\n"; | |
245 | } | |
246 | ||
247 | sub print_defines { | |
248 | my ($ofh,$defines)= @_; | |
249 | ||
250 | my $key_len; | |
251 | foreach my $def (keys %$defines) { | |
252 | $key_len //= length $def; | |
253 | $key_len= length $def if $key_len < length $def; | |
254 | } | |
255 | foreach my $def (sort keys %$defines) { | |
256 | printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def}; | |
257 | } | |
258 | print $ofh "\n"; | |
259 | } | |
260 | ||
261 | ||
262 | sub build_array_of_struct { | |
263 | my ($second_level,$blob)= @_; | |
264 | ||
265 | my %defines; | |
266 | my %tests; | |
267 | my @rows; | |
268 | foreach my $row (@$second_level) { | |
269 | $defines{$row->{value}}= $row->{idx}+1; | |
270 | $tests{$row->{key}}= $defines{$row->{value}}; | |
271 | my @u16= ( | |
272 | $row->{seed2}, | |
273 | index($blob,$row->{prefix}//0), | |
274 | index($blob,$row->{suffix}//0), | |
275 | ); | |
9923c0b5 | 276 | $_ > 0xFFFF and die "panic: value exceeds range of U16" |
1a4d3814 YO |
277 | for @u16; |
278 | my @u8= ( | |
279 | length($row->{prefix}), | |
280 | length($row->{suffix}), | |
281 | ); | |
9923c0b5 | 282 | $_ > 0xFF and die "panic: value exceeds range of U8" |
1a4d3814 | 283 | for @u8; |
c2300ef8 KW |
284 | push @rows, sprintf(" { %5d, %5d, %5d, %3d, %3d, %s } /* %s%s */", |
285 | @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix}); | |
1a4d3814 YO |
286 | } |
287 | return \@rows,\%defines,\%tests; | |
288 | } | |
289 | ||
64affa0c KW |
290 | sub make_algo { |
291 | my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, | |
1a4d3814 YO |
292 | $blob_name, $struct_name, $table_name, $match_name, $prefix) = @_; |
293 | ||
294 | $blob_name ||= "mph_blob"; | |
295 | $struct_name ||= "mph_struct"; | |
296 | $table_name ||= "mph_table"; | |
297 | $prefix ||= "MPH"; | |
298 | ||
1a4d3814 YO |
299 | my $n= 0+@$second_level; |
300 | my $data_size= 0+@$second_level * 8 + length $smart_blob; | |
301 | ||
64affa0c KW |
302 | my @code = "#define ${prefix}_VALt I16\n\n"; |
303 | push @code, "/*\n"; | |
304 | push @code, sprintf "rows: %s\n", $n; | |
305 | push @code, sprintf "seed: %s\n", $seed1; | |
306 | push @code, sprintf "full length of keys: %d\n", $length_all_keys; | |
307 | push @code, sprintf "blob length: %d\n", length $smart_blob; | |
308 | push @code, sprintf "ref length: %d\n", 0+@$second_level * 8; | |
309 | push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100; | |
310 | push @code, "*/\n\n"; | |
311 | ||
312 | push @code, blob_as_code($smart_blob, $blob_name); | |
313 | push @code, <<"EOF_CODE"; | |
1a4d3814 YO |
314 | |
315 | struct $struct_name { | |
9923c0b5 KW |
316 | U16 seed2; |
317 | U16 pfx; | |
318 | U16 sfx; | |
319 | U8 pfx_len; | |
320 | U8 sfx_len; | |
1a4d3814 YO |
321 | ${prefix}_VALt value; |
322 | }; | |
323 | ||
324 | EOF_CODE | |
325 | ||
64affa0c KW |
326 | push @code, "#define ${prefix}_RSHIFT $RSHIFT\n"; |
327 | push @code, "#define ${prefix}_BUCKETS $n\n\n"; | |
328 | push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1; | |
329 | push @code, sprintf "STATIC const U32 ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST; | |
1a4d3814 | 330 | |
db55a61c | 331 | push @code, "/* The comments give the input key for the row it is in */\n"; |
64affa0c KW |
332 | push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n"; |
333 | push @code, <<"EOF_CODE"; | |
9923c0b5 | 334 | ${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) { |
1a4d3814 YO |
335 | const unsigned char * ptr= key; |
336 | const unsigned char * ptr_end= key + key_len; | |
9923c0b5 KW |
337 | U32 h= ${prefix}_SEED1; |
338 | U32 s; | |
339 | U32 n; | |
1a4d3814 | 340 | do { |
5fea3644 | 341 | h ^= NATIVE_TO_LATIN1(*ptr); /* table collated in Latin1 */ |
1a4d3814 YO |
342 | h *= ${prefix}_FNV_CONST; |
343 | } while ( ++ptr < ptr_end ); | |
344 | n= h % ${prefix}_BUCKETS; | |
345 | s = $table_name\[n].seed2; | |
346 | if (s) { | |
347 | h= (h >> ${prefix}_RSHIFT) ^ s; | |
348 | n = h % ${prefix}_BUCKETS; | |
349 | if ( | |
350 | ( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) && | |
351 | ( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) && | |
352 | ( !$table_name\[n].sfx_len || memcmp($blob_name + $table_name\[n].sfx, | |
353 | key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 ) | |
354 | ) { | |
355 | return $table_name\[n].value; | |
356 | } | |
357 | } | |
358 | return 0; | |
359 | } | |
360 | EOF_CODE | |
64affa0c KW |
361 | |
362 | return join "", @code; | |
363 | } | |
364 | ||
365 | sub print_algo { | |
366 | my ($ofh, $second_level, $seed1, $long_blob, $smart_blob, $rows, | |
367 | $blob_name, $struct_name, $table_name, $match_name ) = @_; | |
368 | ||
369 | if (!ref $ofh) { | |
370 | my $file= $ofh; | |
371 | undef $ofh; | |
372 | open $ofh, ">", $file | |
373 | or die "Failed to open '$file': $!"; | |
374 | } | |
375 | ||
376 | my $code = make_algo( | |
377 | $second_level, $seed1, $long_blob, $smart_blob, $rows, | |
378 | $blob_name, $struct_name, $table_name, $match_name ); | |
379 | print $ofh $code; | |
1a4d3814 YO |
380 | } |
381 | ||
382 | sub print_main { | |
383 | my ($ofh,$h_file,$match_name,$prefix)=@_; | |
384 | print $ofh <<"EOF_CODE"; | |
1a4d3814 YO |
385 | #include "$h_file" |
386 | ||
387 | int main(int argc, char *argv[]){ | |
388 | int i; | |
389 | for (i=1; i<argc; i++) { | |
390 | unsigned char *key = (unsigned char *)argv[i]; | |
391 | int key_len = strlen(argv[i]); | |
392 | printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len)); | |
393 | } | |
394 | return 0; | |
395 | } | |
396 | EOF_CODE | |
397 | } | |
398 | ||
399 | # output the test Perl code. | |
400 | sub print_tests { | |
401 | my ($file, $tests_hash)= @_; | |
402 | open my $ofh, ">", $file | |
403 | or die "Failed to open '$file' for writing: $!"; | |
404 | my $num_tests= 2 + keys %$tests_hash; | |
405 | print $ofh "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;"; | |
406 | my $bytes= 0; | |
407 | my @tests= sort keys %$tests_hash; | |
408 | print $ofh "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n"; | |
409 | print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n"; | |
410 | print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n"; | |
411 | while (@tests) { | |
412 | my @batch= splice @tests,0,10; | |
413 | my $batch_args= join " ", map { "'$_'" } @batch; | |
414 | print $ofh "\@res=`./mph_test $batch_args`;\n"; | |
415 | foreach my $i (0..$#batch) { | |
416 | my $key= $batch[$i]; | |
417 | my $want= $tests_hash->{$key}; | |
418 | print $ofh "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n"; | |
419 | } | |
420 | } | |
421 | close $ofh; | |
422 | } | |
423 | ||
424 | sub print_test_binary { | |
425 | my ($file,$h_file, $second_level, $seed1, $length_all_keys, | |
426 | $smart_blob, $rows, $defines, $match_name, $prefix)= @_; | |
427 | open my $ofh, ">", $file | |
428 | or die "Failed to open '$file': $!"; | |
429 | print_includes($ofh); | |
430 | print_defines($ofh, $defines); | |
431 | print_main($ofh,$h_file,$match_name,$prefix); | |
432 | close $ofh; | |
433 | } | |
434 | ||
435 | sub make_mph_from_hash { | |
436 | my $hash= shift; | |
437 | ||
438 | # we do this twice because often we can find longer prefixes on the second pass. | |
1a4d3814 YO |
439 | my ($smart_blob, $res_to_split)= build_split_words($hash,0); |
440 | { | |
441 | my ($smart_blob2, $res_to_split2)= build_split_words($hash,1); | |
442 | if (length($smart_blob) > length($smart_blob2)) { | |
443 | printf "Using preprocess-smart blob, length: %d (vs %d)\n", length $smart_blob2, length $smart_blob; | |
444 | $smart_blob= $smart_blob2; | |
445 | $res_to_split= $res_to_split2; | |
446 | } else { | |
447 | printf "Using greedy-smart blob, length: %d (vs %d)\n", length $smart_blob, length $smart_blob2; | |
448 | } | |
449 | } | |
450 | my ($seed1, $second_level, $length_all_keys)= build_perfect_hash($hash, $res_to_split); | |
451 | my ($rows, $defines, $tests)= build_array_of_struct($second_level, $smart_blob); | |
452 | return ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, $defines, $tests); | |
453 | } | |
454 | ||
455 | sub make_files { | |
456 | my ($hash,$base_name)= @_; | |
457 | ||
458 | my $h_name= $base_name . "_algo.h"; | |
459 | my $c_name= $base_name . "_test.c"; | |
460 | my $p_name= $base_name . "_test.pl"; | |
461 | my $blob_name= $base_name . "_blob"; | |
462 | my $struct_name= $base_name . "_bucket_info"; | |
463 | my $table_name= $base_name . "_table"; | |
464 | my $match_name= $base_name . "_match"; | |
465 | my $prefix= uc($base_name); | |
466 | ||
467 | my ($second_level, $seed1, $length_all_keys, | |
468 | $smart_blob, $rows, $defines, $tests)= make_mph_from_hash( $hash ); | |
469 | print_algo( $h_name, | |
470 | $second_level, $seed1, $length_all_keys, $smart_blob, $rows, | |
471 | $blob_name, $struct_name, $table_name, $match_name, $prefix ); | |
472 | print_test_binary( $c_name, $h_name, $second_level, $seed1, $length_all_keys, | |
473 | $smart_blob, $rows, $defines, | |
474 | $match_name, $prefix ); | |
475 | print_tests( $p_name, $tests ); | |
476 | } | |
477 | ||
478 | unless (caller) { | |
479 | my %hash; | |
480 | { | |
481 | no warnings; | |
048bdb72 | 482 | do "../perl/lib/unicore/UCD.pl"; |
1a4d3814 YO |
483 | %hash= %utf8::loose_to_file_of; |
484 | } | |
485 | if ($ENV{MERGE_KEYS}) { | |
486 | my @keys= keys %hash; | |
487 | foreach my $loose (keys %utf8::loose_property_name_of) { | |
488 | my $to= $utf8::loose_property_name_of{$loose}; | |
489 | next if $to eq $loose; | |
490 | foreach my $key (@keys) { | |
491 | my $copy= $key; | |
492 | if ($copy=~s/^\Q$to\E(=|\z)/$loose$1/) { | |
493 | #print "$key => $copy\n"; | |
494 | $hash{$copy}= $key; | |
495 | } | |
496 | } | |
497 | } | |
498 | } | |
499 | foreach my $key (keys %hash) { | |
500 | my $munged= uc($key); | |
501 | $munged=~s/\W/__/g; | |
502 | $hash{$key} = $munged; | |
503 | } | |
504 | ||
505 | my $name= shift @ARGV; | |
506 | $name ||= "mph"; | |
507 | make_files(\%hash,$name); | |
508 | } | |
509 | ||
510 | 1; | |
511 | __END__ |