regen/mph.pl

   1 package MinimalPerfectHash;
   2 use strict;
   3 use warnings;
   4 use Data::Dumper;
   5 use Carp;
   6 use Text::Wrap;
   7 use bignum;     # Otherwise fails on 32-bit systems
   8
   9 my $DEBUG= 0;
  10 my $RSHIFT= 8;
  11 my $FNV_CONST= 16777619;
  12
  13 # The basic idea is that you have a two level structure, and effectively
  14 # hash the key twice.
  15 #
  16 # The first hash finds a bucket in the array which contains a seed which
  17 # is used for the second hash, which then leads to a bucket with key
  18 # data which is compared against to determine if the key is a match.
  19 #
  20 # If the first hash finds no seed, then the key cannot match.
  21 #
  22 # In our case we cheat a bit, and hash the key only once, but use the
  23 # low bits for the first lookup and the high-bits for the second.
  24 #
  25 # So for instance:
  26 #
  27 #           h= (h >> RSHIFT) ^ s;
  28 #
  29 # is how the second hash is computed. We right shift the original hash
  30 # value  and then xor in the seed2, which will be non-zero.
  31 #
  32 # That then gives us the bucket which contains the key data we need to
  33 # match for a valid key.
  34
  35 sub _fnv {
  36     my ($key, $seed)= @_;
  37     my $hash = 0+$seed;
  38     foreach my $char (split //, $key) {
  39         $hash = $hash ^ ord($char);
  40         $hash = ($hash * $FNV_CONST) & 0xFFFFFFFF;
  41     }
  42     return $hash;
  43 }
  44
  45 sub build_perfect_hash {
  46     my ($hash, $split_pos)= @_;
  47
  48     my $n= 0+keys %$hash;
  49     my $max_h= 0xFFFFFFFF;
  50     $max_h -= $max_h % $n; # this just avoids a tiny bit bias
  51     my $seed1= unpack("N", "Perl") - 1;
  52     my $hash_to_key;
  53     my $key_to_hash;
  54     my $length_all_keys;
  55     my $key_buckets;
  56     SEED1:
  57     for ($seed1++;1;$seed1++) {
  58         my %hash_to_key;
  59         my %key_to_hash;
  60         my %key_buckets;
  61         my %high;
  62         $length_all_keys= 0;
  63         foreach my $key (sort keys %$hash) {
  64             $length_all_keys += length $key;
  65             my $h= _fnv($key,$seed1);
  66             next SEED1 if $h >= $max_h; # check if this hash would bias, and if so find a new seed
  67             next SEED1 if exists $hash_to_key{$h};
  68             next SEED1 if $high{$h >> $RSHIFT}++;
  69             $hash_to_key{$h}= $key;
  70             $key_to_hash{$key}= $h;
  71             push @{$key_buckets{$h % $n}}, $key;
  72         }
  73         $hash_to_key= \%hash_to_key;
  74         $key_to_hash= \%key_to_hash;
  75         $key_buckets= \%key_buckets;
  76         last SEED1;
  77     }
  78
  79     my %token;
  80     my @first_level;
  81     my @second_level;
  82     foreach my $first_idx (sort { @{$key_buckets->{$b}} <=> @{$key_buckets->{$a}} || $a <=> $b } keys %$key_buckets) {
  83         my $keys= $key_buckets->{$first_idx};
  84         #printf "got %d keys in bucket %d\n", 0+@$keys, $first_idx;
  85         my $seed2;
  86         SEED2:
  87         for ($seed2=1;1;$seed2++) {
  88             goto FIND_SEED if $seed2 > 0xFFFF;
  89             my @idx= map {
  90                 ( ( ( $key_to_hash->{$_} >> $RSHIFT ) ^ $seed2 ) & 0xFFFFFFFF ) % $n
  91             } @$keys;
  92             my %seen;
  93             next SEED2 if grep { $second_level[$_] || $seen{$_}++ } @idx;
  94             $first_level[$first_idx]= $seed2;
  95             @second_level[@idx]= map {
  96                 my $sp= $split_pos->{$_} // die "no split pos for '$_':$!";
  97                 my ($prefix,$suffix)= unpack "A${sp}A*", $_;
  98
  99                 +{
 100                     key     => $_,
 101                     prefix  => $prefix,
 102                     suffix  => $suffix,
 103                     hash    => $key_to_hash->{$_},
 104                     value   => $hash->{$_},
 105                     seed2   => 0,
 106                 }
 107             } @$keys;
 108             last;
 109         }
 110
 111     }
 112     $second_level[$_]{seed2}= $first_level[$_]||0, $second_level[$_]{idx}= $_ for 0 .. $#second_level;
 113
 114     return $seed1, \@second_level, $length_all_keys;
 115 }
 116
 117 sub build_split_words {
 118     my ($hash, $preprocess, $blob, $old_res)= @_;
 119     my %appended;
 120     $blob //= "";
 121     if ($preprocess) {
 122         my %parts;
 123         foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %$hash) {
 124             my ($prefix,$suffix);
 125             if ($key=~/^([^=]+=)([^=]+)\z/) {
 126                 ($prefix,$suffix)= ($1, $2);
 127                 $parts{$suffix}++;
 128                 #$parts{$prefix}++;
 129             } else {
 130                 $prefix= $key;
 131                 $parts{$prefix}++;
 132             }
 133
 134         }
 135         foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %parts) {
 136             $blob .= $key . "\0";
 137         }
 138         printf "Using preprocessing, initial blob size %d\n", length($blob);
 139     } else {
 140         printf "No preprocessing, initial blob size %d\n", length($blob);
 141     }
 142     my %res;
 143
 144     REDO:
 145     %res= ();
 146     KEY:
 147     foreach my $key (
 148         sort {
 149             (length($b) <=> length($a)) ||
 150             ($a cmp $b)
 151         }
 152         keys %$hash
 153     ) {
 154         next if exists $res{$key};
 155         if (index($blob,$key) >= 0 ) {
 156             my $idx= length($key);
 157             if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
 158                 print "changing: $key => $old_res->{$key} : $idx\n";
 159             }
 160             $res{$key}= $idx;
 161             next KEY;
 162         }
 163         my $best= length($key);
 164         my $append= $key;
 165         my $min= 0; #length $key >= 4 ? 4 : 0;
 166         my $best_prefix;
 167         my $best_suffix;
 168         foreach my $idx (reverse $min .. length($key)) {
 169             my $prefix= substr($key,0,$idx);
 170             my $suffix= substr($key,$idx);
 171             my $i1= index($blob,$prefix)>=0;
 172             my $i2= index($blob,$suffix)>=0;
 173             if ($i1 and $i2) {
 174                 if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
 175                     print "changing: $key => $old_res->{$key} : $idx\n";
 176                 }
 177                 $res{$key}= $idx;
 178                 $appended{$prefix}++;
 179                 $appended{$suffix}++;
 180                 next KEY;
 181             } elsif ($i1) {
 182                 if (length $suffix <= length $append) {
 183                     $best= $idx;
 184                     $append= $suffix;
 185                     $best_prefix= $prefix;
 186                     $best_suffix= $suffix;
 187                 }
 188             } elsif ($i2) {
 189                 if (length $prefix <= length $append) {
 190                     $best= $idx;
 191                     $append= $prefix;
 192                     $best_prefix= $prefix;
 193                     $best_suffix= $suffix;
 194                 }
 195             }
 196         }
 197         if ($DEBUG and $old_res and $old_res->{$key} != $best) {
 198             print "changing: $key => $old_res->{$key} : $best\n";
 199         }
 200         #print "$best_prefix|$best_suffix => $best => $append\n";
 201         $res{$key}= $best;
 202         $blob .= $append;
 203         $appended{$best_prefix}++;
 204         $appended{$best_suffix}++;
 205     }
 206     my $b2 = "";
 207     foreach my $key (sort { length($b) <=> length($a) || $a cmp $b } keys %appended) {
 208         $b2 .= $key unless index($b2,$key)>=0;
 209     }
 210     if (length($b2)<length($blob)) {
 211         printf "Length old blob: %d length new blob: %d, recomputing using new blob\n", length($blob),length($b2);
 212         $blob= $b2;
 213         %appended=();
 214         goto REDO;
 215     } else {
 216         printf "Length old blob: %d length new blob: %d, keeping old blob\n", length($blob),length($b2);
 217     }
 218     die sprintf "not same size? %d != %d", 0+keys %res, 0+keys %$hash unless keys %res == keys %$hash;
 219     return ($blob,\%res);
 220 }
 221
 222
 223 sub blob_as_code {
 224     my ($blob,$blob_name)= @_;
 225
 226     $blob_name ||= "mph_blob";
 227
 228     # output the blob as C code.
 229     my @code= (sprintf "STATIC const unsigned char %s[] =\n",$blob_name);
 230     my $blob_len= length $blob;
 231     while (length($blob)) {
 232         push @code, sprintf qq(    "%s"), substr($blob,0,65,"");
 233         push @code, length $blob ? "\n" : ";\n";
 234     }
 235     push @code, "/* $blob_name length: $blob_len */\n";
 236     return join "",@code;
 237 }
 238
 239 sub print_includes {
 240     my $ofh= shift;
 241     print $ofh "#include <stdio.h>\n";
 242     print $ofh "#include <string.h>\n";
 243     print $ofh "#include <stdint.h>\n";
 244     print $ofh "\n";
 245 }
 246
 247 sub print_defines {
 248     my ($ofh,$defines)= @_;
 249
 250     my $key_len;
 251     foreach my $def (keys %$defines) {
 252         $key_len //= length $def;
 253         $key_len= length $def if $key_len < length $def;
 254     }
 255     foreach my $def (sort keys %$defines) {
 256         printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def};
 257     }
 258     print $ofh "\n";
 259 }
 260
 261
 262 sub build_array_of_struct {
 263     my ($second_level,$blob)= @_;
 264
 265     my %defines;
 266     my %tests;
 267     my @rows;
 268     foreach my $row (@$second_level) {
 269         $defines{$row->{value}}= $row->{idx}+1;
 270         $tests{$row->{key}}= $defines{$row->{value}};
 271         my @u16= (
 272             $row->{seed2},
 273             index($blob,$row->{prefix}//0),
 274             index($blob,$row->{suffix}//0),
 275         );
 276         $_ > 0xFFFF and die "panic: value exceeds range of U16"
 277             for @u16;
 278         my @u8= (
 279             length($row->{prefix}),
 280             length($row->{suffix}),
 281         );
 282         $_ > 0xFF and die "panic: value exceeds range of U8"
 283             for @u8;
 284         push @rows, sprintf("  { %5d, %5d, %5d, %3d, %3d, %s }   /* %s%s */",
 285             @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix});
 286     }
 287     return \@rows,\%defines,\%tests;
 288 }
 289
 290 sub make_algo {
 291     my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows,
 292         $blob_name, $struct_name, $table_name, $match_name, $prefix) = @_;
 293
 294     $blob_name ||= "mph_blob";
 295     $struct_name ||= "mph_struct";
 296     $table_name ||= "mph_table";
 297     $prefix ||= "MPH";
 298
 299     my $n= 0+@$second_level;
 300     my $data_size= 0+@$second_level * 8 + length $smart_blob;
 301
 302     my @code = "#define ${prefix}_VALt I16\n\n";
 303     push @code, "/*\n";
 304     push @code, sprintf "rows: %s\n", $n;
 305     push @code, sprintf "seed: %s\n", $seed1;
 306     push @code, sprintf "full length of keys: %d\n", $length_all_keys;
 307     push @code, sprintf "blob length: %d\n", length $smart_blob;
 308     push @code, sprintf "ref length: %d\n", 0+@$second_level * 8;
 309     push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
 310     push @code, "*/\n\n";
 311
 312     push @code, blob_as_code($smart_blob, $blob_name);
 313     push @code, <<"EOF_CODE";
 314
 315 struct $struct_name {
 316     U16 seed2;
 317     U16 pfx;
 318     U16 sfx;
 319     U8  pfx_len;
 320     U8  sfx_len;
 321     ${prefix}_VALt value;
 322 };
 323
 324 EOF_CODE
 325
 326     push @code, "#define ${prefix}_RSHIFT $RSHIFT\n";
 327     push @code, "#define ${prefix}_BUCKETS $n\n\n";
 328     push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1;
 329     push @code, sprintf "STATIC const U32 ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;
 330
 331     push @code, "/* The comments give the input key for the row it is in */\n";
 332     push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
 333     push @code, <<"EOF_CODE";
 334 ${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) {
 335     const unsigned char * ptr= key;
 336     const unsigned char * ptr_end= key + key_len;
 337     U32 h= ${prefix}_SEED1;
 338     U32 s;
 339     U32 n;
 340     do {
 341         h ^= NATIVE_TO_LATIN1(*ptr);    /* table collated in Latin1 */
 342         h *= ${prefix}_FNV_CONST;
 343     } while ( ++ptr < ptr_end );
 344     n= h % ${prefix}_BUCKETS;
 345     s = $table_name\[n].seed2;
 346     if (s) {
 347         h= (h >> ${prefix}_RSHIFT) ^ s;
 348         n = h % ${prefix}_BUCKETS;
 349         if (
 350             ( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) &&
 351             ( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) &&
 352             ( !$table_name\[n].sfx_len || memcmp($blob_name + $table_name\[n].sfx,
 353                 key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 )
 354         ) {
 355             return $table_name\[n].value;
 356         }
 357     }
 358     return 0;
 359 }
 360 EOF_CODE
 361
 362     return join "", @code;
 363 }
 364
 365 sub print_algo {
 366     my ($ofh, $second_level, $seed1, $long_blob, $smart_blob, $rows,
 367         $blob_name, $struct_name, $table_name, $match_name ) = @_;
 368
 369     if (!ref $ofh) {
 370         my $file= $ofh;
 371         undef $ofh;
 372         open $ofh, ">", $file
 373             or die "Failed to open '$file': $!";
 374     }
 375
 376     my $code = make_algo(
 377         $second_level, $seed1, $long_blob, $smart_blob, $rows,
 378         $blob_name, $struct_name, $table_name, $match_name );
 379     print $ofh $code;
 380 }
 381
 382 sub print_main {
 383     my ($ofh,$h_file,$match_name,$prefix)=@_;
 384     print $ofh <<"EOF_CODE";
 385 #include "$h_file"
 386
 387 int main(int argc, char *argv[]){
 388     int i;
 389     for (i=1; i<argc; i++) {
 390         unsigned char *key = (unsigned char *)argv[i];
 391         int key_len = strlen(argv[i]);
 392         printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len));
 393     }
 394     return 0;
 395 }
 396 EOF_CODE
 397 }
 398
 399 # output the test Perl code.
 400 sub print_tests {
 401     my ($file, $tests_hash)= @_;
 402     open my $ofh, ">", $file
 403         or die "Failed to open '$file' for writing: $!";
 404     my $num_tests= 2 + keys %$tests_hash;
 405     print $ofh "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;";
 406     my $bytes= 0;
 407     my @tests= sort keys %$tests_hash;
 408     print $ofh "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n";
 409     print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n";
 410     print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n";
 411     while (@tests) {
 412         my @batch= splice @tests,0,10;
 413         my $batch_args= join " ", map { "'$_'" } @batch;
 414         print $ofh "\@res=`./mph_test $batch_args`;\n";
 415         foreach my $i (0..$#batch) {
 416             my $key= $batch[$i];
 417             my $want= $tests_hash->{$key};
 418             print $ofh "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n";
 419         }
 420     }
 421     close $ofh;
 422 }
 423
 424 sub print_test_binary {
 425     my ($file,$h_file, $second_level, $seed1, $length_all_keys,
 426         $smart_blob, $rows, $defines, $match_name, $prefix)= @_;
 427     open my $ofh, ">", $file
 428         or die "Failed to open '$file': $!";
 429     print_includes($ofh);
 430     print_defines($ofh, $defines);
 431     print_main($ofh,$h_file,$match_name,$prefix);
 432     close $ofh;
 433 }
 434
 435 sub make_mph_from_hash {
 436     my $hash= shift;
 437
 438     # we do this twice because often we can find longer prefixes on the second pass.
 439     my ($smart_blob, $res_to_split)= build_split_words($hash,0);
 440     {
 441         my ($smart_blob2, $res_to_split2)= build_split_words($hash,1);
 442         if (length($smart_blob) > length($smart_blob2)) {
 443             printf "Using preprocess-smart blob, length: %d (vs %d)\n", length $smart_blob2, length $smart_blob;
 444             $smart_blob= $smart_blob2;
 445             $res_to_split= $res_to_split2;
 446         } else {
 447             printf "Using greedy-smart blob, length: %d (vs %d)\n", length $smart_blob, length $smart_blob2;
 448         }
 449     }
 450     my ($seed1, $second_level, $length_all_keys)= build_perfect_hash($hash, $res_to_split);
 451     my ($rows, $defines, $tests)= build_array_of_struct($second_level, $smart_blob);
 452     return ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, $defines, $tests);
 453 }
 454
 455 sub make_files {
 456     my ($hash,$base_name)= @_;
 457
 458     my $h_name= $base_name . "_algo.h";
 459     my $c_name= $base_name . "_test.c";
 460     my $p_name= $base_name . "_test.pl";
 461     my $blob_name= $base_name . "_blob";
 462     my $struct_name= $base_name . "_bucket_info";
 463     my $table_name= $base_name . "_table";
 464     my $match_name= $base_name . "_match";
 465     my $prefix= uc($base_name);
 466
 467     my ($second_level, $seed1, $length_all_keys,
 468         $smart_blob, $rows, $defines, $tests)= make_mph_from_hash( $hash );
 469     print_algo( $h_name,
 470         $second_level, $seed1, $length_all_keys, $smart_blob, $rows,
 471         $blob_name, $struct_name, $table_name, $match_name, $prefix );
 472     print_test_binary( $c_name, $h_name, $second_level, $seed1, $length_all_keys,
 473         $smart_blob, $rows, $defines,
 474         $match_name, $prefix );
 475     print_tests( $p_name, $tests );
 476 }
 477
 478 unless (caller) {
 479     my %hash;
 480     {
 481         no warnings;
 482         do "../perl/lib/unicore/UCD.pl";
 483         %hash= %utf8::loose_to_file_of;
 484     }
 485     if ($ENV{MERGE_KEYS}) {
 486         my @keys= keys %hash;
 487         foreach my $loose (keys %utf8::loose_property_name_of) {
 488             my $to= $utf8::loose_property_name_of{$loose};
 489             next if $to eq $loose;
 490             foreach my $key (@keys) {
 491                 my $copy= $key;
 492                 if ($copy=~s/^\Q$to\E(=|\z)/$loose$1/) {
 493                     #print "$key => $copy\n";
 494                     $hash{$copy}= $key;
 495                 }
 496             }
 497         }
 498     }
 499     foreach my $key (keys %hash) {
 500         my $munged= uc($key);
 501         $munged=~s/\W/__/g;
 502         $hash{$key} = $munged;
 503     }
 504
 505     my $name= shift @ARGV;
 506     $name ||= "mph";
 507     make_files(\%hash,$name);
 508 }
 509
 510 1;
 511 __END__