use Data::Dumper;
use Carp;
use Text::Wrap;
+use bignum; # Otherwise fails on 32-bit systems
my $DEBUG= 0;
my $RSHIFT= 8;
my $FNV_CONST= 16777619;
+# The basic idea is that you have a two level structure, and effectively
+# hash the key twice.
+#
+# The first hash finds a bucket in the array which contains a seed which
+# is used for the second hash, which then leads to a bucket with key
+# data which is compared against to determine if the key is a match.
+#
+# If the first hash finds no seed, then the key cannot match.
+#
+# In our case we cheat a bit, and hash the key only once, but use the
+# low bits for the first lookup and the high-bits for the second.
+#
+# So for instance:
+#
+# h= (h >> RSHIFT) ^ s;
+#
+# is how the second hash is computed. We right shift the original hash
+# value and then xor in the seed2, which will be non-zero.
+#
+# That then gives us the bucket which contains the key data we need to
+# match for a valid key.
+
sub _fnv {
my ($key, $seed)= @_;
-
my $hash = 0+$seed;
foreach my $char (split //, $key) {
$hash = $hash ^ ord($char);
$blob_name ||= "mph_blob";
# output the blob as C code.
- my @code= (sprintf "const unsigned char %s[] =\n",$blob_name);
+ my @code= (sprintf "STATIC const unsigned char %s[] =\n",$blob_name);
my $blob_len= length $blob;
while (length($blob)) {
push @code, sprintf qq( "%s"), substr($blob,0,65,"");
index($blob,$row->{prefix}//0),
index($blob,$row->{suffix}//0),
);
- $_ > 0xFFFF and die "panic: value exceeds range of uint16_t"
+ $_ > 0xFFFF and die "panic: value exceeds range of U16"
for @u16;
my @u8= (
length($row->{prefix}),
length($row->{suffix}),
);
- $_ > 0xFF and die "panic: value exceeds range of uint8_t"
+ $_ > 0xFF and die "panic: value exceeds range of U8"
for @u8;
- push @rows, sprintf(" { %5d, %5d, %5d, %3d, %3d, %s }",
- @u16, @u8, $row->{value} );
+ push @rows, sprintf(" { %5d, %5d, %5d, %3d, %3d, %s } /* %s%s */",
+ @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix});
}
return \@rows,\%defines,\%tests;
}
-sub print_algo {
- my ($ofh, $second_level, $seed1, $length_all_keys, $smart_blob, $rows,
+sub make_algo {
+ my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows,
$blob_name, $struct_name, $table_name, $match_name, $prefix) = @_;
$blob_name ||= "mph_blob";
$table_name ||= "mph_table";
$prefix ||= "MPH";
- if (!ref $ofh) {
- my $file= $ofh;
- undef $ofh;
- open $ofh, ">", $file
- or die "Failed to open '$file': $!";
- }
-
my $n= 0+@$second_level;
my $data_size= 0+@$second_level * 8 + length $smart_blob;
- print $ofh "/*\n";
- printf $ofh "rows: %s\n", $n;
- printf $ofh "seed: %s\n", $seed1;
- printf $ofh "full length of keys: %d\n", $length_all_keys;
- printf $ofh "blob length: %d\n", length $smart_blob;
- printf $ofh "ref length: %d\n", 0+@$second_level * 8;
- printf $ofh "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
- print $ofh "*/\n\n";
+ my @code = "#define ${prefix}_VALt I16\n\n";
+ push @code, "/*\n";
+ push @code, sprintf "rows: %s\n", $n;
+ push @code, sprintf "seed: %s\n", $seed1;
+ push @code, sprintf "full length of keys: %d\n", $length_all_keys;
+ push @code, sprintf "blob length: %d\n", length $smart_blob;
+ push @code, sprintf "ref length: %d\n", 0+@$second_level * 8;
+ push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
+ push @code, "*/\n\n";
- print $ofh blob_as_code($smart_blob, $blob_name);
- print $ofh <<"EOF_CODE";
+ push @code, blob_as_code($smart_blob, $blob_name);
+ push @code, <<"EOF_CODE";
struct $struct_name {
- uint16_t seed2;
- uint16_t pfx;
- uint16_t sfx;
- uint8_t pfx_len;
- uint8_t sfx_len;
+ U16 seed2;
+ U16 pfx;
+ U16 sfx;
+ U8 pfx_len;
+ U8 sfx_len;
${prefix}_VALt value;
};
EOF_CODE
- print $ofh "#define ${prefix}_RSHIFT $RSHIFT\n";
- print $ofh "#define ${prefix}_BUCKETS $n\n\n";
- printf $ofh "const uint32_t ${prefix}_SEED1 = 0x%08x;\n", $seed1;
- printf $ofh "const uint32_t ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;
+ push @code, "#define ${prefix}_RSHIFT $RSHIFT\n";
+ push @code, "#define ${prefix}_BUCKETS $n\n\n";
+ push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1;
+ push @code, sprintf "STATIC const U32 ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;
- print $ofh "\n";
- print $ofh "const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
- print $ofh <<"EOF_CODE";
-${prefix}_VALt $match_name( const unsigned char * const key, const uint16_t key_len ) {
+ push @code, "/* The comments give the input key for the row it is in */\n";
+ push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
+ push @code, <<"EOF_CODE";
+${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) {
const unsigned char * ptr= key;
const unsigned char * ptr_end= key + key_len;
- uint32_t h= ${prefix}_SEED1;
- uint32_t s;
- uint32_t n;
+ U32 h= ${prefix}_SEED1;
+ U32 s;
+ U32 n;
do {
- h ^= *ptr;
+ h ^= NATIVE_TO_LATIN1(*ptr); /* table collated in Latin1 */
h *= ${prefix}_FNV_CONST;
} while ( ++ptr < ptr_end );
n= h % ${prefix}_BUCKETS;
return 0;
}
EOF_CODE
+
+ return join "", @code;
+}
+
+sub print_algo {
+ my ($ofh, $second_level, $seed1, $long_blob, $smart_blob, $rows,
+ $blob_name, $struct_name, $table_name, $match_name ) = @_;
+
+ if (!ref $ofh) {
+ my $file= $ofh;
+ undef $ofh;
+ open $ofh, ">", $file
+ or die "Failed to open '$file': $!";
+ }
+
+ my $code = make_algo(
+ $second_level, $seed1, $long_blob, $smart_blob, $rows,
+ $blob_name, $struct_name, $table_name, $match_name );
+ print $ofh $code;
}
sub print_main {
my ($ofh,$h_file,$match_name,$prefix)=@_;
print $ofh <<"EOF_CODE";
-#define ${prefix}_VALt int16_t
#include "$h_file"
int main(int argc, char *argv[]){
my $hash= shift;
# we do this twice because often we can find longer prefixes on the second pass.
- my @keys= sort {length($b) <=> length($a) || $a cmp $b } keys %$hash;
-
my ($smart_blob, $res_to_split)= build_split_words($hash,0);
{
my ($smart_blob2, $res_to_split2)= build_split_words($hash,1);
my %hash;
{
no warnings;
- do "../perl/lib/unicore/Heavy.pl";
+ do "../perl/lib/unicore/UCD.pl";
%hash= %utf8::loose_to_file_of;
}
if ($ENV{MERGE_KEYS}) {