[perl5.git] / regen / mph.pl

package MinimalPerfectHash;
use strict;
use warnings;
use Data::Dumper;
use Carp;
use Text::Wrap;
use bignum;     # Otherwise fails on 32-bit systems

my $DEBUG= 0;
my $RSHIFT= 8;
my $FNV_CONST= 16777619;

# The basic idea is that you have a two level structure, and effectively
# hash the key twice.
#
# The first hash finds a bucket in the array which contains a seed which
# is used for the second hash, which then leads to a bucket with key
# data which is compared against to determine if the key is a match.
#
# If the first hash finds no seed, then the key cannot match.
#
# In our case we cheat a bit, and hash the key only once, but use the
# low bits for the first lookup and the high-bits for the second.
#
# So for instance:
#
#           h= (h >> RSHIFT) ^ s;
#
# is how the second hash is computed. We right shift the original hash
# value  and then xor in the seed2, which will be non-zero.
#
# That then gives us the bucket which contains the key data we need to
# match for a valid key.

sub _fnv {
    my ($key, $seed)= @_;
    my $hash = 0+$seed;
    foreach my $char (split //, $key) {
        $hash = $hash ^ ord($char);
        $hash = ($hash * $FNV_CONST) & 0xFFFFFFFF;
    }
    return $hash;
}

sub build_perfect_hash {
    my ($hash, $split_pos)= @_;

    my $n= 0+keys %$hash;
    my $max_h= 0xFFFFFFFF;
    $max_h -= $max_h % $n; # this just avoids a tiny bit bias
    my $seed1= unpack("N", "Perl") - 1;
    my $hash_to_key;
    my $key_to_hash;
    my $length_all_keys;
    my $key_buckets;
    SEED1:
    for ($seed1++;1;$seed1++) {
        my %hash_to_key;
        my %key_to_hash;
        my %key_buckets;
        my %high;
        $length_all_keys= 0;
        foreach my $key (sort keys %$hash) {
            $length_all_keys += length $key;
            my $h= _fnv($key,$seed1);
            next SEED1 if $h >= $max_h; # check if this hash would bias, and if so find a new seed
            next SEED1 if exists $hash_to_key{$h};
            next SEED1 if $high{$h >> $RSHIFT}++;
            $hash_to_key{$h}= $key;
            $key_to_hash{$key}= $h;
            push @{$key_buckets{$h % $n}}, $key;
        }
        $hash_to_key= \%hash_to_key;
        $key_to_hash= \%key_to_hash;
        $key_buckets= \%key_buckets;
        last SEED1;
    }

    my %token;
    my @first_level;
    my @second_level;
    foreach my $first_idx (sort { @{$key_buckets->{$b}} <=> @{$key_buckets->{$a}} || $a <=> $b } keys %$key_buckets) {
        my $keys= $key_buckets->{$first_idx};
        #printf "got %d keys in bucket %d\n", 0+@$keys, $first_idx;
        my $seed2;
        SEED2:
        for ($seed2=1;1;$seed2++) {
            goto FIND_SEED if $seed2 > 0xFFFF;
            my @idx= map {
                ( ( ( $key_to_hash->{$_} >> $RSHIFT ) ^ $seed2 ) & 0xFFFFFFFF ) % $n
            } @$keys;
            my %seen;
            next SEED2 if grep { $second_level[$_] || $seen{$_}++ } @idx;
            $first_level[$first_idx]= $seed2;
            @second_level[@idx]= map {
                my $sp= $split_pos->{$_} // die "no split pos for '$_':$!";
                my ($prefix,$suffix)= unpack "A${sp}A*", $_;

                +{
                    key     => $_,
                    prefix  => $prefix,
                    suffix  => $suffix,
                    hash    => $key_to_hash->{$_},
                    value   => $hash->{$_},
                    seed2   => 0,
                }
            } @$keys;
            last;
        }

    }
    $second_level[$_]{seed2}= $first_level[$_]||0, $second_level[$_]{idx}= $_ for 0 .. $#second_level;

    return $seed1, \@second_level, $length_all_keys;
}

sub build_split_words {
    my ($hash, $preprocess, $blob, $old_res)= @_;
    my %appended;
    $blob //= "";
    if ($preprocess) {
        my %parts;
        foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %$hash) {
            my ($prefix,$suffix);
            if ($key=~/^([^=]+=)([^=]+)\z/) {
                ($prefix,$suffix)= ($1, $2);
                $parts{$suffix}++;
                #$parts{$prefix}++;
            } else {
                $prefix= $key;
                $parts{$prefix}++;
            }

        }
        foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %parts) {
            $blob .= $key . "\0";
        }
        printf "Using preprocessing, initial blob size %d\n", length($blob);
    } else {
        printf "No preprocessing, initial blob size %d\n", length($blob);
    }
    my %res;

    REDO:
    %res= ();
    KEY:
    foreach my $key (
        sort {
            (length($b) <=> length($a)) ||
            ($a cmp $b)
        }
        keys %$hash
    ) {
        next if exists $res{$key};
        if (index($blob,$key) >= 0 ) {
            my $idx= length($key);
            if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
                print "changing: $key => $old_res->{$key} : $idx\n";
            }
            $res{$key}= $idx;
            next KEY;
        }
        my $best= length($key);
        my $append= $key;
        my $min= 0; #length $key >= 4 ? 4 : 0;
        my $best_prefix;
        my $best_suffix;
        foreach my $idx (reverse $min .. length($key)) {
            my $prefix= substr($key,0,$idx);
            my $suffix= substr($key,$idx);
            my $i1= index($blob,$prefix)>=0;
            my $i2= index($blob,$suffix)>=0;
            if ($i1 and $i2) {
                if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
                    print "changing: $key => $old_res->{$key} : $idx\n";
                }
                $res{$key}= $idx;
                $appended{$prefix}++;
                $appended{$suffix}++;
                next KEY;
            } elsif ($i1) {
                if (length $suffix <= length $append) {
                    $best= $idx;
                    $append= $suffix;
                    $best_prefix= $prefix;
                    $best_suffix= $suffix;
                }
            } elsif ($i2) {
                if (length $prefix <= length $append) {
                    $best= $idx;
                    $append= $prefix;
                    $best_prefix= $prefix;
                    $best_suffix= $suffix;
                }
            }
        }
        if ($DEBUG and $old_res and $old_res->{$key} != $best) {
            print "changing: $key => $old_res->{$key} : $best\n";
        }
        #print "$best_prefix|$best_suffix => $best => $append\n";
        $res{$key}= $best;
        $blob .= $append;
        $appended{$best_prefix}++;
        $appended{$best_suffix}++;
    }
    my $b2 = "";
    foreach my $key (sort { length($b) <=> length($a) || $a cmp $b } keys %appended) {
        $b2 .= $key unless index($b2,$key)>=0;
    }
    if (length($b2)<length($blob)) {
        printf "Length old blob: %d length new blob: %d, recomputing using new blob\n", length($blob),length($b2);
        $blob= $b2;
        %appended=();
        goto REDO;
    } else {
        printf "Length old blob: %d length new blob: %d, keeping old blob\n", length($blob),length($b2);
    }
    die sprintf "not same size? %d != %d", 0+keys %res, 0+keys %$hash unless keys %res == keys %$hash;
    return ($blob,\%res);
}


sub blob_as_code {
    my ($blob,$blob_name)= @_;

    $blob_name ||= "mph_blob";

    # output the blob as C code.
    my @code= (sprintf "STATIC const unsigned char %s[] =\n",$blob_name);
    my $blob_len= length $blob;
    while (length($blob)) {
        push @code, sprintf qq(    "%s"), substr($blob,0,65,"");
        push @code, length $blob ? "\n" : ";\n";
    }
    push @code, "/* $blob_name length: $blob_len */\n";
    return join "",@code;
}

sub print_includes {
    my $ofh= shift;
    print $ofh "#include <stdio.h>\n";
    print $ofh "#include <string.h>\n";
    print $ofh "#include <stdint.h>\n";
    print $ofh "\n";
}

sub print_defines {
    my ($ofh,$defines)= @_;

    my $key_len;
    foreach my $def (keys %$defines) {
        $key_len //= length $def;
        $key_len= length $def if $key_len < length $def;
    }
    foreach my $def (sort keys %$defines) {
        printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def};
    }
    print $ofh "\n";
}


sub build_array_of_struct {
    my ($second_level,$blob)= @_;

    my %defines;
    my %tests;
    my @rows;
    foreach my $row (@$second_level) {
        $defines{$row->{value}}= $row->{idx}+1;
        $tests{$row->{key}}= $defines{$row->{value}};
        my @u16= (
            $row->{seed2},
            index($blob,$row->{prefix}//0),
            index($blob,$row->{suffix}//0),
        );
        $_ > 0xFFFF and die "panic: value exceeds range of U16"
            for @u16;
        my @u8= (
            length($row->{prefix}),
            length($row->{suffix}),
        );
        $_ > 0xFF and die "panic: value exceeds range of U8"
            for @u8;
        push @rows, sprintf("  { %5d, %5d, %5d, %3d, %3d, %s }   /* %s%s */",
            @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix});
    }
    return \@rows,\%defines,\%tests;
}

sub make_algo {
    my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows,
        $blob_name, $struct_name, $table_name, $match_name, $prefix) = @_;

    $blob_name ||= "mph_blob";
    $struct_name ||= "mph_struct";
    $table_name ||= "mph_table";
    $prefix ||= "MPH";

    my $n= 0+@$second_level;
    my $data_size= 0+@$second_level * 8 + length $smart_blob;

    my @code = "#define ${prefix}_VALt I16\n\n";
    push @code, "/*\n";
    push @code, sprintf "rows: %s\n", $n;
    push @code, sprintf "seed: %s\n", $seed1;
    push @code, sprintf "full length of keys: %d\n", $length_all_keys;
    push @code, sprintf "blob length: %d\n", length $smart_blob;
    push @code, sprintf "ref length: %d\n", 0+@$second_level * 8;
    push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
    push @code, "*/\n\n";

    push @code, blob_as_code($smart_blob, $blob_name);
    push @code, <<"EOF_CODE";

struct $struct_name {
    U16 seed2;
    U16 pfx;
    U16 sfx;
    U8  pfx_len;
    U8  sfx_len;
    ${prefix}_VALt value;
};

EOF_CODE

    push @code, "#define ${prefix}_RSHIFT $RSHIFT\n";
    push @code, "#define ${prefix}_BUCKETS $n\n\n";
    push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1;
    push @code, sprintf "STATIC const U32 ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;

    push @code, "/* The comments give the input key for the row it is in */\n";
    push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
    push @code, <<"EOF_CODE";
${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) {
    const unsigned char * ptr= key;
    const unsigned char * ptr_end= key + key_len;
    U32 h= ${prefix}_SEED1;
    U32 s;
    U32 n;
    do {
        h ^= NATIVE_TO_LATIN1(*ptr);    /* table collated in Latin1 */
        h *= ${prefix}_FNV_CONST;
    } while ( ++ptr < ptr_end );
    n= h % ${prefix}_BUCKETS;
    s = $table_name\[n].seed2;
    if (s) {
        h= (h >> ${prefix}_RSHIFT) ^ s;
        n = h % ${prefix}_BUCKETS;
        if (
            ( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) &&
            ( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) &&
            ( !$table_name\[n].sfx_len || memcmp($blob_name + $table_name\[n].sfx,
                key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 )
        ) {
            return $table_name\[n].value;
        }
    }
    return 0;
}
EOF_CODE

    return join "", @code;
}

sub print_algo {
    my ($ofh, $second_level, $seed1, $long_blob, $smart_blob, $rows,
        $blob_name, $struct_name, $table_name, $match_name ) = @_;

    if (!ref $ofh) {
        my $file= $ofh;
        undef $ofh;
        open $ofh, ">", $file
            or die "Failed to open '$file': $!";
    }

    my $code = make_algo(
        $second_level, $seed1, $long_blob, $smart_blob, $rows,
        $blob_name, $struct_name, $table_name, $match_name );
    print $ofh $code;
}

sub print_main {
    my ($ofh,$h_file,$match_name,$prefix)=@_;
    print $ofh <<"EOF_CODE";
#include "$h_file"

int main(int argc, char *argv[]){
    int i;
    for (i=1; i<argc; i++) {
        unsigned char *key = (unsigned char *)argv[i];
        int key_len = strlen(argv[i]);
        printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len));
    }
    return 0;
}
EOF_CODE
}

# output the test Perl code.
sub print_tests {
    my ($file, $tests_hash)= @_;
    open my $ofh, ">", $file
        or die "Failed to open '$file' for writing: $!";
    my $num_tests= 2 + keys %$tests_hash;
    print $ofh "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;";
    my $bytes= 0;
    my @tests= sort keys %$tests_hash;
    print $ofh "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n";
    print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n";
    print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n";
    while (@tests) {
        my @batch= splice @tests,0,10;
        my $batch_args= join " ", map { "'$_'" } @batch;
        print $ofh "\@res=`./mph_test $batch_args`;\n";
        foreach my $i (0..$#batch) {
            my $key= $batch[$i];
            my $want= $tests_hash->{$key};
            print $ofh "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n";
        }
    }
    close $ofh;
}

sub print_test_binary {
    my ($file,$h_file, $second_level, $seed1, $length_all_keys,
        $smart_blob, $rows, $defines, $match_name, $prefix)= @_;
    open my $ofh, ">", $file
        or die "Failed to open '$file': $!";
    print_includes($ofh);
    print_defines($ofh, $defines);
    print_main($ofh,$h_file,$match_name,$prefix);
    close $ofh;
}

sub make_mph_from_hash {
    my $hash= shift;

    # we do this twice because often we can find longer prefixes on the second pass.
    my ($smart_blob, $res_to_split)= build_split_words($hash,0);
    {
        my ($smart_blob2, $res_to_split2)= build_split_words($hash,1);
        if (length($smart_blob) > length($smart_blob2)) {
            printf "Using preprocess-smart blob, length: %d (vs %d)\n", length $smart_blob2, length $smart_blob;
            $smart_blob= $smart_blob2;
            $res_to_split= $res_to_split2;
        } else {
            printf "Using greedy-smart blob, length: %d (vs %d)\n", length $smart_blob, length $smart_blob2;
        }
    }
    my ($seed1, $second_level, $length_all_keys)= build_perfect_hash($hash, $res_to_split);
    my ($rows, $defines, $tests)= build_array_of_struct($second_level, $smart_blob);
    return ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, $defines, $tests);
}

sub make_files {
    my ($hash,$base_name)= @_;

    my $h_name= $base_name . "_algo.h";
    my $c_name= $base_name . "_test.c";
    my $p_name= $base_name . "_test.pl";
    my $blob_name= $base_name . "_blob";
    my $struct_name= $base_name . "_bucket_info";
    my $table_name= $base_name . "_table";
    my $match_name= $base_name . "_match";
    my $prefix= uc($base_name);

    my ($second_level, $seed1, $length_all_keys,
        $smart_blob, $rows, $defines, $tests)= make_mph_from_hash( $hash );
    print_algo( $h_name,
        $second_level, $seed1, $length_all_keys, $smart_blob, $rows,
        $blob_name, $struct_name, $table_name, $match_name, $prefix );
    print_test_binary( $c_name, $h_name, $second_level, $seed1, $length_all_keys,
        $smart_blob, $rows, $defines,
        $match_name, $prefix );
    print_tests( $p_name, $tests );
}

unless (caller) {
    my %hash;
    {
        no warnings;
        do "../perl/lib/unicore/UCD.pl";
        %hash= %utf8::loose_to_file_of;
    }
    if ($ENV{MERGE_KEYS}) {
        my @keys= keys %hash;
        foreach my $loose (keys %utf8::loose_property_name_of) {
            my $to= $utf8::loose_property_name_of{$loose};
            next if $to eq $loose;
            foreach my $key (@keys) {
                my $copy= $key;
                if ($copy=~s/^\Q$to\E(=|\z)/$loose$1/) {
                    #print "$key => $copy\n";
                    $hash{$copy}= $key;
                }
            }
        }
    }
    foreach my $key (keys %hash) {
        my $munged= uc($key);
        $munged=~s/\W/__/g;
        $hash{$key} = $munged;
    }

    my $name= shift @ARGV;
    $name ||= "mph";
    make_files(\%hash,$name);
}

1;
__END__
Commit	Line	Data
1a4d3814 YO	1	package MinimalPerfectHash;
	2	use strict;
	3	use warnings;
	4	use Data::Dumper;
	5	use Carp;
	6	use Text::Wrap;
00d503c4	7	use bignum; # Otherwise fails on 32-bit systems
1a4d3814 YO	8
	9	my $DEBUG= 0;
	10	my $RSHIFT= 8;
	11	my $FNV_CONST= 16777619;
	12
db027c8a KW	13	# The basic idea is that you have a two level structure, and effectively
	14	# hash the key twice.
	15	#
	16	# The first hash finds a bucket in the array which contains a seed which
	17	# is used for the second hash, which then leads to a bucket with key
	18	# data which is compared against to determine if the key is a match.
	19	#
	20	# If the first hash finds no seed, then the key cannot match.
	21	#
	22	# In our case we cheat a bit, and hash the key only once, but use the
	23	# low bits for the first lookup and the high-bits for the second.
	24	#
	25	# So for instance:
	26	#
	27	# h= (h >> RSHIFT) ^ s;
	28	#
	29	# is how the second hash is computed. We right shift the original hash
	30	# value and then xor in the seed2, which will be non-zero.
	31	#
	32	# That then gives us the bucket which contains the key data we need to
	33	# match for a valid key.
	34
1a4d3814 YO	35	sub _fnv {
1a4d3814 YO	36	my ($key, $seed)= @_;
1a4d3814 YO	37	my $hash = 0+$seed;
	38	foreach my $char (split //, $key) {
	39	$hash = $hash ^ ord($char);
	40	$hash = ($hash * $FNV_CONST) & 0xFFFFFFFF;
	41	}
	42	return $hash;
	43	}
	44
	45	sub build_perfect_hash {
	46	my ($hash, $split_pos)= @_;
	47
	48	my $n= 0+keys %$hash;
	49	my $max_h= 0xFFFFFFFF;
	50	$max_h -= $max_h % $n; # this just avoids a tiny bit bias
	51	my $seed1= unpack("N", "Perl") - 1;
	52	my $hash_to_key;
	53	my $key_to_hash;
	54	my $length_all_keys;
	55	my $key_buckets;
	56	SEED1:
	57	for ($seed1++;1;$seed1++) {
	58	my %hash_to_key;
	59	my %key_to_hash;
	60	my %key_buckets;
	61	my %high;
	62	$length_all_keys= 0;
	63	foreach my $key (sort keys %$hash) {
	64	$length_all_keys += length $key;
	65	my $h= _fnv($key,$seed1);
	66	next SEED1 if $h >= $max_h; # check if this hash would bias, and if so find a new seed
	67	next SEED1 if exists $hash_to_key{$h};
	68	next SEED1 if $high{$h >> $RSHIFT}++;
	69	$hash_to_key{$h}= $key;
	70	$key_to_hash{$key}= $h;
	71	push @{$key_buckets{$h % $n}}, $key;
	72	}
	73	$hash_to_key= \%hash_to_key;
	74	$key_to_hash= \%key_to_hash;
	75	$key_buckets= \%key_buckets;
	76	last SEED1;
	77	}
	78
	79	my %token;
	80	my @first_level;
	81	my @second_level;
	82	foreach my $first_idx (sort { @{$key_buckets->{$b}} <=> @{$key_buckets->{$a}} \|\| $a <=> $b } keys %$key_buckets) {
	83	my $keys= $key_buckets->{$first_idx};
	84	#printf "got %d keys in bucket %d\n", 0+@$keys, $first_idx;
	85	my $seed2;
	86	SEED2:
	87	for ($seed2=1;1;$seed2++) {
	88	goto FIND_SEED if $seed2 > 0xFFFF;
	89	my @idx= map {
	90	( ( ( $key_to_hash->{$_} >> $RSHIFT ) ^ $seed2 ) & 0xFFFFFFFF ) % $n
	91	} @$keys;
	92	my %seen;
	93	next SEED2 if grep { $second_level[$_] \|\| $seen{$_}++ } @idx;
	94	$first_level[$first_idx]= $seed2;
	95	@second_level[@idx]= map {
	96	my $sp= $split_pos->{$_} // die "no split pos for '$_':$!";
	97	my ($prefix,$suffix)= unpack "A${sp}A*", $_;
	98
	99	+{
	100	key => $_,
101	prefix => $prefix,
102	suffix => $suffix,
103	hash => $key_to_hash->{$_},
104	value => $hash->{$_},
105	seed2 => 0,
106	}
107	} @$keys;
108	last;
109	}
110
111	}
112	$second_level[$_]{seed2}= $first_level[$_]\|\|0, $second_level[$_]{idx}= $_ for 0 .. $#second_level;
113
114	return $seed1, \@second_level, $length_all_keys;
115	}
116
117	sub build_split_words {
118	my ($hash, $preprocess, $blob, $old_res)= @_;
119	my %appended;
120	$blob //= "";
121	if ($preprocess) {
122	my %parts;
123	foreach my $key (sort {length($b) <=> length($a) \|\| $a cmp $b } keys %$hash) {
124	my ($prefix,$suffix);
125	if ($key=~/^([^=]+=)([^=]+)\z/) {
126	($prefix,$suffix)= ($1, $2);
127	$parts{$suffix}++;
128	#$parts{$prefix}++;
129	} else {
130	$prefix= $key;
131	$parts{$prefix}++;
132	}
133
134	}
135	foreach my $key (sort {length($b) <=> length($a) \|\| $a cmp $b } keys %parts) {
136	$blob .= $key . "\0";
137	}
138	printf "Using preprocessing, initial blob size %d\n", length($blob);
139	} else {
140	printf "No preprocessing, initial blob size %d\n", length($blob);
141	}
142	my %res;
143
144	REDO:
145	%res= ();
146	KEY:
147	foreach my $key (
148	sort {
149	(length($b) <=> length($a)) \|\|
150	($a cmp $b)
151	}
152	keys %$hash
153	) {
154	next if exists $res{$key};
155	if (index($blob,$key) >= 0 ) {
156	my $idx= length($key);
157	if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
158	print "changing: $key => $old_res->{$key} : $idx\n";
159	}
160	$res{$key}= $idx;
161	next KEY;
162	}
163	my $best= length($key);
164	my $append= $key;
165	my $min= 0; #length $key >= 4 ? 4 : 0;
166	my $best_prefix;
167	my $best_suffix;
168	foreach my $idx (reverse $min .. length($key)) {
169	my $prefix= substr($key,0,$idx);
170	my $suffix= substr($key,$idx);
171	my $i1= index($blob,$prefix)>=0;
172	my $i2= index($blob,$suffix)>=0;
173	if ($i1 and $i2) {
174	if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
175	print "changing: $key => $old_res->{$key} : $idx\n";
176	}
177	$res{$key}= $idx;
178	$appended{$prefix}++;
179	$appended{$suffix}++;
180	next KEY;
181	} elsif ($i1) {
182	if (length $suffix <= length $append) {
183	$best= $idx;
184	$append= $suffix;
185	$best_prefix= $prefix;
186	$best_suffix= $suffix;
187	}
188	} elsif ($i2) {
189	if (length $prefix <= length $append) {
190	$best= $idx;
191	$append= $prefix;
192	$best_prefix= $prefix;
193	$best_suffix= $suffix;
194	}
195	}
196	}
197	if ($DEBUG and $old_res and $old_res->{$key} != $best) {
198	print "changing: $key => $old_res->{$key} : $best\n";
199	}
200	#print "$best_prefix\|$best_suffix => $best => $append\n";
201	$res{$key}= $best;
202	$blob .= $append;
203	$appended{$best_prefix}++;
204	$appended{$best_suffix}++;
205	}
206	my $b2 = "";
207	foreach my $key (sort { length($b) <=> length($a) \|\| $a cmp $b } keys %appended) {
208	$b2 .= $key unless index($b2,$key)>=0;
209	}
210	if (length($b2)<length($blob)) {
211	printf "Length old blob: %d length new blob: %d, recomputing using new blob\n", length($blob),length($b2);
212	$blob= $b2;
213	%appended=();
214	goto REDO;
215	} else {
216	printf "Length old blob: %d length new blob: %d, keeping old blob\n", length($blob),length($b2);
217	}
218	die sprintf "not same size? %d != %d", 0+keys %res, 0+keys %$hash unless keys %res == keys %$hash;
219	return ($blob,\%res);
220	}
221
222
223	sub blob_as_code {
224	my ($blob,$blob_name)= @_;
225
226	$blob_name \|\|= "mph_blob";
227
228	# output the blob as C code.
9923c0b5	229	my @code= (sprintf "STATIC const unsigned char %s[] =\n",$blob_name);
1a4d3814 YO	230	my $blob_len= length $blob;
	231	while (length($blob)) {
	232	push @code, sprintf qq( "%s"), substr($blob,0,65,"");
	233	push @code, length $blob ? "\n" : ";\n";
	234	}
	235	push @code, "/* $blob_name length: $blob_len */\n";
	236	return join "",@code;
	237	}
	238
	239	sub print_includes {
	240	my $ofh= shift;
	241	print $ofh "#include <stdio.h>\n";
	242	print $ofh "#include <string.h>\n";
	243	print $ofh "#include <stdint.h>\n";
	244	print $ofh "\n";
	245	}
	246
	247	sub print_defines {
	248	my ($ofh,$defines)= @_;
	249
	250	my $key_len;
	251	foreach my $def (keys %$defines) {
	252	$key_len //= length $def;
	253	$key_len= length $def if $key_len < length $def;
	254	}
	255	foreach my $def (sort keys %$defines) {
	256	printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def};
	257	}
	258	print $ofh "\n";
	259	}
	260
	261
	262	sub build_array_of_struct {
	263	my ($second_level,$blob)= @_;
	264
	265	my %defines;
	266	my %tests;
	267	my @rows;
	268	foreach my $row (@$second_level) {
	269	$defines{$row->{value}}= $row->{idx}+1;
	270	$tests{$row->{key}}= $defines{$row->{value}};
	271	my @u16= (
	272	$row->{seed2},
	273	index($blob,$row->{prefix}//0),
	274	index($blob,$row->{suffix}//0),
	275	);
9923c0b5	276	$_ > 0xFFFF and die "panic: value exceeds range of U16"
1a4d3814 YO	277	for @u16;
	278	my @u8= (
	279	length($row->{prefix}),
	280	length($row->{suffix}),
	281	);
9923c0b5	282	$_ > 0xFF and die "panic: value exceeds range of U8"
1a4d3814	283	for @u8;
c2300ef8 KW	284	push @rows, sprintf(" { %5d, %5d, %5d, %3d, %3d, %s } /* %s%s */",
c2300ef8 KW	285	@u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix});
1a4d3814 YO	286	}
	287	return \@rows,\%defines,\%tests;
	288	}
	289
64affa0c KW	290	sub make_algo {
64affa0c KW	291	my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows,
1a4d3814 YO	292	$blob_name, $struct_name, $table_name, $match_name, $prefix) = @_;
	293
	294	$blob_name \|\|= "mph_blob";
	295	$struct_name \|\|= "mph_struct";
	296	$table_name \|\|= "mph_table";
	297	$prefix \|\|= "MPH";
	298
1a4d3814 YO	299	my $n= 0+@$second_level;
	300	my $data_size= 0+@$second_level * 8 + length $smart_blob;
	301
64affa0c KW	302	my @code = "#define ${prefix}_VALt I16\n\n";
	303	push @code, "/*\n";
	304	push @code, sprintf "rows: %s\n", $n;
	305	push @code, sprintf "seed: %s\n", $seed1;
	306	push @code, sprintf "full length of keys: %d\n", $length_all_keys;
	307	push @code, sprintf "blob length: %d\n", length $smart_blob;
	308	push @code, sprintf "ref length: %d\n", 0+@$second_level * 8;
	309	push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
	310	push @code, "*/\n\n";
	311
	312	push @code, blob_as_code($smart_blob, $blob_name);
	313	push @code, <<"EOF_CODE";
1a4d3814 YO	314
1a4d3814 YO	315	struct $struct_name {
9923c0b5 KW	316	U16 seed2;
	317	U16 pfx;
	318	U16 sfx;
	319	U8 pfx_len;
	320	U8 sfx_len;
1a4d3814 YO	321	${prefix}_VALt value;
	322	};
	323
	324	EOF_CODE
	325
64affa0c KW	326	push @code, "#define ${prefix}_RSHIFT $RSHIFT\n";
	327	push @code, "#define ${prefix}_BUCKETS $n\n\n";
	328	push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1;
	329	push @code, sprintf "STATIC const U32 ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;
1a4d3814	330
db55a61c	331	push @code, "/* The comments give the input key for the row it is in */\n";
64affa0c KW	332	push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
64affa0c KW	333	push @code, <<"EOF_CODE";
9923c0b5	334	${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) {
1a4d3814 YO	335	const unsigned char * ptr= key;
1a4d3814 YO	336	const unsigned char * ptr_end= key + key_len;
9923c0b5 KW	337	U32 h= ${prefix}_SEED1;
	338	U32 s;
	339	U32 n;
1a4d3814	340	do {
5fea3644	341	h ^= NATIVE_TO_LATIN1(ptr); / table collated in Latin1 */
1a4d3814 YO	342	h *= ${prefix}_FNV_CONST;
	343	} while ( ++ptr < ptr_end );
	344	n= h % ${prefix}_BUCKETS;
	345	s = $table_name\[n].seed2;
	346	if (s) {
	347	h= (h >> ${prefix}_RSHIFT) ^ s;
	348	n = h % ${prefix}_BUCKETS;
	349	if (
	350	( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) &&
	351	( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) &&
	352	( !$table_name\[n].sfx_len \|\| memcmp($blob_name + $table_name\[n].sfx,
	353	key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 )
	354	) {
	355	return $table_name\[n].value;
	356	}
	357	}
	358	return 0;
	359	}
	360	EOF_CODE
64affa0c KW	361
	362	return join "", @code;
	363	}
	364
	365	sub print_algo {
	366	my ($ofh, $second_level, $seed1, $long_blob, $smart_blob, $rows,
	367	$blob_name, $struct_name, $table_name, $match_name ) = @_;
	368
	369	if (!ref $ofh) {
	370	my $file= $ofh;
	371	undef $ofh;
	372	open $ofh, ">", $file
	373	or die "Failed to open '$file': $!";
	374	}
	375
	376	my $code = make_algo(
	377	$second_level, $seed1, $long_blob, $smart_blob, $rows,
	378	$blob_name, $struct_name, $table_name, $match_name );
	379	print $ofh $code;
1a4d3814 YO	380	}
	381
	382	sub print_main {
	383	my ($ofh,$h_file,$match_name,$prefix)=@_;
	384	print $ofh <<"EOF_CODE";
1a4d3814 YO	385	#include "$h_file"
	386
	387	int main(int argc, char *argv[]){
	388	int i;
	389	for (i=1; i<argc; i++) {
	390	unsigned char key = (unsigned char )argv[i];
	391	int key_len = strlen(argv[i]);
	392	printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len));
	393	}
	394	return 0;
	395	}
	396	EOF_CODE
	397	}
	398
	399	# output the test Perl code.
	400	sub print_tests {
	401	my ($file, $tests_hash)= @_;
	402	open my $ofh, ">", $file
	403	or die "Failed to open '$file' for writing: $!";
	404	my $num_tests= 2 + keys %$tests_hash;
	405	print $ofh "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;";
	406	my $bytes= 0;
	407	my @tests= sort keys %$tests_hash;
	408	print $ofh "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n";
	409	print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n";
	410	print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n";
	411	while (@tests) {
	412	my @batch= splice @tests,0,10;
	413	my $batch_args= join " ", map { "'$_'" } @batch;
	414	print $ofh "\@res=`./mph_test $batch_args`;\n";
	415	foreach my $i (0..$#batch) {
	416	my $key= $batch[$i];
	417	my $want= $tests_hash->{$key};
	418	print $ofh "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n";
	419	}
	420	}
	421	close $ofh;
	422	}
	423
	424	sub print_test_binary {
	425	my ($file,$h_file, $second_level, $seed1, $length_all_keys,
	426	$smart_blob, $rows, $defines, $match_name, $prefix)= @_;
	427	open my $ofh, ">", $file
	428	or die "Failed to open '$file': $!";
	429	print_includes($ofh);
	430	print_defines($ofh, $defines);
	431	print_main($ofh,$h_file,$match_name,$prefix);
	432	close $ofh;
	433	}
	434
	435	sub make_mph_from_hash {
	436	my $hash= shift;
	437
	438	# we do this twice because often we can find longer prefixes on the second pass.
1a4d3814 YO	439	my ($smart_blob, $res_to_split)= build_split_words($hash,0);
	440	{
	441	my ($smart_blob2, $res_to_split2)= build_split_words($hash,1);
	442	if (length($smart_blob) > length($smart_blob2)) {
	443	printf "Using preprocess-smart blob, length: %d (vs %d)\n", length $smart_blob2, length $smart_blob;
	444	$smart_blob= $smart_blob2;
	445	$res_to_split= $res_to_split2;
	446	} else {
	447	printf "Using greedy-smart blob, length: %d (vs %d)\n", length $smart_blob, length $smart_blob2;
	448	}
	449	}
	450	my ($seed1, $second_level, $length_all_keys)= build_perfect_hash($hash, $res_to_split);
	451	my ($rows, $defines, $tests)= build_array_of_struct($second_level, $smart_blob);
	452	return ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, $defines, $tests);
	453	}
	454
	455	sub make_files {
	456	my ($hash,$base_name)= @_;
	457
	458	my $h_name= $base_name . "_algo.h";
	459	my $c_name= $base_name . "_test.c";
	460	my $p_name= $base_name . "_test.pl";
	461	my $blob_name= $base_name . "_blob";
	462	my $struct_name= $base_name . "_bucket_info";
	463	my $table_name= $base_name . "_table";
	464	my $match_name= $base_name . "_match";
	465	my $prefix= uc($base_name);
	466
	467	my ($second_level, $seed1, $length_all_keys,
	468	$smart_blob, $rows, $defines, $tests)= make_mph_from_hash( $hash );
	469	print_algo( $h_name,
	470	$second_level, $seed1, $length_all_keys, $smart_blob, $rows,
	471	$blob_name, $struct_name, $table_name, $match_name, $prefix );
	472	print_test_binary( $c_name, $h_name, $second_level, $seed1, $length_all_keys,
	473	$smart_blob, $rows, $defines,
	474	$match_name, $prefix );
	475	print_tests( $p_name, $tests );
	476	}
	477
	478	unless (caller) {
	479	my %hash;
	480	{
	481	no warnings;
048bdb72	482	do "../perl/lib/unicore/UCD.pl";
1a4d3814 YO	483	%hash= %utf8::loose_to_file_of;
	484	}
	485	if ($ENV{MERGE_KEYS}) {
	486	my @keys= keys %hash;
	487	foreach my $loose (keys %utf8::loose_property_name_of) {
	488	my $to= $utf8::loose_property_name_of{$loose};
	489	next if $to eq $loose;
	490	foreach my $key (@keys) {
	491	my $copy= $key;
	492	if ($copy=~s/^\Q$to\E(=\|\z)/$loose$1/) {
	493	#print "$key => $copy\n";
	494	$hash{$copy}= $key;
	495	}
	496	}
	497	}
	498	}
	499	foreach my $key (keys %hash) {
	500	my $munged= uc($key);
	501	$munged=~s/\W/__/g;
	502	$hash{$key} = $munged;
	503	}
	504
	505	my $name= shift @ARGV;
	506	$name \|\|= "mph";
	507	make_files(\%hash,$name);
	508	}
	509
	510	1;
	511	__END__