t/re/anyof.t: Fix highest range tests

[perl5.git] / regen / mph.pl
diff --git a/regen/mph.pl b/regen/mph.pl

index 464ef38..00a36ec 100644 (file)
--- a/regen/mph.pl
+++ b/regen/mph.pl
@@ -4,14 +4,36 @@ use warnings;
  use Data::Dumper;
  use Carp;
  use Text::Wrap;
+use bignum;     # Otherwise fails on 32-bit systems
  
  my $DEBUG= 0;
  my $RSHIFT= 8;
  my $FNV_CONST= 16777619;
  
+# The basic idea is that you have a two level structure, and effectively
+# hash the key twice.
+#
+# The first hash finds a bucket in the array which contains a seed which
+# is used for the second hash, which then leads to a bucket with key
+# data which is compared against to determine if the key is a match.
+#
+# If the first hash finds no seed, then the key cannot match.
+#
+# In our case we cheat a bit, and hash the key only once, but use the
+# low bits for the first lookup and the high-bits for the second.
+#
+# So for instance:
+#
+#           h= (h >> RSHIFT) ^ s;
+#
+# is how the second hash is computed. We right shift the original hash
+# value  and then xor in the seed2, which will be non-zero.
+#
+# That then gives us the bucket which contains the key data we need to
+# match for a valid key.
+
  sub _fnv {
      my ($key, $seed)= @_;
-
      my $hash = 0+$seed;
      foreach my $char (split //, $key) {
          $hash = $hash ^ ord($char);
@@ -204,7 +226,7 @@ sub blob_as_code {
      $blob_name ||= "mph_blob";
  
      # output the blob as C code.
-    my @code= (sprintf "const unsigned char %s[] =\n",$blob_name);
+    my @code= (sprintf "STATIC const unsigned char %s[] =\n",$blob_name);
      my $blob_len= length $blob;
      while (length($blob)) {
          push @code, sprintf qq(    "%s"), substr($blob,0,65,"");
@@ -251,22 +273,22 @@ sub build_array_of_struct {
              index($blob,$row->{prefix}//0),
              index($blob,$row->{suffix}//0),
          );
-        $_ > 0xFFFF and die "panic: value exceeds range of uint16_t"
+        $_ > 0xFFFF and die "panic: value exceeds range of U16"
              for @u16;
          my @u8= (
              length($row->{prefix}),
              length($row->{suffix}),
          );
-        $_ > 0xFF and die "panic: value exceeds range of uint8_t"
+        $_ > 0xFF and die "panic: value exceeds range of U8"
              for @u8;
-        push @rows, sprintf("  { %5d, %5d, %5d, %3d, %3d, %s }",
-            @u16, @u8, $row->{value} );
+        push @rows, sprintf("  { %5d, %5d, %5d, %3d, %3d, %s }   /* %s%s */",
+            @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix});
      }
      return \@rows,\%defines,\%tests;
  }
  
-sub print_algo {
-    my ($ofh, $second_level, $seed1, $length_all_keys, $smart_blob, $rows,
+sub make_algo {
+    my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows,
          $blob_name, $struct_name, $table_name, $match_name, $prefix) = @_;
  
      $blob_name ||= "mph_blob";
@@ -274,55 +296,49 @@ sub print_algo {
      $table_name ||= "mph_table";
      $prefix ||= "MPH";
  
-    if (!ref $ofh) {
-        my $file= $ofh;
-        undef $ofh;
-        open $ofh, ">", $file
-            or die "Failed to open '$file': $!";
-    }
-
      my $n= 0+@$second_level;
      my $data_size= 0+@$second_level * 8 + length $smart_blob;
  
-    print $ofh "/*\n";
-    printf $ofh "rows: %s\n", $n;
-    printf $ofh "seed: %s\n", $seed1;
-    printf $ofh "full length of keys: %d\n", $length_all_keys;
-    printf $ofh "blob length: %d\n", length $smart_blob;
-    printf $ofh "ref length: %d\n", 0+@$second_level * 8;
-    printf $ofh "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
-    print $ofh "*/\n\n";
+    my @code = "#define ${prefix}_VALt I16\n\n";
+    push @code, "/*\n";
+    push @code, sprintf "rows: %s\n", $n;
+    push @code, sprintf "seed: %s\n", $seed1;
+    push @code, sprintf "full length of keys: %d\n", $length_all_keys;
+    push @code, sprintf "blob length: %d\n", length $smart_blob;
+    push @code, sprintf "ref length: %d\n", 0+@$second_level * 8;
+    push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
+    push @code, "*/\n\n";
  
-    print $ofh blob_as_code($smart_blob, $blob_name);
-    print $ofh <<"EOF_CODE";
+    push @code, blob_as_code($smart_blob, $blob_name);
+    push @code, <<"EOF_CODE";
  
  struct $struct_name {
-    uint16_t seed2;
-    uint16_t pfx;
-    uint16_t sfx;
-    uint8_t  pfx_len;
-    uint8_t  sfx_len;
+    U16 seed2;
+    U16 pfx;
+    U16 sfx;
+    U8  pfx_len;
+    U8  sfx_len;
      ${prefix}_VALt value;
  };
  
  EOF_CODE
  
-    print $ofh "#define ${prefix}_RSHIFT $RSHIFT\n";
-    print $ofh "#define ${prefix}_BUCKETS $n\n\n";
-    printf $ofh "const uint32_t ${prefix}_SEED1 = 0x%08x;\n", $seed1;
-    printf $ofh "const uint32_t ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;
+    push @code, "#define ${prefix}_RSHIFT $RSHIFT\n";
+    push @code, "#define ${prefix}_BUCKETS $n\n\n";
+    push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1;
+    push @code, sprintf "STATIC const U32 ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;
  
-    print $ofh "\n";
-    print $ofh "const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
-    print $ofh <<"EOF_CODE";
-${prefix}_VALt $match_name( const unsigned char * const key, const uint16_t key_len ) {
+    push @code, "/* The comments give the input key for the row it is in */\n";
+    push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
+    push @code, <<"EOF_CODE";
+${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) {
      const unsigned char * ptr= key;
      const unsigned char * ptr_end= key + key_len;
-    uint32_t h= ${prefix}_SEED1;
-    uint32_t s;
-    uint32_t n;
+    U32 h= ${prefix}_SEED1;
+    U32 s;
+    U32 n;
      do {
-        h ^= *ptr;
+        h ^= NATIVE_TO_LATIN1(*ptr);    /* table collated in Latin1 */
          h *= ${prefix}_FNV_CONST;
      } while ( ++ptr < ptr_end );
      n= h % ${prefix}_BUCKETS;
@@ -342,12 +358,30 @@ ${prefix}_VALt $match_name( const unsigned char * const key, const uint16_t key_
      return 0;
  }
  EOF_CODE
+
+    return join "", @code;
+}
+
+sub print_algo {
+    my ($ofh, $second_level, $seed1, $long_blob, $smart_blob, $rows,
+        $blob_name, $struct_name, $table_name, $match_name ) = @_;
+
+    if (!ref $ofh) {
+        my $file= $ofh;
+        undef $ofh;
+        open $ofh, ">", $file
+            or die "Failed to open '$file': $!";
+    }
+
+    my $code = make_algo(
+        $second_level, $seed1, $long_blob, $smart_blob, $rows,
+        $blob_name, $struct_name, $table_name, $match_name );
+    print $ofh $code;
  }
  
  sub print_main {
      my ($ofh,$h_file,$match_name,$prefix)=@_;
      print $ofh <<"EOF_CODE";
-#define ${prefix}_VALt int16_t
  #include "$h_file"
  
  int main(int argc, char *argv[]){
@@ -402,8 +436,6 @@ sub make_mph_from_hash {
      my $hash= shift;
  
      # we do this twice because often we can find longer prefixes on the second pass.
-    my @keys= sort {length($b) <=> length($a) || $a cmp $b } keys %$hash;
-
      my ($smart_blob, $res_to_split)= build_split_words($hash,0);
      {
          my ($smart_blob2, $res_to_split2)= build_split_words($hash,1);
@@ -447,7 +479,7 @@ unless (caller) {
      my %hash;
      {
          no warnings;
-        do "../perl/lib/unicore/Heavy.pl";
+        do "../perl/lib/unicore/UCD.pl";
          %hash= %utf8::loose_to_file_of;
      }
      if ($ENV{MERGE_KEYS}) {