This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
add a "hash quality score" to Hash::Util::bucket_stats()
[perl5.git] / ext / Hash-Util / lib / Hash / Util.pm
CommitLineData
96c33d98
YO
1package Hash::Util;
2
3require 5.007003;
4use strict;
5use Carp;
6use warnings;
1e6ffe56 7no warnings 'uninitialized';
96c33d98
YO
8use warnings::register;
9use Scalar::Util qw(reftype);
10
11require Exporter;
12our @ISA = qw(Exporter);
13our @EXPORT_OK = qw(
1e73acc8
AS
14 fieldhash fieldhashes
15
96c33d98
YO
16 all_keys
17 lock_keys unlock_keys
18 lock_value unlock_value
19 lock_hash unlock_hash
08579537
JK
20 lock_keys_plus
21 hash_locked hash_unlocked
22 hashref_locked hashref_unlocked
96c33d98
YO
23 hidden_keys legal_keys
24
25 lock_ref_keys unlock_ref_keys
26 lock_ref_value unlock_ref_value
27 lock_hashref unlock_hashref
08579537 28 lock_ref_keys_plus
96c33d98
YO
29 hidden_ref_keys legal_ref_keys
30
77a92142 31 hash_seed hash_value hv_store
32 bucket_stats bucket_info bucket_array
5114d263 33 lock_hash_recurse unlock_hash_recurse
96c33d98 34 );
a740dcb9 35our $VERSION = '0.15';
b58f046e
NC
36require XSLoader;
37XSLoader::load();
96c33d98 38
1e73acc8
AS
39sub import {
40 my $class = shift;
41 if ( grep /fieldhash/, @_ ) {
42 require Hash::Util::FieldHash;
43 Hash::Util::FieldHash->import(':all'); # for re-export
44 }
45 unshift @_, $class;
46 goto &Exporter::import;
47}
48
96c33d98
YO
49
50=head1 NAME
51
52Hash::Util - A selection of general-utility hash subroutines
53
54=head1 SYNOPSIS
55
1e73acc8
AS
56 # Restricted hashes
57
96c33d98 58 use Hash::Util qw(
08579537
JK
59 fieldhash fieldhashes
60
61 all_keys
96c33d98
YO
62 lock_keys unlock_keys
63 lock_value unlock_value
64 lock_hash unlock_hash
08579537
JK
65 lock_keys_plus
66 hash_locked hash_unlocked
67 hashref_locked hashref_unlocked
96c33d98 68 hidden_keys legal_keys
08579537
JK
69
70 lock_ref_keys unlock_ref_keys
71 lock_ref_value unlock_ref_value
72 lock_hashref unlock_hashref
73 lock_ref_keys_plus
74 hidden_ref_keys legal_ref_keys
75
77a92142 76 hash_seed hash_value hv_store
77 bucket_stats bucket_info bucket_array
5114d263 78 lock_hash_recurse unlock_hash_recurse
96c33d98
YO
79 );
80
81 %hash = (foo => 42, bar => 23);
82 # Ways to restrict a hash
83 lock_keys(%hash);
84 lock_keys(%hash, @keyset);
85 lock_keys_plus(%hash, @additional_keys);
86
72712bfd
RGS
87 # Ways to inspect the properties of a restricted hash
88 my @legal = legal_keys(%hash);
89 my @hidden = hidden_keys(%hash);
90 my $ref = all_keys(%hash,@keys,@hidden);
91 my $is_locked = hash_locked(%hash);
96c33d98 92
72712bfd 93 # Remove restrictions on the hash
96c33d98
YO
94 unlock_keys(%hash);
95
72712bfd 96 # Lock individual values in a hash
96c33d98
YO
97 lock_value (%hash, 'foo');
98 unlock_value(%hash, 'foo');
99
72712bfd 100 # Ways to change the restrictions on both keys and values
96c33d98
YO
101 lock_hash (%hash);
102 unlock_hash(%hash);
103
104 my $hashes_are_randomised = hash_seed() != 0;
105
77a92142 106 my $int_hash_value = hash_value( 'string' );
107
96c33d98
YO
108=head1 DESCRIPTION
109
d74d639b
AS
110C<Hash::Util> and C<Hash::Util::FieldHash> contain special functions
111for manipulating hashes that don't really warrant a keyword.
1e73acc8 112
b299c47c
RGS
113C<Hash::Util> contains a set of functions that support
114L<restricted hashes|/"Restricted hashes">. These are described in
d74d639b
AS
115this document. C<Hash::Util::FieldHash> contains an (unrelated)
116set of functions that support the use of hashes in
117I<inside-out classes>, described in L<Hash::Util::FieldHash>.
1e73acc8 118
d74d639b 119By default C<Hash::Util> does not export anything.
1e73acc8 120
96c33d98
YO
121=head2 Restricted hashes
122
1235.8.0 introduces the ability to restrict a hash to a certain set of
124keys. No keys outside of this set can be added. It also introduces
125the ability to lock an individual key so it cannot be deleted and the
126ability to ensure that an individual value cannot be changed.
127
128This is intended to largely replace the deprecated pseudo-hashes.
129
130=over 4
131
132=item B<lock_keys>
133
134=item B<unlock_keys>
135
136 lock_keys(%hash);
137 lock_keys(%hash, @keys);
138
139Restricts the given %hash's set of keys to @keys. If @keys is not
140given it restricts it to its current keyset. No more keys can be
141added. delete() and exists() will still work, but will not alter
142the set of allowed keys. B<Note>: the current implementation prevents
143the hash from being bless()ed while it is in a locked state. Any attempt
144to do so will raise an exception. Of course you can still bless()
145the hash before you call lock_keys() so this shouldn't be a problem.
146
147 unlock_keys(%hash);
148
149Removes the restriction on the %hash's keyset.
150
5114d263
JK
151B<Note> that if any of the values of the hash have been locked they will not
152be unlocked after this sub executes.
96c33d98
YO
153
154Both routines return a reference to the hash operated on.
155
156=cut
157
158sub lock_ref_keys {
159 my($hash, @keys) = @_;
160
161 Internals::hv_clear_placeholders %$hash;
162 if( @keys ) {
163 my %keys = map { ($_ => 1) } @keys;
164 my %original_keys = map { ($_ => 1) } keys %$hash;
165 foreach my $k (keys %original_keys) {
166 croak "Hash has key '$k' which is not in the new key set"
167 unless $keys{$k};
168 }
169
170 foreach my $k (@keys) {
171 $hash->{$k} = undef unless exists $hash->{$k};
172 }
173 Internals::SvREADONLY %$hash, 1;
174
175 foreach my $k (@keys) {
176 delete $hash->{$k} unless $original_keys{$k};
177 }
178 }
179 else {
180 Internals::SvREADONLY %$hash, 1;
181 }
182
183 return $hash;
184}
185
186sub unlock_ref_keys {
187 my $hash = shift;
188
189 Internals::SvREADONLY %$hash, 0;
190 return $hash;
191}
192
193sub lock_keys (\%;@) { lock_ref_keys(@_) }
194sub unlock_keys (\%) { unlock_ref_keys(@_) }
195
196=item B<lock_keys_plus>
197
198 lock_keys_plus(%hash,@additional_keys)
199
200Similar to C<lock_keys()>, with the difference being that the optional key list
201specifies keys that may or may not be already in the hash. Essentially this is
202an easier way to say
203
204 lock_keys(%hash,@additional_keys,keys %hash);
205
206Returns a reference to %hash
207
208=cut
209
210
211sub lock_ref_keys_plus {
77a92142 212 my ($hash,@keys) = @_;
96c33d98
YO
213 my @delete;
214 Internals::hv_clear_placeholders(%$hash);
215 foreach my $key (@keys) {
216 unless (exists($hash->{$key})) {
217 $hash->{$key}=undef;
218 push @delete,$key;
219 }
220 }
221 Internals::SvREADONLY(%$hash,1);
222 delete @{$hash}{@delete};
223 return $hash
224}
225
226sub lock_keys_plus(\%;@) { lock_ref_keys_plus(@_) }
227
228
229=item B<lock_value>
230
231=item B<unlock_value>
232
233 lock_value (%hash, $key);
234 unlock_value(%hash, $key);
235
236Locks and unlocks the value for an individual key of a hash. The value of a
237locked key cannot be changed.
238
239Unless %hash has already been locked the key/value could be deleted
240regardless of this setting.
241
242Returns a reference to the %hash.
243
244=cut
245
246sub lock_ref_value {
247 my($hash, $key) = @_;
248 # I'm doubtful about this warning, as it seems not to be true.
249 # Marking a value in the hash as RO is useful, regardless
250 # of the status of the hash itself.
251 carp "Cannot usefully lock values in an unlocked hash"
252 if !Internals::SvREADONLY(%$hash) && warnings::enabled;
253 Internals::SvREADONLY $hash->{$key}, 1;
254 return $hash
255}
256
257sub unlock_ref_value {
258 my($hash, $key) = @_;
259 Internals::SvREADONLY $hash->{$key}, 0;
260 return $hash
261}
262
263sub lock_value (\%$) { lock_ref_value(@_) }
264sub unlock_value (\%$) { unlock_ref_value(@_) }
265
266
267=item B<lock_hash>
268
269=item B<unlock_hash>
270
271 lock_hash(%hash);
272
72712bfd 273lock_hash() locks an entire hash, making all keys and values read-only.
96c33d98
YO
274No value can be changed, no keys can be added or deleted.
275
276 unlock_hash(%hash);
277
278unlock_hash() does the opposite of lock_hash(). All keys and values
279are made writable. All values can be changed and keys can be added
280and deleted.
281
282Returns a reference to the %hash.
283
284=cut
285
286sub lock_hashref {
287 my $hash = shift;
288
289 lock_ref_keys($hash);
290
291 foreach my $value (values %$hash) {
292 Internals::SvREADONLY($value,1);
293 }
294
295 return $hash;
296}
297
298sub unlock_hashref {
299 my $hash = shift;
300
301 foreach my $value (values %$hash) {
302 Internals::SvREADONLY($value, 0);
303 }
304
305 unlock_ref_keys($hash);
306
307 return $hash;
308}
309
310sub lock_hash (\%) { lock_hashref(@_) }
311sub unlock_hash (\%) { unlock_hashref(@_) }
312
313=item B<lock_hash_recurse>
314
315=item B<unlock_hash_recurse>
316
317 lock_hash_recurse(%hash);
318
319lock_hash() locks an entire hash and any hashes it references recursively,
72712bfd 320making all keys and values read-only. No value can be changed, no keys can
96c33d98
YO
321be added or deleted.
322
5114d263
JK
323This method B<only> recurses into hashes that are referenced by another hash.
324Thus a Hash of Hashes (HoH) will all be restricted, but a Hash of Arrays of
325Hashes (HoAoH) will only have the top hash restricted.
96c33d98
YO
326
327 unlock_hash_recurse(%hash);
328
329unlock_hash_recurse() does the opposite of lock_hash_recurse(). All keys and
330values are made writable. All values can be changed and keys can be added
331and deleted. Identical recursion restrictions apply as to lock_hash_recurse().
332
333Returns a reference to the %hash.
334
335=cut
336
337sub lock_hashref_recurse {
338 my $hash = shift;
339
340 lock_ref_keys($hash);
341 foreach my $value (values %$hash) {
2793cb42
CO
342 my $type = reftype($value);
343 if (defined($type) and $type eq 'HASH') {
96c33d98
YO
344 lock_hashref_recurse($value);
345 }
346 Internals::SvREADONLY($value,1);
347 }
348 return $hash
349}
350
351sub unlock_hashref_recurse {
352 my $hash = shift;
353
354 foreach my $value (values %$hash) {
2793cb42
CO
355 my $type = reftype($value);
356 if (defined($type) and $type eq 'HASH') {
96c33d98
YO
357 unlock_hashref_recurse($value);
358 }
359 Internals::SvREADONLY($value,1);
360 }
361 unlock_ref_keys($hash);
362 return $hash;
363}
364
365sub lock_hash_recurse (\%) { lock_hashref_recurse(@_) }
366sub unlock_hash_recurse (\%) { unlock_hashref_recurse(@_) }
367
5114d263
JK
368=item B<hashref_locked>
369
08579537
JK
370=item B<hash_locked>
371
5114d263 372 hashref_locked(\%hash) and print "Hash is locked!\n";
08579537
JK
373 hash_locked(%hash) and print "Hash is locked!\n";
374
375Returns true if the hash and its keys are locked.
376
377=cut
378
379sub hashref_locked {
380 my $hash=shift;
5114d263 381 Internals::SvREADONLY(%$hash);
08579537
JK
382}
383
384sub hash_locked(\%) { hashref_locked(@_) }
96c33d98 385
5114d263
JK
386=item B<hashref_unlocked>
387
96c33d98
YO
388=item B<hash_unlocked>
389
5114d263 390 hashref_unlocked(\%hash) and print "Hash is unlocked!\n";
96c33d98
YO
391 hash_unlocked(%hash) and print "Hash is unlocked!\n";
392
393Returns true if the hash and its keys are unlocked.
394
395=cut
396
397sub hashref_unlocked {
398 my $hash=shift;
5114d263 399 !Internals::SvREADONLY(%$hash);
96c33d98
YO
400}
401
402sub hash_unlocked(\%) { hashref_unlocked(@_) }
403
404=for demerphqs_editor
405sub legal_ref_keys{}
406sub hidden_ref_keys{}
407sub all_keys{}
408
409=cut
410
411sub legal_keys(\%) { legal_ref_keys(@_) }
412sub hidden_keys(\%){ hidden_ref_keys(@_) }
413
3a6a06e3 414=item B<legal_keys>
96c33d98 415
72712bfd 416 my @keys = legal_keys(%hash);
96c33d98 417
72712bfd 418Returns the list of the keys that are legal in a restricted hash.
96c33d98
YO
419In the case of an unrestricted hash this is identical to calling
420keys(%hash).
421
422=item B<hidden_keys>
423
72712bfd 424 my @keys = hidden_keys(%hash);
96c33d98 425
72712bfd 426Returns the list of the keys that are legal in a restricted hash but
96c33d98
YO
427do not have a value associated to them. Thus if 'foo' is a
428"hidden" key of the %hash it will return false for both C<defined>
429and C<exists> tests.
430
431In the case of an unrestricted hash this will return an empty list.
432
433B<NOTE> this is an experimental feature that is heavily dependent
434on the current implementation of restricted hashes. Should the
72712bfd 435implementation change, this routine may become meaningless, in which
96c33d98
YO
436case it will return an empty list.
437
438=item B<all_keys>
439
440 all_keys(%hash,@keys,@hidden);
441
442Populates the arrays @keys with the all the keys that would pass
443an C<exists> tests, and populates @hidden with the remaining legal
444keys that have not been utilized.
445
446Returns a reference to the hash.
447
72712bfd 448In the case of an unrestricted hash this will be equivalent to
96c33d98 449
72712bfd
RGS
450 $ref = do {
451 @keys = keys %hash;
452 @hidden = ();
453 \%hash
454 };
96c33d98
YO
455
456B<NOTE> this is an experimental feature that is heavily dependent
457on the current implementation of restricted hashes. Should the
458implementation change this routine may become meaningless in which
459case it will behave identically to how it would behave on an
72712bfd 460unrestricted hash.
96c33d98
YO
461
462=item B<hash_seed>
463
464 my $hash_seed = hash_seed();
465
7dc86639 466hash_seed() returns the seed bytes used to randomise hash ordering.
96c33d98
YO
467
468B<Note that the hash seed is sensitive information>: by knowing it one
469can craft a denial-of-service attack against Perl code, even remotely,
470see L<perlsec/"Algorithmic Complexity Attacks"> for more information.
471B<Do not disclose the hash seed> to people who don't need to know it.
472See also L<perlrun/PERL_HASH_SEED_DEBUG>.
473
7dc86639
YO
474Prior to Perl 5.17.6 this function returned a UV, it now returns a string,
475which may be of nearly any size as determined by the hash function your
476Perl has been built with. Possible sizes may be but are not limited to
4774 bytes (for most hash algorithms) and 16 bytes (for siphash).
478
479=item B<hash_value>
480
481 my $hash_value = hash_value($string);
482
77a92142 483hash_value() returns the current perl's internal hash value for a given
7dc86639
YO
484string.
485
486Returns a 32 bit integer representing the hash value of the string passed
487in. This value is only reliable for the lifetime of the process. It may
488be different depending on invocation, environment variables, perl version,
489architectures, and build options.
490
491B<Note that the hash value of a given string is sensitive information>:
492by knowing it one can deduce the hash seed which in turn can allow one to
493craft a denial-of-service attack against Perl code, even remotely,
494see L<perlsec/"Algorithmic Complexity Attacks"> for more information.
495B<Do not disclose the hash value of a string> to people who don't need to
496know it. See also L<perlrun/PERL_HASH_SEED_DEBUG>.
497
498=item B<bucket_info>
499
500Return a set of basic information about a hash.
501
9fc1b446 502 my ($keys, $buckets, $used, @length_counts)= bucket_info($hash);
7dc86639
YO
503
504Fields are as follows:
505
506 0: Number of keys in the hash
507 1: Number of buckets in the hash
508 2: Number of used buckets in the hash
509 rest : list of counts, Kth element is the number of buckets
510 with K keys in it.
511
512See also bucket_stats() and bucket_array().
513
514=item B<bucket_stats>
515
516Returns a list of statistics about a hash.
517
518 my ($keys, buckets, $used, $utilization_ratio, $collision_pct,
77a92142 519 $mean, $stddev, @length_counts) = bucket_info($hashref);
7dc86639
YO
520
521
522Fields are as follows:
523
524
525 0: Number of keys in the hash
526 1: Number of buckets in the hash
527 2: Number of used buckets in the hash
a740dcb9
YO
528 3: Hash Quality Score
529 4: Percent of buckets used
530 5: Percent of keys which are in collision
531 6: Average bucket length
532 7: Standard Deviation of bucket lengths.
7dc86639
YO
533 rest : list of counts, Kth element is the number of buckets
534 with K keys in it.
535
536See also bucket_info() and bucket_array().
537
a740dcb9
YO
538Note that Hash Quality Score would be 1 for an ideal hash, numbers
539close to and below 1 indicate good hashing, and number significantly
540above indicate a poor score. In practice it should be around 0.95 to 1.05.
541It is defined as:
542
543 $score= sum( $count[$length] * ($length * ($length + 1) / 2) )
544 /
545 ( ( $keys / 2 * $buckets ) *
546 ( $keys + ( 2 * $buckets ) - 1 ) )
547
548The formula is from the Red Dragon book (reformulated to use the data available)
549and is documented at L<http://www.strchr.com/hash_functions>
550
7dc86639
YO
551=item B<bucket_array>
552
553 my $array= bucket_array(\%hash);
554
555Returns a packed representation of the bucket array associated with a hash. Each element
556of the array is either an integer K, in which case it represents K empty buckets, or
557a reference to another array which contains the keys that are in that bucket.
558
559B<Note that the information returned by bucket_array is sensitive information>:
77a92142 560by knowing it one can directly attack perl's hash function which in turn may allow
7dc86639
YO
561one to craft a denial-of-service attack against Perl code, even remotely,
562see L<perlsec/"Algorithmic Complexity Attacks"> for more information.
77a92142 563B<Do not disclose the output of this function> to people who don't need to
7dc86639
YO
564know it. See also L<perlrun/PERL_HASH_SEED_DEBUG>. This function is provided strictly
565for debugging and diagnostics purposes only, it is hard to imagine a reason why it
566would be used in production code.
567
96c33d98
YO
568=cut
569
7dc86639
YO
570
571sub bucket_stats {
77a92142 572 my ($hash) = @_;
573 my ($keys, $buckets, $used, @length_counts) = bucket_info($hash);
7dc86639 574 my $sum;
a740dcb9
YO
575 my $score;
576 for (0 .. $#length_counts) {
577 $sum += ($length_counts[$_] * $_);
578 $score += $length_counts[$_] * ( $_ * ($_ + 1 ) / 2 );
579 }
580 $score = $score /
581 (( $keys / (2 * $buckets )) * ( $keys + ( 2 * $buckets ) - 1 ))
582 if $keys;
7dc86639
YO
583 my $mean= $sum/$buckets;
584 $sum= 0;
585 $sum += ($length_counts[$_] * (($_-$mean)**2)) for 0 .. $#length_counts;
586
587 my $stddev= sqrt($sum/$buckets);
a740dcb9 588 return $keys, $buckets, $used, $keys ? ($score, $used/$buckets, ($keys-$used)/$keys, $mean, $stddev, @length_counts) : ();
96c33d98
YO
589}
590
591=item B<hv_store>
592
72712bfd 593 my $sv = 0;
96c33d98 594 hv_store(%hash,$key,$sv) or die "Failed to alias!";
72712bfd 595 $hash{$key} = 1;
96c33d98
YO
596 print $sv; # prints 1
597
598Stores an alias to a variable in a hash instead of copying the value.
599
600=back
601
602=head2 Operating on references to hashes.
603
72712bfd 604Most subroutines documented in this module have equivalent versions
96c33d98
YO
605that operate on references to hashes instead of native hashes.
606The following is a list of these subs. They are identical except
607in name and in that instead of taking a %hash they take a $hashref,
608and additionally are not prototyped.
609
610=over 4
611
612=item lock_ref_keys
613
614=item unlock_ref_keys
615
616=item lock_ref_keys_plus
617
618=item lock_ref_value
619
620=item unlock_ref_value
621
622=item lock_hashref
623
624=item unlock_hashref
625
626=item lock_hashref_recurse
627
628=item unlock_hashref_recurse
629
630=item hash_ref_unlocked
631
632=item legal_ref_keys
633
634=item hidden_ref_keys
635
636=back
637
638=head1 CAVEATS
639
640Note that the trapping of the restricted operations is not atomic:
641for example
642
643 eval { %hash = (illegal_key => 1) }
644
645leaves the C<%hash> empty rather than with its original contents.
646
647=head1 BUGS
648
649The interface exposed by this module is very close to the current
72712bfd 650implementation of restricted hashes. Over time it is expected that
96c33d98
YO
651this behavior will be extended and the interface abstracted further.
652
653=head1 AUTHOR
654
655Michael G Schwern <schwern@pobox.com> on top of code by Nick
656Ing-Simmons and Jeffrey Friedl.
657
658hv_store() is from Array::RefElem, Copyright 2000 Gisle Aas.
659
660Additional code by Yves Orton.
661
662=head1 SEE ALSO
663
d74d639b
AS
664L<Scalar::Util>, L<List::Util> and L<perlsec/"Algorithmic Complexity Attacks">.
665
666L<Hash::Util::FieldHash>.
96c33d98
YO
667
668=cut
669
6701;