Commit | Line | Data |
---|---|---|
a0cb3900 JH |
1 | # -*- mode: perl; perl-indent-level: 2; -*- |
2 | # Memoize.pm | |
3 | # | |
4 | # Transparent memoization of idempotent functions | |
5 | # | |
899dc88a | 6 | # Copyright 1998, 1999, 2000, 2001 M-J. Dominus. |
a0cb3900 JH |
7 | # You may copy and distribute this program under the |
8 | # same terms as Perl itself. If in doubt, | |
9 | # write to mjd-perl-memoize+@plover.com for a license. | |
10 | # | |
484fdf61 | 11 | # Version 1.01 $Revision: 1.18 $ $Date: 2001/06/24 17:16:47 $ |
a0cb3900 JH |
12 | |
13 | package Memoize; | |
484fdf61 | 14 | $VERSION = '1.01'; |
a0cb3900 JH |
15 | |
16 | # Compile-time constants | |
17 | sub SCALAR () { 0 } | |
18 | sub LIST () { 1 } | |
19 | ||
20 | ||
21 | # | |
22 | # Usage memoize(functionname/ref, | |
23 | # { NORMALIZER => coderef, INSTALL => name, | |
24 | # LIST_CACHE => descriptor, SCALAR_CACHE => descriptor } | |
25 | # | |
26 | ||
27 | use Carp; | |
28 | use Exporter; | |
29 | use vars qw($DEBUG); | |
899dc88a | 30 | use Config; # Dammit. |
a0cb3900 JH |
31 | @ISA = qw(Exporter); |
32 | @EXPORT = qw(memoize); | |
33 | @EXPORT_OK = qw(unmemoize flush_cache); | |
34 | use strict; | |
35 | ||
36 | my %memotable; | |
37 | my %revmemotable; | |
38 | my @CONTEXT_TAGS = qw(MERGE TIE MEMORY FAULT HASH); | |
39 | my %IS_CACHE_TAG = map {($_ => 1)} @CONTEXT_TAGS; | |
40 | ||
41 | # Raise an error if the user tries to specify one of thesepackage as a | |
42 | # tie for LIST_CACHE | |
43 | ||
44 | my %scalar_only = map {($_ => 1)} qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File); | |
45 | ||
46 | sub memoize { | |
47 | my $fn = shift; | |
48 | my %options = @_; | |
49 | my $options = \%options; | |
50 | ||
51 | unless (defined($fn) && | |
52 | (ref $fn eq 'CODE' || ref $fn eq '')) { | |
53 | croak "Usage: memoize 'functionname'|coderef {OPTIONS}"; | |
54 | } | |
55 | ||
56 | my $uppack = caller; # TCL me Elmo! | |
57 | my $cref; # Code reference to original function | |
58 | my $name = (ref $fn ? undef : $fn); | |
59 | ||
60 | # Convert function names to code references | |
61 | $cref = &_make_cref($fn, $uppack); | |
62 | ||
63 | # Locate function prototype, if any | |
64 | my $proto = prototype $cref; | |
65 | if (defined $proto) { $proto = "($proto)" } | |
66 | else { $proto = "" } | |
67 | ||
899dc88a JH |
68 | # I would like to get rid of the eval, but there seems not to be any |
69 | # other way to set the prototype properly. The switch here for | |
70 | # 'usethreads' works around a bug in threadperl having to do with | |
71 | # magic goto. It would be better to fix the bug and use the magic | |
72 | # goto version everywhere. | |
73 | my $wrapper = | |
74 | $Config{usethreads} | |
75 | ? eval "sub $proto { &_memoizer(\$cref, \@_); }" | |
76 | : eval "sub $proto { unshift \@_, \$cref; goto &_memoizer; }"; | |
a0cb3900 JH |
77 | |
78 | my $normalizer = $options{NORMALIZER}; | |
79 | if (defined $normalizer && ! ref $normalizer) { | |
80 | $normalizer = _make_cref($normalizer, $uppack); | |
81 | } | |
82 | ||
83 | my $install_name; | |
84 | if (defined $options->{INSTALL}) { | |
85 | # INSTALL => name | |
86 | $install_name = $options->{INSTALL}; | |
87 | } elsif (! exists $options->{INSTALL}) { | |
88 | # No INSTALL option provided; use original name if possible | |
89 | $install_name = $name; | |
90 | } else { | |
91 | # INSTALL => undef means don't install | |
92 | } | |
93 | ||
94 | if (defined $install_name) { | |
95 | $install_name = $uppack . '::' . $install_name | |
96 | unless $install_name =~ /::/; | |
97 | no strict; | |
98 | local($^W) = 0; # ``Subroutine $install_name redefined at ...'' | |
99 | *{$install_name} = $wrapper; # Install memoized version | |
100 | } | |
101 | ||
102 | $revmemotable{$wrapper} = "" . $cref; # Turn code ref into hash key | |
103 | ||
104 | # These will be the caches | |
105 | my %caches; | |
106 | for my $context (qw(SCALAR LIST)) { | |
107 | # suppress subsequent 'uninitialized value' warnings | |
108 | $options{"${context}_CACHE"} ||= ''; | |
109 | ||
110 | my $cache_opt = $options{"${context}_CACHE"}; | |
111 | my @cache_opt_args; | |
112 | if (ref $cache_opt) { | |
113 | @cache_opt_args = @$cache_opt; | |
114 | $cache_opt = shift @cache_opt_args; | |
115 | } | |
116 | if ($cache_opt eq 'FAULT') { # no cache | |
117 | $caches{$context} = undef; | |
118 | } elsif ($cache_opt eq 'HASH') { # user-supplied hash | |
899dc88a JH |
119 | my $cache = $cache_opt_args[0]; |
120 | my $package = ref(tied %$cache); | |
121 | if ($context eq 'LIST' && $scalar_only{$package}) { | |
122 | croak("You can't use $package for LIST_CACHE because it can only store scalars"); | |
123 | } | |
124 | $caches{$context} = $cache; | |
a0cb3900 JH |
125 | } elsif ($cache_opt eq '' || $IS_CACHE_TAG{$cache_opt}) { |
126 | # default is that we make up an in-memory hash | |
127 | $caches{$context} = {}; | |
128 | # (this might get tied later, or MERGEd away) | |
129 | } else { | |
130 | croak "Unrecognized option to `${context}_CACHE': `$cache_opt' should be one of (@CONTEXT_TAGS); aborting"; | |
131 | } | |
132 | } | |
133 | ||
134 | # Perhaps I should check here that you didn't supply *both* merge | |
135 | # options. But if you did, it does do something reasonable: They | |
136 | # both get merged to the same in-memory hash. | |
137 | if ($options{SCALAR_CACHE} eq 'MERGE') { | |
138 | $caches{SCALAR} = $caches{LIST}; | |
139 | } elsif ($options{LIST_CACHE} eq 'MERGE') { | |
140 | $caches{LIST} = $caches{SCALAR}; | |
141 | } | |
142 | ||
143 | # Now deal with the TIE options | |
144 | { | |
145 | my $context; | |
146 | foreach $context (qw(SCALAR LIST)) { | |
147 | # If the relevant option wasn't `TIE', this call does nothing. | |
148 | _my_tie($context, $caches{$context}, $options); # Croaks on failure | |
149 | } | |
150 | } | |
151 | ||
152 | # We should put some more stuff in here eventually. | |
153 | # We've been saying that for serveral versions now. | |
154 | # And you know what? More stuff keeps going in! | |
155 | $memotable{$cref} = | |
156 | { | |
157 | O => $options, # Short keys here for things we need to access frequently | |
158 | N => $normalizer, | |
159 | U => $cref, | |
160 | MEMOIZED => $wrapper, | |
161 | PACKAGE => $uppack, | |
162 | NAME => $install_name, | |
163 | S => $caches{SCALAR}, | |
164 | L => $caches{LIST}, | |
165 | }; | |
166 | ||
167 | $wrapper # Return just memoized version | |
168 | } | |
169 | ||
170 | # This function tries to load a tied hash class and tie the hash to it. | |
171 | sub _my_tie { | |
172 | my ($context, $hash, $options) = @_; | |
173 | my $fullopt = $options->{"${context}_CACHE"}; | |
174 | ||
175 | # We already checked to make sure that this works. | |
176 | my $shortopt = (ref $fullopt) ? $fullopt->[0] : $fullopt; | |
177 | ||
178 | return unless defined $shortopt && $shortopt eq 'TIE'; | |
5189e6fe | 179 | carp("TIE option to memoize() is deprecated; use HASH instead") |
484fdf61 | 180 | if $^W; |
a0cb3900 JH |
181 | |
182 | my @args = ref $fullopt ? @$fullopt : (); | |
183 | shift @args; | |
184 | my $module = shift @args; | |
185 | if ($context eq 'LIST' && $scalar_only{$module}) { | |
186 | croak("You can't use $module for LIST_CACHE because it can only store scalars"); | |
187 | } | |
188 | my $modulefile = $module . '.pm'; | |
189 | $modulefile =~ s{::}{/}g; | |
190 | eval { require $modulefile }; | |
191 | if ($@) { | |
192 | croak "Memoize: Couldn't load hash tie module `$module': $@; aborting"; | |
193 | } | |
a0cb3900 JH |
194 | my $rc = (tie %$hash => $module, @args); |
195 | unless ($rc) { | |
899dc88a | 196 | croak "Memoize: Couldn't tie hash to `$module': $!; aborting"; |
a0cb3900 JH |
197 | } |
198 | 1; | |
199 | } | |
200 | ||
201 | sub flush_cache { | |
202 | my $func = _make_cref($_[0], scalar caller); | |
203 | my $info = $memotable{$revmemotable{$func}}; | |
204 | die "$func not memoized" unless defined $info; | |
205 | for my $context (qw(S L)) { | |
206 | my $cache = $info->{$context}; | |
207 | if (tied %$cache && ! (tied %$cache)->can('CLEAR')) { | |
208 | my $funcname = defined($info->{NAME}) ? | |
209 | "function $info->{NAME}" : "anonymous function $func"; | |
210 | my $context = {S => 'scalar', L => 'list'}->{$context}; | |
211 | croak "Tied cache hash for $context-context $funcname does not support flushing"; | |
212 | } else { | |
213 | %$cache = (); | |
214 | } | |
215 | } | |
216 | } | |
217 | ||
218 | # This is the function that manages the memo tables. | |
219 | sub _memoizer { | |
220 | my $orig = shift; # stringized version of ref to original func. | |
221 | my $info = $memotable{$orig}; | |
222 | my $normalizer = $info->{N}; | |
223 | ||
224 | my $argstr; | |
225 | my $context = (wantarray() ? LIST : SCALAR); | |
226 | ||
227 | if (defined $normalizer) { | |
228 | no strict; | |
229 | if ($context == SCALAR) { | |
230 | $argstr = &{$normalizer}(@_); | |
231 | } elsif ($context == LIST) { | |
232 | ($argstr) = &{$normalizer}(@_); | |
233 | } else { | |
234 | croak "Internal error \#41; context was neither LIST nor SCALAR\n"; | |
235 | } | |
236 | } else { # Default normalizer | |
899dc88a JH |
237 | local $^W = 0; |
238 | $argstr = join chr(28),@_; | |
a0cb3900 JH |
239 | } |
240 | ||
241 | if ($context == SCALAR) { | |
242 | my $cache = $info->{S}; | |
899dc88a | 243 | _crap_out($info->{NAME}, 'scalar') unless $cache; |
a0cb3900 JH |
244 | if (exists $cache->{$argstr}) { |
245 | return $cache->{$argstr}; | |
246 | } else { | |
247 | my $val = &{$info->{U}}(@_); | |
248 | # Scalars are considered to be lists; store appropriately | |
249 | if ($info->{O}{SCALAR_CACHE} eq 'MERGE') { | |
250 | $cache->{$argstr} = [$val]; | |
251 | } else { | |
252 | $cache->{$argstr} = $val; | |
253 | } | |
254 | $val; | |
255 | } | |
256 | } elsif ($context == LIST) { | |
257 | my $cache = $info->{L}; | |
899dc88a | 258 | _crap_out($info->{NAME}, 'list') unless $cache; |
a0cb3900 JH |
259 | if (exists $cache->{$argstr}) { |
260 | my $val = $cache->{$argstr}; | |
a0cb3900 | 261 | # If LISTCONTEXT=>MERGE, then the function never returns lists, |
899dc88a | 262 | # so we have a scalar value cached, so just return it straightaway: |
a0cb3900 | 263 | return ($val) if $info->{O}{LIST_CACHE} eq 'MERGE'; |
899dc88a JH |
264 | # Maybe in a later version we can use a faster test. |
265 | ||
266 | # Otherwise, we cached an array containing the returned list: | |
a0cb3900 JH |
267 | return @$val; |
268 | } else { | |
269 | my $q = $cache->{$argstr} = [&{$info->{U}}(@_)]; | |
270 | @$q; | |
271 | } | |
272 | } else { | |
273 | croak "Internal error \#42; context was neither LIST nor SCALAR\n"; | |
274 | } | |
275 | } | |
276 | ||
277 | sub unmemoize { | |
278 | my $f = shift; | |
279 | my $uppack = caller; | |
280 | my $cref = _make_cref($f, $uppack); | |
281 | ||
282 | unless (exists $revmemotable{$cref}) { | |
283 | croak "Could not unmemoize function `$f', because it was not memoized to begin with"; | |
284 | } | |
285 | ||
286 | my $tabent = $memotable{$revmemotable{$cref}}; | |
287 | unless (defined $tabent) { | |
288 | croak "Could not figure out how to unmemoize function `$f'"; | |
289 | } | |
290 | my $name = $tabent->{NAME}; | |
291 | if (defined $name) { | |
292 | no strict; | |
293 | local($^W) = 0; # ``Subroutine $install_name redefined at ...'' | |
294 | *{$name} = $tabent->{U}; # Replace with original function | |
295 | } | |
296 | undef $memotable{$revmemotable{$cref}}; | |
297 | undef $revmemotable{$cref}; | |
298 | ||
299 | # This removes the last reference to the (possibly tied) memo tables | |
300 | # my ($old_function, $memotabs) = @{$tabent}{'U','S','L'}; | |
301 | # undef $tabent; | |
302 | ||
303 | # # Untie the memo tables if they were tied. | |
304 | # my $i; | |
305 | # for $i (0,1) { | |
306 | # if (tied %{$memotabs->[$i]}) { | |
307 | # warn "Untying hash #$i\n"; | |
308 | # untie %{$memotabs->[$i]}; | |
309 | # } | |
310 | # } | |
311 | ||
312 | $tabent->{U}; | |
313 | } | |
314 | ||
315 | sub _make_cref { | |
316 | my $fn = shift; | |
317 | my $uppack = shift; | |
318 | my $cref; | |
319 | my $name; | |
320 | ||
321 | if (ref $fn eq 'CODE') { | |
322 | $cref = $fn; | |
323 | } elsif (! ref $fn) { | |
324 | if ($fn =~ /::/) { | |
325 | $name = $fn; | |
326 | } else { | |
327 | $name = $uppack . '::' . $fn; | |
328 | } | |
329 | no strict; | |
330 | if (defined $name and !defined(&$name)) { | |
331 | croak "Cannot operate on nonexistent function `$fn'"; | |
332 | } | |
333 | # $cref = \&$name; | |
334 | $cref = *{$name}{CODE}; | |
335 | } else { | |
336 | my $parent = (caller(1))[3]; # Function that called _make_cref | |
337 | croak "Usage: argument 1 to `$parent' must be a function name or reference.\n"; | |
338 | } | |
339 | $DEBUG and warn "${name}($fn) => $cref in _make_cref\n"; | |
340 | $cref; | |
341 | } | |
342 | ||
343 | sub _crap_out { | |
344 | my ($funcname, $context) = @_; | |
345 | if (defined $funcname) { | |
346 | croak "Function `$funcname' called in forbidden $context context; faulting"; | |
347 | } else { | |
348 | croak "Anonymous function called in forbidden $context context; faulting"; | |
349 | } | |
350 | } | |
351 | ||
352 | 1; | |
353 | ||
354 | ||
355 | ||
356 | ||
357 | ||
358 | =head1 NAME | |
359 | ||
5189e6fe | 360 | Memoize - Make functions faster by trading space for time |
a0cb3900 JH |
361 | |
362 | =head1 SYNOPSIS | |
363 | ||
484fdf61 | 364 | # This is the documentation for Memoize 1.01 |
a0cb3900 JH |
365 | use Memoize; |
366 | memoize('slow_function'); | |
367 | slow_function(arguments); # Is faster than it was before | |
368 | ||
369 | ||
370 | This is normally all you need to know. However, many options are available: | |
371 | ||
372 | memoize(function, options...); | |
373 | ||
374 | Options include: | |
375 | ||
376 | NORMALIZER => function | |
377 | INSTALL => new_name | |
378 | ||
379 | SCALAR_CACHE => 'MEMORY' | |
380 | SCALAR_CACHE => ['HASH', \%cache_hash ] | |
381 | SCALAR_CACHE => 'FAULT' | |
382 | SCALAR_CACHE => 'MERGE' | |
383 | ||
384 | LIST_CACHE => 'MEMORY' | |
385 | LIST_CACHE => ['HASH', \%cache_hash ] | |
386 | LIST_CACHE => 'FAULT' | |
387 | LIST_CACHE => 'MERGE' | |
388 | ||
389 | =head1 DESCRIPTION | |
390 | ||
391 | `Memoizing' a function makes it faster by trading space for time. It | |
392 | does this by caching the return values of the function in a table. | |
393 | If you call the function again with the same arguments, C<memoize> | |
3d4a255c | 394 | jumps in and gives you the value out of the table, instead of letting |
a0cb3900 JH |
395 | the function compute the value all over again. |
396 | ||
397 | Here is an extreme example. Consider the Fibonacci sequence, defined | |
398 | by the following function: | |
399 | ||
400 | # Compute Fibonacci numbers | |
401 | sub fib { | |
402 | my $n = shift; | |
403 | return $n if $n < 2; | |
404 | fib($n-1) + fib($n-2); | |
405 | } | |
406 | ||
407 | This function is very slow. Why? To compute fib(14), it first wants | |
408 | to compute fib(13) and fib(12), and add the results. But to compute | |
409 | fib(13), it first has to compute fib(12) and fib(11), and then it | |
410 | comes back and computes fib(12) all over again even though the answer | |
411 | is the same. And both of the times that it wants to compute fib(12), | |
412 | it has to compute fib(11) from scratch, and then it has to do it | |
413 | again each time it wants to compute fib(13). This function does so | |
414 | much recomputing of old results that it takes a really long time to | |
415 | run---fib(14) makes 1,200 extra recursive calls to itself, to compute | |
416 | and recompute things that it already computed. | |
417 | ||
418 | This function is a good candidate for memoization. If you memoize the | |
419 | `fib' function above, it will compute fib(14) exactly once, the first | |
420 | time it needs to, and then save the result in a table. Then if you | |
421 | ask for fib(14) again, it gives you the result out of the table. | |
422 | While computing fib(14), instead of computing fib(12) twice, it does | |
423 | it once; the second time it needs the value it gets it from the table. | |
424 | It doesn't compute fib(11) four times; it computes it once, getting it | |
425 | from the table the next three times. Instead of making 1,200 | |
426 | recursive calls to `fib', it makes 15. This makes the function about | |
427 | 150 times faster. | |
428 | ||
429 | You could do the memoization yourself, by rewriting the function, like | |
430 | this: | |
431 | ||
432 | # Compute Fibonacci numbers, memoized version | |
433 | { my @fib; | |
434 | sub fib { | |
435 | my $n = shift; | |
436 | return $fib[$n] if defined $fib[$n]; | |
437 | return $fib[$n] = $n if $n < 2; | |
438 | $fib[$n] = fib($n-1) + fib($n-2); | |
439 | } | |
440 | } | |
441 | ||
442 | Or you could use this module, like this: | |
443 | ||
444 | use Memoize; | |
445 | memoize('fib'); | |
446 | ||
447 | # Rest of the fib function just like the original version. | |
448 | ||
449 | This makes it easy to turn memoizing on and off. | |
450 | ||
451 | Here's an even simpler example: I wrote a simple ray tracer; the | |
452 | program would look in a certain direction, figure out what it was | |
453 | looking at, and then convert the `color' value (typically a string | |
454 | like `red') of that object to a red, green, and blue pixel value, like | |
455 | this: | |
456 | ||
457 | for ($direction = 0; $direction < 300; $direction++) { | |
458 | # Figure out which object is in direction $direction | |
459 | $color = $object->{color}; | |
460 | ($r, $g, $b) = @{&ColorToRGB($color)}; | |
461 | ... | |
462 | } | |
463 | ||
464 | Since there are relatively few objects in a picture, there are only a | |
465 | few colors, which get looked up over and over again. Memoizing | |
5189e6fe | 466 | C<ColorToRGB> sped up the program by several percent. |
a0cb3900 JH |
467 | |
468 | =head1 DETAILS | |
469 | ||
470 | This module exports exactly one function, C<memoize>. The rest of the | |
471 | functions in this package are None of Your Business. | |
472 | ||
473 | You should say | |
474 | ||
475 | memoize(function) | |
476 | ||
477 | where C<function> is the name of the function you want to memoize, or | |
478 | a reference to it. C<memoize> returns a reference to the new, | |
479 | memoized version of the function, or C<undef> on a non-fatal error. | |
480 | At present, there are no non-fatal errors, but there might be some in | |
481 | the future. | |
482 | ||
483 | If C<function> was the name of a function, then C<memoize> hides the | |
484 | old version and installs the new memoized version under the old name, | |
485 | so that C<&function(...)> actually invokes the memoized version. | |
486 | ||
487 | =head1 OPTIONS | |
488 | ||
489 | There are some optional options you can pass to C<memoize> to change | |
490 | the way it behaves a little. To supply options, invoke C<memoize> | |
491 | like this: | |
492 | ||
493 | memoize(function, NORMALIZER => function, | |
494 | INSTALL => newname, | |
495 | SCALAR_CACHE => option, | |
496 | LIST_CACHE => option | |
497 | ); | |
498 | ||
499 | Each of these options is optional; you can include some, all, or none | |
500 | of them. | |
501 | ||
502 | =head2 INSTALL | |
503 | ||
504 | If you supply a function name with C<INSTALL>, memoize will install | |
505 | the new, memoized version of the function under the name you give. | |
506 | For example, | |
507 | ||
508 | memoize('fib', INSTALL => 'fastfib') | |
509 | ||
510 | installs the memoized version of C<fib> as C<fastfib>; without the | |
511 | C<INSTALL> option it would have replaced the old C<fib> with the | |
512 | memoized version. | |
513 | ||
514 | To prevent C<memoize> from installing the memoized version anywhere, use | |
515 | C<INSTALL =E<gt> undef>. | |
516 | ||
517 | =head2 NORMALIZER | |
518 | ||
519 | Suppose your function looks like this: | |
520 | ||
521 | # Typical call: f('aha!', A => 11, B => 12); | |
522 | sub f { | |
523 | my $a = shift; | |
524 | my %hash = @_; | |
525 | $hash{B} ||= 2; # B defaults to 2 | |
526 | $hash{C} ||= 7; # C defaults to 7 | |
527 | ||
528 | # Do something with $a, %hash | |
529 | } | |
530 | ||
531 | Now, the following calls to your function are all completely equivalent: | |
532 | ||
533 | f(OUCH); | |
534 | f(OUCH, B => 2); | |
535 | f(OUCH, C => 7); | |
536 | f(OUCH, B => 2, C => 7); | |
537 | f(OUCH, C => 7, B => 2); | |
538 | (etc.) | |
539 | ||
540 | However, unless you tell C<Memoize> that these calls are equivalent, | |
541 | it will not know that, and it will compute the values for these | |
542 | invocations of your function separately, and store them separately. | |
543 | ||
544 | To prevent this, supply a C<NORMALIZER> function that turns the | |
545 | program arguments into a string in a way that equivalent arguments | |
546 | turn into the same string. A C<NORMALIZER> function for C<f> above | |
547 | might look like this: | |
548 | ||
549 | sub normalize_f { | |
550 | my $a = shift; | |
551 | my %hash = @_; | |
552 | $hash{B} ||= 2; | |
553 | $hash{C} ||= 7; | |
554 | ||
3d4a255c | 555 | join(',', $a, map ($_ => $hash{$_}) sort keys %hash); |
a0cb3900 JH |
556 | } |
557 | ||
558 | Each of the argument lists above comes out of the C<normalize_f> | |
559 | function looking exactly the same, like this: | |
560 | ||
3d4a255c | 561 | OUCH,B,2,C,7 |
a0cb3900 JH |
562 | |
563 | You would tell C<Memoize> to use this normalizer this way: | |
564 | ||
565 | memoize('f', NORMALIZER => 'normalize_f'); | |
566 | ||
567 | C<memoize> knows that if the normalized version of the arguments is | |
568 | the same for two argument lists, then it can safely look up the value | |
569 | that it computed for one argument list and return it as the result of | |
570 | calling the function with the other argument list, even if the | |
571 | argument lists look different. | |
572 | ||
3d4a255c JH |
573 | The default normalizer just concatenates the arguments with character |
574 | 28 in between. (In ASCII, this is called FS or control-\.) This | |
575 | always works correctly for functions with only one string argument, | |
576 | and also when the arguments never contain character 28. However, it | |
577 | can confuse certain argument lists: | |
a0cb3900 JH |
578 | |
579 | normalizer("a\034", "b") | |
580 | normalizer("a", "\034b") | |
581 | normalizer("a\034\034b") | |
582 | ||
3d4a255c | 583 | for example. |
a0cb3900 | 584 | |
899dc88a JH |
585 | Since hash keys are strings, the default normalizer will not |
586 | distinguish between C<undef> and the empty string. It also won't work | |
3d4a255c JH |
587 | when the function's arguments are references. For example, consider a |
588 | function C<g> which gets two arguments: A number, and a reference to | |
899dc88a | 589 | an array of numbers: |
a0cb3900 JH |
590 | |
591 | g(13, [1,2,3,4,5,6,7]); | |
592 | ||
593 | The default normalizer will turn this into something like | |
3d4a255c | 594 | C<"13\034ARRAY(0x436c1f)">. That would be all right, except that a |
a0cb3900 JH |
595 | subsequent array of numbers might be stored at a different location |
596 | even though it contains the same data. If this happens, C<Memoize> | |
597 | will think that the arguments are different, even though they are | |
598 | equivalent. In this case, a normalizer like this is appropriate: | |
599 | ||
600 | sub normalize { join ' ', $_[0], @{$_[1]} } | |
601 | ||
602 | For the example above, this produces the key "13 1 2 3 4 5 6 7". | |
603 | ||
604 | Another use for normalizers is when the function depends on data other | |
605 | than those in its arguments. Suppose you have a function which | |
606 | returns a value which depends on the current hour of the day: | |
607 | ||
608 | sub on_duty { | |
609 | my ($problem_type) = @_; | |
610 | my $hour = (localtime)[2]; | |
611 | open my $fh, "$DIR/$problem_type" or die...; | |
612 | my $line; | |
613 | while ($hour-- > 0) | |
614 | $line = <$fh>; | |
615 | } | |
616 | return $line; | |
617 | } | |
618 | ||
3d4a255c | 619 | At 10:23, this function generates the 10th line of a data file; at |
a0cb3900 JH |
620 | 3:45 PM it generates the 15th line instead. By default, C<Memoize> |
621 | will only see the $problem_type argument. To fix this, include the | |
622 | current hour in the normalizer: | |
623 | ||
624 | sub normalize { join ' ', (localtime)[2], @_ } | |
625 | ||
626 | The calling context of the function (scalar or list context) is | |
627 | propagated to the normalizer. This means that if the memoized | |
628 | function will treat its arguments differently in list context than it | |
629 | would in scalar context, you can have the normalizer function select | |
630 | its behavior based on the results of C<wantarray>. Even if called in | |
631 | a list context, a normalizer should still return a single string. | |
632 | ||
633 | =head2 C<SCALAR_CACHE>, C<LIST_CACHE> | |
634 | ||
635 | Normally, C<Memoize> caches your function's return values into an | |
636 | ordinary Perl hash variable. However, you might like to have the | |
637 | values cached on the disk, so that they persist from one run of your | |
638 | program to the next, or you might like to associate some other | |
3d4a255c | 639 | interesting semantics with the cached values. |
a0cb3900 JH |
640 | |
641 | There's a slight complication under the hood of C<Memoize>: There are | |
642 | actually I<two> caches, one for scalar values and one for list values. | |
643 | When your function is called in scalar context, its return value is | |
644 | cached in one hash, and when your function is called in list context, | |
645 | its value is cached in the other hash. You can control the caching | |
646 | behavior of both contexts independently with these options. | |
647 | ||
648 | The argument to C<LIST_CACHE> or C<SCALAR_CACHE> must either be one of | |
649 | the following four strings: | |
650 | ||
651 | MEMORY | |
652 | FAULT | |
653 | MERGE | |
3d4a255c | 654 | HASH |
a0cb3900 JH |
655 | |
656 | or else it must be a reference to a list whose first element is one of | |
657 | these four strings, such as C<[HASH, arguments...]>. | |
658 | ||
659 | =over 4 | |
660 | ||
661 | =item C<MEMORY> | |
662 | ||
663 | C<MEMORY> means that return values from the function will be cached in | |
664 | an ordinary Perl hash variable. The hash variable will not persist | |
665 | after the program exits. This is the default. | |
666 | ||
667 | =item C<HASH> | |
668 | ||
669 | C<HASH> allows you to specify that a particular hash that you supply | |
670 | will be used as the cache. You can tie this hash beforehand to give | |
671 | it any behavior you want. | |
672 | ||
673 | A tied hash can have any semantics at all. It is typically tied to an | |
674 | on-disk database, so that cached values are stored in the database and | |
675 | retrieved from it again when needed, and the disk file typically | |
676 | persists after your program has exited. See C<perltie> for more | |
677 | complete details about C<tie>. | |
678 | ||
679 | A typical example is: | |
680 | ||
3d4a255c | 681 | use DB_File; |
a0cb3900 JH |
682 | tie my %cache => 'DB_File', $filename, O_RDWR|O_CREAT, 0666; |
683 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
684 | ||
685 | This has the effect of storing the cache in a C<DB_File> database | |
686 | whose name is in C<$filename>. The cache will persist after the | |
687 | program has exited. Next time the program runs, it will find the | |
688 | cache already populated from the previous run of the program. Or you | |
689 | can forcibly populate the cache by constructing a batch program that | |
690 | runs in the background and populates the cache file. Then when you | |
691 | come to run your real program the memoized function will be fast | |
692 | because all its results have been precomputed. | |
693 | ||
694 | =item C<TIE> | |
695 | ||
5189e6fe JH |
696 | This option is no longer supported. It is still documented only to |
697 | aid in the debugging of old programs that use it. Old programs should | |
698 | be converted to use the C<HASH> option instead. | |
a0cb3900 | 699 | |
3d4a255c | 700 | memoize ... [TIE, PACKAGE, ARGS...] |
a0cb3900 JH |
701 | |
702 | is merely a shortcut for | |
703 | ||
3d4a255c | 704 | require PACKAGE; |
5189e6fe JH |
705 | { my %cache; |
706 | tie %cache, PACKAGE, ARGS...; | |
707 | } | |
a0cb3900 JH |
708 | memoize ... [HASH => \%cache]; |
709 | ||
a0cb3900 JH |
710 | =item C<FAULT> |
711 | ||
712 | C<FAULT> means that you never expect to call the function in scalar | |
713 | (or list) context, and that if C<Memoize> detects such a call, it | |
714 | should abort the program. The error message is one of | |
715 | ||
716 | `foo' function called in forbidden list context at line ... | |
717 | `foo' function called in forbidden scalar context at line ... | |
718 | ||
719 | =item C<MERGE> | |
720 | ||
721 | C<MERGE> normally means the function does not distinguish between list | |
722 | and sclar context, and that return values in both contexts should be | |
723 | stored together. C<LIST_CACHE =E<gt> MERGE> means that list context | |
724 | return values should be stored in the same hash that is used for | |
725 | scalar context returns, and C<SCALAR_CACHE =E<gt> MERGE> means the | |
726 | same, mutatis mutandis. It is an error to specify C<MERGE> for both, | |
727 | but it probably does something useful. | |
728 | ||
729 | Consider this function: | |
730 | ||
731 | sub pi { 3; } | |
732 | ||
733 | Normally, the following code will result in two calls to C<pi>: | |
734 | ||
735 | $x = pi(); | |
736 | ($y) = pi(); | |
737 | $z = pi(); | |
738 | ||
739 | The first call caches the value C<3> in the scalar cache; the second | |
740 | caches the list C<(3)> in the list cache. The third call doesn't call | |
741 | the real C<pi> function; it gets the value from the scalar cache. | |
742 | ||
743 | Obviously, the second call to C<pi> is a waste of time, and storing | |
3d4a255c JH |
744 | its return value is a waste of space. Specifying C<LIST_CACHE =E<gt> |
745 | MERGE> will make C<memoize> use the same cache for scalar and list | |
746 | context return values, so that the second call uses the scalar cache | |
747 | that was populated by the first call. C<pi> ends up being called only | |
748 | once, and both subsequent calls return C<3> from the cache, regardless | |
749 | of the calling context. | |
a0cb3900 JH |
750 | |
751 | Another use for C<MERGE> is when you want both kinds of return values | |
752 | stored in the same disk file; this saves you from having to deal with | |
753 | two disk files instead of one. You can use a normalizer function to | |
754 | keep the two sets of return values separate. For example: | |
755 | ||
756 | tie my %cache => 'MLDBM', 'DB_File', $filename, ...; | |
757 | ||
758 | memoize 'myfunc', | |
759 | NORMALIZER => 'n', | |
760 | SCALAR_CACHE => [HASH => \%cache], | |
761 | LIST_CACHE => MERGE, | |
762 | ; | |
763 | ||
764 | sub n { | |
765 | my $context = wantarray() ? 'L' : 'S'; | |
766 | # ... now compute the hash key from the arguments ... | |
767 | $hashkey = "$context:$hashkey"; | |
768 | } | |
769 | ||
770 | This normalizer function will store scalar context return values in | |
771 | the disk file under keys that begin with C<S:>, and list context | |
772 | return values under keys that begin with C<L:>. | |
773 | ||
774 | =back | |
775 | ||
776 | =head1 OTHER FACILITIES | |
777 | ||
778 | =head2 C<unmemoize> | |
779 | ||
780 | There's an C<unmemoize> function that you can import if you want to. | |
781 | Why would you want to? Here's an example: Suppose you have your cache | |
782 | tied to a DBM file, and you want to make sure that the cache is | |
783 | written out to disk if someone interrupts the program. If the program | |
784 | exits normally, this will happen anyway, but if someone types | |
785 | control-C or something then the program will terminate immediately | |
786 | without synchronizing the database. So what you can do instead is | |
787 | ||
788 | $SIG{INT} = sub { unmemoize 'function' }; | |
789 | ||
a0cb3900 JH |
790 | C<unmemoize> accepts a reference to, or the name of a previously |
791 | memoized function, and undoes whatever it did to provide the memoized | |
792 | version in the first place, including making the name refer to the | |
793 | unmemoized version if appropriate. It returns a reference to the | |
794 | unmemoized version of the function. | |
795 | ||
796 | If you ask it to unmemoize a function that was never memoized, it | |
797 | croaks. | |
798 | ||
799 | =head2 C<flush_cache> | |
800 | ||
801 | C<flush_cache(function)> will flush out the caches, discarding I<all> | |
3d4a255c | 802 | the cached data. The argument may be a function name or a reference |
a0cb3900 JH |
803 | to a function. For finer control over when data is discarded or |
804 | expired, see the documentation for C<Memoize::Expire>, included in | |
805 | this package. | |
806 | ||
807 | Note that if the cache is a tied hash, C<flush_cache> will attempt to | |
808 | invoke the C<CLEAR> method on the hash. If there is no C<CLEAR> | |
809 | method, this will cause a run-time error. | |
810 | ||
811 | An alternative approach to cache flushing is to use the C<HASH> option | |
812 | (see above) to request that C<Memoize> use a particular hash variable | |
813 | as its cache. Then you can examine or modify the hash at any time in | |
3d4a255c | 814 | any way you desire. You may flush the cache by using C<%hash = ()>. |
a0cb3900 JH |
815 | |
816 | =head1 CAVEATS | |
817 | ||
818 | Memoization is not a cure-all: | |
819 | ||
820 | =over 4 | |
821 | ||
822 | =item * | |
823 | ||
824 | Do not memoize a function whose behavior depends on program | |
825 | state other than its own arguments, such as global variables, the time | |
826 | of day, or file input. These functions will not produce correct | |
827 | results when memoized. For a particularly easy example: | |
828 | ||
829 | sub f { | |
830 | time; | |
831 | } | |
832 | ||
833 | This function takes no arguments, and as far as C<Memoize> is | |
834 | concerned, it always returns the same result. C<Memoize> is wrong, of | |
835 | course, and the memoized version of this function will call C<time> once | |
836 | to get the current time, and it will return that same time | |
837 | every time you call it after that. | |
838 | ||
839 | =item * | |
840 | ||
841 | Do not memoize a function with side effects. | |
842 | ||
843 | sub f { | |
844 | my ($a, $b) = @_; | |
845 | my $s = $a + $b; | |
846 | print "$a + $b = $s.\n"; | |
847 | } | |
848 | ||
849 | This function accepts two arguments, adds them, and prints their sum. | |
850 | Its return value is the numuber of characters it printed, but you | |
851 | probably didn't care about that. But C<Memoize> doesn't understand | |
852 | that. If you memoize this function, you will get the result you | |
853 | expect the first time you ask it to print the sum of 2 and 3, but | |
854 | subsequent calls will return 1 (the return value of | |
855 | C<print>) without actually printing anything. | |
856 | ||
857 | =item * | |
858 | ||
859 | Do not memoize a function that returns a data structure that is | |
860 | modified by its caller. | |
861 | ||
862 | Consider these functions: C<getusers> returns a list of users somehow, | |
863 | and then C<main> throws away the first user on the list and prints the | |
864 | rest: | |
865 | ||
866 | sub main { | |
867 | my $userlist = getusers(); | |
868 | shift @$userlist; | |
869 | foreach $u (@$userlist) { | |
870 | print "User $u\n"; | |
871 | } | |
872 | } | |
873 | ||
874 | sub getusers { | |
875 | my @users; | |
876 | # Do something to get a list of users; | |
877 | \@users; # Return reference to list. | |
878 | } | |
879 | ||
880 | If you memoize C<getusers> here, it will work right exactly once. The | |
881 | reference to the users list will be stored in the memo table. C<main> | |
882 | will discard the first element from the referenced list. The next | |
883 | time you invoke C<main>, C<Memoize> will not call C<getusers>; it will | |
884 | just return the same reference to the same list it got last time. But | |
885 | this time the list has already had its head removed; C<main> will | |
886 | erroneously remove another element from it. The list will get shorter | |
887 | and shorter every time you call C<main>. | |
888 | ||
889 | Similarly, this: | |
890 | ||
891 | $u1 = getusers(); | |
892 | $u2 = getusers(); | |
893 | pop @$u1; | |
894 | ||
895 | will modify $u2 as well as $u1, because both variables are references | |
896 | to the same array. Had C<getusers> not been memoized, $u1 and $u2 | |
897 | would have referred to different arrays. | |
898 | ||
899 | =item * | |
900 | ||
901 | Do not memoize a very simple function. | |
902 | ||
903 | Recently someone mentioned to me that the Memoize module made his | |
904 | program run slower instead of faster. It turned out that he was | |
905 | memoizing the following function: | |
906 | ||
907 | sub square { | |
908 | $_[0] * $_[0]; | |
909 | } | |
910 | ||
911 | I pointed out that C<Memoize> uses a hash, and that looking up a | |
912 | number in the hash is necessarily going to take a lot longer than a | |
913 | single multiplication. There really is no way to speed up the | |
914 | C<square> function. | |
915 | ||
916 | Memoization is not magical. | |
917 | ||
918 | =back | |
919 | ||
920 | =head1 PERSISTENT CACHE SUPPORT | |
921 | ||
922 | You can tie the cache tables to any sort of tied hash that you want | |
923 | to, as long as it supports C<TIEHASH>, C<FETCH>, C<STORE>, and | |
924 | C<EXISTS>. For example, | |
925 | ||
926 | tie my %cache => 'GDBM_File', $filename, O_RDWR|O_CREAT, 0666; | |
927 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
928 | ||
929 | works just fine. For some storage methods, you need a little glue. | |
930 | ||
931 | C<SDBM_File> doesn't supply an C<EXISTS> method, so included in this | |
932 | package is a glue module called C<Memoize::SDBM_File> which does | |
933 | provide one. Use this instead of plain C<SDBM_File> to store your | |
934 | cache table on disk in an C<SDBM_File> database: | |
935 | ||
936 | tie my %cache => 'Memoize::SDBM_File', $filename, O_RDWR|O_CREAT, 0666; | |
937 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
938 | ||
939 | C<NDBM_File> has the same problem and the same solution. (Use | |
899dc88a | 940 | C<Memoize::NDBM_File instead of plain NDBM_File.>) |
a0cb3900 JH |
941 | |
942 | C<Storable> isn't a tied hash class at all. You can use it to store a | |
943 | hash to disk and retrieve it again, but you can't modify the hash while | |
944 | it's on the disk. So if you want to store your cache table in a | |
945 | C<Storable> database, use C<Memoize::Storable>, which puts a hashlike | |
946 | front-end onto C<Storable>. The hash table is actually kept in | |
947 | memory, and is loaded from your C<Storable> file at the time you | |
948 | memoize the function, and stored back at the time you unmemoize the | |
949 | function (or when your program exits): | |
950 | ||
951 | tie my %cache => 'Memoize::Storable', $filename; | |
952 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
953 | ||
954 | tie my %cache => 'Memoize::Storable', $filename, 'nstore'; | |
955 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
956 | ||
957 | Include the `nstore' option to have the C<Storable> database written | |
958 | in `network order'. (See L<Storable> for more details about this.) | |
959 | ||
3d4a255c JH |
960 | The C<flush_cache()> function will raise a run-time error unless the |
961 | tied package provides a C<CLEAR> method. | |
962 | ||
a0cb3900 JH |
963 | =head1 EXPIRATION SUPPORT |
964 | ||
965 | See Memoize::Expire, which is a plug-in module that adds expiration | |
966 | functionality to Memoize. If you don't like the kinds of policies | |
967 | that Memoize::Expire implements, it is easy to write your own plug-in | |
968 | module to implement whatever policy you desire. Memoize comes with | |
969 | several examples. An expiration manager that implements a LRU policy | |
970 | is available on CPAN as Memoize::ExpireLRU. | |
971 | ||
972 | =head1 BUGS | |
973 | ||
974 | The test suite is much better, but always needs improvement. | |
975 | ||
3d4a255c JH |
976 | There is some problem with the way C<goto &f> works under threaded |
977 | Perl, perhaps because of the lexical scoping of C<@_>. This is a bug | |
978 | in Perl, and until it is resolved, memoized functions will see a | |
979 | slightly different C<caller()> and will perform a little more slowly | |
980 | on threaded perls than unthreaded perls. | |
a0cb3900 | 981 | |
5189e6fe JH |
982 | Some versions of C<DB_File> won't let you store data under a key of |
983 | length 0. That means that if you have a function C<f> which you | |
984 | memoized and the cache is in a C<DB_File> database, then the value of | |
985 | C<f()> (C<f> called with no arguments) will not be memoized. If this | |
986 | is a big problem, you can supply a normalizer function that prepends | |
987 | C<"x"> to every key. | |
a0cb3900 JH |
988 | |
989 | =head1 MAILING LIST | |
990 | ||
991 | To join a very low-traffic mailing list for announcements about | |
992 | C<Memoize>, send an empty note to C<mjd-perl-memoize-request@plover.com>. | |
993 | ||
994 | =head1 AUTHOR | |
995 | ||
996 | Mark-Jason Dominus (C<mjd-perl-memoize+@plover.com>), Plover Systems co. | |
997 | ||
998 | See the C<Memoize.pm> Page at http://www.plover.com/~mjd/perl/Memoize/ | |
999 | for news and upgrades. Near this page, at | |
1000 | http://www.plover.com/~mjd/perl/MiniMemoize/ there is an article about | |
1001 | memoization and about the internals of Memoize that appeared in The | |
1002 | Perl Journal, issue #13. (This article is also included in the | |
1003 | Memoize distribution as `article.html'.) | |
1004 | ||
3d4a255c JH |
1005 | My upcoming book will discuss memoization (and many other fascinating |
1006 | topics) in tremendous detail. It will be published by Morgan Kaufmann | |
1007 | in 2002, possibly under the title I<Perl Advanced Techniques | |
1008 | Handbook>. It will also be available on-line for free. For more | |
1009 | information, visit http://perl.plover.com/book/ . | |
1010 | ||
a0cb3900 JH |
1011 | To join a mailing list for announcements about C<Memoize>, send an |
1012 | empty message to C<mjd-perl-memoize-request@plover.com>. This mailing | |
1013 | list is for announcements only and has extremely low traffic---about | |
3d4a255c | 1014 | two messages per year. |
a0cb3900 | 1015 | |
899dc88a JH |
1016 | =head1 COPYRIGHT AND LICENSE |
1017 | ||
1018 | Copyright 1998, 1999, 2000, 2001 by Mark Jason Dominus | |
1019 | ||
1020 | This library is free software; you may redistribute it and/or modify | |
3d4a255c | 1021 | it under the same terms as Perl itself. |
899dc88a | 1022 | |
a0cb3900 JH |
1023 | =head1 THANK YOU |
1024 | ||
1025 | Many thanks to Jonathan Roy for bug reports and suggestions, to | |
1026 | Michael Schwern for other bug reports and patches, to Mike Cariaso for | |
1027 | helping me to figure out the Right Thing to Do About Expiration, to | |
3d4a255c JH |
1028 | Joshua Gerth, Joshua Chamas, Jonathan Roy (again), Mark D. Anderson, |
1029 | and Andrew Johnson for more suggestions about expiration, to Brent | |
1030 | Powers for the Memoize::ExpireLRU module, to Ariel Scolnicov for | |
1031 | delightful messages about the Fibonacci function, to Dion Almaer for | |
a0cb3900 JH |
1032 | thought-provoking suggestions about the default normalizer, to Walt |
1033 | Mankowski and Kurt Starsinic for much help investigating problems | |
1034 | under threaded Perl, to Alex Dudkevich for reporting the bug in | |
1035 | prototyped functions and for checking my patch, to Tony Bass for many | |
3d4a255c JH |
1036 | helpful suggestions, to Jonathan Roy (again) for finding a use for |
1037 | C<unmemoize()>, to Philippe Verdret for enlightening discussion of | |
1038 | C<Hook::PrePostCall>, to Nat Torkington for advice I ignored, to Chris | |
a0cb3900 JH |
1039 | Nandor for portability advice, to Randal Schwartz for suggesting the |
1040 | 'C<flush_cache> function, and to Jenda Krynicky for being a light in | |
1041 | the world. | |
1042 | ||
899dc88a JH |
1043 | Special thanks to Jarkko Hietaniemi, the 5.8.0 pumpking, for including |
1044 | this module in the core and for his patient and helpful guidance | |
1045 | during the integration process. | |
3d4a255c | 1046 | |
a0cb3900 | 1047 | =cut |