Commit | Line | Data |
---|---|---|
a0cb3900 JH |
1 | # -*- mode: perl; perl-indent-level: 2; -*- |
2 | # Memoize.pm | |
3 | # | |
4 | # Transparent memoization of idempotent functions | |
5 | # | |
899dc88a | 6 | # Copyright 1998, 1999, 2000, 2001 M-J. Dominus. |
a0cb3900 JH |
7 | # You may copy and distribute this program under the |
8 | # same terms as Perl itself. If in doubt, | |
9 | # write to mjd-perl-memoize+@plover.com for a license. | |
10 | # | |
5189e6fe | 11 | # Version 1.00 $Revision: 1.18 $ $Date: 2001/06/24 17:16:47 $ |
a0cb3900 JH |
12 | |
13 | package Memoize; | |
5189e6fe | 14 | $VERSION = '1.00'; |
a0cb3900 JH |
15 | |
16 | # Compile-time constants | |
17 | sub SCALAR () { 0 } | |
18 | sub LIST () { 1 } | |
19 | ||
20 | ||
21 | # | |
22 | # Usage memoize(functionname/ref, | |
23 | # { NORMALIZER => coderef, INSTALL => name, | |
24 | # LIST_CACHE => descriptor, SCALAR_CACHE => descriptor } | |
25 | # | |
26 | ||
27 | use Carp; | |
28 | use Exporter; | |
29 | use vars qw($DEBUG); | |
899dc88a | 30 | use Config; # Dammit. |
a0cb3900 JH |
31 | @ISA = qw(Exporter); |
32 | @EXPORT = qw(memoize); | |
33 | @EXPORT_OK = qw(unmemoize flush_cache); | |
34 | use strict; | |
35 | ||
36 | my %memotable; | |
37 | my %revmemotable; | |
38 | my @CONTEXT_TAGS = qw(MERGE TIE MEMORY FAULT HASH); | |
39 | my %IS_CACHE_TAG = map {($_ => 1)} @CONTEXT_TAGS; | |
40 | ||
41 | # Raise an error if the user tries to specify one of thesepackage as a | |
42 | # tie for LIST_CACHE | |
43 | ||
44 | my %scalar_only = map {($_ => 1)} qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File); | |
45 | ||
46 | sub memoize { | |
47 | my $fn = shift; | |
48 | my %options = @_; | |
49 | my $options = \%options; | |
50 | ||
51 | unless (defined($fn) && | |
52 | (ref $fn eq 'CODE' || ref $fn eq '')) { | |
53 | croak "Usage: memoize 'functionname'|coderef {OPTIONS}"; | |
54 | } | |
55 | ||
56 | my $uppack = caller; # TCL me Elmo! | |
57 | my $cref; # Code reference to original function | |
58 | my $name = (ref $fn ? undef : $fn); | |
59 | ||
60 | # Convert function names to code references | |
61 | $cref = &_make_cref($fn, $uppack); | |
62 | ||
63 | # Locate function prototype, if any | |
64 | my $proto = prototype $cref; | |
65 | if (defined $proto) { $proto = "($proto)" } | |
66 | else { $proto = "" } | |
67 | ||
899dc88a JH |
68 | # I would like to get rid of the eval, but there seems not to be any |
69 | # other way to set the prototype properly. The switch here for | |
70 | # 'usethreads' works around a bug in threadperl having to do with | |
71 | # magic goto. It would be better to fix the bug and use the magic | |
72 | # goto version everywhere. | |
73 | my $wrapper = | |
74 | $Config{usethreads} | |
75 | ? eval "sub $proto { &_memoizer(\$cref, \@_); }" | |
76 | : eval "sub $proto { unshift \@_, \$cref; goto &_memoizer; }"; | |
a0cb3900 JH |
77 | |
78 | my $normalizer = $options{NORMALIZER}; | |
79 | if (defined $normalizer && ! ref $normalizer) { | |
80 | $normalizer = _make_cref($normalizer, $uppack); | |
81 | } | |
82 | ||
83 | my $install_name; | |
84 | if (defined $options->{INSTALL}) { | |
85 | # INSTALL => name | |
86 | $install_name = $options->{INSTALL}; | |
87 | } elsif (! exists $options->{INSTALL}) { | |
88 | # No INSTALL option provided; use original name if possible | |
89 | $install_name = $name; | |
90 | } else { | |
91 | # INSTALL => undef means don't install | |
92 | } | |
93 | ||
94 | if (defined $install_name) { | |
95 | $install_name = $uppack . '::' . $install_name | |
96 | unless $install_name =~ /::/; | |
97 | no strict; | |
98 | local($^W) = 0; # ``Subroutine $install_name redefined at ...'' | |
99 | *{$install_name} = $wrapper; # Install memoized version | |
100 | } | |
101 | ||
102 | $revmemotable{$wrapper} = "" . $cref; # Turn code ref into hash key | |
103 | ||
104 | # These will be the caches | |
105 | my %caches; | |
106 | for my $context (qw(SCALAR LIST)) { | |
107 | # suppress subsequent 'uninitialized value' warnings | |
108 | $options{"${context}_CACHE"} ||= ''; | |
109 | ||
110 | my $cache_opt = $options{"${context}_CACHE"}; | |
111 | my @cache_opt_args; | |
112 | if (ref $cache_opt) { | |
113 | @cache_opt_args = @$cache_opt; | |
114 | $cache_opt = shift @cache_opt_args; | |
115 | } | |
116 | if ($cache_opt eq 'FAULT') { # no cache | |
117 | $caches{$context} = undef; | |
118 | } elsif ($cache_opt eq 'HASH') { # user-supplied hash | |
899dc88a JH |
119 | my $cache = $cache_opt_args[0]; |
120 | my $package = ref(tied %$cache); | |
121 | if ($context eq 'LIST' && $scalar_only{$package}) { | |
122 | croak("You can't use $package for LIST_CACHE because it can only store scalars"); | |
123 | } | |
124 | $caches{$context} = $cache; | |
a0cb3900 JH |
125 | } elsif ($cache_opt eq '' || $IS_CACHE_TAG{$cache_opt}) { |
126 | # default is that we make up an in-memory hash | |
127 | $caches{$context} = {}; | |
128 | # (this might get tied later, or MERGEd away) | |
129 | } else { | |
130 | croak "Unrecognized option to `${context}_CACHE': `$cache_opt' should be one of (@CONTEXT_TAGS); aborting"; | |
131 | } | |
132 | } | |
133 | ||
134 | # Perhaps I should check here that you didn't supply *both* merge | |
135 | # options. But if you did, it does do something reasonable: They | |
136 | # both get merged to the same in-memory hash. | |
137 | if ($options{SCALAR_CACHE} eq 'MERGE') { | |
138 | $caches{SCALAR} = $caches{LIST}; | |
139 | } elsif ($options{LIST_CACHE} eq 'MERGE') { | |
140 | $caches{LIST} = $caches{SCALAR}; | |
141 | } | |
142 | ||
143 | # Now deal with the TIE options | |
144 | { | |
145 | my $context; | |
146 | foreach $context (qw(SCALAR LIST)) { | |
147 | # If the relevant option wasn't `TIE', this call does nothing. | |
148 | _my_tie($context, $caches{$context}, $options); # Croaks on failure | |
149 | } | |
150 | } | |
151 | ||
152 | # We should put some more stuff in here eventually. | |
153 | # We've been saying that for serveral versions now. | |
154 | # And you know what? More stuff keeps going in! | |
155 | $memotable{$cref} = | |
156 | { | |
157 | O => $options, # Short keys here for things we need to access frequently | |
158 | N => $normalizer, | |
159 | U => $cref, | |
160 | MEMOIZED => $wrapper, | |
161 | PACKAGE => $uppack, | |
162 | NAME => $install_name, | |
163 | S => $caches{SCALAR}, | |
164 | L => $caches{LIST}, | |
165 | }; | |
166 | ||
167 | $wrapper # Return just memoized version | |
168 | } | |
169 | ||
5189e6fe JH |
170 | use warnings::register; |
171 | ||
a0cb3900 JH |
172 | # This function tries to load a tied hash class and tie the hash to it. |
173 | sub _my_tie { | |
174 | my ($context, $hash, $options) = @_; | |
175 | my $fullopt = $options->{"${context}_CACHE"}; | |
176 | ||
177 | # We already checked to make sure that this works. | |
178 | my $shortopt = (ref $fullopt) ? $fullopt->[0] : $fullopt; | |
179 | ||
180 | return unless defined $shortopt && $shortopt eq 'TIE'; | |
5189e6fe JH |
181 | carp("TIE option to memoize() is deprecated; use HASH instead") |
182 | if warnings::enabled('deprecated'); | |
a0cb3900 JH |
183 | |
184 | my @args = ref $fullopt ? @$fullopt : (); | |
185 | shift @args; | |
186 | my $module = shift @args; | |
187 | if ($context eq 'LIST' && $scalar_only{$module}) { | |
188 | croak("You can't use $module for LIST_CACHE because it can only store scalars"); | |
189 | } | |
190 | my $modulefile = $module . '.pm'; | |
191 | $modulefile =~ s{::}{/}g; | |
192 | eval { require $modulefile }; | |
193 | if ($@) { | |
194 | croak "Memoize: Couldn't load hash tie module `$module': $@; aborting"; | |
195 | } | |
a0cb3900 JH |
196 | my $rc = (tie %$hash => $module, @args); |
197 | unless ($rc) { | |
899dc88a | 198 | croak "Memoize: Couldn't tie hash to `$module': $!; aborting"; |
a0cb3900 JH |
199 | } |
200 | 1; | |
201 | } | |
202 | ||
203 | sub flush_cache { | |
204 | my $func = _make_cref($_[0], scalar caller); | |
205 | my $info = $memotable{$revmemotable{$func}}; | |
206 | die "$func not memoized" unless defined $info; | |
207 | for my $context (qw(S L)) { | |
208 | my $cache = $info->{$context}; | |
209 | if (tied %$cache && ! (tied %$cache)->can('CLEAR')) { | |
210 | my $funcname = defined($info->{NAME}) ? | |
211 | "function $info->{NAME}" : "anonymous function $func"; | |
212 | my $context = {S => 'scalar', L => 'list'}->{$context}; | |
213 | croak "Tied cache hash for $context-context $funcname does not support flushing"; | |
214 | } else { | |
215 | %$cache = (); | |
216 | } | |
217 | } | |
218 | } | |
219 | ||
220 | # This is the function that manages the memo tables. | |
221 | sub _memoizer { | |
222 | my $orig = shift; # stringized version of ref to original func. | |
223 | my $info = $memotable{$orig}; | |
224 | my $normalizer = $info->{N}; | |
225 | ||
226 | my $argstr; | |
227 | my $context = (wantarray() ? LIST : SCALAR); | |
228 | ||
229 | if (defined $normalizer) { | |
230 | no strict; | |
231 | if ($context == SCALAR) { | |
232 | $argstr = &{$normalizer}(@_); | |
233 | } elsif ($context == LIST) { | |
234 | ($argstr) = &{$normalizer}(@_); | |
235 | } else { | |
236 | croak "Internal error \#41; context was neither LIST nor SCALAR\n"; | |
237 | } | |
238 | } else { # Default normalizer | |
899dc88a JH |
239 | local $^W = 0; |
240 | $argstr = join chr(28),@_; | |
a0cb3900 JH |
241 | } |
242 | ||
243 | if ($context == SCALAR) { | |
244 | my $cache = $info->{S}; | |
899dc88a | 245 | _crap_out($info->{NAME}, 'scalar') unless $cache; |
a0cb3900 JH |
246 | if (exists $cache->{$argstr}) { |
247 | return $cache->{$argstr}; | |
248 | } else { | |
249 | my $val = &{$info->{U}}(@_); | |
250 | # Scalars are considered to be lists; store appropriately | |
251 | if ($info->{O}{SCALAR_CACHE} eq 'MERGE') { | |
252 | $cache->{$argstr} = [$val]; | |
253 | } else { | |
254 | $cache->{$argstr} = $val; | |
255 | } | |
256 | $val; | |
257 | } | |
258 | } elsif ($context == LIST) { | |
259 | my $cache = $info->{L}; | |
899dc88a | 260 | _crap_out($info->{NAME}, 'list') unless $cache; |
a0cb3900 JH |
261 | if (exists $cache->{$argstr}) { |
262 | my $val = $cache->{$argstr}; | |
a0cb3900 | 263 | # If LISTCONTEXT=>MERGE, then the function never returns lists, |
899dc88a | 264 | # so we have a scalar value cached, so just return it straightaway: |
a0cb3900 | 265 | return ($val) if $info->{O}{LIST_CACHE} eq 'MERGE'; |
899dc88a JH |
266 | # Maybe in a later version we can use a faster test. |
267 | ||
268 | # Otherwise, we cached an array containing the returned list: | |
a0cb3900 JH |
269 | return @$val; |
270 | } else { | |
271 | my $q = $cache->{$argstr} = [&{$info->{U}}(@_)]; | |
272 | @$q; | |
273 | } | |
274 | } else { | |
275 | croak "Internal error \#42; context was neither LIST nor SCALAR\n"; | |
276 | } | |
277 | } | |
278 | ||
279 | sub unmemoize { | |
280 | my $f = shift; | |
281 | my $uppack = caller; | |
282 | my $cref = _make_cref($f, $uppack); | |
283 | ||
284 | unless (exists $revmemotable{$cref}) { | |
285 | croak "Could not unmemoize function `$f', because it was not memoized to begin with"; | |
286 | } | |
287 | ||
288 | my $tabent = $memotable{$revmemotable{$cref}}; | |
289 | unless (defined $tabent) { | |
290 | croak "Could not figure out how to unmemoize function `$f'"; | |
291 | } | |
292 | my $name = $tabent->{NAME}; | |
293 | if (defined $name) { | |
294 | no strict; | |
295 | local($^W) = 0; # ``Subroutine $install_name redefined at ...'' | |
296 | *{$name} = $tabent->{U}; # Replace with original function | |
297 | } | |
298 | undef $memotable{$revmemotable{$cref}}; | |
299 | undef $revmemotable{$cref}; | |
300 | ||
301 | # This removes the last reference to the (possibly tied) memo tables | |
302 | # my ($old_function, $memotabs) = @{$tabent}{'U','S','L'}; | |
303 | # undef $tabent; | |
304 | ||
305 | # # Untie the memo tables if they were tied. | |
306 | # my $i; | |
307 | # for $i (0,1) { | |
308 | # if (tied %{$memotabs->[$i]}) { | |
309 | # warn "Untying hash #$i\n"; | |
310 | # untie %{$memotabs->[$i]}; | |
311 | # } | |
312 | # } | |
313 | ||
314 | $tabent->{U}; | |
315 | } | |
316 | ||
317 | sub _make_cref { | |
318 | my $fn = shift; | |
319 | my $uppack = shift; | |
320 | my $cref; | |
321 | my $name; | |
322 | ||
323 | if (ref $fn eq 'CODE') { | |
324 | $cref = $fn; | |
325 | } elsif (! ref $fn) { | |
326 | if ($fn =~ /::/) { | |
327 | $name = $fn; | |
328 | } else { | |
329 | $name = $uppack . '::' . $fn; | |
330 | } | |
331 | no strict; | |
332 | if (defined $name and !defined(&$name)) { | |
333 | croak "Cannot operate on nonexistent function `$fn'"; | |
334 | } | |
335 | # $cref = \&$name; | |
336 | $cref = *{$name}{CODE}; | |
337 | } else { | |
338 | my $parent = (caller(1))[3]; # Function that called _make_cref | |
339 | croak "Usage: argument 1 to `$parent' must be a function name or reference.\n"; | |
340 | } | |
341 | $DEBUG and warn "${name}($fn) => $cref in _make_cref\n"; | |
342 | $cref; | |
343 | } | |
344 | ||
345 | sub _crap_out { | |
346 | my ($funcname, $context) = @_; | |
347 | if (defined $funcname) { | |
348 | croak "Function `$funcname' called in forbidden $context context; faulting"; | |
349 | } else { | |
350 | croak "Anonymous function called in forbidden $context context; faulting"; | |
351 | } | |
352 | } | |
353 | ||
354 | 1; | |
355 | ||
356 | ||
357 | ||
358 | ||
359 | ||
360 | =head1 NAME | |
361 | ||
5189e6fe | 362 | Memoize - Make functions faster by trading space for time |
a0cb3900 JH |
363 | |
364 | =head1 SYNOPSIS | |
365 | ||
5189e6fe | 366 | # This is the documentation for Memoize 1.00 |
a0cb3900 JH |
367 | use Memoize; |
368 | memoize('slow_function'); | |
369 | slow_function(arguments); # Is faster than it was before | |
370 | ||
371 | ||
372 | This is normally all you need to know. However, many options are available: | |
373 | ||
374 | memoize(function, options...); | |
375 | ||
376 | Options include: | |
377 | ||
378 | NORMALIZER => function | |
379 | INSTALL => new_name | |
380 | ||
381 | SCALAR_CACHE => 'MEMORY' | |
382 | SCALAR_CACHE => ['HASH', \%cache_hash ] | |
383 | SCALAR_CACHE => 'FAULT' | |
384 | SCALAR_CACHE => 'MERGE' | |
385 | ||
386 | LIST_CACHE => 'MEMORY' | |
387 | LIST_CACHE => ['HASH', \%cache_hash ] | |
388 | LIST_CACHE => 'FAULT' | |
389 | LIST_CACHE => 'MERGE' | |
390 | ||
391 | =head1 DESCRIPTION | |
392 | ||
393 | `Memoizing' a function makes it faster by trading space for time. It | |
394 | does this by caching the return values of the function in a table. | |
395 | If you call the function again with the same arguments, C<memoize> | |
3d4a255c | 396 | jumps in and gives you the value out of the table, instead of letting |
a0cb3900 JH |
397 | the function compute the value all over again. |
398 | ||
399 | Here is an extreme example. Consider the Fibonacci sequence, defined | |
400 | by the following function: | |
401 | ||
402 | # Compute Fibonacci numbers | |
403 | sub fib { | |
404 | my $n = shift; | |
405 | return $n if $n < 2; | |
406 | fib($n-1) + fib($n-2); | |
407 | } | |
408 | ||
409 | This function is very slow. Why? To compute fib(14), it first wants | |
410 | to compute fib(13) and fib(12), and add the results. But to compute | |
411 | fib(13), it first has to compute fib(12) and fib(11), and then it | |
412 | comes back and computes fib(12) all over again even though the answer | |
413 | is the same. And both of the times that it wants to compute fib(12), | |
414 | it has to compute fib(11) from scratch, and then it has to do it | |
415 | again each time it wants to compute fib(13). This function does so | |
416 | much recomputing of old results that it takes a really long time to | |
417 | run---fib(14) makes 1,200 extra recursive calls to itself, to compute | |
418 | and recompute things that it already computed. | |
419 | ||
420 | This function is a good candidate for memoization. If you memoize the | |
421 | `fib' function above, it will compute fib(14) exactly once, the first | |
422 | time it needs to, and then save the result in a table. Then if you | |
423 | ask for fib(14) again, it gives you the result out of the table. | |
424 | While computing fib(14), instead of computing fib(12) twice, it does | |
425 | it once; the second time it needs the value it gets it from the table. | |
426 | It doesn't compute fib(11) four times; it computes it once, getting it | |
427 | from the table the next three times. Instead of making 1,200 | |
428 | recursive calls to `fib', it makes 15. This makes the function about | |
429 | 150 times faster. | |
430 | ||
431 | You could do the memoization yourself, by rewriting the function, like | |
432 | this: | |
433 | ||
434 | # Compute Fibonacci numbers, memoized version | |
435 | { my @fib; | |
436 | sub fib { | |
437 | my $n = shift; | |
438 | return $fib[$n] if defined $fib[$n]; | |
439 | return $fib[$n] = $n if $n < 2; | |
440 | $fib[$n] = fib($n-1) + fib($n-2); | |
441 | } | |
442 | } | |
443 | ||
444 | Or you could use this module, like this: | |
445 | ||
446 | use Memoize; | |
447 | memoize('fib'); | |
448 | ||
449 | # Rest of the fib function just like the original version. | |
450 | ||
451 | This makes it easy to turn memoizing on and off. | |
452 | ||
453 | Here's an even simpler example: I wrote a simple ray tracer; the | |
454 | program would look in a certain direction, figure out what it was | |
455 | looking at, and then convert the `color' value (typically a string | |
456 | like `red') of that object to a red, green, and blue pixel value, like | |
457 | this: | |
458 | ||
459 | for ($direction = 0; $direction < 300; $direction++) { | |
460 | # Figure out which object is in direction $direction | |
461 | $color = $object->{color}; | |
462 | ($r, $g, $b) = @{&ColorToRGB($color)}; | |
463 | ... | |
464 | } | |
465 | ||
466 | Since there are relatively few objects in a picture, there are only a | |
467 | few colors, which get looked up over and over again. Memoizing | |
5189e6fe | 468 | C<ColorToRGB> sped up the program by several percent. |
a0cb3900 JH |
469 | |
470 | =head1 DETAILS | |
471 | ||
472 | This module exports exactly one function, C<memoize>. The rest of the | |
473 | functions in this package are None of Your Business. | |
474 | ||
475 | You should say | |
476 | ||
477 | memoize(function) | |
478 | ||
479 | where C<function> is the name of the function you want to memoize, or | |
480 | a reference to it. C<memoize> returns a reference to the new, | |
481 | memoized version of the function, or C<undef> on a non-fatal error. | |
482 | At present, there are no non-fatal errors, but there might be some in | |
483 | the future. | |
484 | ||
485 | If C<function> was the name of a function, then C<memoize> hides the | |
486 | old version and installs the new memoized version under the old name, | |
487 | so that C<&function(...)> actually invokes the memoized version. | |
488 | ||
489 | =head1 OPTIONS | |
490 | ||
491 | There are some optional options you can pass to C<memoize> to change | |
492 | the way it behaves a little. To supply options, invoke C<memoize> | |
493 | like this: | |
494 | ||
495 | memoize(function, NORMALIZER => function, | |
496 | INSTALL => newname, | |
497 | SCALAR_CACHE => option, | |
498 | LIST_CACHE => option | |
499 | ); | |
500 | ||
501 | Each of these options is optional; you can include some, all, or none | |
502 | of them. | |
503 | ||
504 | =head2 INSTALL | |
505 | ||
506 | If you supply a function name with C<INSTALL>, memoize will install | |
507 | the new, memoized version of the function under the name you give. | |
508 | For example, | |
509 | ||
510 | memoize('fib', INSTALL => 'fastfib') | |
511 | ||
512 | installs the memoized version of C<fib> as C<fastfib>; without the | |
513 | C<INSTALL> option it would have replaced the old C<fib> with the | |
514 | memoized version. | |
515 | ||
516 | To prevent C<memoize> from installing the memoized version anywhere, use | |
517 | C<INSTALL =E<gt> undef>. | |
518 | ||
519 | =head2 NORMALIZER | |
520 | ||
521 | Suppose your function looks like this: | |
522 | ||
523 | # Typical call: f('aha!', A => 11, B => 12); | |
524 | sub f { | |
525 | my $a = shift; | |
526 | my %hash = @_; | |
527 | $hash{B} ||= 2; # B defaults to 2 | |
528 | $hash{C} ||= 7; # C defaults to 7 | |
529 | ||
530 | # Do something with $a, %hash | |
531 | } | |
532 | ||
533 | Now, the following calls to your function are all completely equivalent: | |
534 | ||
535 | f(OUCH); | |
536 | f(OUCH, B => 2); | |
537 | f(OUCH, C => 7); | |
538 | f(OUCH, B => 2, C => 7); | |
539 | f(OUCH, C => 7, B => 2); | |
540 | (etc.) | |
541 | ||
542 | However, unless you tell C<Memoize> that these calls are equivalent, | |
543 | it will not know that, and it will compute the values for these | |
544 | invocations of your function separately, and store them separately. | |
545 | ||
546 | To prevent this, supply a C<NORMALIZER> function that turns the | |
547 | program arguments into a string in a way that equivalent arguments | |
548 | turn into the same string. A C<NORMALIZER> function for C<f> above | |
549 | might look like this: | |
550 | ||
551 | sub normalize_f { | |
552 | my $a = shift; | |
553 | my %hash = @_; | |
554 | $hash{B} ||= 2; | |
555 | $hash{C} ||= 7; | |
556 | ||
3d4a255c | 557 | join(',', $a, map ($_ => $hash{$_}) sort keys %hash); |
a0cb3900 JH |
558 | } |
559 | ||
560 | Each of the argument lists above comes out of the C<normalize_f> | |
561 | function looking exactly the same, like this: | |
562 | ||
3d4a255c | 563 | OUCH,B,2,C,7 |
a0cb3900 JH |
564 | |
565 | You would tell C<Memoize> to use this normalizer this way: | |
566 | ||
567 | memoize('f', NORMALIZER => 'normalize_f'); | |
568 | ||
569 | C<memoize> knows that if the normalized version of the arguments is | |
570 | the same for two argument lists, then it can safely look up the value | |
571 | that it computed for one argument list and return it as the result of | |
572 | calling the function with the other argument list, even if the | |
573 | argument lists look different. | |
574 | ||
3d4a255c JH |
575 | The default normalizer just concatenates the arguments with character |
576 | 28 in between. (In ASCII, this is called FS or control-\.) This | |
577 | always works correctly for functions with only one string argument, | |
578 | and also when the arguments never contain character 28. However, it | |
579 | can confuse certain argument lists: | |
a0cb3900 JH |
580 | |
581 | normalizer("a\034", "b") | |
582 | normalizer("a", "\034b") | |
583 | normalizer("a\034\034b") | |
584 | ||
3d4a255c | 585 | for example. |
a0cb3900 | 586 | |
899dc88a JH |
587 | Since hash keys are strings, the default normalizer will not |
588 | distinguish between C<undef> and the empty string. It also won't work | |
3d4a255c JH |
589 | when the function's arguments are references. For example, consider a |
590 | function C<g> which gets two arguments: A number, and a reference to | |
899dc88a | 591 | an array of numbers: |
a0cb3900 JH |
592 | |
593 | g(13, [1,2,3,4,5,6,7]); | |
594 | ||
595 | The default normalizer will turn this into something like | |
3d4a255c | 596 | C<"13\034ARRAY(0x436c1f)">. That would be all right, except that a |
a0cb3900 JH |
597 | subsequent array of numbers might be stored at a different location |
598 | even though it contains the same data. If this happens, C<Memoize> | |
599 | will think that the arguments are different, even though they are | |
600 | equivalent. In this case, a normalizer like this is appropriate: | |
601 | ||
602 | sub normalize { join ' ', $_[0], @{$_[1]} } | |
603 | ||
604 | For the example above, this produces the key "13 1 2 3 4 5 6 7". | |
605 | ||
606 | Another use for normalizers is when the function depends on data other | |
607 | than those in its arguments. Suppose you have a function which | |
608 | returns a value which depends on the current hour of the day: | |
609 | ||
610 | sub on_duty { | |
611 | my ($problem_type) = @_; | |
612 | my $hour = (localtime)[2]; | |
613 | open my $fh, "$DIR/$problem_type" or die...; | |
614 | my $line; | |
615 | while ($hour-- > 0) | |
616 | $line = <$fh>; | |
617 | } | |
618 | return $line; | |
619 | } | |
620 | ||
3d4a255c | 621 | At 10:23, this function generates the 10th line of a data file; at |
a0cb3900 JH |
622 | 3:45 PM it generates the 15th line instead. By default, C<Memoize> |
623 | will only see the $problem_type argument. To fix this, include the | |
624 | current hour in the normalizer: | |
625 | ||
626 | sub normalize { join ' ', (localtime)[2], @_ } | |
627 | ||
628 | The calling context of the function (scalar or list context) is | |
629 | propagated to the normalizer. This means that if the memoized | |
630 | function will treat its arguments differently in list context than it | |
631 | would in scalar context, you can have the normalizer function select | |
632 | its behavior based on the results of C<wantarray>. Even if called in | |
633 | a list context, a normalizer should still return a single string. | |
634 | ||
635 | =head2 C<SCALAR_CACHE>, C<LIST_CACHE> | |
636 | ||
637 | Normally, C<Memoize> caches your function's return values into an | |
638 | ordinary Perl hash variable. However, you might like to have the | |
639 | values cached on the disk, so that they persist from one run of your | |
640 | program to the next, or you might like to associate some other | |
3d4a255c | 641 | interesting semantics with the cached values. |
a0cb3900 JH |
642 | |
643 | There's a slight complication under the hood of C<Memoize>: There are | |
644 | actually I<two> caches, one for scalar values and one for list values. | |
645 | When your function is called in scalar context, its return value is | |
646 | cached in one hash, and when your function is called in list context, | |
647 | its value is cached in the other hash. You can control the caching | |
648 | behavior of both contexts independently with these options. | |
649 | ||
650 | The argument to C<LIST_CACHE> or C<SCALAR_CACHE> must either be one of | |
651 | the following four strings: | |
652 | ||
653 | MEMORY | |
654 | FAULT | |
655 | MERGE | |
3d4a255c | 656 | HASH |
a0cb3900 JH |
657 | |
658 | or else it must be a reference to a list whose first element is one of | |
659 | these four strings, such as C<[HASH, arguments...]>. | |
660 | ||
661 | =over 4 | |
662 | ||
663 | =item C<MEMORY> | |
664 | ||
665 | C<MEMORY> means that return values from the function will be cached in | |
666 | an ordinary Perl hash variable. The hash variable will not persist | |
667 | after the program exits. This is the default. | |
668 | ||
669 | =item C<HASH> | |
670 | ||
671 | C<HASH> allows you to specify that a particular hash that you supply | |
672 | will be used as the cache. You can tie this hash beforehand to give | |
673 | it any behavior you want. | |
674 | ||
675 | A tied hash can have any semantics at all. It is typically tied to an | |
676 | on-disk database, so that cached values are stored in the database and | |
677 | retrieved from it again when needed, and the disk file typically | |
678 | persists after your program has exited. See C<perltie> for more | |
679 | complete details about C<tie>. | |
680 | ||
681 | A typical example is: | |
682 | ||
3d4a255c | 683 | use DB_File; |
a0cb3900 JH |
684 | tie my %cache => 'DB_File', $filename, O_RDWR|O_CREAT, 0666; |
685 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
686 | ||
687 | This has the effect of storing the cache in a C<DB_File> database | |
688 | whose name is in C<$filename>. The cache will persist after the | |
689 | program has exited. Next time the program runs, it will find the | |
690 | cache already populated from the previous run of the program. Or you | |
691 | can forcibly populate the cache by constructing a batch program that | |
692 | runs in the background and populates the cache file. Then when you | |
693 | come to run your real program the memoized function will be fast | |
694 | because all its results have been precomputed. | |
695 | ||
696 | =item C<TIE> | |
697 | ||
5189e6fe JH |
698 | This option is no longer supported. It is still documented only to |
699 | aid in the debugging of old programs that use it. Old programs should | |
700 | be converted to use the C<HASH> option instead. | |
a0cb3900 | 701 | |
3d4a255c | 702 | memoize ... [TIE, PACKAGE, ARGS...] |
a0cb3900 JH |
703 | |
704 | is merely a shortcut for | |
705 | ||
3d4a255c | 706 | require PACKAGE; |
5189e6fe JH |
707 | { my %cache; |
708 | tie %cache, PACKAGE, ARGS...; | |
709 | } | |
a0cb3900 JH |
710 | memoize ... [HASH => \%cache]; |
711 | ||
a0cb3900 JH |
712 | =item C<FAULT> |
713 | ||
714 | C<FAULT> means that you never expect to call the function in scalar | |
715 | (or list) context, and that if C<Memoize> detects such a call, it | |
716 | should abort the program. The error message is one of | |
717 | ||
718 | `foo' function called in forbidden list context at line ... | |
719 | `foo' function called in forbidden scalar context at line ... | |
720 | ||
721 | =item C<MERGE> | |
722 | ||
723 | C<MERGE> normally means the function does not distinguish between list | |
724 | and sclar context, and that return values in both contexts should be | |
725 | stored together. C<LIST_CACHE =E<gt> MERGE> means that list context | |
726 | return values should be stored in the same hash that is used for | |
727 | scalar context returns, and C<SCALAR_CACHE =E<gt> MERGE> means the | |
728 | same, mutatis mutandis. It is an error to specify C<MERGE> for both, | |
729 | but it probably does something useful. | |
730 | ||
731 | Consider this function: | |
732 | ||
733 | sub pi { 3; } | |
734 | ||
735 | Normally, the following code will result in two calls to C<pi>: | |
736 | ||
737 | $x = pi(); | |
738 | ($y) = pi(); | |
739 | $z = pi(); | |
740 | ||
741 | The first call caches the value C<3> in the scalar cache; the second | |
742 | caches the list C<(3)> in the list cache. The third call doesn't call | |
743 | the real C<pi> function; it gets the value from the scalar cache. | |
744 | ||
745 | Obviously, the second call to C<pi> is a waste of time, and storing | |
3d4a255c JH |
746 | its return value is a waste of space. Specifying C<LIST_CACHE =E<gt> |
747 | MERGE> will make C<memoize> use the same cache for scalar and list | |
748 | context return values, so that the second call uses the scalar cache | |
749 | that was populated by the first call. C<pi> ends up being called only | |
750 | once, and both subsequent calls return C<3> from the cache, regardless | |
751 | of the calling context. | |
a0cb3900 JH |
752 | |
753 | Another use for C<MERGE> is when you want both kinds of return values | |
754 | stored in the same disk file; this saves you from having to deal with | |
755 | two disk files instead of one. You can use a normalizer function to | |
756 | keep the two sets of return values separate. For example: | |
757 | ||
758 | tie my %cache => 'MLDBM', 'DB_File', $filename, ...; | |
759 | ||
760 | memoize 'myfunc', | |
761 | NORMALIZER => 'n', | |
762 | SCALAR_CACHE => [HASH => \%cache], | |
763 | LIST_CACHE => MERGE, | |
764 | ; | |
765 | ||
766 | sub n { | |
767 | my $context = wantarray() ? 'L' : 'S'; | |
768 | # ... now compute the hash key from the arguments ... | |
769 | $hashkey = "$context:$hashkey"; | |
770 | } | |
771 | ||
772 | This normalizer function will store scalar context return values in | |
773 | the disk file under keys that begin with C<S:>, and list context | |
774 | return values under keys that begin with C<L:>. | |
775 | ||
776 | =back | |
777 | ||
778 | =head1 OTHER FACILITIES | |
779 | ||
780 | =head2 C<unmemoize> | |
781 | ||
782 | There's an C<unmemoize> function that you can import if you want to. | |
783 | Why would you want to? Here's an example: Suppose you have your cache | |
784 | tied to a DBM file, and you want to make sure that the cache is | |
785 | written out to disk if someone interrupts the program. If the program | |
786 | exits normally, this will happen anyway, but if someone types | |
787 | control-C or something then the program will terminate immediately | |
788 | without synchronizing the database. So what you can do instead is | |
789 | ||
790 | $SIG{INT} = sub { unmemoize 'function' }; | |
791 | ||
a0cb3900 JH |
792 | C<unmemoize> accepts a reference to, or the name of a previously |
793 | memoized function, and undoes whatever it did to provide the memoized | |
794 | version in the first place, including making the name refer to the | |
795 | unmemoized version if appropriate. It returns a reference to the | |
796 | unmemoized version of the function. | |
797 | ||
798 | If you ask it to unmemoize a function that was never memoized, it | |
799 | croaks. | |
800 | ||
801 | =head2 C<flush_cache> | |
802 | ||
803 | C<flush_cache(function)> will flush out the caches, discarding I<all> | |
3d4a255c | 804 | the cached data. The argument may be a function name or a reference |
a0cb3900 JH |
805 | to a function. For finer control over when data is discarded or |
806 | expired, see the documentation for C<Memoize::Expire>, included in | |
807 | this package. | |
808 | ||
809 | Note that if the cache is a tied hash, C<flush_cache> will attempt to | |
810 | invoke the C<CLEAR> method on the hash. If there is no C<CLEAR> | |
811 | method, this will cause a run-time error. | |
812 | ||
813 | An alternative approach to cache flushing is to use the C<HASH> option | |
814 | (see above) to request that C<Memoize> use a particular hash variable | |
815 | as its cache. Then you can examine or modify the hash at any time in | |
3d4a255c | 816 | any way you desire. You may flush the cache by using C<%hash = ()>. |
a0cb3900 JH |
817 | |
818 | =head1 CAVEATS | |
819 | ||
820 | Memoization is not a cure-all: | |
821 | ||
822 | =over 4 | |
823 | ||
824 | =item * | |
825 | ||
826 | Do not memoize a function whose behavior depends on program | |
827 | state other than its own arguments, such as global variables, the time | |
828 | of day, or file input. These functions will not produce correct | |
829 | results when memoized. For a particularly easy example: | |
830 | ||
831 | sub f { | |
832 | time; | |
833 | } | |
834 | ||
835 | This function takes no arguments, and as far as C<Memoize> is | |
836 | concerned, it always returns the same result. C<Memoize> is wrong, of | |
837 | course, and the memoized version of this function will call C<time> once | |
838 | to get the current time, and it will return that same time | |
839 | every time you call it after that. | |
840 | ||
841 | =item * | |
842 | ||
843 | Do not memoize a function with side effects. | |
844 | ||
845 | sub f { | |
846 | my ($a, $b) = @_; | |
847 | my $s = $a + $b; | |
848 | print "$a + $b = $s.\n"; | |
849 | } | |
850 | ||
851 | This function accepts two arguments, adds them, and prints their sum. | |
852 | Its return value is the numuber of characters it printed, but you | |
853 | probably didn't care about that. But C<Memoize> doesn't understand | |
854 | that. If you memoize this function, you will get the result you | |
855 | expect the first time you ask it to print the sum of 2 and 3, but | |
856 | subsequent calls will return 1 (the return value of | |
857 | C<print>) without actually printing anything. | |
858 | ||
859 | =item * | |
860 | ||
861 | Do not memoize a function that returns a data structure that is | |
862 | modified by its caller. | |
863 | ||
864 | Consider these functions: C<getusers> returns a list of users somehow, | |
865 | and then C<main> throws away the first user on the list and prints the | |
866 | rest: | |
867 | ||
868 | sub main { | |
869 | my $userlist = getusers(); | |
870 | shift @$userlist; | |
871 | foreach $u (@$userlist) { | |
872 | print "User $u\n"; | |
873 | } | |
874 | } | |
875 | ||
876 | sub getusers { | |
877 | my @users; | |
878 | # Do something to get a list of users; | |
879 | \@users; # Return reference to list. | |
880 | } | |
881 | ||
882 | If you memoize C<getusers> here, it will work right exactly once. The | |
883 | reference to the users list will be stored in the memo table. C<main> | |
884 | will discard the first element from the referenced list. The next | |
885 | time you invoke C<main>, C<Memoize> will not call C<getusers>; it will | |
886 | just return the same reference to the same list it got last time. But | |
887 | this time the list has already had its head removed; C<main> will | |
888 | erroneously remove another element from it. The list will get shorter | |
889 | and shorter every time you call C<main>. | |
890 | ||
891 | Similarly, this: | |
892 | ||
893 | $u1 = getusers(); | |
894 | $u2 = getusers(); | |
895 | pop @$u1; | |
896 | ||
897 | will modify $u2 as well as $u1, because both variables are references | |
898 | to the same array. Had C<getusers> not been memoized, $u1 and $u2 | |
899 | would have referred to different arrays. | |
900 | ||
901 | =item * | |
902 | ||
903 | Do not memoize a very simple function. | |
904 | ||
905 | Recently someone mentioned to me that the Memoize module made his | |
906 | program run slower instead of faster. It turned out that he was | |
907 | memoizing the following function: | |
908 | ||
909 | sub square { | |
910 | $_[0] * $_[0]; | |
911 | } | |
912 | ||
913 | I pointed out that C<Memoize> uses a hash, and that looking up a | |
914 | number in the hash is necessarily going to take a lot longer than a | |
915 | single multiplication. There really is no way to speed up the | |
916 | C<square> function. | |
917 | ||
918 | Memoization is not magical. | |
919 | ||
920 | =back | |
921 | ||
922 | =head1 PERSISTENT CACHE SUPPORT | |
923 | ||
924 | You can tie the cache tables to any sort of tied hash that you want | |
925 | to, as long as it supports C<TIEHASH>, C<FETCH>, C<STORE>, and | |
926 | C<EXISTS>. For example, | |
927 | ||
928 | tie my %cache => 'GDBM_File', $filename, O_RDWR|O_CREAT, 0666; | |
929 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
930 | ||
931 | works just fine. For some storage methods, you need a little glue. | |
932 | ||
933 | C<SDBM_File> doesn't supply an C<EXISTS> method, so included in this | |
934 | package is a glue module called C<Memoize::SDBM_File> which does | |
935 | provide one. Use this instead of plain C<SDBM_File> to store your | |
936 | cache table on disk in an C<SDBM_File> database: | |
937 | ||
938 | tie my %cache => 'Memoize::SDBM_File', $filename, O_RDWR|O_CREAT, 0666; | |
939 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
940 | ||
941 | C<NDBM_File> has the same problem and the same solution. (Use | |
899dc88a | 942 | C<Memoize::NDBM_File instead of plain NDBM_File.>) |
a0cb3900 JH |
943 | |
944 | C<Storable> isn't a tied hash class at all. You can use it to store a | |
945 | hash to disk and retrieve it again, but you can't modify the hash while | |
946 | it's on the disk. So if you want to store your cache table in a | |
947 | C<Storable> database, use C<Memoize::Storable>, which puts a hashlike | |
948 | front-end onto C<Storable>. The hash table is actually kept in | |
949 | memory, and is loaded from your C<Storable> file at the time you | |
950 | memoize the function, and stored back at the time you unmemoize the | |
951 | function (or when your program exits): | |
952 | ||
953 | tie my %cache => 'Memoize::Storable', $filename; | |
954 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
955 | ||
956 | tie my %cache => 'Memoize::Storable', $filename, 'nstore'; | |
957 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
958 | ||
959 | Include the `nstore' option to have the C<Storable> database written | |
960 | in `network order'. (See L<Storable> for more details about this.) | |
961 | ||
3d4a255c JH |
962 | The C<flush_cache()> function will raise a run-time error unless the |
963 | tied package provides a C<CLEAR> method. | |
964 | ||
a0cb3900 JH |
965 | =head1 EXPIRATION SUPPORT |
966 | ||
967 | See Memoize::Expire, which is a plug-in module that adds expiration | |
968 | functionality to Memoize. If you don't like the kinds of policies | |
969 | that Memoize::Expire implements, it is easy to write your own plug-in | |
970 | module to implement whatever policy you desire. Memoize comes with | |
971 | several examples. An expiration manager that implements a LRU policy | |
972 | is available on CPAN as Memoize::ExpireLRU. | |
973 | ||
974 | =head1 BUGS | |
975 | ||
976 | The test suite is much better, but always needs improvement. | |
977 | ||
3d4a255c JH |
978 | There is some problem with the way C<goto &f> works under threaded |
979 | Perl, perhaps because of the lexical scoping of C<@_>. This is a bug | |
980 | in Perl, and until it is resolved, memoized functions will see a | |
981 | slightly different C<caller()> and will perform a little more slowly | |
982 | on threaded perls than unthreaded perls. | |
a0cb3900 | 983 | |
5189e6fe JH |
984 | Some versions of C<DB_File> won't let you store data under a key of |
985 | length 0. That means that if you have a function C<f> which you | |
986 | memoized and the cache is in a C<DB_File> database, then the value of | |
987 | C<f()> (C<f> called with no arguments) will not be memoized. If this | |
988 | is a big problem, you can supply a normalizer function that prepends | |
989 | C<"x"> to every key. | |
a0cb3900 JH |
990 | |
991 | =head1 MAILING LIST | |
992 | ||
993 | To join a very low-traffic mailing list for announcements about | |
994 | C<Memoize>, send an empty note to C<mjd-perl-memoize-request@plover.com>. | |
995 | ||
996 | =head1 AUTHOR | |
997 | ||
998 | Mark-Jason Dominus (C<mjd-perl-memoize+@plover.com>), Plover Systems co. | |
999 | ||
1000 | See the C<Memoize.pm> Page at http://www.plover.com/~mjd/perl/Memoize/ | |
1001 | for news and upgrades. Near this page, at | |
1002 | http://www.plover.com/~mjd/perl/MiniMemoize/ there is an article about | |
1003 | memoization and about the internals of Memoize that appeared in The | |
1004 | Perl Journal, issue #13. (This article is also included in the | |
1005 | Memoize distribution as `article.html'.) | |
1006 | ||
3d4a255c JH |
1007 | My upcoming book will discuss memoization (and many other fascinating |
1008 | topics) in tremendous detail. It will be published by Morgan Kaufmann | |
1009 | in 2002, possibly under the title I<Perl Advanced Techniques | |
1010 | Handbook>. It will also be available on-line for free. For more | |
1011 | information, visit http://perl.plover.com/book/ . | |
1012 | ||
a0cb3900 JH |
1013 | To join a mailing list for announcements about C<Memoize>, send an |
1014 | empty message to C<mjd-perl-memoize-request@plover.com>. This mailing | |
1015 | list is for announcements only and has extremely low traffic---about | |
3d4a255c | 1016 | two messages per year. |
a0cb3900 | 1017 | |
899dc88a JH |
1018 | =head1 COPYRIGHT AND LICENSE |
1019 | ||
1020 | Copyright 1998, 1999, 2000, 2001 by Mark Jason Dominus | |
1021 | ||
1022 | This library is free software; you may redistribute it and/or modify | |
3d4a255c | 1023 | it under the same terms as Perl itself. |
899dc88a | 1024 | |
a0cb3900 JH |
1025 | =head1 THANK YOU |
1026 | ||
1027 | Many thanks to Jonathan Roy for bug reports and suggestions, to | |
1028 | Michael Schwern for other bug reports and patches, to Mike Cariaso for | |
1029 | helping me to figure out the Right Thing to Do About Expiration, to | |
3d4a255c JH |
1030 | Joshua Gerth, Joshua Chamas, Jonathan Roy (again), Mark D. Anderson, |
1031 | and Andrew Johnson for more suggestions about expiration, to Brent | |
1032 | Powers for the Memoize::ExpireLRU module, to Ariel Scolnicov for | |
1033 | delightful messages about the Fibonacci function, to Dion Almaer for | |
a0cb3900 JH |
1034 | thought-provoking suggestions about the default normalizer, to Walt |
1035 | Mankowski and Kurt Starsinic for much help investigating problems | |
1036 | under threaded Perl, to Alex Dudkevich for reporting the bug in | |
1037 | prototyped functions and for checking my patch, to Tony Bass for many | |
3d4a255c JH |
1038 | helpful suggestions, to Jonathan Roy (again) for finding a use for |
1039 | C<unmemoize()>, to Philippe Verdret for enlightening discussion of | |
1040 | C<Hook::PrePostCall>, to Nat Torkington for advice I ignored, to Chris | |
a0cb3900 JH |
1041 | Nandor for portability advice, to Randal Schwartz for suggesting the |
1042 | 'C<flush_cache> function, and to Jenda Krynicky for being a light in | |
1043 | the world. | |
1044 | ||
899dc88a JH |
1045 | Special thanks to Jarkko Hietaniemi, the 5.8.0 pumpking, for including |
1046 | this module in the core and for his patient and helpful guidance | |
1047 | during the integration process. | |
3d4a255c | 1048 | |
a0cb3900 | 1049 | =cut |