Commit | Line | Data |
---|---|---|
a0cb3900 JH |
1 | # -*- mode: perl; perl-indent-level: 2; -*- |
2 | # Memoize.pm | |
3 | # | |
4 | # Transparent memoization of idempotent functions | |
5 | # | |
899dc88a | 6 | # Copyright 1998, 1999, 2000, 2001 M-J. Dominus. |
a0cb3900 JH |
7 | # You may copy and distribute this program under the |
8 | # same terms as Perl itself. If in doubt, | |
9 | # write to mjd-perl-memoize+@plover.com for a license. | |
10 | # | |
484fdf61 | 11 | # Version 1.01 $Revision: 1.18 $ $Date: 2001/06/24 17:16:47 $ |
a0cb3900 JH |
12 | |
13 | package Memoize; | |
2fe79ddc | 14 | $VERSION = '1.01_02'; |
a0cb3900 JH |
15 | |
16 | # Compile-time constants | |
17 | sub SCALAR () { 0 } | |
18 | sub LIST () { 1 } | |
19 | ||
20 | ||
21 | # | |
22 | # Usage memoize(functionname/ref, | |
23 | # { NORMALIZER => coderef, INSTALL => name, | |
24 | # LIST_CACHE => descriptor, SCALAR_CACHE => descriptor } | |
25 | # | |
26 | ||
27 | use Carp; | |
28 | use Exporter; | |
29 | use vars qw($DEBUG); | |
899dc88a | 30 | use Config; # Dammit. |
a0cb3900 JH |
31 | @ISA = qw(Exporter); |
32 | @EXPORT = qw(memoize); | |
33 | @EXPORT_OK = qw(unmemoize flush_cache); | |
34 | use strict; | |
35 | ||
36 | my %memotable; | |
37 | my %revmemotable; | |
38 | my @CONTEXT_TAGS = qw(MERGE TIE MEMORY FAULT HASH); | |
39 | my %IS_CACHE_TAG = map {($_ => 1)} @CONTEXT_TAGS; | |
40 | ||
41 | # Raise an error if the user tries to specify one of thesepackage as a | |
42 | # tie for LIST_CACHE | |
43 | ||
44 | my %scalar_only = map {($_ => 1)} qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File); | |
45 | ||
46 | sub memoize { | |
47 | my $fn = shift; | |
48 | my %options = @_; | |
49 | my $options = \%options; | |
50 | ||
51 | unless (defined($fn) && | |
52 | (ref $fn eq 'CODE' || ref $fn eq '')) { | |
53 | croak "Usage: memoize 'functionname'|coderef {OPTIONS}"; | |
54 | } | |
55 | ||
56 | my $uppack = caller; # TCL me Elmo! | |
57 | my $cref; # Code reference to original function | |
58 | my $name = (ref $fn ? undef : $fn); | |
59 | ||
60 | # Convert function names to code references | |
61 | $cref = &_make_cref($fn, $uppack); | |
62 | ||
63 | # Locate function prototype, if any | |
64 | my $proto = prototype $cref; | |
65 | if (defined $proto) { $proto = "($proto)" } | |
66 | else { $proto = "" } | |
67 | ||
899dc88a JH |
68 | # I would like to get rid of the eval, but there seems not to be any |
69 | # other way to set the prototype properly. The switch here for | |
70 | # 'usethreads' works around a bug in threadperl having to do with | |
71 | # magic goto. It would be better to fix the bug and use the magic | |
72 | # goto version everywhere. | |
73 | my $wrapper = | |
74 | $Config{usethreads} | |
75 | ? eval "sub $proto { &_memoizer(\$cref, \@_); }" | |
76 | : eval "sub $proto { unshift \@_, \$cref; goto &_memoizer; }"; | |
a0cb3900 JH |
77 | |
78 | my $normalizer = $options{NORMALIZER}; | |
79 | if (defined $normalizer && ! ref $normalizer) { | |
80 | $normalizer = _make_cref($normalizer, $uppack); | |
81 | } | |
82 | ||
83 | my $install_name; | |
84 | if (defined $options->{INSTALL}) { | |
85 | # INSTALL => name | |
86 | $install_name = $options->{INSTALL}; | |
87 | } elsif (! exists $options->{INSTALL}) { | |
88 | # No INSTALL option provided; use original name if possible | |
89 | $install_name = $name; | |
90 | } else { | |
91 | # INSTALL => undef means don't install | |
92 | } | |
93 | ||
94 | if (defined $install_name) { | |
95 | $install_name = $uppack . '::' . $install_name | |
96 | unless $install_name =~ /::/; | |
97 | no strict; | |
98 | local($^W) = 0; # ``Subroutine $install_name redefined at ...'' | |
99 | *{$install_name} = $wrapper; # Install memoized version | |
100 | } | |
101 | ||
102 | $revmemotable{$wrapper} = "" . $cref; # Turn code ref into hash key | |
103 | ||
104 | # These will be the caches | |
105 | my %caches; | |
106 | for my $context (qw(SCALAR LIST)) { | |
107 | # suppress subsequent 'uninitialized value' warnings | |
108 | $options{"${context}_CACHE"} ||= ''; | |
109 | ||
110 | my $cache_opt = $options{"${context}_CACHE"}; | |
111 | my @cache_opt_args; | |
112 | if (ref $cache_opt) { | |
113 | @cache_opt_args = @$cache_opt; | |
114 | $cache_opt = shift @cache_opt_args; | |
115 | } | |
116 | if ($cache_opt eq 'FAULT') { # no cache | |
117 | $caches{$context} = undef; | |
118 | } elsif ($cache_opt eq 'HASH') { # user-supplied hash | |
899dc88a JH |
119 | my $cache = $cache_opt_args[0]; |
120 | my $package = ref(tied %$cache); | |
121 | if ($context eq 'LIST' && $scalar_only{$package}) { | |
122 | croak("You can't use $package for LIST_CACHE because it can only store scalars"); | |
123 | } | |
124 | $caches{$context} = $cache; | |
a0cb3900 JH |
125 | } elsif ($cache_opt eq '' || $IS_CACHE_TAG{$cache_opt}) { |
126 | # default is that we make up an in-memory hash | |
127 | $caches{$context} = {}; | |
128 | # (this might get tied later, or MERGEd away) | |
129 | } else { | |
130 | croak "Unrecognized option to `${context}_CACHE': `$cache_opt' should be one of (@CONTEXT_TAGS); aborting"; | |
131 | } | |
132 | } | |
133 | ||
134 | # Perhaps I should check here that you didn't supply *both* merge | |
135 | # options. But if you did, it does do something reasonable: They | |
136 | # both get merged to the same in-memory hash. | |
137 | if ($options{SCALAR_CACHE} eq 'MERGE') { | |
138 | $caches{SCALAR} = $caches{LIST}; | |
139 | } elsif ($options{LIST_CACHE} eq 'MERGE') { | |
140 | $caches{LIST} = $caches{SCALAR}; | |
141 | } | |
142 | ||
143 | # Now deal with the TIE options | |
144 | { | |
145 | my $context; | |
146 | foreach $context (qw(SCALAR LIST)) { | |
147 | # If the relevant option wasn't `TIE', this call does nothing. | |
148 | _my_tie($context, $caches{$context}, $options); # Croaks on failure | |
149 | } | |
150 | } | |
151 | ||
152 | # We should put some more stuff in here eventually. | |
153 | # We've been saying that for serveral versions now. | |
154 | # And you know what? More stuff keeps going in! | |
155 | $memotable{$cref} = | |
156 | { | |
157 | O => $options, # Short keys here for things we need to access frequently | |
158 | N => $normalizer, | |
159 | U => $cref, | |
160 | MEMOIZED => $wrapper, | |
161 | PACKAGE => $uppack, | |
162 | NAME => $install_name, | |
163 | S => $caches{SCALAR}, | |
164 | L => $caches{LIST}, | |
165 | }; | |
166 | ||
167 | $wrapper # Return just memoized version | |
168 | } | |
169 | ||
170 | # This function tries to load a tied hash class and tie the hash to it. | |
171 | sub _my_tie { | |
172 | my ($context, $hash, $options) = @_; | |
173 | my $fullopt = $options->{"${context}_CACHE"}; | |
174 | ||
175 | # We already checked to make sure that this works. | |
176 | my $shortopt = (ref $fullopt) ? $fullopt->[0] : $fullopt; | |
177 | ||
178 | return unless defined $shortopt && $shortopt eq 'TIE'; | |
5189e6fe | 179 | carp("TIE option to memoize() is deprecated; use HASH instead") |
484fdf61 | 180 | if $^W; |
a0cb3900 JH |
181 | |
182 | my @args = ref $fullopt ? @$fullopt : (); | |
183 | shift @args; | |
184 | my $module = shift @args; | |
185 | if ($context eq 'LIST' && $scalar_only{$module}) { | |
186 | croak("You can't use $module for LIST_CACHE because it can only store scalars"); | |
187 | } | |
188 | my $modulefile = $module . '.pm'; | |
189 | $modulefile =~ s{::}{/}g; | |
190 | eval { require $modulefile }; | |
191 | if ($@) { | |
192 | croak "Memoize: Couldn't load hash tie module `$module': $@; aborting"; | |
193 | } | |
a0cb3900 JH |
194 | my $rc = (tie %$hash => $module, @args); |
195 | unless ($rc) { | |
899dc88a | 196 | croak "Memoize: Couldn't tie hash to `$module': $!; aborting"; |
a0cb3900 JH |
197 | } |
198 | 1; | |
199 | } | |
200 | ||
201 | sub flush_cache { | |
202 | my $func = _make_cref($_[0], scalar caller); | |
203 | my $info = $memotable{$revmemotable{$func}}; | |
204 | die "$func not memoized" unless defined $info; | |
205 | for my $context (qw(S L)) { | |
206 | my $cache = $info->{$context}; | |
207 | if (tied %$cache && ! (tied %$cache)->can('CLEAR')) { | |
208 | my $funcname = defined($info->{NAME}) ? | |
209 | "function $info->{NAME}" : "anonymous function $func"; | |
210 | my $context = {S => 'scalar', L => 'list'}->{$context}; | |
211 | croak "Tied cache hash for $context-context $funcname does not support flushing"; | |
212 | } else { | |
213 | %$cache = (); | |
214 | } | |
215 | } | |
216 | } | |
217 | ||
218 | # This is the function that manages the memo tables. | |
219 | sub _memoizer { | |
220 | my $orig = shift; # stringized version of ref to original func. | |
221 | my $info = $memotable{$orig}; | |
222 | my $normalizer = $info->{N}; | |
223 | ||
224 | my $argstr; | |
225 | my $context = (wantarray() ? LIST : SCALAR); | |
226 | ||
227 | if (defined $normalizer) { | |
228 | no strict; | |
229 | if ($context == SCALAR) { | |
230 | $argstr = &{$normalizer}(@_); | |
231 | } elsif ($context == LIST) { | |
232 | ($argstr) = &{$normalizer}(@_); | |
233 | } else { | |
234 | croak "Internal error \#41; context was neither LIST nor SCALAR\n"; | |
235 | } | |
236 | } else { # Default normalizer | |
899dc88a JH |
237 | local $^W = 0; |
238 | $argstr = join chr(28),@_; | |
a0cb3900 JH |
239 | } |
240 | ||
241 | if ($context == SCALAR) { | |
242 | my $cache = $info->{S}; | |
899dc88a | 243 | _crap_out($info->{NAME}, 'scalar') unless $cache; |
a0cb3900 JH |
244 | if (exists $cache->{$argstr}) { |
245 | return $cache->{$argstr}; | |
246 | } else { | |
247 | my $val = &{$info->{U}}(@_); | |
248 | # Scalars are considered to be lists; store appropriately | |
249 | if ($info->{O}{SCALAR_CACHE} eq 'MERGE') { | |
250 | $cache->{$argstr} = [$val]; | |
251 | } else { | |
252 | $cache->{$argstr} = $val; | |
253 | } | |
254 | $val; | |
255 | } | |
256 | } elsif ($context == LIST) { | |
257 | my $cache = $info->{L}; | |
899dc88a | 258 | _crap_out($info->{NAME}, 'list') unless $cache; |
a0cb3900 JH |
259 | if (exists $cache->{$argstr}) { |
260 | my $val = $cache->{$argstr}; | |
a0cb3900 | 261 | # If LISTCONTEXT=>MERGE, then the function never returns lists, |
899dc88a | 262 | # so we have a scalar value cached, so just return it straightaway: |
a0cb3900 | 263 | return ($val) if $info->{O}{LIST_CACHE} eq 'MERGE'; |
899dc88a JH |
264 | # Maybe in a later version we can use a faster test. |
265 | ||
266 | # Otherwise, we cached an array containing the returned list: | |
a0cb3900 JH |
267 | return @$val; |
268 | } else { | |
2fe79ddc FC |
269 | my @q = &{$info->{U}}(@_); |
270 | $cache->{$argstr} = $info->{O}{LIST_CACHE} eq 'MERGE' ? $q [0] : \@q; | |
271 | @q; | |
a0cb3900 JH |
272 | } |
273 | } else { | |
274 | croak "Internal error \#42; context was neither LIST nor SCALAR\n"; | |
275 | } | |
276 | } | |
277 | ||
278 | sub unmemoize { | |
279 | my $f = shift; | |
280 | my $uppack = caller; | |
281 | my $cref = _make_cref($f, $uppack); | |
282 | ||
283 | unless (exists $revmemotable{$cref}) { | |
284 | croak "Could not unmemoize function `$f', because it was not memoized to begin with"; | |
285 | } | |
286 | ||
287 | my $tabent = $memotable{$revmemotable{$cref}}; | |
288 | unless (defined $tabent) { | |
289 | croak "Could not figure out how to unmemoize function `$f'"; | |
290 | } | |
291 | my $name = $tabent->{NAME}; | |
292 | if (defined $name) { | |
293 | no strict; | |
294 | local($^W) = 0; # ``Subroutine $install_name redefined at ...'' | |
295 | *{$name} = $tabent->{U}; # Replace with original function | |
296 | } | |
297 | undef $memotable{$revmemotable{$cref}}; | |
298 | undef $revmemotable{$cref}; | |
299 | ||
300 | # This removes the last reference to the (possibly tied) memo tables | |
301 | # my ($old_function, $memotabs) = @{$tabent}{'U','S','L'}; | |
302 | # undef $tabent; | |
303 | ||
304 | # # Untie the memo tables if they were tied. | |
305 | # my $i; | |
306 | # for $i (0,1) { | |
307 | # if (tied %{$memotabs->[$i]}) { | |
308 | # warn "Untying hash #$i\n"; | |
309 | # untie %{$memotabs->[$i]}; | |
310 | # } | |
311 | # } | |
312 | ||
313 | $tabent->{U}; | |
314 | } | |
315 | ||
316 | sub _make_cref { | |
317 | my $fn = shift; | |
318 | my $uppack = shift; | |
319 | my $cref; | |
320 | my $name; | |
321 | ||
322 | if (ref $fn eq 'CODE') { | |
323 | $cref = $fn; | |
324 | } elsif (! ref $fn) { | |
325 | if ($fn =~ /::/) { | |
326 | $name = $fn; | |
327 | } else { | |
328 | $name = $uppack . '::' . $fn; | |
329 | } | |
330 | no strict; | |
331 | if (defined $name and !defined(&$name)) { | |
332 | croak "Cannot operate on nonexistent function `$fn'"; | |
333 | } | |
334 | # $cref = \&$name; | |
335 | $cref = *{$name}{CODE}; | |
336 | } else { | |
337 | my $parent = (caller(1))[3]; # Function that called _make_cref | |
338 | croak "Usage: argument 1 to `$parent' must be a function name or reference.\n"; | |
339 | } | |
340 | $DEBUG and warn "${name}($fn) => $cref in _make_cref\n"; | |
341 | $cref; | |
342 | } | |
343 | ||
344 | sub _crap_out { | |
345 | my ($funcname, $context) = @_; | |
346 | if (defined $funcname) { | |
347 | croak "Function `$funcname' called in forbidden $context context; faulting"; | |
348 | } else { | |
349 | croak "Anonymous function called in forbidden $context context; faulting"; | |
350 | } | |
351 | } | |
352 | ||
353 | 1; | |
354 | ||
355 | ||
356 | ||
357 | ||
358 | ||
359 | =head1 NAME | |
360 | ||
5189e6fe | 361 | Memoize - Make functions faster by trading space for time |
a0cb3900 JH |
362 | |
363 | =head1 SYNOPSIS | |
364 | ||
484fdf61 | 365 | # This is the documentation for Memoize 1.01 |
a0cb3900 JH |
366 | use Memoize; |
367 | memoize('slow_function'); | |
368 | slow_function(arguments); # Is faster than it was before | |
369 | ||
370 | ||
371 | This is normally all you need to know. However, many options are available: | |
372 | ||
373 | memoize(function, options...); | |
374 | ||
375 | Options include: | |
376 | ||
377 | NORMALIZER => function | |
378 | INSTALL => new_name | |
379 | ||
380 | SCALAR_CACHE => 'MEMORY' | |
381 | SCALAR_CACHE => ['HASH', \%cache_hash ] | |
382 | SCALAR_CACHE => 'FAULT' | |
383 | SCALAR_CACHE => 'MERGE' | |
384 | ||
385 | LIST_CACHE => 'MEMORY' | |
386 | LIST_CACHE => ['HASH', \%cache_hash ] | |
387 | LIST_CACHE => 'FAULT' | |
388 | LIST_CACHE => 'MERGE' | |
389 | ||
390 | =head1 DESCRIPTION | |
391 | ||
392 | `Memoizing' a function makes it faster by trading space for time. It | |
393 | does this by caching the return values of the function in a table. | |
394 | If you call the function again with the same arguments, C<memoize> | |
3d4a255c | 395 | jumps in and gives you the value out of the table, instead of letting |
a0cb3900 JH |
396 | the function compute the value all over again. |
397 | ||
398 | Here is an extreme example. Consider the Fibonacci sequence, defined | |
399 | by the following function: | |
400 | ||
401 | # Compute Fibonacci numbers | |
402 | sub fib { | |
403 | my $n = shift; | |
404 | return $n if $n < 2; | |
405 | fib($n-1) + fib($n-2); | |
406 | } | |
407 | ||
408 | This function is very slow. Why? To compute fib(14), it first wants | |
409 | to compute fib(13) and fib(12), and add the results. But to compute | |
410 | fib(13), it first has to compute fib(12) and fib(11), and then it | |
411 | comes back and computes fib(12) all over again even though the answer | |
412 | is the same. And both of the times that it wants to compute fib(12), | |
413 | it has to compute fib(11) from scratch, and then it has to do it | |
414 | again each time it wants to compute fib(13). This function does so | |
415 | much recomputing of old results that it takes a really long time to | |
416 | run---fib(14) makes 1,200 extra recursive calls to itself, to compute | |
417 | and recompute things that it already computed. | |
418 | ||
419 | This function is a good candidate for memoization. If you memoize the | |
420 | `fib' function above, it will compute fib(14) exactly once, the first | |
421 | time it needs to, and then save the result in a table. Then if you | |
422 | ask for fib(14) again, it gives you the result out of the table. | |
423 | While computing fib(14), instead of computing fib(12) twice, it does | |
424 | it once; the second time it needs the value it gets it from the table. | |
425 | It doesn't compute fib(11) four times; it computes it once, getting it | |
426 | from the table the next three times. Instead of making 1,200 | |
427 | recursive calls to `fib', it makes 15. This makes the function about | |
428 | 150 times faster. | |
429 | ||
430 | You could do the memoization yourself, by rewriting the function, like | |
431 | this: | |
432 | ||
433 | # Compute Fibonacci numbers, memoized version | |
434 | { my @fib; | |
435 | sub fib { | |
436 | my $n = shift; | |
437 | return $fib[$n] if defined $fib[$n]; | |
438 | return $fib[$n] = $n if $n < 2; | |
439 | $fib[$n] = fib($n-1) + fib($n-2); | |
440 | } | |
441 | } | |
442 | ||
443 | Or you could use this module, like this: | |
444 | ||
445 | use Memoize; | |
446 | memoize('fib'); | |
447 | ||
448 | # Rest of the fib function just like the original version. | |
449 | ||
450 | This makes it easy to turn memoizing on and off. | |
451 | ||
452 | Here's an even simpler example: I wrote a simple ray tracer; the | |
453 | program would look in a certain direction, figure out what it was | |
454 | looking at, and then convert the `color' value (typically a string | |
455 | like `red') of that object to a red, green, and blue pixel value, like | |
456 | this: | |
457 | ||
458 | for ($direction = 0; $direction < 300; $direction++) { | |
459 | # Figure out which object is in direction $direction | |
460 | $color = $object->{color}; | |
461 | ($r, $g, $b) = @{&ColorToRGB($color)}; | |
462 | ... | |
463 | } | |
464 | ||
465 | Since there are relatively few objects in a picture, there are only a | |
466 | few colors, which get looked up over and over again. Memoizing | |
5189e6fe | 467 | C<ColorToRGB> sped up the program by several percent. |
a0cb3900 JH |
468 | |
469 | =head1 DETAILS | |
470 | ||
471 | This module exports exactly one function, C<memoize>. The rest of the | |
472 | functions in this package are None of Your Business. | |
473 | ||
474 | You should say | |
475 | ||
476 | memoize(function) | |
477 | ||
478 | where C<function> is the name of the function you want to memoize, or | |
479 | a reference to it. C<memoize> returns a reference to the new, | |
480 | memoized version of the function, or C<undef> on a non-fatal error. | |
481 | At present, there are no non-fatal errors, but there might be some in | |
482 | the future. | |
483 | ||
484 | If C<function> was the name of a function, then C<memoize> hides the | |
485 | old version and installs the new memoized version under the old name, | |
486 | so that C<&function(...)> actually invokes the memoized version. | |
487 | ||
488 | =head1 OPTIONS | |
489 | ||
490 | There are some optional options you can pass to C<memoize> to change | |
491 | the way it behaves a little. To supply options, invoke C<memoize> | |
492 | like this: | |
493 | ||
494 | memoize(function, NORMALIZER => function, | |
495 | INSTALL => newname, | |
496 | SCALAR_CACHE => option, | |
497 | LIST_CACHE => option | |
498 | ); | |
499 | ||
500 | Each of these options is optional; you can include some, all, or none | |
501 | of them. | |
502 | ||
503 | =head2 INSTALL | |
504 | ||
505 | If you supply a function name with C<INSTALL>, memoize will install | |
506 | the new, memoized version of the function under the name you give. | |
507 | For example, | |
508 | ||
509 | memoize('fib', INSTALL => 'fastfib') | |
510 | ||
511 | installs the memoized version of C<fib> as C<fastfib>; without the | |
512 | C<INSTALL> option it would have replaced the old C<fib> with the | |
513 | memoized version. | |
514 | ||
515 | To prevent C<memoize> from installing the memoized version anywhere, use | |
516 | C<INSTALL =E<gt> undef>. | |
517 | ||
518 | =head2 NORMALIZER | |
519 | ||
520 | Suppose your function looks like this: | |
521 | ||
522 | # Typical call: f('aha!', A => 11, B => 12); | |
523 | sub f { | |
524 | my $a = shift; | |
525 | my %hash = @_; | |
526 | $hash{B} ||= 2; # B defaults to 2 | |
527 | $hash{C} ||= 7; # C defaults to 7 | |
528 | ||
529 | # Do something with $a, %hash | |
530 | } | |
531 | ||
532 | Now, the following calls to your function are all completely equivalent: | |
533 | ||
534 | f(OUCH); | |
535 | f(OUCH, B => 2); | |
536 | f(OUCH, C => 7); | |
537 | f(OUCH, B => 2, C => 7); | |
538 | f(OUCH, C => 7, B => 2); | |
539 | (etc.) | |
540 | ||
541 | However, unless you tell C<Memoize> that these calls are equivalent, | |
542 | it will not know that, and it will compute the values for these | |
543 | invocations of your function separately, and store them separately. | |
544 | ||
545 | To prevent this, supply a C<NORMALIZER> function that turns the | |
546 | program arguments into a string in a way that equivalent arguments | |
547 | turn into the same string. A C<NORMALIZER> function for C<f> above | |
548 | might look like this: | |
549 | ||
550 | sub normalize_f { | |
551 | my $a = shift; | |
552 | my %hash = @_; | |
553 | $hash{B} ||= 2; | |
554 | $hash{C} ||= 7; | |
555 | ||
3d4a255c | 556 | join(',', $a, map ($_ => $hash{$_}) sort keys %hash); |
a0cb3900 JH |
557 | } |
558 | ||
559 | Each of the argument lists above comes out of the C<normalize_f> | |
560 | function looking exactly the same, like this: | |
561 | ||
3d4a255c | 562 | OUCH,B,2,C,7 |
a0cb3900 JH |
563 | |
564 | You would tell C<Memoize> to use this normalizer this way: | |
565 | ||
566 | memoize('f', NORMALIZER => 'normalize_f'); | |
567 | ||
568 | C<memoize> knows that if the normalized version of the arguments is | |
569 | the same for two argument lists, then it can safely look up the value | |
570 | that it computed for one argument list and return it as the result of | |
571 | calling the function with the other argument list, even if the | |
572 | argument lists look different. | |
573 | ||
3d4a255c JH |
574 | The default normalizer just concatenates the arguments with character |
575 | 28 in between. (In ASCII, this is called FS or control-\.) This | |
576 | always works correctly for functions with only one string argument, | |
577 | and also when the arguments never contain character 28. However, it | |
578 | can confuse certain argument lists: | |
a0cb3900 JH |
579 | |
580 | normalizer("a\034", "b") | |
581 | normalizer("a", "\034b") | |
582 | normalizer("a\034\034b") | |
583 | ||
3d4a255c | 584 | for example. |
a0cb3900 | 585 | |
899dc88a JH |
586 | Since hash keys are strings, the default normalizer will not |
587 | distinguish between C<undef> and the empty string. It also won't work | |
3d4a255c JH |
588 | when the function's arguments are references. For example, consider a |
589 | function C<g> which gets two arguments: A number, and a reference to | |
899dc88a | 590 | an array of numbers: |
a0cb3900 JH |
591 | |
592 | g(13, [1,2,3,4,5,6,7]); | |
593 | ||
594 | The default normalizer will turn this into something like | |
3d4a255c | 595 | C<"13\034ARRAY(0x436c1f)">. That would be all right, except that a |
a0cb3900 JH |
596 | subsequent array of numbers might be stored at a different location |
597 | even though it contains the same data. If this happens, C<Memoize> | |
598 | will think that the arguments are different, even though they are | |
599 | equivalent. In this case, a normalizer like this is appropriate: | |
600 | ||
601 | sub normalize { join ' ', $_[0], @{$_[1]} } | |
602 | ||
603 | For the example above, this produces the key "13 1 2 3 4 5 6 7". | |
604 | ||
605 | Another use for normalizers is when the function depends on data other | |
606 | than those in its arguments. Suppose you have a function which | |
607 | returns a value which depends on the current hour of the day: | |
608 | ||
609 | sub on_duty { | |
610 | my ($problem_type) = @_; | |
611 | my $hour = (localtime)[2]; | |
612 | open my $fh, "$DIR/$problem_type" or die...; | |
613 | my $line; | |
614 | while ($hour-- > 0) | |
615 | $line = <$fh>; | |
616 | } | |
617 | return $line; | |
618 | } | |
619 | ||
3d4a255c | 620 | At 10:23, this function generates the 10th line of a data file; at |
a0cb3900 JH |
621 | 3:45 PM it generates the 15th line instead. By default, C<Memoize> |
622 | will only see the $problem_type argument. To fix this, include the | |
623 | current hour in the normalizer: | |
624 | ||
625 | sub normalize { join ' ', (localtime)[2], @_ } | |
626 | ||
627 | The calling context of the function (scalar or list context) is | |
628 | propagated to the normalizer. This means that if the memoized | |
629 | function will treat its arguments differently in list context than it | |
630 | would in scalar context, you can have the normalizer function select | |
631 | its behavior based on the results of C<wantarray>. Even if called in | |
632 | a list context, a normalizer should still return a single string. | |
633 | ||
634 | =head2 C<SCALAR_CACHE>, C<LIST_CACHE> | |
635 | ||
636 | Normally, C<Memoize> caches your function's return values into an | |
637 | ordinary Perl hash variable. However, you might like to have the | |
638 | values cached on the disk, so that they persist from one run of your | |
639 | program to the next, or you might like to associate some other | |
3d4a255c | 640 | interesting semantics with the cached values. |
a0cb3900 JH |
641 | |
642 | There's a slight complication under the hood of C<Memoize>: There are | |
643 | actually I<two> caches, one for scalar values and one for list values. | |
644 | When your function is called in scalar context, its return value is | |
645 | cached in one hash, and when your function is called in list context, | |
646 | its value is cached in the other hash. You can control the caching | |
647 | behavior of both contexts independently with these options. | |
648 | ||
649 | The argument to C<LIST_CACHE> or C<SCALAR_CACHE> must either be one of | |
650 | the following four strings: | |
651 | ||
652 | MEMORY | |
653 | FAULT | |
654 | MERGE | |
3d4a255c | 655 | HASH |
a0cb3900 JH |
656 | |
657 | or else it must be a reference to a list whose first element is one of | |
658 | these four strings, such as C<[HASH, arguments...]>. | |
659 | ||
660 | =over 4 | |
661 | ||
662 | =item C<MEMORY> | |
663 | ||
664 | C<MEMORY> means that return values from the function will be cached in | |
665 | an ordinary Perl hash variable. The hash variable will not persist | |
666 | after the program exits. This is the default. | |
667 | ||
668 | =item C<HASH> | |
669 | ||
670 | C<HASH> allows you to specify that a particular hash that you supply | |
671 | will be used as the cache. You can tie this hash beforehand to give | |
672 | it any behavior you want. | |
673 | ||
674 | A tied hash can have any semantics at all. It is typically tied to an | |
675 | on-disk database, so that cached values are stored in the database and | |
676 | retrieved from it again when needed, and the disk file typically | |
677 | persists after your program has exited. See C<perltie> for more | |
678 | complete details about C<tie>. | |
679 | ||
680 | A typical example is: | |
681 | ||
3d4a255c | 682 | use DB_File; |
a0cb3900 JH |
683 | tie my %cache => 'DB_File', $filename, O_RDWR|O_CREAT, 0666; |
684 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
685 | ||
686 | This has the effect of storing the cache in a C<DB_File> database | |
687 | whose name is in C<$filename>. The cache will persist after the | |
688 | program has exited. Next time the program runs, it will find the | |
689 | cache already populated from the previous run of the program. Or you | |
690 | can forcibly populate the cache by constructing a batch program that | |
691 | runs in the background and populates the cache file. Then when you | |
692 | come to run your real program the memoized function will be fast | |
693 | because all its results have been precomputed. | |
694 | ||
695 | =item C<TIE> | |
696 | ||
5189e6fe JH |
697 | This option is no longer supported. It is still documented only to |
698 | aid in the debugging of old programs that use it. Old programs should | |
699 | be converted to use the C<HASH> option instead. | |
a0cb3900 | 700 | |
3d4a255c | 701 | memoize ... [TIE, PACKAGE, ARGS...] |
a0cb3900 JH |
702 | |
703 | is merely a shortcut for | |
704 | ||
3d4a255c | 705 | require PACKAGE; |
5189e6fe JH |
706 | { my %cache; |
707 | tie %cache, PACKAGE, ARGS...; | |
708 | } | |
a0cb3900 JH |
709 | memoize ... [HASH => \%cache]; |
710 | ||
a0cb3900 JH |
711 | =item C<FAULT> |
712 | ||
713 | C<FAULT> means that you never expect to call the function in scalar | |
714 | (or list) context, and that if C<Memoize> detects such a call, it | |
715 | should abort the program. The error message is one of | |
716 | ||
717 | `foo' function called in forbidden list context at line ... | |
718 | `foo' function called in forbidden scalar context at line ... | |
719 | ||
720 | =item C<MERGE> | |
721 | ||
722 | C<MERGE> normally means the function does not distinguish between list | |
723 | and sclar context, and that return values in both contexts should be | |
724 | stored together. C<LIST_CACHE =E<gt> MERGE> means that list context | |
725 | return values should be stored in the same hash that is used for | |
726 | scalar context returns, and C<SCALAR_CACHE =E<gt> MERGE> means the | |
727 | same, mutatis mutandis. It is an error to specify C<MERGE> for both, | |
728 | but it probably does something useful. | |
729 | ||
730 | Consider this function: | |
731 | ||
732 | sub pi { 3; } | |
733 | ||
734 | Normally, the following code will result in two calls to C<pi>: | |
735 | ||
736 | $x = pi(); | |
737 | ($y) = pi(); | |
738 | $z = pi(); | |
739 | ||
740 | The first call caches the value C<3> in the scalar cache; the second | |
741 | caches the list C<(3)> in the list cache. The third call doesn't call | |
742 | the real C<pi> function; it gets the value from the scalar cache. | |
743 | ||
744 | Obviously, the second call to C<pi> is a waste of time, and storing | |
3d4a255c JH |
745 | its return value is a waste of space. Specifying C<LIST_CACHE =E<gt> |
746 | MERGE> will make C<memoize> use the same cache for scalar and list | |
747 | context return values, so that the second call uses the scalar cache | |
748 | that was populated by the first call. C<pi> ends up being called only | |
749 | once, and both subsequent calls return C<3> from the cache, regardless | |
750 | of the calling context. | |
a0cb3900 JH |
751 | |
752 | Another use for C<MERGE> is when you want both kinds of return values | |
753 | stored in the same disk file; this saves you from having to deal with | |
754 | two disk files instead of one. You can use a normalizer function to | |
755 | keep the two sets of return values separate. For example: | |
756 | ||
757 | tie my %cache => 'MLDBM', 'DB_File', $filename, ...; | |
758 | ||
759 | memoize 'myfunc', | |
760 | NORMALIZER => 'n', | |
761 | SCALAR_CACHE => [HASH => \%cache], | |
762 | LIST_CACHE => MERGE, | |
763 | ; | |
764 | ||
765 | sub n { | |
766 | my $context = wantarray() ? 'L' : 'S'; | |
767 | # ... now compute the hash key from the arguments ... | |
768 | $hashkey = "$context:$hashkey"; | |
769 | } | |
770 | ||
771 | This normalizer function will store scalar context return values in | |
772 | the disk file under keys that begin with C<S:>, and list context | |
773 | return values under keys that begin with C<L:>. | |
774 | ||
775 | =back | |
776 | ||
777 | =head1 OTHER FACILITIES | |
778 | ||
779 | =head2 C<unmemoize> | |
780 | ||
781 | There's an C<unmemoize> function that you can import if you want to. | |
782 | Why would you want to? Here's an example: Suppose you have your cache | |
783 | tied to a DBM file, and you want to make sure that the cache is | |
784 | written out to disk if someone interrupts the program. If the program | |
785 | exits normally, this will happen anyway, but if someone types | |
786 | control-C or something then the program will terminate immediately | |
787 | without synchronizing the database. So what you can do instead is | |
788 | ||
789 | $SIG{INT} = sub { unmemoize 'function' }; | |
790 | ||
a0cb3900 JH |
791 | C<unmemoize> accepts a reference to, or the name of a previously |
792 | memoized function, and undoes whatever it did to provide the memoized | |
793 | version in the first place, including making the name refer to the | |
794 | unmemoized version if appropriate. It returns a reference to the | |
795 | unmemoized version of the function. | |
796 | ||
797 | If you ask it to unmemoize a function that was never memoized, it | |
798 | croaks. | |
799 | ||
800 | =head2 C<flush_cache> | |
801 | ||
802 | C<flush_cache(function)> will flush out the caches, discarding I<all> | |
3d4a255c | 803 | the cached data. The argument may be a function name or a reference |
a0cb3900 JH |
804 | to a function. For finer control over when data is discarded or |
805 | expired, see the documentation for C<Memoize::Expire>, included in | |
806 | this package. | |
807 | ||
808 | Note that if the cache is a tied hash, C<flush_cache> will attempt to | |
809 | invoke the C<CLEAR> method on the hash. If there is no C<CLEAR> | |
810 | method, this will cause a run-time error. | |
811 | ||
812 | An alternative approach to cache flushing is to use the C<HASH> option | |
813 | (see above) to request that C<Memoize> use a particular hash variable | |
814 | as its cache. Then you can examine or modify the hash at any time in | |
3d4a255c | 815 | any way you desire. You may flush the cache by using C<%hash = ()>. |
a0cb3900 JH |
816 | |
817 | =head1 CAVEATS | |
818 | ||
819 | Memoization is not a cure-all: | |
820 | ||
821 | =over 4 | |
822 | ||
823 | =item * | |
824 | ||
825 | Do not memoize a function whose behavior depends on program | |
826 | state other than its own arguments, such as global variables, the time | |
827 | of day, or file input. These functions will not produce correct | |
828 | results when memoized. For a particularly easy example: | |
829 | ||
830 | sub f { | |
831 | time; | |
832 | } | |
833 | ||
834 | This function takes no arguments, and as far as C<Memoize> is | |
835 | concerned, it always returns the same result. C<Memoize> is wrong, of | |
836 | course, and the memoized version of this function will call C<time> once | |
837 | to get the current time, and it will return that same time | |
838 | every time you call it after that. | |
839 | ||
840 | =item * | |
841 | ||
842 | Do not memoize a function with side effects. | |
843 | ||
844 | sub f { | |
845 | my ($a, $b) = @_; | |
846 | my $s = $a + $b; | |
847 | print "$a + $b = $s.\n"; | |
848 | } | |
849 | ||
850 | This function accepts two arguments, adds them, and prints their sum. | |
851 | Its return value is the numuber of characters it printed, but you | |
852 | probably didn't care about that. But C<Memoize> doesn't understand | |
853 | that. If you memoize this function, you will get the result you | |
854 | expect the first time you ask it to print the sum of 2 and 3, but | |
855 | subsequent calls will return 1 (the return value of | |
856 | C<print>) without actually printing anything. | |
857 | ||
858 | =item * | |
859 | ||
860 | Do not memoize a function that returns a data structure that is | |
861 | modified by its caller. | |
862 | ||
863 | Consider these functions: C<getusers> returns a list of users somehow, | |
864 | and then C<main> throws away the first user on the list and prints the | |
865 | rest: | |
866 | ||
867 | sub main { | |
868 | my $userlist = getusers(); | |
869 | shift @$userlist; | |
870 | foreach $u (@$userlist) { | |
871 | print "User $u\n"; | |
872 | } | |
873 | } | |
874 | ||
875 | sub getusers { | |
876 | my @users; | |
877 | # Do something to get a list of users; | |
878 | \@users; # Return reference to list. | |
879 | } | |
880 | ||
881 | If you memoize C<getusers> here, it will work right exactly once. The | |
882 | reference to the users list will be stored in the memo table. C<main> | |
883 | will discard the first element from the referenced list. The next | |
884 | time you invoke C<main>, C<Memoize> will not call C<getusers>; it will | |
885 | just return the same reference to the same list it got last time. But | |
886 | this time the list has already had its head removed; C<main> will | |
887 | erroneously remove another element from it. The list will get shorter | |
888 | and shorter every time you call C<main>. | |
889 | ||
890 | Similarly, this: | |
891 | ||
892 | $u1 = getusers(); | |
893 | $u2 = getusers(); | |
894 | pop @$u1; | |
895 | ||
896 | will modify $u2 as well as $u1, because both variables are references | |
897 | to the same array. Had C<getusers> not been memoized, $u1 and $u2 | |
898 | would have referred to different arrays. | |
899 | ||
900 | =item * | |
901 | ||
902 | Do not memoize a very simple function. | |
903 | ||
904 | Recently someone mentioned to me that the Memoize module made his | |
905 | program run slower instead of faster. It turned out that he was | |
906 | memoizing the following function: | |
907 | ||
908 | sub square { | |
909 | $_[0] * $_[0]; | |
910 | } | |
911 | ||
912 | I pointed out that C<Memoize> uses a hash, and that looking up a | |
913 | number in the hash is necessarily going to take a lot longer than a | |
914 | single multiplication. There really is no way to speed up the | |
915 | C<square> function. | |
916 | ||
917 | Memoization is not magical. | |
918 | ||
919 | =back | |
920 | ||
921 | =head1 PERSISTENT CACHE SUPPORT | |
922 | ||
923 | You can tie the cache tables to any sort of tied hash that you want | |
924 | to, as long as it supports C<TIEHASH>, C<FETCH>, C<STORE>, and | |
925 | C<EXISTS>. For example, | |
926 | ||
927 | tie my %cache => 'GDBM_File', $filename, O_RDWR|O_CREAT, 0666; | |
928 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
929 | ||
930 | works just fine. For some storage methods, you need a little glue. | |
931 | ||
932 | C<SDBM_File> doesn't supply an C<EXISTS> method, so included in this | |
933 | package is a glue module called C<Memoize::SDBM_File> which does | |
934 | provide one. Use this instead of plain C<SDBM_File> to store your | |
935 | cache table on disk in an C<SDBM_File> database: | |
936 | ||
937 | tie my %cache => 'Memoize::SDBM_File', $filename, O_RDWR|O_CREAT, 0666; | |
938 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
939 | ||
940 | C<NDBM_File> has the same problem and the same solution. (Use | |
899dc88a | 941 | C<Memoize::NDBM_File instead of plain NDBM_File.>) |
a0cb3900 JH |
942 | |
943 | C<Storable> isn't a tied hash class at all. You can use it to store a | |
944 | hash to disk and retrieve it again, but you can't modify the hash while | |
945 | it's on the disk. So if you want to store your cache table in a | |
946 | C<Storable> database, use C<Memoize::Storable>, which puts a hashlike | |
947 | front-end onto C<Storable>. The hash table is actually kept in | |
948 | memory, and is loaded from your C<Storable> file at the time you | |
949 | memoize the function, and stored back at the time you unmemoize the | |
950 | function (or when your program exits): | |
951 | ||
952 | tie my %cache => 'Memoize::Storable', $filename; | |
953 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
954 | ||
955 | tie my %cache => 'Memoize::Storable', $filename, 'nstore'; | |
956 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
957 | ||
958 | Include the `nstore' option to have the C<Storable> database written | |
959 | in `network order'. (See L<Storable> for more details about this.) | |
960 | ||
3d4a255c JH |
961 | The C<flush_cache()> function will raise a run-time error unless the |
962 | tied package provides a C<CLEAR> method. | |
963 | ||
a0cb3900 JH |
964 | =head1 EXPIRATION SUPPORT |
965 | ||
966 | See Memoize::Expire, which is a plug-in module that adds expiration | |
967 | functionality to Memoize. If you don't like the kinds of policies | |
968 | that Memoize::Expire implements, it is easy to write your own plug-in | |
969 | module to implement whatever policy you desire. Memoize comes with | |
970 | several examples. An expiration manager that implements a LRU policy | |
971 | is available on CPAN as Memoize::ExpireLRU. | |
972 | ||
973 | =head1 BUGS | |
974 | ||
975 | The test suite is much better, but always needs improvement. | |
976 | ||
3d4a255c JH |
977 | There is some problem with the way C<goto &f> works under threaded |
978 | Perl, perhaps because of the lexical scoping of C<@_>. This is a bug | |
979 | in Perl, and until it is resolved, memoized functions will see a | |
980 | slightly different C<caller()> and will perform a little more slowly | |
981 | on threaded perls than unthreaded perls. | |
a0cb3900 | 982 | |
5189e6fe JH |
983 | Some versions of C<DB_File> won't let you store data under a key of |
984 | length 0. That means that if you have a function C<f> which you | |
985 | memoized and the cache is in a C<DB_File> database, then the value of | |
986 | C<f()> (C<f> called with no arguments) will not be memoized. If this | |
987 | is a big problem, you can supply a normalizer function that prepends | |
988 | C<"x"> to every key. | |
a0cb3900 JH |
989 | |
990 | =head1 MAILING LIST | |
991 | ||
992 | To join a very low-traffic mailing list for announcements about | |
993 | C<Memoize>, send an empty note to C<mjd-perl-memoize-request@plover.com>. | |
994 | ||
995 | =head1 AUTHOR | |
996 | ||
997 | Mark-Jason Dominus (C<mjd-perl-memoize+@plover.com>), Plover Systems co. | |
998 | ||
999 | See the C<Memoize.pm> Page at http://www.plover.com/~mjd/perl/Memoize/ | |
1000 | for news and upgrades. Near this page, at | |
1001 | http://www.plover.com/~mjd/perl/MiniMemoize/ there is an article about | |
1002 | memoization and about the internals of Memoize that appeared in The | |
1003 | Perl Journal, issue #13. (This article is also included in the | |
1004 | Memoize distribution as `article.html'.) | |
1005 | ||
3d4a255c JH |
1006 | My upcoming book will discuss memoization (and many other fascinating |
1007 | topics) in tremendous detail. It will be published by Morgan Kaufmann | |
1008 | in 2002, possibly under the title I<Perl Advanced Techniques | |
1009 | Handbook>. It will also be available on-line for free. For more | |
1010 | information, visit http://perl.plover.com/book/ . | |
1011 | ||
a0cb3900 JH |
1012 | To join a mailing list for announcements about C<Memoize>, send an |
1013 | empty message to C<mjd-perl-memoize-request@plover.com>. This mailing | |
1014 | list is for announcements only and has extremely low traffic---about | |
3d4a255c | 1015 | two messages per year. |
a0cb3900 | 1016 | |
899dc88a JH |
1017 | =head1 COPYRIGHT AND LICENSE |
1018 | ||
1019 | Copyright 1998, 1999, 2000, 2001 by Mark Jason Dominus | |
1020 | ||
1021 | This library is free software; you may redistribute it and/or modify | |
3d4a255c | 1022 | it under the same terms as Perl itself. |
899dc88a | 1023 | |
a0cb3900 JH |
1024 | =head1 THANK YOU |
1025 | ||
1026 | Many thanks to Jonathan Roy for bug reports and suggestions, to | |
1027 | Michael Schwern for other bug reports and patches, to Mike Cariaso for | |
1028 | helping me to figure out the Right Thing to Do About Expiration, to | |
3d4a255c JH |
1029 | Joshua Gerth, Joshua Chamas, Jonathan Roy (again), Mark D. Anderson, |
1030 | and Andrew Johnson for more suggestions about expiration, to Brent | |
1031 | Powers for the Memoize::ExpireLRU module, to Ariel Scolnicov for | |
1032 | delightful messages about the Fibonacci function, to Dion Almaer for | |
a0cb3900 JH |
1033 | thought-provoking suggestions about the default normalizer, to Walt |
1034 | Mankowski and Kurt Starsinic for much help investigating problems | |
1035 | under threaded Perl, to Alex Dudkevich for reporting the bug in | |
1036 | prototyped functions and for checking my patch, to Tony Bass for many | |
3d4a255c JH |
1037 | helpful suggestions, to Jonathan Roy (again) for finding a use for |
1038 | C<unmemoize()>, to Philippe Verdret for enlightening discussion of | |
1039 | C<Hook::PrePostCall>, to Nat Torkington for advice I ignored, to Chris | |
a0cb3900 JH |
1040 | Nandor for portability advice, to Randal Schwartz for suggesting the |
1041 | 'C<flush_cache> function, and to Jenda Krynicky for being a light in | |
1042 | the world. | |
1043 | ||
899dc88a JH |
1044 | Special thanks to Jarkko Hietaniemi, the 5.8.0 pumpking, for including |
1045 | this module in the core and for his patient and helpful guidance | |
1046 | during the integration process. | |
3d4a255c | 1047 | |
a0cb3900 | 1048 | =cut |