Commit | Line | Data |
---|---|---|
a0cb3900 JH |
1 | # -*- mode: perl; perl-indent-level: 2; -*- |
2 | # Memoize.pm | |
3 | # | |
4 | # Transparent memoization of idempotent functions | |
5 | # | |
8114efa0 | 6 | # Copyright 1998, 1999, 2000, 2001, 2012 M. J. Dominus. |
a0cb3900 JH |
7 | # You may copy and distribute this program under the |
8 | # same terms as Perl itself. If in doubt, | |
9 | # write to mjd-perl-memoize+@plover.com for a license. | |
a0cb3900 JH |
10 | |
11 | package Memoize; | |
8114efa0 | 12 | $VERSION = '1.03'; |
a0cb3900 JH |
13 | |
14 | # Compile-time constants | |
15 | sub SCALAR () { 0 } | |
16 | sub LIST () { 1 } | |
17 | ||
18 | ||
19 | # | |
20 | # Usage memoize(functionname/ref, | |
21 | # { NORMALIZER => coderef, INSTALL => name, | |
22 | # LIST_CACHE => descriptor, SCALAR_CACHE => descriptor } | |
23 | # | |
24 | ||
25 | use Carp; | |
26 | use Exporter; | |
27 | use vars qw($DEBUG); | |
899dc88a | 28 | use Config; # Dammit. |
a0cb3900 JH |
29 | @ISA = qw(Exporter); |
30 | @EXPORT = qw(memoize); | |
31 | @EXPORT_OK = qw(unmemoize flush_cache); | |
32 | use strict; | |
33 | ||
34 | my %memotable; | |
35 | my %revmemotable; | |
36 | my @CONTEXT_TAGS = qw(MERGE TIE MEMORY FAULT HASH); | |
37 | my %IS_CACHE_TAG = map {($_ => 1)} @CONTEXT_TAGS; | |
38 | ||
39 | # Raise an error if the user tries to specify one of thesepackage as a | |
40 | # tie for LIST_CACHE | |
41 | ||
42 | my %scalar_only = map {($_ => 1)} qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File); | |
43 | ||
44 | sub memoize { | |
45 | my $fn = shift; | |
46 | my %options = @_; | |
47 | my $options = \%options; | |
48 | ||
49 | unless (defined($fn) && | |
50 | (ref $fn eq 'CODE' || ref $fn eq '')) { | |
51 | croak "Usage: memoize 'functionname'|coderef {OPTIONS}"; | |
52 | } | |
53 | ||
54 | my $uppack = caller; # TCL me Elmo! | |
55 | my $cref; # Code reference to original function | |
56 | my $name = (ref $fn ? undef : $fn); | |
57 | ||
58 | # Convert function names to code references | |
59 | $cref = &_make_cref($fn, $uppack); | |
60 | ||
61 | # Locate function prototype, if any | |
62 | my $proto = prototype $cref; | |
63 | if (defined $proto) { $proto = "($proto)" } | |
64 | else { $proto = "" } | |
65 | ||
899dc88a JH |
66 | # I would like to get rid of the eval, but there seems not to be any |
67 | # other way to set the prototype properly. The switch here for | |
68 | # 'usethreads' works around a bug in threadperl having to do with | |
69 | # magic goto. It would be better to fix the bug and use the magic | |
70 | # goto version everywhere. | |
71 | my $wrapper = | |
72 | $Config{usethreads} | |
73 | ? eval "sub $proto { &_memoizer(\$cref, \@_); }" | |
74 | : eval "sub $proto { unshift \@_, \$cref; goto &_memoizer; }"; | |
a0cb3900 JH |
75 | |
76 | my $normalizer = $options{NORMALIZER}; | |
77 | if (defined $normalizer && ! ref $normalizer) { | |
78 | $normalizer = _make_cref($normalizer, $uppack); | |
79 | } | |
80 | ||
81 | my $install_name; | |
82 | if (defined $options->{INSTALL}) { | |
83 | # INSTALL => name | |
84 | $install_name = $options->{INSTALL}; | |
85 | } elsif (! exists $options->{INSTALL}) { | |
86 | # No INSTALL option provided; use original name if possible | |
87 | $install_name = $name; | |
88 | } else { | |
89 | # INSTALL => undef means don't install | |
90 | } | |
91 | ||
92 | if (defined $install_name) { | |
93 | $install_name = $uppack . '::' . $install_name | |
94 | unless $install_name =~ /::/; | |
95 | no strict; | |
96 | local($^W) = 0; # ``Subroutine $install_name redefined at ...'' | |
97 | *{$install_name} = $wrapper; # Install memoized version | |
98 | } | |
99 | ||
100 | $revmemotable{$wrapper} = "" . $cref; # Turn code ref into hash key | |
101 | ||
102 | # These will be the caches | |
103 | my %caches; | |
104 | for my $context (qw(SCALAR LIST)) { | |
105 | # suppress subsequent 'uninitialized value' warnings | |
106 | $options{"${context}_CACHE"} ||= ''; | |
107 | ||
108 | my $cache_opt = $options{"${context}_CACHE"}; | |
109 | my @cache_opt_args; | |
110 | if (ref $cache_opt) { | |
111 | @cache_opt_args = @$cache_opt; | |
112 | $cache_opt = shift @cache_opt_args; | |
113 | } | |
114 | if ($cache_opt eq 'FAULT') { # no cache | |
115 | $caches{$context} = undef; | |
116 | } elsif ($cache_opt eq 'HASH') { # user-supplied hash | |
899dc88a JH |
117 | my $cache = $cache_opt_args[0]; |
118 | my $package = ref(tied %$cache); | |
119 | if ($context eq 'LIST' && $scalar_only{$package}) { | |
120 | croak("You can't use $package for LIST_CACHE because it can only store scalars"); | |
121 | } | |
122 | $caches{$context} = $cache; | |
a0cb3900 JH |
123 | } elsif ($cache_opt eq '' || $IS_CACHE_TAG{$cache_opt}) { |
124 | # default is that we make up an in-memory hash | |
125 | $caches{$context} = {}; | |
126 | # (this might get tied later, or MERGEd away) | |
127 | } else { | |
128 | croak "Unrecognized option to `${context}_CACHE': `$cache_opt' should be one of (@CONTEXT_TAGS); aborting"; | |
129 | } | |
130 | } | |
131 | ||
132 | # Perhaps I should check here that you didn't supply *both* merge | |
133 | # options. But if you did, it does do something reasonable: They | |
134 | # both get merged to the same in-memory hash. | |
8114efa0 CBW |
135 | if ($options{SCALAR_CACHE} eq 'MERGE' || $options{LIST_CACHE} eq 'MERGE') { |
136 | $options{MERGED} = 1; | |
a0cb3900 | 137 | $caches{SCALAR} = $caches{LIST}; |
a0cb3900 JH |
138 | } |
139 | ||
140 | # Now deal with the TIE options | |
141 | { | |
142 | my $context; | |
143 | foreach $context (qw(SCALAR LIST)) { | |
144 | # If the relevant option wasn't `TIE', this call does nothing. | |
145 | _my_tie($context, $caches{$context}, $options); # Croaks on failure | |
146 | } | |
147 | } | |
148 | ||
149 | # We should put some more stuff in here eventually. | |
150 | # We've been saying that for serveral versions now. | |
151 | # And you know what? More stuff keeps going in! | |
152 | $memotable{$cref} = | |
153 | { | |
154 | O => $options, # Short keys here for things we need to access frequently | |
155 | N => $normalizer, | |
156 | U => $cref, | |
157 | MEMOIZED => $wrapper, | |
158 | PACKAGE => $uppack, | |
159 | NAME => $install_name, | |
160 | S => $caches{SCALAR}, | |
161 | L => $caches{LIST}, | |
162 | }; | |
163 | ||
164 | $wrapper # Return just memoized version | |
165 | } | |
166 | ||
167 | # This function tries to load a tied hash class and tie the hash to it. | |
168 | sub _my_tie { | |
169 | my ($context, $hash, $options) = @_; | |
170 | my $fullopt = $options->{"${context}_CACHE"}; | |
171 | ||
172 | # We already checked to make sure that this works. | |
173 | my $shortopt = (ref $fullopt) ? $fullopt->[0] : $fullopt; | |
174 | ||
175 | return unless defined $shortopt && $shortopt eq 'TIE'; | |
5189e6fe | 176 | carp("TIE option to memoize() is deprecated; use HASH instead") |
484fdf61 | 177 | if $^W; |
a0cb3900 JH |
178 | |
179 | my @args = ref $fullopt ? @$fullopt : (); | |
180 | shift @args; | |
181 | my $module = shift @args; | |
182 | if ($context eq 'LIST' && $scalar_only{$module}) { | |
183 | croak("You can't use $module for LIST_CACHE because it can only store scalars"); | |
184 | } | |
185 | my $modulefile = $module . '.pm'; | |
186 | $modulefile =~ s{::}{/}g; | |
187 | eval { require $modulefile }; | |
188 | if ($@) { | |
189 | croak "Memoize: Couldn't load hash tie module `$module': $@; aborting"; | |
190 | } | |
a0cb3900 JH |
191 | my $rc = (tie %$hash => $module, @args); |
192 | unless ($rc) { | |
899dc88a | 193 | croak "Memoize: Couldn't tie hash to `$module': $!; aborting"; |
a0cb3900 JH |
194 | } |
195 | 1; | |
196 | } | |
197 | ||
198 | sub flush_cache { | |
199 | my $func = _make_cref($_[0], scalar caller); | |
200 | my $info = $memotable{$revmemotable{$func}}; | |
201 | die "$func not memoized" unless defined $info; | |
202 | for my $context (qw(S L)) { | |
203 | my $cache = $info->{$context}; | |
204 | if (tied %$cache && ! (tied %$cache)->can('CLEAR')) { | |
205 | my $funcname = defined($info->{NAME}) ? | |
206 | "function $info->{NAME}" : "anonymous function $func"; | |
207 | my $context = {S => 'scalar', L => 'list'}->{$context}; | |
208 | croak "Tied cache hash for $context-context $funcname does not support flushing"; | |
209 | } else { | |
210 | %$cache = (); | |
211 | } | |
212 | } | |
213 | } | |
214 | ||
215 | # This is the function that manages the memo tables. | |
216 | sub _memoizer { | |
217 | my $orig = shift; # stringized version of ref to original func. | |
218 | my $info = $memotable{$orig}; | |
219 | my $normalizer = $info->{N}; | |
220 | ||
221 | my $argstr; | |
222 | my $context = (wantarray() ? LIST : SCALAR); | |
223 | ||
224 | if (defined $normalizer) { | |
225 | no strict; | |
226 | if ($context == SCALAR) { | |
227 | $argstr = &{$normalizer}(@_); | |
228 | } elsif ($context == LIST) { | |
229 | ($argstr) = &{$normalizer}(@_); | |
230 | } else { | |
231 | croak "Internal error \#41; context was neither LIST nor SCALAR\n"; | |
232 | } | |
233 | } else { # Default normalizer | |
899dc88a JH |
234 | local $^W = 0; |
235 | $argstr = join chr(28),@_; | |
a0cb3900 JH |
236 | } |
237 | ||
238 | if ($context == SCALAR) { | |
239 | my $cache = $info->{S}; | |
899dc88a | 240 | _crap_out($info->{NAME}, 'scalar') unless $cache; |
a0cb3900 | 241 | if (exists $cache->{$argstr}) { |
8114efa0 CBW |
242 | return $info->{O}{MERGED} |
243 | ? $cache->{$argstr}[0] : $cache->{$argstr}; | |
a0cb3900 JH |
244 | } else { |
245 | my $val = &{$info->{U}}(@_); | |
246 | # Scalars are considered to be lists; store appropriately | |
8114efa0 | 247 | if ($info->{O}{MERGED}) { |
a0cb3900 JH |
248 | $cache->{$argstr} = [$val]; |
249 | } else { | |
250 | $cache->{$argstr} = $val; | |
251 | } | |
252 | $val; | |
253 | } | |
254 | } elsif ($context == LIST) { | |
255 | my $cache = $info->{L}; | |
899dc88a | 256 | _crap_out($info->{NAME}, 'list') unless $cache; |
a0cb3900 | 257 | if (exists $cache->{$argstr}) { |
8114efa0 | 258 | return @{$cache->{$argstr}}; |
a0cb3900 | 259 | } else { |
28502098 | 260 | my @q = &{$info->{U}}(@_); |
8114efa0 | 261 | $cache->{$argstr} = \@q; |
28502098 | 262 | @q; |
a0cb3900 JH |
263 | } |
264 | } else { | |
265 | croak "Internal error \#42; context was neither LIST nor SCALAR\n"; | |
266 | } | |
267 | } | |
268 | ||
269 | sub unmemoize { | |
270 | my $f = shift; | |
271 | my $uppack = caller; | |
272 | my $cref = _make_cref($f, $uppack); | |
273 | ||
274 | unless (exists $revmemotable{$cref}) { | |
275 | croak "Could not unmemoize function `$f', because it was not memoized to begin with"; | |
276 | } | |
277 | ||
278 | my $tabent = $memotable{$revmemotable{$cref}}; | |
279 | unless (defined $tabent) { | |
280 | croak "Could not figure out how to unmemoize function `$f'"; | |
281 | } | |
282 | my $name = $tabent->{NAME}; | |
283 | if (defined $name) { | |
284 | no strict; | |
285 | local($^W) = 0; # ``Subroutine $install_name redefined at ...'' | |
286 | *{$name} = $tabent->{U}; # Replace with original function | |
287 | } | |
288 | undef $memotable{$revmemotable{$cref}}; | |
289 | undef $revmemotable{$cref}; | |
290 | ||
291 | # This removes the last reference to the (possibly tied) memo tables | |
292 | # my ($old_function, $memotabs) = @{$tabent}{'U','S','L'}; | |
293 | # undef $tabent; | |
294 | ||
295 | # # Untie the memo tables if they were tied. | |
296 | # my $i; | |
297 | # for $i (0,1) { | |
298 | # if (tied %{$memotabs->[$i]}) { | |
299 | # warn "Untying hash #$i\n"; | |
300 | # untie %{$memotabs->[$i]}; | |
301 | # } | |
302 | # } | |
303 | ||
304 | $tabent->{U}; | |
305 | } | |
306 | ||
307 | sub _make_cref { | |
308 | my $fn = shift; | |
309 | my $uppack = shift; | |
310 | my $cref; | |
311 | my $name; | |
312 | ||
313 | if (ref $fn eq 'CODE') { | |
314 | $cref = $fn; | |
315 | } elsif (! ref $fn) { | |
316 | if ($fn =~ /::/) { | |
317 | $name = $fn; | |
318 | } else { | |
319 | $name = $uppack . '::' . $fn; | |
320 | } | |
321 | no strict; | |
322 | if (defined $name and !defined(&$name)) { | |
323 | croak "Cannot operate on nonexistent function `$fn'"; | |
324 | } | |
325 | # $cref = \&$name; | |
326 | $cref = *{$name}{CODE}; | |
327 | } else { | |
328 | my $parent = (caller(1))[3]; # Function that called _make_cref | |
329 | croak "Usage: argument 1 to `$parent' must be a function name or reference.\n"; | |
330 | } | |
331 | $DEBUG and warn "${name}($fn) => $cref in _make_cref\n"; | |
332 | $cref; | |
333 | } | |
334 | ||
335 | sub _crap_out { | |
336 | my ($funcname, $context) = @_; | |
337 | if (defined $funcname) { | |
338 | croak "Function `$funcname' called in forbidden $context context; faulting"; | |
339 | } else { | |
340 | croak "Anonymous function called in forbidden $context context; faulting"; | |
341 | } | |
342 | } | |
343 | ||
344 | 1; | |
345 | ||
346 | ||
347 | ||
348 | ||
349 | ||
350 | =head1 NAME | |
351 | ||
5189e6fe | 352 | Memoize - Make functions faster by trading space for time |
a0cb3900 JH |
353 | |
354 | =head1 SYNOPSIS | |
355 | ||
8114efa0 | 356 | # This is the documentation for Memoize 1.03 |
a0cb3900 JH |
357 | use Memoize; |
358 | memoize('slow_function'); | |
359 | slow_function(arguments); # Is faster than it was before | |
360 | ||
361 | ||
362 | This is normally all you need to know. However, many options are available: | |
363 | ||
364 | memoize(function, options...); | |
365 | ||
366 | Options include: | |
367 | ||
368 | NORMALIZER => function | |
369 | INSTALL => new_name | |
370 | ||
371 | SCALAR_CACHE => 'MEMORY' | |
372 | SCALAR_CACHE => ['HASH', \%cache_hash ] | |
373 | SCALAR_CACHE => 'FAULT' | |
374 | SCALAR_CACHE => 'MERGE' | |
375 | ||
376 | LIST_CACHE => 'MEMORY' | |
377 | LIST_CACHE => ['HASH', \%cache_hash ] | |
378 | LIST_CACHE => 'FAULT' | |
379 | LIST_CACHE => 'MERGE' | |
380 | ||
381 | =head1 DESCRIPTION | |
382 | ||
383 | `Memoizing' a function makes it faster by trading space for time. It | |
384 | does this by caching the return values of the function in a table. | |
385 | If you call the function again with the same arguments, C<memoize> | |
3d4a255c | 386 | jumps in and gives you the value out of the table, instead of letting |
a0cb3900 JH |
387 | the function compute the value all over again. |
388 | ||
389 | Here is an extreme example. Consider the Fibonacci sequence, defined | |
390 | by the following function: | |
391 | ||
392 | # Compute Fibonacci numbers | |
393 | sub fib { | |
394 | my $n = shift; | |
395 | return $n if $n < 2; | |
396 | fib($n-1) + fib($n-2); | |
397 | } | |
398 | ||
399 | This function is very slow. Why? To compute fib(14), it first wants | |
400 | to compute fib(13) and fib(12), and add the results. But to compute | |
401 | fib(13), it first has to compute fib(12) and fib(11), and then it | |
402 | comes back and computes fib(12) all over again even though the answer | |
403 | is the same. And both of the times that it wants to compute fib(12), | |
404 | it has to compute fib(11) from scratch, and then it has to do it | |
405 | again each time it wants to compute fib(13). This function does so | |
406 | much recomputing of old results that it takes a really long time to | |
407 | run---fib(14) makes 1,200 extra recursive calls to itself, to compute | |
408 | and recompute things that it already computed. | |
409 | ||
410 | This function is a good candidate for memoization. If you memoize the | |
411 | `fib' function above, it will compute fib(14) exactly once, the first | |
412 | time it needs to, and then save the result in a table. Then if you | |
413 | ask for fib(14) again, it gives you the result out of the table. | |
414 | While computing fib(14), instead of computing fib(12) twice, it does | |
415 | it once; the second time it needs the value it gets it from the table. | |
416 | It doesn't compute fib(11) four times; it computes it once, getting it | |
417 | from the table the next three times. Instead of making 1,200 | |
418 | recursive calls to `fib', it makes 15. This makes the function about | |
419 | 150 times faster. | |
420 | ||
421 | You could do the memoization yourself, by rewriting the function, like | |
422 | this: | |
423 | ||
424 | # Compute Fibonacci numbers, memoized version | |
425 | { my @fib; | |
426 | sub fib { | |
427 | my $n = shift; | |
428 | return $fib[$n] if defined $fib[$n]; | |
429 | return $fib[$n] = $n if $n < 2; | |
430 | $fib[$n] = fib($n-1) + fib($n-2); | |
431 | } | |
432 | } | |
433 | ||
434 | Or you could use this module, like this: | |
435 | ||
436 | use Memoize; | |
437 | memoize('fib'); | |
438 | ||
439 | # Rest of the fib function just like the original version. | |
440 | ||
441 | This makes it easy to turn memoizing on and off. | |
442 | ||
443 | Here's an even simpler example: I wrote a simple ray tracer; the | |
444 | program would look in a certain direction, figure out what it was | |
445 | looking at, and then convert the `color' value (typically a string | |
446 | like `red') of that object to a red, green, and blue pixel value, like | |
447 | this: | |
448 | ||
449 | for ($direction = 0; $direction < 300; $direction++) { | |
450 | # Figure out which object is in direction $direction | |
451 | $color = $object->{color}; | |
452 | ($r, $g, $b) = @{&ColorToRGB($color)}; | |
453 | ... | |
454 | } | |
455 | ||
456 | Since there are relatively few objects in a picture, there are only a | |
457 | few colors, which get looked up over and over again. Memoizing | |
5189e6fe | 458 | C<ColorToRGB> sped up the program by several percent. |
a0cb3900 JH |
459 | |
460 | =head1 DETAILS | |
461 | ||
462 | This module exports exactly one function, C<memoize>. The rest of the | |
463 | functions in this package are None of Your Business. | |
464 | ||
465 | You should say | |
466 | ||
467 | memoize(function) | |
468 | ||
469 | where C<function> is the name of the function you want to memoize, or | |
470 | a reference to it. C<memoize> returns a reference to the new, | |
471 | memoized version of the function, or C<undef> on a non-fatal error. | |
472 | At present, there are no non-fatal errors, but there might be some in | |
473 | the future. | |
474 | ||
475 | If C<function> was the name of a function, then C<memoize> hides the | |
476 | old version and installs the new memoized version under the old name, | |
477 | so that C<&function(...)> actually invokes the memoized version. | |
478 | ||
479 | =head1 OPTIONS | |
480 | ||
481 | There are some optional options you can pass to C<memoize> to change | |
482 | the way it behaves a little. To supply options, invoke C<memoize> | |
483 | like this: | |
484 | ||
485 | memoize(function, NORMALIZER => function, | |
486 | INSTALL => newname, | |
487 | SCALAR_CACHE => option, | |
488 | LIST_CACHE => option | |
489 | ); | |
490 | ||
491 | Each of these options is optional; you can include some, all, or none | |
492 | of them. | |
493 | ||
494 | =head2 INSTALL | |
495 | ||
496 | If you supply a function name with C<INSTALL>, memoize will install | |
497 | the new, memoized version of the function under the name you give. | |
498 | For example, | |
499 | ||
500 | memoize('fib', INSTALL => 'fastfib') | |
501 | ||
502 | installs the memoized version of C<fib> as C<fastfib>; without the | |
503 | C<INSTALL> option it would have replaced the old C<fib> with the | |
504 | memoized version. | |
505 | ||
506 | To prevent C<memoize> from installing the memoized version anywhere, use | |
507 | C<INSTALL =E<gt> undef>. | |
508 | ||
509 | =head2 NORMALIZER | |
510 | ||
511 | Suppose your function looks like this: | |
512 | ||
513 | # Typical call: f('aha!', A => 11, B => 12); | |
514 | sub f { | |
515 | my $a = shift; | |
516 | my %hash = @_; | |
517 | $hash{B} ||= 2; # B defaults to 2 | |
518 | $hash{C} ||= 7; # C defaults to 7 | |
519 | ||
520 | # Do something with $a, %hash | |
521 | } | |
522 | ||
523 | Now, the following calls to your function are all completely equivalent: | |
524 | ||
525 | f(OUCH); | |
526 | f(OUCH, B => 2); | |
527 | f(OUCH, C => 7); | |
528 | f(OUCH, B => 2, C => 7); | |
529 | f(OUCH, C => 7, B => 2); | |
530 | (etc.) | |
531 | ||
532 | However, unless you tell C<Memoize> that these calls are equivalent, | |
533 | it will not know that, and it will compute the values for these | |
534 | invocations of your function separately, and store them separately. | |
535 | ||
536 | To prevent this, supply a C<NORMALIZER> function that turns the | |
537 | program arguments into a string in a way that equivalent arguments | |
538 | turn into the same string. A C<NORMALIZER> function for C<f> above | |
539 | might look like this: | |
540 | ||
541 | sub normalize_f { | |
542 | my $a = shift; | |
543 | my %hash = @_; | |
544 | $hash{B} ||= 2; | |
545 | $hash{C} ||= 7; | |
546 | ||
3d4a255c | 547 | join(',', $a, map ($_ => $hash{$_}) sort keys %hash); |
a0cb3900 JH |
548 | } |
549 | ||
550 | Each of the argument lists above comes out of the C<normalize_f> | |
551 | function looking exactly the same, like this: | |
552 | ||
3d4a255c | 553 | OUCH,B,2,C,7 |
a0cb3900 JH |
554 | |
555 | You would tell C<Memoize> to use this normalizer this way: | |
556 | ||
557 | memoize('f', NORMALIZER => 'normalize_f'); | |
558 | ||
559 | C<memoize> knows that if the normalized version of the arguments is | |
560 | the same for two argument lists, then it can safely look up the value | |
561 | that it computed for one argument list and return it as the result of | |
562 | calling the function with the other argument list, even if the | |
563 | argument lists look different. | |
564 | ||
3d4a255c JH |
565 | The default normalizer just concatenates the arguments with character |
566 | 28 in between. (In ASCII, this is called FS or control-\.) This | |
567 | always works correctly for functions with only one string argument, | |
568 | and also when the arguments never contain character 28. However, it | |
569 | can confuse certain argument lists: | |
a0cb3900 JH |
570 | |
571 | normalizer("a\034", "b") | |
572 | normalizer("a", "\034b") | |
573 | normalizer("a\034\034b") | |
574 | ||
3d4a255c | 575 | for example. |
a0cb3900 | 576 | |
899dc88a JH |
577 | Since hash keys are strings, the default normalizer will not |
578 | distinguish between C<undef> and the empty string. It also won't work | |
3d4a255c JH |
579 | when the function's arguments are references. For example, consider a |
580 | function C<g> which gets two arguments: A number, and a reference to | |
899dc88a | 581 | an array of numbers: |
a0cb3900 JH |
582 | |
583 | g(13, [1,2,3,4,5,6,7]); | |
584 | ||
585 | The default normalizer will turn this into something like | |
3d4a255c | 586 | C<"13\034ARRAY(0x436c1f)">. That would be all right, except that a |
a0cb3900 JH |
587 | subsequent array of numbers might be stored at a different location |
588 | even though it contains the same data. If this happens, C<Memoize> | |
589 | will think that the arguments are different, even though they are | |
590 | equivalent. In this case, a normalizer like this is appropriate: | |
591 | ||
592 | sub normalize { join ' ', $_[0], @{$_[1]} } | |
593 | ||
594 | For the example above, this produces the key "13 1 2 3 4 5 6 7". | |
595 | ||
596 | Another use for normalizers is when the function depends on data other | |
597 | than those in its arguments. Suppose you have a function which | |
598 | returns a value which depends on the current hour of the day: | |
599 | ||
600 | sub on_duty { | |
601 | my ($problem_type) = @_; | |
602 | my $hour = (localtime)[2]; | |
603 | open my $fh, "$DIR/$problem_type" or die...; | |
604 | my $line; | |
605 | while ($hour-- > 0) | |
606 | $line = <$fh>; | |
607 | } | |
608 | return $line; | |
609 | } | |
610 | ||
3d4a255c | 611 | At 10:23, this function generates the 10th line of a data file; at |
a0cb3900 JH |
612 | 3:45 PM it generates the 15th line instead. By default, C<Memoize> |
613 | will only see the $problem_type argument. To fix this, include the | |
614 | current hour in the normalizer: | |
615 | ||
616 | sub normalize { join ' ', (localtime)[2], @_ } | |
617 | ||
618 | The calling context of the function (scalar or list context) is | |
619 | propagated to the normalizer. This means that if the memoized | |
620 | function will treat its arguments differently in list context than it | |
621 | would in scalar context, you can have the normalizer function select | |
622 | its behavior based on the results of C<wantarray>. Even if called in | |
623 | a list context, a normalizer should still return a single string. | |
624 | ||
625 | =head2 C<SCALAR_CACHE>, C<LIST_CACHE> | |
626 | ||
627 | Normally, C<Memoize> caches your function's return values into an | |
628 | ordinary Perl hash variable. However, you might like to have the | |
629 | values cached on the disk, so that they persist from one run of your | |
630 | program to the next, or you might like to associate some other | |
3d4a255c | 631 | interesting semantics with the cached values. |
a0cb3900 JH |
632 | |
633 | There's a slight complication under the hood of C<Memoize>: There are | |
634 | actually I<two> caches, one for scalar values and one for list values. | |
635 | When your function is called in scalar context, its return value is | |
636 | cached in one hash, and when your function is called in list context, | |
637 | its value is cached in the other hash. You can control the caching | |
638 | behavior of both contexts independently with these options. | |
639 | ||
640 | The argument to C<LIST_CACHE> or C<SCALAR_CACHE> must either be one of | |
641 | the following four strings: | |
642 | ||
643 | MEMORY | |
644 | FAULT | |
645 | MERGE | |
3d4a255c | 646 | HASH |
a0cb3900 | 647 | |
8114efa0 | 648 | or else it must be a reference to an array whose first element is one of |
a0cb3900 JH |
649 | these four strings, such as C<[HASH, arguments...]>. |
650 | ||
651 | =over 4 | |
652 | ||
653 | =item C<MEMORY> | |
654 | ||
655 | C<MEMORY> means that return values from the function will be cached in | |
656 | an ordinary Perl hash variable. The hash variable will not persist | |
657 | after the program exits. This is the default. | |
658 | ||
659 | =item C<HASH> | |
660 | ||
661 | C<HASH> allows you to specify that a particular hash that you supply | |
662 | will be used as the cache. You can tie this hash beforehand to give | |
663 | it any behavior you want. | |
664 | ||
665 | A tied hash can have any semantics at all. It is typically tied to an | |
666 | on-disk database, so that cached values are stored in the database and | |
667 | retrieved from it again when needed, and the disk file typically | |
668 | persists after your program has exited. See C<perltie> for more | |
669 | complete details about C<tie>. | |
670 | ||
671 | A typical example is: | |
672 | ||
3d4a255c | 673 | use DB_File; |
a0cb3900 JH |
674 | tie my %cache => 'DB_File', $filename, O_RDWR|O_CREAT, 0666; |
675 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
676 | ||
677 | This has the effect of storing the cache in a C<DB_File> database | |
678 | whose name is in C<$filename>. The cache will persist after the | |
679 | program has exited. Next time the program runs, it will find the | |
680 | cache already populated from the previous run of the program. Or you | |
681 | can forcibly populate the cache by constructing a batch program that | |
682 | runs in the background and populates the cache file. Then when you | |
683 | come to run your real program the memoized function will be fast | |
684 | because all its results have been precomputed. | |
685 | ||
8114efa0 CBW |
686 | Another reason to use C<HASH> is to provide your own hash variable. |
687 | You can then inspect or modify the contents of the hash to gain finer | |
688 | control over the cache management. | |
689 | ||
a0cb3900 JH |
690 | =item C<TIE> |
691 | ||
5189e6fe JH |
692 | This option is no longer supported. It is still documented only to |
693 | aid in the debugging of old programs that use it. Old programs should | |
694 | be converted to use the C<HASH> option instead. | |
a0cb3900 | 695 | |
8114efa0 | 696 | memoize ... ['TIE', PACKAGE, ARGS...] |
a0cb3900 JH |
697 | |
698 | is merely a shortcut for | |
699 | ||
3d4a255c | 700 | require PACKAGE; |
8114efa0 CBW |
701 | { tie my %cache, PACKAGE, ARGS...; |
702 | memoize ... [HASH => \%cache]; | |
703 | } | |
a0cb3900 | 704 | |
a0cb3900 JH |
705 | =item C<FAULT> |
706 | ||
707 | C<FAULT> means that you never expect to call the function in scalar | |
708 | (or list) context, and that if C<Memoize> detects such a call, it | |
709 | should abort the program. The error message is one of | |
710 | ||
711 | `foo' function called in forbidden list context at line ... | |
712 | `foo' function called in forbidden scalar context at line ... | |
713 | ||
714 | =item C<MERGE> | |
715 | ||
8114efa0 CBW |
716 | C<MERGE> normally means that the memoized function does not |
717 | distinguish between list and sclar context, and that return values in | |
718 | both contexts should be stored together. Both C<LIST_CACHE =E<gt> | |
719 | MERGE> and C<SCALAR_CACHE =E<gt> MERGE> mean the same thing. | |
720 | ||
721 | Consider this function: | |
722 | ||
723 | sub complicated { | |
724 | # ... time-consuming calculation of $result | |
725 | return $result; | |
726 | } | |
727 | ||
728 | The C<complicated> function will return the same numeric C<$result> | |
729 | regardless of whether it is called in list or in scalar context. | |
730 | ||
731 | Normally, the following code will result in two calls to C<complicated>, even | |
732 | if C<complicated> is memoized: | |
733 | ||
734 | $x = complicated(142); | |
735 | ($y) = complicated(142); | |
736 | $z = complicated(142); | |
737 | ||
738 | The first call will cache the result, say 37, in the scalar cache; the | |
739 | second will cach the list C<(37)> in the list cache. The third call | |
740 | doesn't call the real C<complicated> function; it gets the value 37 | |
741 | from the scalar cache. | |
742 | ||
743 | Obviously, the second call to C<complicated> is a waste of time, and | |
744 | storing its return value is a waste of space. Specifying C<LIST_CACHE | |
745 | =E<gt> MERGE> will make C<memoize> use the same cache for scalar and | |
746 | list context return values, so that the second call uses the scalar | |
747 | cache that was populated by the first call. C<complicated> ends up | |
748 | being called only once, and both subsequent calls return C<3> from the | |
749 | cache, regardless of the calling context. | |
750 | ||
751 | =head3 List values in scalar context | |
a0cb3900 JH |
752 | |
753 | Consider this function: | |
754 | ||
8114efa0 CBW |
755 | sub iota { return reverse (1..$_[0]) } |
756 | ||
757 | This function normally returns a list. Suppose you memoize it and | |
758 | merge the caches: | |
a0cb3900 | 759 | |
8114efa0 | 760 | memoize 'iota', SCALAR_CACHE => 'MERGE'; |
a0cb3900 | 761 | |
8114efa0 CBW |
762 | @i7 = iota(7); |
763 | $i7 = iota(7); | |
a0cb3900 | 764 | |
8114efa0 CBW |
765 | Here the first call caches the list (1,2,3,4,5,6,7). The second call |
766 | does not really make sense. C<Memoize> cannot guess what behavior | |
767 | C<iota> should have in scalar context without actually calling it in | |
768 | scalar context. Normally C<Memoize> I<would> call C<iota> in scalar | |
769 | context and cache the result, but the C<SCALAR_CACHE =E<gt> 'MERGE'> | |
770 | option says not to do that, but to use the cache list-context value | |
771 | instead. But it cannot return a list of seven elements in a scalar | |
772 | context. In this case C<$i7> will receive the B<first element> of the | |
773 | cached list value, namely 7. | |
a0cb3900 | 774 | |
8114efa0 | 775 | =head3 Merged disk caches |
a0cb3900 JH |
776 | |
777 | Another use for C<MERGE> is when you want both kinds of return values | |
778 | stored in the same disk file; this saves you from having to deal with | |
779 | two disk files instead of one. You can use a normalizer function to | |
780 | keep the two sets of return values separate. For example: | |
781 | ||
782 | tie my %cache => 'MLDBM', 'DB_File', $filename, ...; | |
783 | ||
784 | memoize 'myfunc', | |
785 | NORMALIZER => 'n', | |
786 | SCALAR_CACHE => [HASH => \%cache], | |
8114efa0 | 787 | LIST_CACHE => 'MERGE', |
a0cb3900 JH |
788 | ; |
789 | ||
790 | sub n { | |
791 | my $context = wantarray() ? 'L' : 'S'; | |
792 | # ... now compute the hash key from the arguments ... | |
793 | $hashkey = "$context:$hashkey"; | |
794 | } | |
795 | ||
796 | This normalizer function will store scalar context return values in | |
797 | the disk file under keys that begin with C<S:>, and list context | |
798 | return values under keys that begin with C<L:>. | |
799 | ||
800 | =back | |
801 | ||
802 | =head1 OTHER FACILITIES | |
803 | ||
804 | =head2 C<unmemoize> | |
805 | ||
806 | There's an C<unmemoize> function that you can import if you want to. | |
807 | Why would you want to? Here's an example: Suppose you have your cache | |
808 | tied to a DBM file, and you want to make sure that the cache is | |
809 | written out to disk if someone interrupts the program. If the program | |
810 | exits normally, this will happen anyway, but if someone types | |
811 | control-C or something then the program will terminate immediately | |
812 | without synchronizing the database. So what you can do instead is | |
813 | ||
814 | $SIG{INT} = sub { unmemoize 'function' }; | |
815 | ||
a0cb3900 JH |
816 | C<unmemoize> accepts a reference to, or the name of a previously |
817 | memoized function, and undoes whatever it did to provide the memoized | |
818 | version in the first place, including making the name refer to the | |
819 | unmemoized version if appropriate. It returns a reference to the | |
820 | unmemoized version of the function. | |
821 | ||
822 | If you ask it to unmemoize a function that was never memoized, it | |
823 | croaks. | |
824 | ||
825 | =head2 C<flush_cache> | |
826 | ||
827 | C<flush_cache(function)> will flush out the caches, discarding I<all> | |
3d4a255c | 828 | the cached data. The argument may be a function name or a reference |
a0cb3900 JH |
829 | to a function. For finer control over when data is discarded or |
830 | expired, see the documentation for C<Memoize::Expire>, included in | |
831 | this package. | |
832 | ||
833 | Note that if the cache is a tied hash, C<flush_cache> will attempt to | |
834 | invoke the C<CLEAR> method on the hash. If there is no C<CLEAR> | |
835 | method, this will cause a run-time error. | |
836 | ||
837 | An alternative approach to cache flushing is to use the C<HASH> option | |
838 | (see above) to request that C<Memoize> use a particular hash variable | |
839 | as its cache. Then you can examine or modify the hash at any time in | |
3d4a255c | 840 | any way you desire. You may flush the cache by using C<%hash = ()>. |
a0cb3900 JH |
841 | |
842 | =head1 CAVEATS | |
843 | ||
844 | Memoization is not a cure-all: | |
845 | ||
846 | =over 4 | |
847 | ||
848 | =item * | |
849 | ||
850 | Do not memoize a function whose behavior depends on program | |
851 | state other than its own arguments, such as global variables, the time | |
852 | of day, or file input. These functions will not produce correct | |
853 | results when memoized. For a particularly easy example: | |
854 | ||
855 | sub f { | |
856 | time; | |
857 | } | |
858 | ||
859 | This function takes no arguments, and as far as C<Memoize> is | |
860 | concerned, it always returns the same result. C<Memoize> is wrong, of | |
861 | course, and the memoized version of this function will call C<time> once | |
862 | to get the current time, and it will return that same time | |
863 | every time you call it after that. | |
864 | ||
865 | =item * | |
866 | ||
867 | Do not memoize a function with side effects. | |
868 | ||
869 | sub f { | |
870 | my ($a, $b) = @_; | |
871 | my $s = $a + $b; | |
872 | print "$a + $b = $s.\n"; | |
873 | } | |
874 | ||
875 | This function accepts two arguments, adds them, and prints their sum. | |
876 | Its return value is the numuber of characters it printed, but you | |
877 | probably didn't care about that. But C<Memoize> doesn't understand | |
878 | that. If you memoize this function, you will get the result you | |
879 | expect the first time you ask it to print the sum of 2 and 3, but | |
880 | subsequent calls will return 1 (the return value of | |
881 | C<print>) without actually printing anything. | |
882 | ||
883 | =item * | |
884 | ||
885 | Do not memoize a function that returns a data structure that is | |
886 | modified by its caller. | |
887 | ||
888 | Consider these functions: C<getusers> returns a list of users somehow, | |
889 | and then C<main> throws away the first user on the list and prints the | |
890 | rest: | |
891 | ||
892 | sub main { | |
893 | my $userlist = getusers(); | |
894 | shift @$userlist; | |
895 | foreach $u (@$userlist) { | |
896 | print "User $u\n"; | |
897 | } | |
898 | } | |
899 | ||
900 | sub getusers { | |
901 | my @users; | |
902 | # Do something to get a list of users; | |
903 | \@users; # Return reference to list. | |
904 | } | |
905 | ||
906 | If you memoize C<getusers> here, it will work right exactly once. The | |
907 | reference to the users list will be stored in the memo table. C<main> | |
908 | will discard the first element from the referenced list. The next | |
909 | time you invoke C<main>, C<Memoize> will not call C<getusers>; it will | |
910 | just return the same reference to the same list it got last time. But | |
911 | this time the list has already had its head removed; C<main> will | |
912 | erroneously remove another element from it. The list will get shorter | |
913 | and shorter every time you call C<main>. | |
914 | ||
915 | Similarly, this: | |
916 | ||
917 | $u1 = getusers(); | |
918 | $u2 = getusers(); | |
919 | pop @$u1; | |
920 | ||
921 | will modify $u2 as well as $u1, because both variables are references | |
922 | to the same array. Had C<getusers> not been memoized, $u1 and $u2 | |
923 | would have referred to different arrays. | |
924 | ||
925 | =item * | |
926 | ||
927 | Do not memoize a very simple function. | |
928 | ||
929 | Recently someone mentioned to me that the Memoize module made his | |
930 | program run slower instead of faster. It turned out that he was | |
931 | memoizing the following function: | |
932 | ||
933 | sub square { | |
934 | $_[0] * $_[0]; | |
935 | } | |
936 | ||
937 | I pointed out that C<Memoize> uses a hash, and that looking up a | |
938 | number in the hash is necessarily going to take a lot longer than a | |
939 | single multiplication. There really is no way to speed up the | |
940 | C<square> function. | |
941 | ||
942 | Memoization is not magical. | |
943 | ||
944 | =back | |
945 | ||
946 | =head1 PERSISTENT CACHE SUPPORT | |
947 | ||
948 | You can tie the cache tables to any sort of tied hash that you want | |
949 | to, as long as it supports C<TIEHASH>, C<FETCH>, C<STORE>, and | |
950 | C<EXISTS>. For example, | |
951 | ||
952 | tie my %cache => 'GDBM_File', $filename, O_RDWR|O_CREAT, 0666; | |
953 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
954 | ||
955 | works just fine. For some storage methods, you need a little glue. | |
956 | ||
957 | C<SDBM_File> doesn't supply an C<EXISTS> method, so included in this | |
958 | package is a glue module called C<Memoize::SDBM_File> which does | |
959 | provide one. Use this instead of plain C<SDBM_File> to store your | |
960 | cache table on disk in an C<SDBM_File> database: | |
961 | ||
962 | tie my %cache => 'Memoize::SDBM_File', $filename, O_RDWR|O_CREAT, 0666; | |
963 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
964 | ||
965 | C<NDBM_File> has the same problem and the same solution. (Use | |
899dc88a | 966 | C<Memoize::NDBM_File instead of plain NDBM_File.>) |
a0cb3900 JH |
967 | |
968 | C<Storable> isn't a tied hash class at all. You can use it to store a | |
969 | hash to disk and retrieve it again, but you can't modify the hash while | |
970 | it's on the disk. So if you want to store your cache table in a | |
971 | C<Storable> database, use C<Memoize::Storable>, which puts a hashlike | |
972 | front-end onto C<Storable>. The hash table is actually kept in | |
973 | memory, and is loaded from your C<Storable> file at the time you | |
974 | memoize the function, and stored back at the time you unmemoize the | |
975 | function (or when your program exits): | |
976 | ||
977 | tie my %cache => 'Memoize::Storable', $filename; | |
978 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
979 | ||
980 | tie my %cache => 'Memoize::Storable', $filename, 'nstore'; | |
981 | memoize 'function', SCALAR_CACHE => [HASH => \%cache]; | |
982 | ||
983 | Include the `nstore' option to have the C<Storable> database written | |
984 | in `network order'. (See L<Storable> for more details about this.) | |
985 | ||
3d4a255c JH |
986 | The C<flush_cache()> function will raise a run-time error unless the |
987 | tied package provides a C<CLEAR> method. | |
988 | ||
a0cb3900 JH |
989 | =head1 EXPIRATION SUPPORT |
990 | ||
991 | See Memoize::Expire, which is a plug-in module that adds expiration | |
992 | functionality to Memoize. If you don't like the kinds of policies | |
993 | that Memoize::Expire implements, it is easy to write your own plug-in | |
994 | module to implement whatever policy you desire. Memoize comes with | |
995 | several examples. An expiration manager that implements a LRU policy | |
996 | is available on CPAN as Memoize::ExpireLRU. | |
997 | ||
998 | =head1 BUGS | |
999 | ||
1000 | The test suite is much better, but always needs improvement. | |
1001 | ||
3d4a255c JH |
1002 | There is some problem with the way C<goto &f> works under threaded |
1003 | Perl, perhaps because of the lexical scoping of C<@_>. This is a bug | |
1004 | in Perl, and until it is resolved, memoized functions will see a | |
1005 | slightly different C<caller()> and will perform a little more slowly | |
1006 | on threaded perls than unthreaded perls. | |
a0cb3900 | 1007 | |
5189e6fe JH |
1008 | Some versions of C<DB_File> won't let you store data under a key of |
1009 | length 0. That means that if you have a function C<f> which you | |
1010 | memoized and the cache is in a C<DB_File> database, then the value of | |
1011 | C<f()> (C<f> called with no arguments) will not be memoized. If this | |
1012 | is a big problem, you can supply a normalizer function that prepends | |
1013 | C<"x"> to every key. | |
a0cb3900 JH |
1014 | |
1015 | =head1 MAILING LIST | |
1016 | ||
1017 | To join a very low-traffic mailing list for announcements about | |
1018 | C<Memoize>, send an empty note to C<mjd-perl-memoize-request@plover.com>. | |
1019 | ||
1020 | =head1 AUTHOR | |
1021 | ||
1022 | Mark-Jason Dominus (C<mjd-perl-memoize+@plover.com>), Plover Systems co. | |
1023 | ||
8114efa0 | 1024 | See the C<Memoize.pm> Page at http://perl.plover.com/Memoize/ |
a0cb3900 | 1025 | for news and upgrades. Near this page, at |
8114efa0 | 1026 | http://perl.plover.com/MiniMemoize/ there is an article about |
a0cb3900 JH |
1027 | memoization and about the internals of Memoize that appeared in The |
1028 | Perl Journal, issue #13. (This article is also included in the | |
1029 | Memoize distribution as `article.html'.) | |
1030 | ||
8114efa0 CBW |
1031 | The author's book I<Higher-Order Perl> (2005, ISBN 1558607013, published |
1032 | by Morgan Kaufmann) discusses memoization (and many other | |
1033 | topics) in tremendous detail. It is available on-line for free. | |
1034 | For more information, visit http://hop.perl.plover.com/ . | |
3d4a255c | 1035 | |
a0cb3900 JH |
1036 | To join a mailing list for announcements about C<Memoize>, send an |
1037 | empty message to C<mjd-perl-memoize-request@plover.com>. This mailing | |
8114efa0 | 1038 | list is for announcements only and has extremely low traffic---fewer than |
3d4a255c | 1039 | two messages per year. |
a0cb3900 | 1040 | |
899dc88a JH |
1041 | =head1 COPYRIGHT AND LICENSE |
1042 | ||
8114efa0 | 1043 | Copyright 1998, 1999, 2000, 2001, 2012 by Mark Jason Dominus |
899dc88a JH |
1044 | |
1045 | This library is free software; you may redistribute it and/or modify | |
3d4a255c | 1046 | it under the same terms as Perl itself. |
899dc88a | 1047 | |
a0cb3900 JH |
1048 | =head1 THANK YOU |
1049 | ||
8114efa0 CBW |
1050 | Many thanks to Florian Ragwitz for administration and packaging |
1051 | assistance, to John Tromp for bug reports, to Jonathan Roy for bug reports | |
1052 | and suggestions, to Michael Schwern for other bug reports and patches, | |
1053 | to Mike Cariaso for helping me to figure out the Right Thing to Do | |
1054 | About Expiration, to Joshua Gerth, Joshua Chamas, Jonathan Roy | |
1055 | (again), Mark D. Anderson, and Andrew Johnson for more suggestions | |
1056 | about expiration, to Brent Powers for the Memoize::ExpireLRU module, | |
1057 | to Ariel Scolnicov for delightful messages about the Fibonacci | |
1058 | function, to Dion Almaer for thought-provoking suggestions about the | |
1059 | default normalizer, to Walt Mankowski and Kurt Starsinic for much help | |
1060 | investigating problems under threaded Perl, to Alex Dudkevich for | |
1061 | reporting the bug in prototyped functions and for checking my patch, | |
1062 | to Tony Bass for many helpful suggestions, to Jonathan Roy (again) for | |
1063 | finding a use for C<unmemoize()>, to Philippe Verdret for enlightening | |
1064 | discussion of C<Hook::PrePostCall>, to Nat Torkington for advice I | |
1065 | ignored, to Chris Nandor for portability advice, to Randal Schwartz | |
1066 | for suggesting the 'C<flush_cache> function, and to Jenda Krynicky for | |
1067 | being a light in the world. | |
a0cb3900 | 1068 | |
899dc88a JH |
1069 | Special thanks to Jarkko Hietaniemi, the 5.8.0 pumpking, for including |
1070 | this module in the core and for his patient and helpful guidance | |
1071 | during the integration process. | |
3d4a255c | 1072 | |
a0cb3900 | 1073 | =cut |