This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
DJGPP tweaks from Laszlo.
[perl5.git] / lib / Tie / File.pm
CommitLineData
b5aed31e
AMS
1
2package Tie::File;
6fc0ea7e 3require 5.005;
b5aed31e
AMS
4use Carp;
5use POSIX 'SEEK_SET';
51efdd02 6use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
b5aed31e 7
6fc0ea7e 8$VERSION = "0.90";
b3fe5a4c 9my $DEFAULT_MEMORY_SIZE = 1<<21; # 2 megabytes
6fc0ea7e
JH
10my $DEFAULT_AUTODEFER_THRESHHOLD = 3; # 3 records
11my $DEFAULT_AUTODEFER_FILELEN_THRESHHOLD = 65536; # 16 disk blocksful
b3fe5a4c
AMS
12
13my %good_opt = map {$_ => 1, "-$_" => 1}
6fc0ea7e 14 qw(memory dw_size mode recsep discipline autodefer autochomp);
b5aed31e
AMS
15
16sub TIEARRAY {
17 if (@_ % 2 != 0) {
18 croak "usage: tie \@array, $_[0], filename, [option => value]...";
19 }
20 my ($pack, $file, %opts) = @_;
21
22 # transform '-foo' keys into 'foo' keys
23 for my $key (keys %opts) {
b3fe5a4c
AMS
24 unless ($good_opt{$key}) {
25 croak("$pack: Unrecognized option '$key'\n");
26 }
b5aed31e
AMS
27 my $okey = $key;
28 if ($key =~ s/^-+//) {
29 $opts{$key} = delete $opts{$okey};
30 }
31 }
32
b3fe5a4c
AMS
33 unless (defined $opts{memory}) {
34 # default is the larger of the default cache size and the
35 # deferred-write buffer size (if specified)
36 $opts{memory} = $DEFAULT_MEMORY_SIZE;
37 $opts{memory} = $opts{dw_size}
38 if defined $opts{dw_size} && $opts{dw_size} > $DEFAULT_MEMORY_SIZE;
57c7bc08 39 # Dora Winifred Read
b3fe5a4c
AMS
40 }
41 $opts{dw_size} = $opts{memory} unless defined $opts{dw_size};
42 if ($opts{dw_size} > $opts{memory}) {
43 croak("$pack: dw_size may not be larger than total memory allocation\n");
44 }
57c7bc08
AMS
45 # are we in deferred-write mode?
46 $opts{defer} = 0 unless defined $opts{defer};
47 $opts{deferred} = {}; # no records are presently deferred
b3fe5a4c 48 $opts{deferred_s} = 0; # count of total bytes in ->{deferred}
6fc0ea7e 49 $opts{deferred_max} = -1; # empty
b5aed31e
AMS
50
51 # the cache is a hash instead of an array because it is likely to be
52 # sparsely populated
6fc0ea7e
JH
53 $opts{cache} = Tie::File::Cache->new($opts{memory});
54
55 # autodeferment is enabled by default
56 $opts{autodefer} = 1 unless defined $opts{autodefer};
57 $opts{autodeferring} = 0; # but is not initially active
58 $opts{ad_history} = [];
59 $opts{autodefer_threshhold} = $DEFAULT_AUTODEFER_THRESHHOLD
60 unless defined $opts{autodefer_threshhold};
61 $opts{autodefer_filelen_threshhold} = $DEFAULT_AUTODEFER_FILELEN_THRESHHOLD
62 unless defined $opts{autodefer_filelen_threshhold};
b5aed31e
AMS
63
64 $opts{offsets} = [0];
65 $opts{filename} = $file;
b3fe5a4c
AMS
66 unless (defined $opts{recsep}) {
67 $opts{recsep} = _default_recsep();
68 }
b5aed31e
AMS
69 $opts{recseplen} = length($opts{recsep});
70 if ($opts{recseplen} == 0) {
71 croak "Empty record separator not supported by $pack";
72 }
73
0b28bc9a
AMS
74 $opts{autochomp} = 1 unless defined $opts{autochomp};
75
b5aed31e 76 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
fa408a35 77 my $fh;
b5aed31e 78
fa408a35 79 if (UNIVERSAL::isa($file, 'GLOB')) {
57c7bc08
AMS
80 # We use 1 here on the theory that some systems
81 # may not indicate failure if we use 0.
82 # MSWin32 does not indicate failure with 0, but I don't know if
83 # it will indicate failure with 1 or not.
84 unless (seek $file, 1, SEEK_SET) {
fa408a35
AMS
85 croak "$pack: your filehandle does not appear to be seekable";
86 }
57c7bc08
AMS
87 seek $file, 0, SEEK_SET # put it back
88 $fh = $file; # setting binmode is the user's problem
fa408a35
AMS
89 } elsif (ref $file) {
90 croak "usage: tie \@array, $pack, filename, [option => value]...";
91 } else {
92 $fh = \do { local *FH }; # only works in 5.005 and later
93 sysopen $fh, $file, $mode, 0666 or return;
94 binmode $fh;
95 }
b5aed31e 96 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
b3fe5a4c
AMS
97 if (defined $opts{discipline} && $] >= 5.006) {
98 # This avoids a compile-time warning under 5.005
99 eval 'binmode($fh, $opts{discipline})';
100 croak $@ if $@ =~ /unknown discipline/i;
101 die if $@;
102 }
b5aed31e
AMS
103 $opts{fh} = $fh;
104
105 bless \%opts => $pack;
106}
107
108sub FETCH {
109 my ($self, $n) = @_;
6fc0ea7e
JH
110 my $rec;
111
112 # check the defer buffer
113 if ($self->_is_deferring && exists $self->{deferred}{$n}) {
114 $rec = $self->{deferred}{$n};
115 } else {
116 $rec = $self->_fetch($n);
117 }
118
57c7bc08 119 $self->_chomp1($rec);
0b28bc9a
AMS
120}
121
122# Chomp many records in-place; return nothing useful
123sub _chomp {
124 my $self = shift;
125 return unless $self->{autochomp};
126 if ($self->{autochomp}) {
127 for (@_) {
128 next unless defined;
129 substr($_, - $self->{recseplen}) = "";
130 }
131 }
132}
133
134# Chomp one record in-place; return modified record
135sub _chomp1 {
136 my ($self, $rec) = @_;
137 return $rec unless $self->{autochomp};
138 return unless defined $rec;
139 substr($rec, - $self->{recseplen}) = "";
140 $rec;
141}
142
143sub _fetch {
144 my ($self, $n) = @_;
b5aed31e
AMS
145
146 # check the record cache
6fc0ea7e 147 { my $cached = $self->{cache}->lookup($n);
b5aed31e
AMS
148 return $cached if defined $cached;
149 }
150
151 unless ($#{$self->{offsets}} >= $n) {
152 my $o = $self->_fill_offsets_to($n);
153 # If it's still undefined, there is no such record, so return 'undef'
154 return unless defined $o;
155 }
156
157 my $fh = $self->{FH};
158 $self->_seek($n); # we can do this now that offsets is populated
159 my $rec = $self->_read_record;
b3fe5a4c
AMS
160
161# If we happen to have just read the first record, check to see if
162# the length of the record matches what 'tell' says. If not, Tie::File
163# won't work, and should drop dead.
164#
165# if ($n == 0 && defined($rec) && tell($self->{fh}) != length($rec)) {
166# if (defined $self->{discipline}) {
167# croak "I/O discipline $self->{discipline} not supported";
168# } else {
169# croak "File encoding not supported";
170# }
171# }
172
6fc0ea7e 173 $self->{cache}->insert($n, $rec) if defined $rec && not $self->{flushing};
b5aed31e
AMS
174 $rec;
175}
176
177sub STORE {
178 my ($self, $n, $rec) = @_;
6fc0ea7e 179 die "STORE called from _check_integrity!" if $DIAGNOSTIC;
b5aed31e
AMS
180
181 $self->_fixrecs($rec);
182
6fc0ea7e
JH
183 if ($self->{autodefer}) {
184 $self->_annotate_ad_history($n);
185 }
186
187 return $self->_store_deferred($n, $rec) if $self->_is_deferring;
188
b5aed31e
AMS
189
190 # We need this to decide whether the new record will fit
191 # It incidentally populates the offsets table
192 # Note we have to do this before we alter the cache
6fc0ea7e 193 # 20020324 Wait, but this DOES alter the cache. TODO BUG?
0b28bc9a 194 my $oldrec = $self->_fetch($n);
b5aed31e 195
6fc0ea7e
JH
196 if (defined($self->{cache}->lookup($n))) {
197 $self->{cache}->update($n, $rec);
fa408a35 198 }
b5aed31e
AMS
199
200 if (not defined $oldrec) {
201 # We're storing a record beyond the end of the file
51efdd02 202 $self->_extend_file_to($n+1);
b5aed31e
AMS
203 $oldrec = $self->{recsep};
204 }
205 my $len_diff = length($rec) - length($oldrec);
206
b3fe5a4c 207 # length($oldrec) here is not consistent with text mode TODO XXX BUG
b5aed31e
AMS
208 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
209
210 # now update the offsets
211 # array slice goes from element $n+1 (the first one to move)
212 # to the end
213 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
214 $_ += $len_diff;
215 }
216}
217
b3fe5a4c
AMS
218sub _store_deferred {
219 my ($self, $n, $rec) = @_;
6fc0ea7e 220 $self->{cache}->remove($n);
b3fe5a4c 221 my $old_deferred = $self->{deferred}{$n};
6fc0ea7e
JH
222
223 if (defined $self->{deferred_max} && $n > $self->{deferred_max}) {
224 $self->{deferred_max} = $n;
225 }
b3fe5a4c 226 $self->{deferred}{$n} = $rec;
6fc0ea7e
JH
227
228 my $len_diff = length($rec);
229 $len_diff -= length($old_deferred) if defined $old_deferred;
230 $self->{deferred_s} += $len_diff;
231 $self->{cache}->adj_limit(-$len_diff);
b3fe5a4c 232 if ($self->{deferred_s} > $self->{dw_size}) {
57c7bc08
AMS
233 $self->_flush;
234 } elsif ($self->_cache_too_full) {
b3fe5a4c
AMS
235 $self->_cache_flush;
236 }
237}
238
57c7bc08
AMS
239# Remove a single record from the deferred-write buffer without writing it
240# The record need not be present
241sub _delete_deferred {
242 my ($self, $n) = @_;
243 my $rec = delete $self->{deferred}{$n};
244 return unless defined $rec;
6fc0ea7e
JH
245
246 if (defined $self->{deferred_max}
247 && $n == $self->{deferred_max}) {
248 undef $self->{deferred_max};
249 }
250
57c7bc08 251 $self->{deferred_s} -= length $rec;
6fc0ea7e 252 $self->{cache}->adj_limit(length $rec);
57c7bc08
AMS
253}
254
b5aed31e
AMS
255sub FETCHSIZE {
256 my $self = shift;
257 my $n = $#{$self->{offsets}};
57c7bc08 258 # 20020317 Change this to binary search
b5aed31e
AMS
259 while (defined ($self->_fill_offsets_to($n+1))) {
260 ++$n;
261 }
6fc0ea7e
JH
262 my $top_deferred = $self->_defer_max;
263 $n = $top_deferred+1 if defined $top_deferred && $n < $top_deferred+1;
b5aed31e
AMS
264 $n;
265}
266
267sub STORESIZE {
268 my ($self, $len) = @_;
6fc0ea7e
JH
269
270 if ($self->{autodefer}) {
271 $self->_annotate_ad_history('STORESIZE');
272 }
273
b5aed31e
AMS
274 my $olen = $self->FETCHSIZE;
275 return if $len == $olen; # Woo-hoo!
276
277 # file gets longer
278 if ($len > $olen) {
6fc0ea7e 279 if ($self->_is_deferring) {
57c7bc08
AMS
280 for ($olen .. $len-1) {
281 $self->_store_deferred($_, $self->{recsep});
282 }
283 } else {
284 $self->_extend_file_to($len);
285 }
b5aed31e
AMS
286 return;
287 }
288
289 # file gets shorter
6fc0ea7e
JH
290 if ($self->_is_deferring) {
291 # TODO maybe replace this with map-plus-assignment?
57c7bc08
AMS
292 for (grep $_ >= $len, keys %{$self->{deferred}}) {
293 $self->_delete_deferred($_);
294 }
6fc0ea7e 295 $self->{deferred_max} = $len-1;
57c7bc08
AMS
296 }
297
b5aed31e
AMS
298 $self->_seek($len);
299 $self->_chop_file;
836d9961 300 $#{$self->{offsets}} = $len;
b3fe5a4c 301# $self->{offsets}[0] = 0; # in case we just chopped this
6fc0ea7e
JH
302
303 $self->{cache}->remove(grep $_ >= $len, $self->{cache}->keys);
b5aed31e
AMS
304}
305
51efdd02
AMS
306sub PUSH {
307 my $self = shift;
308 $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
57c7bc08 309# $self->FETCHSIZE; # av.c takes care of this for me
51efdd02
AMS
310}
311
312sub POP {
313 my $self = shift;
7b6b3db1
JH
314 my $size = $self->FETCHSIZE;
315 return if $size == 0;
316# print STDERR "# POPPITY POP POP POP\n";
317 scalar $self->SPLICE($size-1, 1);
51efdd02
AMS
318}
319
320sub SHIFT {
321 my $self = shift;
322 scalar $self->SPLICE(0, 1);
323}
324
325sub UNSHIFT {
326 my $self = shift;
327 $self->SPLICE(0, 0, @_);
57c7bc08 328 # $self->FETCHSIZE; # av.c takes care of this for me
51efdd02
AMS
329}
330
331sub CLEAR {
51efdd02 332 my $self = shift;
6fc0ea7e
JH
333
334 if ($self->{autodefer}) {
335 $self->_annotate_ad_history('CLEAR');
336 }
337
51efdd02
AMS
338 $self->_seekb(0);
339 $self->_chop_file;
6fc0ea7e
JH
340 $self->{cache}->set_limit($self->{memory});
341 $self->{cache}->empty;
51efdd02 342 @{$self->{offsets}} = (0);
57c7bc08
AMS
343 %{$self->{deferred}}= ();
344 $self->{deferred_s} = 0;
6fc0ea7e 345 $self->{deferred_max} = -1;
51efdd02
AMS
346}
347
348sub EXTEND {
349 my ($self, $n) = @_;
57c7bc08
AMS
350
351 # No need to pre-extend anything in this case
6fc0ea7e 352 return if $self->_is_deferring;
57c7bc08 353
51efdd02
AMS
354 $self->_fill_offsets_to($n);
355 $self->_extend_file_to($n);
356}
357
358sub DELETE {
359 my ($self, $n) = @_;
6fc0ea7e
JH
360
361 if ($self->{autodefer}) {
362 $self->_annotate_ad_history('DELETE');
363 }
364
51efdd02 365 my $lastrec = $self->FETCHSIZE-1;
57c7bc08 366 my $rec = $self->FETCH($n);
6fc0ea7e 367 $self->_delete_deferred($n) if $self->_is_deferring;
51efdd02
AMS
368 if ($n == $lastrec) {
369 $self->_seek($n);
370 $self->_chop_file;
fa408a35 371 $#{$self->{offsets}}--;
6fc0ea7e 372 $self->{cache}->remove($n);
51efdd02 373 # perhaps in this case I should also remove trailing null records?
57c7bc08
AMS
374 # 20020316
375 # Note that delete @a[-3..-1] deletes the records in the wrong order,
376 # so we only chop the very last one out of the file. We could repair this
377 # by tracking deleted records inside the object.
378 } elsif ($n < $lastrec) {
51efdd02
AMS
379 $self->STORE($n, "");
380 }
57c7bc08 381 $rec;
51efdd02
AMS
382}
383
384sub EXISTS {
385 my ($self, $n) = @_;
57c7bc08
AMS
386 return 1 if exists $self->{deferred}{$n};
387 $self->_fill_offsets_to($n); # I think this is unnecessary
388 $n < $self->FETCHSIZE;
51efdd02
AMS
389}
390
b5aed31e 391sub SPLICE {
b3fe5a4c 392 my $self = shift;
6fc0ea7e
JH
393
394 if ($self->{autodefer}) {
395 $self->_annotate_ad_history('SPLICE');
396 }
397
398 $self->_flush if $self->_is_deferring; # move this up?
0b28bc9a
AMS
399 if (wantarray) {
400 $self->_chomp(my @a = $self->_splice(@_));
401 @a;
402 } else {
403 $self->_chomp1(scalar $self->_splice(@_));
404 }
b3fe5a4c
AMS
405}
406
407sub DESTROY {
57c7bc08 408 my $self = shift;
6fc0ea7e
JH
409 $self->flush if $self->_is_deferring;
410 $self->{cache}->delink if defined $self->{cache}; # break circular link
b3fe5a4c
AMS
411}
412
413sub _splice {
b5aed31e
AMS
414 my ($self, $pos, $nrecs, @data) = @_;
415 my @result;
416
7b6b3db1
JH
417 $pos = 0 unless defined $pos;
418
419 # Deal with negative and other out-of-range positions
420 # Also set default for $nrecs
51efdd02
AMS
421 {
422 my $oldsize = $self->FETCHSIZE;
7b6b3db1 423 $nrecs = $oldsize unless defined $nrecs;
51efdd02
AMS
424 my $oldpos = $pos;
425
426 if ($pos < 0) {
427 $pos += $oldsize;
428 if ($pos < 0) {
429 croak "Modification of non-creatable array value attempted, subscript $oldpos";
430 }
431 }
432
433 if ($pos > $oldsize) {
434 return unless @data;
435 $pos = $oldsize; # This is what perl does for normal arrays
436 }
437 }
b5aed31e
AMS
438
439 $self->_fixrecs(@data);
440 my $data = join '', @data;
441 my $datalen = length $data;
442 my $oldlen = 0;
443
444 # compute length of data being removed
445 for ($pos .. $pos+$nrecs-1) {
6fc0ea7e 446 $self->_fill_offsets_to($_);
0b28bc9a 447 my $rec = $self->_fetch($_);
b5aed31e
AMS
448 last unless defined $rec;
449 push @result, $rec;
6fc0ea7e
JH
450
451 # Why don't we just use length($rec) here?
452 # Because that record might have come from the cache. _splice
453 # might have been called to flush out the deferred-write records,
454 # and in this case length($rec) is the length of the record to be *written*,
455 # not the length of the actual record in the file. But the offsets are
456 # still true. 20020322
457 $oldlen += $self->{offsets}[$_+1] - $self->{offsets}[$_]
458 if defined $self->{offsets}[$_+1];
b5aed31e
AMS
459 }
460
51efdd02 461 # Modify the file
b5aed31e
AMS
462 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
463
464 # update the offsets table part 1
465 # compute the offsets of the new records:
466 my @new_offsets;
467 if (@data) {
468 push @new_offsets, $self->{offsets}[$pos];
469 for (0 .. $#data-1) {
470 push @new_offsets, $new_offsets[-1] + length($data[$_]);
471 }
472 }
473 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
474
475 # update the offsets table part 2
476 # adjust the offsets of the following old records
477 for ($pos+@data .. $#{$self->{offsets}}) {
478 $self->{offsets}[$_] += $datalen - $oldlen;
479 }
480 # If we scrubbed out all known offsets, regenerate the trivial table
481 # that knows that the file does indeed start at 0.
482 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
483
51efdd02
AMS
484 # Perhaps the following cache foolery could be factored out
485 # into a bunch of mor opaque cache functions. For example,
486 # it's odd to delete a record from the cache and then remove
487 # it from the LRU queue later on; there should be a function to
488 # do both at once.
489
b5aed31e
AMS
490 # update the read cache, part 1
491 # modified records
b5aed31e 492 for ($pos .. $pos+$nrecs-1) {
b5aed31e
AMS
493 my $new = $data[$_-$pos];
494 if (defined $new) {
6fc0ea7e 495 $self->{cache}->update($_, $new);
b5aed31e 496 } else {
6fc0ea7e 497 $self->{cache}->remove($_);
b5aed31e
AMS
498 }
499 }
6fc0ea7e 500
b5aed31e
AMS
501 # update the read cache, part 2
502 # moved records - records past the site of the change
503 # need to be renumbered
504 # Maybe merge this with the previous block?
b3fe5a4c 505 {
6fc0ea7e
JH
506 my @oldkeys = grep $_ >= $pos + $nrecs, $self->{cache}->keys;
507 my @newkeys = map $_-$nrecs+@data, @oldkeys;
508 $self->{cache}->rekey(\@oldkeys, \@newkeys);
b5aed31e 509 }
b5aed31e 510
b3fe5a4c
AMS
511 # Now there might be too much data in the cache, if we spliced out
512 # some short records and spliced in some long ones. If so, flush
513 # the cache.
514 $self->_cache_flush;
515
51efdd02
AMS
516 # Yes, the return value of 'splice' *is* actually this complicated
517 wantarray ? @result : @result ? $result[-1] : undef;
b5aed31e
AMS
518}
519
520# write data into the file
521# $data is the data to be written.
522# it should be written at position $pos, and should overwrite
523# exactly $len of the following bytes.
524# Note that if length($data) > $len, the subsequent bytes will have to
525# be moved up, and if length($data) < $len, they will have to
526# be moved down
527sub _twrite {
528 my ($self, $data, $pos, $len) = @_;
529
530 unless (defined $pos) {
531 die "\$pos was undefined in _twrite";
532 }
533
534 my $len_diff = length($data) - $len;
535
536 if ($len_diff == 0) { # Woo-hoo!
537 my $fh = $self->{fh};
538 $self->_seekb($pos);
539 $self->_write_record($data);
540 return; # well, that was easy.
541 }
542
543 # the two records are of different lengths
544 # our strategy here: rewrite the tail of the file,
545 # reading ahead one buffer at a time
546 # $bufsize is required to be at least as large as the data we're overwriting
547 my $bufsize = _bufsize($len_diff);
548 my ($writepos, $readpos) = ($pos, $pos+$len);
51efdd02 549 my $next_block;
6fc0ea7e 550 my $more_data;
b5aed31e
AMS
551
552 # Seems like there ought to be a way to avoid the repeated code
553 # and the special case here. The read(1) is also a little weird.
554 # Think about this.
555 do {
556 $self->_seekb($readpos);
51efdd02 557 my $br = read $self->{fh}, $next_block, $bufsize;
6fc0ea7e 558 $more_data = read $self->{fh}, my($dummy), 1;
b5aed31e
AMS
559 $self->_seekb($writepos);
560 $self->_write_record($data);
561 $readpos += $br;
562 $writepos += length $data;
563 $data = $next_block;
6fc0ea7e 564 } while $more_data; # BUG XXX TODO how could this have worked?
51efdd02
AMS
565 $self->_seekb($writepos);
566 $self->_write_record($next_block);
b5aed31e
AMS
567
568 # There might be leftover data at the end of the file
569 $self->_chop_file if $len_diff < 0;
570}
571
572# If a record does not already end with the appropriate terminator
573# string, append one.
574sub _fixrecs {
575 my $self = shift;
576 for (@_) {
577 $_ .= $self->{recsep}
578 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
579 }
580}
581
57c7bc08
AMS
582
583################################################################
584#
585# Basic read, write, and seek
586#
587
b5aed31e
AMS
588# seek to the beginning of record #$n
589# Assumes that the offsets table is already correctly populated
590#
591# Note that $n=-1 has a special meaning here: It means the start of
592# the last known record; this may or may not be the very last record
593# in the file, depending on whether the offsets table is fully populated.
594#
595sub _seek {
596 my ($self, $n) = @_;
597 my $o = $self->{offsets}[$n];
598 defined($o)
599 or confess("logic error: undefined offset for record $n");
600 seek $self->{fh}, $o, SEEK_SET
601 or die "Couldn't seek filehandle: $!"; # "Should never happen."
602}
603
604sub _seekb {
605 my ($self, $b) = @_;
606 seek $self->{fh}, $b, SEEK_SET
607 or die "Couldn't seek filehandle: $!"; # "Should never happen."
608}
609
610# populate the offsets table up to the beginning of record $n
611# return the offset of record $n
612sub _fill_offsets_to {
613 my ($self, $n) = @_;
614 my $fh = $self->{fh};
615 local *OFF = $self->{offsets};
616 my $rec;
617
618 until ($#OFF >= $n) {
619 my $o = $OFF[-1];
620 $self->_seek(-1); # tricky -- see comment at _seek
621 $rec = $self->_read_record;
622 if (defined $rec) {
51efdd02 623 push @OFF, tell $fh;
b5aed31e
AMS
624 } else {
625 return; # It turns out there is no such record
626 }
627 }
628
629 # we have now read all the records up to record n-1,
630 # so we can return the offset of record n
631 return $OFF[$n];
632}
633
634# assumes that $rec is already suitably terminated
635sub _write_record {
636 my ($self, $rec) = @_;
637 my $fh = $self->{fh};
638 print $fh $rec
639 or die "Couldn't write record: $!"; # "Should never happen."
6fc0ea7e 640 $self->{_written} += length($rec);
b5aed31e
AMS
641}
642
643sub _read_record {
644 my $self = shift;
645 my $rec;
646 { local $/ = $self->{recsep};
647 my $fh = $self->{fh};
648 $rec = <$fh>;
649 }
6fc0ea7e 650 $self->{_read} += length($rec) if defined $rec;
b5aed31e
AMS
651 $rec;
652}
653
6fc0ea7e
JH
654sub _rw_stats {
655 @{$self}{'_read', '_written'};
656}
657
57c7bc08
AMS
658################################################################
659#
660# Read cache management
661
6fc0ea7e
JH
662sub _cache_flush {
663 my ($self) = @_;
664 $self->{cache}->reduce_size_to($self->{memory} - $self->{deferred_s});
b5aed31e
AMS
665}
666
57c7bc08
AMS
667sub _cache_too_full {
668 my $self = shift;
6fc0ea7e 669 $self->{cache}->bytes + $self->{deferred_s} >= $self->{memory};
b5aed31e
AMS
670}
671
57c7bc08
AMS
672################################################################
673#
674# File custodial services
675#
676
677
b5aed31e
AMS
678# We have read to the end of the file and have the offsets table
679# entirely populated. Now we need to write a new record beyond
680# the end of the file. We prepare for this by writing
681# empty records into the file up to the position we want
51efdd02
AMS
682#
683# assumes that the offsets table already contains the offset of record $n,
684# if it exists, and extends to the end of the file if not.
b5aed31e
AMS
685sub _extend_file_to {
686 my ($self, $n) = @_;
687 $self->_seek(-1); # position after the end of the last record
688 my $pos = $self->{offsets}[-1];
689
690 # the offsets table has one entry more than the total number of records
6fc0ea7e 691 my $extras = $n - $#{$self->{offsets}};
b5aed31e
AMS
692
693 # Todo : just use $self->{recsep} x $extras here?
694 while ($extras-- > 0) {
695 $self->_write_record($self->{recsep});
fa408a35 696 push @{$self->{offsets}}, tell $self->{fh};
b5aed31e
AMS
697 }
698}
699
700# Truncate the file at the current position
701sub _chop_file {
702 my $self = shift;
703 truncate $self->{fh}, tell($self->{fh});
704}
705
57c7bc08 706
b5aed31e
AMS
707# compute the size of a buffer suitable for moving
708# all the data in a file forward $n bytes
709# ($n may be negative)
710# The result should be at least $n.
711sub _bufsize {
712 my $n = shift;
713 return 8192 if $n < 0;
714 my $b = $n & ~8191;
715 $b += 8192 if $n & 8191;
716 $b;
717}
718
57c7bc08
AMS
719################################################################
720#
721# Miscellaneous public methods
722#
723
51efdd02
AMS
724# Lock the file
725sub flock {
726 my ($self, $op) = @_;
727 unless (@_ <= 3) {
728 my $pack = ref $self;
729 croak "Usage: $pack\->flock([OPERATION])";
730 }
731 my $fh = $self->{fh};
732 $op = LOCK_EX unless defined $op;
733 flock $fh, $op;
734}
b5aed31e 735
0b28bc9a
AMS
736# Get/set autochomp option
737sub autochomp {
738 my $self = shift;
739 if (@_) {
740 my $old = $self->{autochomp};
741 $self->{autochomp} = shift;
742 $old;
743 } else {
744 $self->{autochomp};
745 }
746}
747
57c7bc08
AMS
748################################################################
749#
750# Matters related to deferred writing
751#
752
753# Defer writes
754sub defer {
755 my $self = shift;
6fc0ea7e
JH
756 $self->_stop_autodeferring;
757 @{$self->{ad_history}} = ();
57c7bc08
AMS
758 $self->{defer} = 1;
759}
760
b3fe5a4c
AMS
761# Flush deferred writes
762#
763# This could be better optimized to write the file in one pass, instead
764# of one pass per block of records. But that will require modifications
765# to _twrite, so I should have a good _twite test suite first.
766sub flush {
767 my $self = shift;
768
769 $self->_flush;
770 $self->{defer} = 0;
771}
772
773sub _flush {
774 my $self = shift;
775 my @writable = sort {$a<=>$b} (keys %{$self->{deferred}});
776
777 while (@writable) {
778 # gather all consecutive records from the front of @writable
779 my $first_rec = shift @writable;
780 my $last_rec = $first_rec+1;
781 ++$last_rec, shift @writable while @writable && $last_rec == $writable[0];
782 --$last_rec;
783 $self->_fill_offsets_to($last_rec);
784 $self->_extend_file_to($last_rec);
785 $self->_splice($first_rec, $last_rec-$first_rec+1,
786 @{$self->{deferred}}{$first_rec .. $last_rec});
787 }
788
57c7bc08 789 $self->_discard; # clear out defered-write-cache
b3fe5a4c
AMS
790}
791
57c7bc08 792# Discard deferred writes and disable future deferred writes
b3fe5a4c
AMS
793sub discard {
794 my $self = shift;
57c7bc08 795 $self->_discard;
b3fe5a4c
AMS
796 $self->{defer} = 0;
797}
798
57c7bc08
AMS
799# Discard deferred writes, but retain old deferred writing mode
800sub _discard {
801 my $self = shift;
6fc0ea7e
JH
802 %{$self->{deferred}} = ();
803 $self->{deferred_s} = 0;
804 $self->{deferred_max} = -1;
805 $self->{cache}->set_limit($self->{memory});
806}
807
808# Deferred writing is enabled, either explicitly ($self->{defer})
809# or automatically ($self->{autodeferring})
810sub _is_deferring {
811 my $self = shift;
812 $self->{defer} || $self->{autodeferring};
813}
814
815# The largest record number of any deferred record
816sub _defer_max {
817 my $self = shift;
818 return $self->{deferred_max} if defined $self->{deferred_max};
819 my $max = -1;
820 for my $key (keys %{$self->{deferred}}) {
821 $max = $key if $key > $max;
822 }
823 $self->{deferred_max} = $max;
824 $max;
57c7bc08
AMS
825}
826
6fc0ea7e
JH
827################################################################
828#
829# Matters related to autodeferment
830#
831
832# Get/set autodefer option
833sub autodefer {
834 my $self = shift;
835 if (@_) {
836 my $old = $self->{autodefer};
837 $self->{autodefer} = shift;
838 if ($old) {
839 $self->_stop_autodeferring;
840 @{$self->{ad_history}} = ();
841 }
842 $old;
843 } else {
844 $self->{autodefer};
845 }
846}
847
848# The user is trying to store record #$n Record that in the history,
849# and then enable (or disable) autodeferment if that seems useful.
850# Note that it's OK for $n to be a non-number, as long as the function
851# is prepared to deal with that. Nobody else looks at the ad_history.
852#
853# Now, what does the ad_history mean, and what is this function doing?
854# Essentially, the idea is to enable autodeferring when we see that the
855# user has made three consecutive STORE calls to three consecutive records.
856# ("Three" is actually ->{autodefer_threshhold}.)
857# A STORE call for record #$n inserts $n into the autodefer history,
858# and if the history contains three consecutive records, we enable
859# autodeferment. An ad_history of [X, Y] means that the most recent
860# STOREs were for records X, X+1, ..., Y, in that order.
861#
862# Inserting a nonconsecutive number erases the history and starts over.
863#
864# Performing a special operation like SPLICE erases the history.
865#
866# There's one special case: CLEAR means that CLEAR was just called.
867# In this case, we prime the history with [-2, -1] so that if the next
868# write is for record 0, autodeferring goes on immediately. This is for
869# the common special case of "@a = (...)".
870#
871sub _annotate_ad_history {
872 my ($self, $n) = @_;
873 return unless $self->{autodefer}; # feature is disabled
874 return if $self->{defer}; # already in explicit defer mode
875 return unless $self->{offsets}[-1] >= $self->{autodefer_filelen_threshhold};
876
877 local *H = $self->{ad_history};
878 if ($n eq 'CLEAR') {
879 @H = (-2, -1); # prime the history with fake records
880 $self->_stop_autodeferring;
881 } elsif ($n =~ /^\d+$/) {
882 if (@H == 0) {
883 @H = ($n, $n);
884 } else { # @H == 2
885 if ($H[1] == $n-1) { # another consecutive record
886 $H[1]++;
887 if ($H[1] - $H[0] + 1 >= $self->{autodefer_threshhold}) {
888 $self->{autodeferring} = 1;
889 }
890 } else { # nonconsecutive- erase and start over
891 @H = ($n, $n);
892 $self->_stop_autodeferring;
893 }
894 }
895 } else { # SPLICE or STORESIZE or some such
896 @H = ();
897 $self->_stop_autodeferring;
898 }
899}
900
901# If autodferring was enabled, cut it out and discard the history
902sub _stop_autodeferring {
903 my $self = shift;
904 if ($self->{autodeferring}) {
905 $self->_flush;
906 }
907 $self->{autodeferring} = 0;
908}
909
910################################################################
911
b3fe5a4c 912
57c7bc08
AMS
913# This is NOT a method. It is here for two reasons:
914# 1. To factor a fairly complicated block out of the constructor
915# 2. To provide access for the test suite, which need to be sure
916# files are being written properly.
b3fe5a4c
AMS
917sub _default_recsep {
918 my $recsep = $/;
57c7bc08 919 if ($^O eq 'MSWin32') { # Dos too?
b3fe5a4c
AMS
920 # Windows users expect files to be terminated with \r\n
921 # But $/ is set to \n instead
922 # Note that this also transforms \n\n into \r\n\r\n.
923 # That is a feature.
924 $recsep =~ s/\n/\r\n/g;
925 }
926 $recsep;
927}
928
57c7bc08
AMS
929# Utility function for _check_integrity
930sub _ci_warn {
931 my $msg = shift;
932 $msg =~ s/\n/\\n/g;
933 $msg =~ s/\r/\\r/g;
934 print "# $msg\n";
935}
936
b5aed31e 937# Given a file, make sure the cache is consistent with the
57c7bc08
AMS
938# file contents and the internal data structures are consistent with
939# each other. Returns true if everything checks out, false if not
940#
941# The $file argument is no longer used. It is retained for compatibility
942# with the existing test suite.
b5aed31e
AMS
943sub _check_integrity {
944 my ($self, $file, $warn) = @_;
6fc0ea7e
JH
945 my $rsl = $self->{recseplen};
946 my $rs = $self->{recsep};
b5aed31e 947 my $good = 1;
6fc0ea7e
JH
948 local *_; # local $_ does not work here
949 local $DIAGNOSTIC = 1;
950
951 if (not defined $rs) {
952 _ci_warn("recsep is undef!");
953 $good = 0;
954 } elsif ($rs eq "") {
955 _ci_warn("recsep is empty!");
956 $good = 0;
957 } elsif ($rsl != length $rs) {
958 my $ln = length $rs;
959 _ci_warn("recsep <$rs> has length $ln, should be $rsl");
960 $good = 0;
961 }
fa408a35 962
836d9961 963 if (not defined $self->{offsets}[0]) {
57c7bc08 964 _ci_warn("offset 0 is missing!");
836d9961
JH
965 $good = 0;
966 } elsif ($self->{offsets}[0] != 0) {
57c7bc08 967 _ci_warn("rec 0: offset <$self->{offsets}[0]> s/b 0!");
b5aed31e
AMS
968 $good = 0;
969 }
fa408a35 970
57c7bc08 971 my $cached = 0;
6fc0ea7e
JH
972 {
973 local *F = $self->{fh};
974 seek F, 0, SEEK_SET;
975 local $. = 0;
976 local $/ = $rs;
977
978 while (<F>) {
979 my $n = $. - 1;
980 my $cached = $self->{cache}->_produce($n);
981 my $offset = $self->{offsets}[$.];
982 my $ao = tell F;
983 if (defined $offset && $offset != $ao) {
984 _ci_warn("rec $n: offset <$offset> actual <$ao>");
985 $good = 0;
986 }
987 if (defined $cached && $_ ne $cached && ! $self->{deferred}{$n}) {
988 $good = 0;
989 _ci_warn("rec $n: cached <$cached> actual <$_>");
990 }
991 if (defined $cached && substr($cached, -$rsl) ne $rs) {
992 _ci_warn("rec $n in the cache is missing the record separator");
993 }
994 }
b5aed31e 995
6fc0ea7e
JH
996 my $deferring = $self->_is_deferring;
997 for my $n ($self->{cache}->keys) {
998 my $r = $self->{cache}->_produce($n);
999 $cached += length($r);
1000 next if $n+1 <= $.; # checked this already
1001 _ci_warn("spurious caching of record $n");
b5aed31e
AMS
1002 $good = 0;
1003 }
6fc0ea7e
JH
1004 my $b = $self->{cache}->bytes;
1005 if ($cached != $b) {
1006 _ci_warn("cache size is $b, should be $cached");
b5aed31e
AMS
1007 $good = 0;
1008 }
1009 }
1010
6fc0ea7e
JH
1011 $good = 0 unless $self->{cache}->_check_integrity;
1012
57c7bc08
AMS
1013 # Now let's check the deferbuffer
1014 # Unless deferred writing is enabled, it should be empty
6fc0ea7e 1015 if (! $self->_is_deferring && %{$self->{deferred}}) {
57c7bc08
AMS
1016 _ci_warn("deferred writing disabled, but deferbuffer nonempty");
1017 $good = 0;
1018 }
1019
1020 # Any record in the deferbuffer should *not* be present in the readcache
1021 my $deferred_s = 0;
1022 while (my ($n, $r) = each %{$self->{deferred}}) {
1023 $deferred_s += length($r);
6fc0ea7e 1024 if (defined $self->{cache}->_produce($n)) {
57c7bc08
AMS
1025 _ci_warn("record $n is in the deferbuffer *and* the readcache");
1026 $good = 0;
1027 }
6fc0ea7e 1028 if (substr($r, -$rsl) ne $rs) {
57c7bc08
AMS
1029 _ci_warn("rec $n in the deferbuffer is missing the record separator");
1030 $good = 0;
1031 }
1032 }
1033
1034 # Total size of deferbuffer should match internal total
1035 if ($deferred_s != $self->{deferred_s}) {
1036 _ci_warn("buffer size is $self->{deferred_s}, should be $deferred_s");
1037 $good = 0;
1038 }
1039
1040 # Total size of deferbuffer should not exceed the specified limit
1041 if ($deferred_s > $self->{dw_size}) {
1042 _ci_warn("buffer size is $self->{deferred_s} which exceeds the limit of $self->{dw_size}");
1043 $good = 0;
1044 }
1045
1046 # Total size of cached data should not exceed the specified limit
1047 if ($deferred_s + $cached > $self->{memory}) {
1048 my $total = $deferred_s + $cached;
1049 _ci_warn("total stored data size is $total which exceeds the limit of $self->{memory}");
1050 $good = 0;
1051 }
1052
6fc0ea7e
JH
1053 # Stuff related to autodeferment
1054 if (!$self->{autodefer} && @{$self->{ad_history}}) {
1055 _ci_warn("autodefer is disabled, but ad_history is nonempty");
1056 $good = 0;
1057 }
1058 if ($self->{autodeferring} && $self->{defer}) {
1059 _ci_warn("both autodeferring and explicit deferring are active");
1060 $good = 0;
1061 }
1062 if (@{$self->{ad_history}} == 0) {
1063 # That's OK, no additional tests required
1064 } elsif (@{$self->{ad_history}} == 2) {
1065 my @non_number = grep !/^-?\d+$/, @{$self->{ad_history}};
1066 if (@non_number) {
1067 my $msg;
1068 { local $" = ')(';
1069 $msg = "ad_history contains non-numbers (@{$self->{ad_history}})";
1070 }
1071 _ci_warn($msg);
1072 $good = 0;
1073 } elsif ($self->{ad_history}[1] < $self->{ad_history}[0]) {
1074 _ci_warn("ad_history has nonsensical values @{$self->{ad_history}}");
1075 $good = 0;
1076 }
1077 } else {
1078 _ci_warn("ad_history has bad length <@{$self->{ad_history}}>");
1079 $good = 0;
1080 }
1081
b5aed31e
AMS
1082 $good;
1083}
1084
6fc0ea7e
JH
1085################################################################
1086#
1087# Tie::File::Cache
1088#
1089# Read cache
1090
1091package Tie::File::Cache;
1092$Tie::File::Cache::VERSION = $Tie::File::VERSION;
1093use Carp ':DEFAULT', 'confess';
1094
1095sub HEAP () { 0 }
1096sub HASH () { 1 }
1097sub MAX () { 2 }
1098sub BYTES() { 3 }
1099use strict 'vars';
1100
1101sub new {
1102 my ($pack, $max) = @_;
1103 local *_;
1104 croak "missing argument to ->new" unless defined $max;
1105 my $self = [];
1106 bless $self => $pack;
1107 @$self = (Tie::File::Heap->new($self), {}, $max, 0);
1108 $self;
1109}
1110
1111sub adj_limit {
1112 my ($self, $n) = @_;
1113 $self->[MAX] += $n;
1114}
1115
1116sub set_limit {
1117 my ($self, $n) = @_;
1118 $self->[MAX] = $n;
1119}
1120
1121# For internal use only
1122# Will be called by the heap structure to notify us that a certain
1123# piece of data has moved from one heap element to another.
1124# $k is the hash key of the item
1125# $n is the new index into the heap at which it is stored
1126# If $n is undefined, the item has been removed from the heap.
1127sub _heap_move {
1128 my ($self, $k, $n) = @_;
1129 if (defined $n) {
1130 $self->[HASH]{$k} = $n;
1131 } else {
1132 delete $self->[HASH]{$k};
1133 }
1134}
1135
1136sub insert {
1137 my ($self, $key, $val) = @_;
1138 local *_;
1139 croak "missing argument to ->insert" unless defined $key;
1140 unless (defined $self->[MAX]) {
1141 confess "undefined max" ;
1142 }
1143 confess "undefined val" unless defined $val;
1144 return if length($val) > $self->[MAX];
1145 my $oldnode = $self->[HASH]{$key};
1146 if (defined $oldnode) {
1147 my $oldval = $self->[HEAP]->set_val($oldnode, $val);
1148 $self->[BYTES] -= length($oldval);
1149 } else {
1150 $self->[HEAP]->insert($key, $val);
1151 }
1152 $self->[BYTES] += length($val);
1153 $self->flush;
1154}
1155
1156sub expire {
1157 my $self = shift;
1158 my $old_data = $self->[HEAP]->popheap;
1159 return unless defined $old_data;
1160 $self->[BYTES] -= length $old_data;
1161 $old_data;
1162}
1163
1164sub remove {
1165 my ($self, @keys) = @_;
1166 my @result;
1167 for my $key (@keys) {
1168 next unless exists $self->[HASH]{$key};
1169 my $old_data = $self->[HEAP]->remove($self->[HASH]{$key});
1170 $self->[BYTES] -= length $old_data;
1171 push @result, $old_data;
1172 }
1173 @result;
1174}
1175
1176sub lookup {
1177 my ($self, $key) = @_;
1178 local *_;
1179 croak "missing argument to ->lookup" unless defined $key;
1180 if (exists $self->[HASH]{$key}) {
1181 $self->[HEAP]->lookup($self->[HASH]{$key});
1182 } else {
1183 return;
1184 }
1185}
1186
1187# For internal use only
1188sub _produce {
1189 my ($self, $key) = @_;
1190 my $loc = $self->[HASH]{$key};
1191 return unless defined $loc;
1192 $self->[HEAP][$loc][2];
1193}
1194
1195# For internal use only
1196sub _promote {
1197 my ($self, $key) = @_;
1198 $self->[HEAP]->promote($self->[HASH]{$key});
1199}
1200
1201sub empty {
1202 my ($self) = @_;
1203 %{$self->[HASH]} = ();
1204 $self->[BYTES] = 0;
1205 $self->[HEAP]->empty;
1206}
1207
1208sub is_empty {
1209 my ($self) = @_;
1210 keys %{$self->[HASH]} == 0;
1211}
1212
1213sub update {
1214 my ($self, $key, $val) = @_;
1215 local *_;
1216 croak "missing argument to ->update" unless defined $key;
1217 if (length($val) > $self->[MAX]) {
1218 my $oldval = $self->remove($key);
1219 $self->[BYTES] -= length($oldval) if defined $oldval;
1220 } elsif (exists $self->[HASH]{$key}) {
1221 my $oldval = $self->[HEAP]->set_val($self->[HASH]{$key}, $val);
1222 $self->[BYTES] += length($val);
1223 $self->[BYTES] -= length($oldval) if defined $oldval;
1224 } else {
1225 $self->[HEAP]->insert($key, $val);
1226 $self->[BYTES] += length($val);
1227 }
1228 $self->flush;
1229}
1230
1231sub rekey {
1232 my ($self, $okeys, $nkeys) = @_;
1233 local *_;
1234 my %map;
1235 @map{@$okeys} = @$nkeys;
1236 croak "missing argument to ->rekey" unless defined $nkeys;
1237 croak "length mismatch in ->rekey arguments" unless @$nkeys == @$okeys;
1238 my %adjusted; # map new keys to heap indices
1239 # You should be able to cut this to one loop TODO XXX
1240 for (0 .. $#$okeys) {
1241 $adjusted{$nkeys->[$_]} = delete $self->[HASH]{$okeys->[$_]};
1242 }
1243 while (my ($nk, $ix) = each %adjusted) {
1244 # @{$self->[HASH]}{keys %adjusted} = values %adjusted;
1245 $self->[HEAP]->rekey($ix, $nk);
1246 $self->[HASH]{$nk} = $ix;
1247 }
1248}
1249
1250sub keys {
1251 my $self = shift;
1252 my @a = keys %{$self->[HASH]};
1253 @a;
1254}
1255
1256sub bytes {
1257 my $self = shift;
1258 $self->[BYTES];
1259}
1260
1261sub reduce_size_to {
1262 my ($self, $max) = @_;
1263 until ($self->is_empty || $self->[BYTES] <= $max) {
1264 $self->expire;
1265 }
1266}
1267
1268sub flush {
1269 my $self = shift;
1270 until ($self->is_empty || $self->[BYTES] <= $self->[MAX]) {
1271 $self->expire;
1272 }
1273}
1274
1275# For internal use only
1276sub _produce_lru {
1277 my $self = shift;
1278 $self->[HEAP]->expire_order;
1279}
1280
1281sub _check_integrity {
1282 my $self = shift;
1283 $self->[HEAP]->_check_integrity;
1284}
1285
1286sub delink {
1287 my $self = shift;
1288 $self->[HEAP] = undef; # Bye bye heap
1289}
1290
1291################################################################
1292#
1293# Tie::File::Heap
1294#
1295# Heap data structure for use by cache LRU routines
1296
1297package Tie::File::Heap;
1298use Carp ':DEFAULT', 'confess';
1299$Tie::File::Heap::VERSION = $Tie::File::Cache::VERSION;
1300sub SEQ () { 0 };
1301sub KEY () { 1 };
1302sub DAT () { 2 };
1303
1304sub new {
1305 my ($pack, $cache) = @_;
1306 die "$pack: Parent cache object $cache does not support _heap_move method"
1307 unless eval { $cache->can('_heap_move') };
1308 my $self = [[0,$cache,0]];
1309 bless $self => $pack;
1310}
1311
1312# Allocate a new sequence number, larger than all previously allocated numbers
1313sub _nseq {
1314 my $self = shift;
1315 $self->[0][0]++;
1316}
1317
1318sub _cache {
1319 my $self = shift;
1320 $self->[0][1];
1321}
1322
1323sub _nelts {
1324 my $self = shift;
1325 $self->[0][2];
1326}
1327
1328sub _nelts_inc {
1329 my $self = shift;
1330 ++$self->[0][2];
1331}
1332
1333sub _nelts_dec {
1334 my $self = shift;
1335 --$self->[0][2];
1336}
1337
1338sub is_empty {
1339 my $self = shift;
1340 $self->_nelts == 0;
1341}
1342
1343sub empty {
1344 my $self = shift;
1345 $#$self = 0;
1346 $self->[0][2] = 0;
1347 $self->[0][0] = 0; # might as well reset the sequence numbers
1348}
1349
1350# notify the parent cache objec tthat we moved something
1351sub _heap_move {
1352 my $self = shift;
1353 $self->_cache->_heap_move(@_);
1354}
1355
1356# Insert a piece of data into the heap with the indicated sequence number.
1357# The item with the smallest sequence number is always at the top.
1358# If no sequence number is specified, allocate a new one and insert the
1359# item at the bottom.
1360sub insert {
1361 my ($self, $key, $data, $seq) = @_;
1362 $seq = $self->_nseq unless defined $seq;
1363 $self->_insert_new([$seq, $key, $data]);
1364}
1365
1366# Insert a new, fresh item at the bottom of the heap
1367sub _insert_new {
1368 my ($self, $item) = @_;
1369 my $i = @$self;
1370 $i = int($i/2) until defined $self->[$i/2];
1371 $self->[$i] = $item;
1372 $self->_heap_move($self->[$i][KEY], $i);
1373 $self->_nelts_inc;
1374}
1375
1376# Insert [$data, $seq] pair at or below item $i in the heap.
1377# If $i is omitted, default to 1 (the top element.)
1378sub _insert {
1379 my ($self, $item, $i) = @_;
1380 $self->_check_loc($i) if defined $i;
1381 $i = 1 unless defined $i;
1382 until (! defined $self->[$i]) {
1383 if ($self->[$i][SEQ] > $item->[SEQ]) { # inserted item is older
1384 ($self->[$i], $item) = ($item, $self->[$i]);
1385 $self->_heap_move($self->[$i][KEY], $i);
1386 }
1387 # If either is undefined, go that way. Otherwise, choose at random
1388 my $dir;
1389 $dir = 0 if !defined $self->[2*$i];
1390 $dir = 1 if !defined $self->[2*$i+1];
1391 $dir = int(rand(2)) unless defined $dir;
1392 $i = 2*$i + $dir;
1393 }
1394 $self->[$i] = $item;
1395 $self->_heap_move($self->[$i][KEY], $i);
1396 $self->_nelts_inc;
1397}
1398
1399# Remove the item at node $i from the heap, moving child items upwards.
1400# The item with the smallest sequence number is always at the top.
1401# Moving items upwards maintains this condition.
1402# Return the removed item.
1403sub remove {
1404 my ($self, $i) = @_;
1405 $i = 1 unless defined $i;
1406 my $top = $self->[$i];
1407 return unless defined $top;
1408 while (1) {
1409 my $ii;
1410 my ($L, $R) = (2*$i, 2*$i+1);
1411
1412 # If either is undefined, go the other way.
1413 # Otherwise, go towards the smallest.
1414 last unless defined $self->[$L] || defined $self->[$R];
1415 $ii = $R if not defined $self->[$L];
1416 $ii = $L if not defined $self->[$R];
1417 unless (defined $ii) {
1418 $ii = $self->[$L][SEQ] < $self->[$R][SEQ] ? $L : $R;
1419 }
1420
1421 $self->[$i] = $self->[$ii]; # Promote child to fill vacated spot
1422 $self->_heap_move($self->[$i][KEY], $i);
1423 $i = $ii; # Fill new vacated spot
1424 }
1425 $self->_heap_move($top->[KEY], undef);
1426 undef $self->[$i];
1427 $self->_nelts_dec;
1428 return $top->[DAT];
1429}
1430
1431sub popheap {
1432 my $self = shift;
1433 $self->remove(1);
1434}
1435
1436# set the sequence number of the indicated item to a higher number
1437# than any other item in the heap, and bubble the item down to the
1438# bottom.
1439sub promote {
1440 my ($self, $n) = @_;
1441 $self->_check_loc($n);
1442 $self->[$n][SEQ] = $self->_nseq;
1443 my $i = $n;
1444 while (1) {
1445 my ($L, $R) = (2*$i, 2*$i+1);
1446 my $dir;
1447 last unless defined $self->[$L] || defined $self->[$R];
1448 $dir = $R unless defined $self->[$L];
1449 $dir = $L unless defined $self->[$R];
1450 unless (defined $dir) {
1451 $dir = $self->[$L][SEQ] < $self->[$R][SEQ] ? $L : $R;
1452 }
1453 @{$self}[$i, $dir] = @{$self}[$dir, $i];
1454 for ($i, $dir) {
1455 $self->_heap_move($self->[$_][KEY], $_) if defined $self->[$_];
1456 }
1457 $i = $dir;
1458 }
1459}
1460
1461# Return item $n from the heap, promoting its LRU status
1462sub lookup {
1463 my ($self, $n) = @_;
1464 $self->_check_loc($n);
1465 my $val = $self->[$n];
1466 $self->promote($n);
1467 $val->[DAT];
1468}
1469
1470
1471# Assign a new value for node $n, promoting it to the bottom of the heap
1472sub set_val {
1473 my ($self, $n, $val) = @_;
1474 $self->_check_loc($n);
1475 my $oval = $self->[$n][DAT];
1476 $self->[$n][DAT] = $val;
1477 $self->promote($n);
1478 return $oval;
1479}
1480
1481# The hask key has changed for an item;
1482# alter the heap's record of the hash key
1483sub rekey {
1484 my ($self, $n, $new_key) = @_;
1485 $self->_check_loc($n);
1486 $self->[$n][KEY] = $new_key;
1487}
1488
1489sub _check_loc {
1490 my ($self, $n) = @_;
1491 unless (defined $self->[$n]) {
1492 confess "_check_loc($n) failed";
1493 }
1494}
1495
1496sub _check_integrity {
1497 my $self = shift;
1498 my $good = 1;
1499 unless (eval {$self->[0][1]->isa("Tie::File::Cache")}) {
1500 print "# Element 0 of heap corrupt\n";
1501 $good = 0;
1502 }
1503 $good = 0 unless $self->_satisfies_heap_condition(1);
1504 for my $i (2 .. $#{$self}) {
1505 my $p = int($i/2); # index of parent node
1506 if (defined $self->[$i] && ! defined $self->[$p]) {
1507 print "# Element $i of heap defined, but parent $p isn't\n";
1508 $good = 0;
1509 }
1510 }
1511 return $good;
1512}
1513
1514sub _satisfies_heap_condition {
1515 my $self = shift;
1516 my $n = shift || 1;
1517 my $good = 1;
1518 for (0, 1) {
1519 my $c = $n*2 + $_;
1520 next unless defined $self->[$c];
1521 if ($self->[$n][SEQ] >= $self->[$c]) {
1522 print "# Node $n of heap does not predate node $c\n";
1523 $good = 0 ;
1524 }
1525 $good = 0 unless $self->_satisfies_heap_condition($c);
1526 }
1527 return $good;
1528}
1529
1530# Return a list of all the values, sorted by expiration order
1531sub expire_order {
1532 my $self = shift;
1533 my @nodes = sort {$a->[SEQ] <=> $b->[SEQ]} $self->_nodes;
1534 map { $_->[KEY] } @nodes;
1535}
1536
1537sub _nodes {
1538 my $self = shift;
1539 my $i = shift || 1;
1540 return unless defined $self->[$i];
1541 ($self->[$i], $self->_nodes($i*2), $self->_nodes($i*2+1));
1542}
1543
15441;
1545
1546
1547
fa408a35
AMS
1548"Cogito, ergo sum."; # don't forget to return a true value from the file
1549
b5aed31e
AMS
1550=head1 NAME
1551
1552Tie::File - Access the lines of a disk file via a Perl array
1553
1554=head1 SYNOPSIS
1555
6fc0ea7e 1556 # This file documents Tie::File version 0.90
b5aed31e
AMS
1557
1558 tie @array, 'Tie::File', filename or die ...;
1559
1560 $array[13] = 'blah'; # line 13 of the file is now 'blah'
1561 print $array[42]; # display line 42 of the file
1562
1563 $n_recs = @array; # how many records are in the file?
57c7bc08
AMS
1564 $#array -= 2; # chop two records off the end
1565
b5aed31e 1566
57c7bc08
AMS
1567 for (@array) {
1568 s/PERL/Perl/g; # Replace PERL with Perl everywhere in the file
1569 }
1570
1571 # These are just like regular push, pop, unshift, shift, and splice
1572 # Except that they modify the file in the way you would expect
51efdd02
AMS
1573
1574 push @array, new recs...;
1575 my $r1 = pop @array;
1576 unshift @array, new recs...;
1577 my $r1 = shift @array;
b5aed31e
AMS
1578 @old_recs = splice @array, 3, 7, new recs...;
1579
1580 untie @array; # all finished
1581
57c7bc08 1582
b5aed31e
AMS
1583=head1 DESCRIPTION
1584
1585C<Tie::File> represents a regular text file as a Perl array. Each
1586element in the array corresponds to a record in the file. The first
1587line of the file is element 0 of the array; the second line is element
15881, and so on.
1589
1590The file is I<not> loaded into memory, so this will work even for
1591gigantic files.
1592
1593Changes to the array are reflected in the file immediately.
1594
57c7bc08 1595Lazy people and beginners may now stop reading the manual.
b3fe5a4c 1596
b5aed31e
AMS
1597=head2 C<recsep>
1598
1599What is a 'record'? By default, the meaning is the same as for the
1600C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
b3fe5a4c
AMS
1601probably C<"\n">. (Minor exception: on dos and Win32 systems, a
1602'record' is a string terminated by C<"\r\n">.) You may change the
1603definition of "record" by supplying the C<recsep> option in the C<tie>
1604call:
b5aed31e
AMS
1605
1606 tie @array, 'Tie::File', $file, recsep => 'es';
1607
b3fe5a4c
AMS
1608This says that records are delimited by the string C<es>. If the file
1609contained the following data:
b5aed31e
AMS
1610
1611 Curse these pesky flies!\n
1612
1613then the C<@array> would appear to have four elements:
1614
0b28bc9a
AMS
1615 "Curse th"
1616 "e p"
1617 "ky fli"
b5aed31e
AMS
1618 "!\n"
1619
1620An undefined value is not permitted as a record separator. Perl's
1621special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
1622emulated.
1623
0b28bc9a
AMS
1624Records read from the tied array do not have the record separator
1625string on the end; this is to allow
1626
1627 $array[17] .= "extra";
1628
1629to work as expected.
1630
1631(See L<"autochomp">, below.) Records stored into the array will have
1632the record separator string appended before they are written to the
1633file, if they don't have one already. For example, if the record
1634separator string is C<"\n">, then the following two lines do exactly
1635the same thing:
b5aed31e
AMS
1636
1637 $array[17] = "Cherry pie";
1638 $array[17] = "Cherry pie\n";
1639
1640The result is that the contents of line 17 of the file will be
1641replaced with "Cherry pie"; a newline character will separate line 17
7b6b3db1 1642from line 18. This means that in particular, this will do nothing:
b5aed31e
AMS
1643
1644 chomp $array[17];
1645
1646Because the C<chomp>ed value will have the separator reattached when
1647it is written back to the file. There is no way to create a file
1648whose trailing record separator string is missing.
1649
1650Inserting records that I<contain> the record separator string will
1651produce a reasonable result, but if you can't foresee what this result
1652will be, you'd better avoid doing this.
1653
0b28bc9a
AMS
1654=head2 C<autochomp>
1655
1656Normally, array elements have the record separator removed, so that if
1657the file contains the text
1658
1659 Gold
1660 Frankincense
1661 Myrrh
1662
57c7bc08
AMS
1663the tied array will appear to contain C<("Gold", "Frankincense",
1664"Myrrh")>. If you set C<autochomp> to a false value, the record
1665separator will not be removed. If the file above was tied with
0b28bc9a
AMS
1666
1667 tie @gifts, "Tie::File", $gifts, autochomp => 0;
1668
1669then the array C<@gifts> would appear to contain C<("Gold\n",
1670"Frankincense\n", "Myrrh\n")>, or (on Win32 systems) C<("Gold\r\n",
1671"Frankincense\r\n", "Myrrh\r\n")>.
1672
b5aed31e
AMS
1673=head2 C<mode>
1674
1675Normally, the specified file will be opened for read and write access,
1676and will be created if it does not exist. (That is, the flags
1677C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
1678change this, you may supply alternative flags in the C<mode> option.
1679See L<Fcntl> for a listing of available flags.
1680For example:
1681
1682 # open the file if it exists, but fail if it does not exist
1683 use Fcntl 'O_RDWR';
1684 tie @array, 'Tie::File', $file, mode => O_RDWR;
1685
1686 # create the file if it does not exist
1687 use Fcntl 'O_RDWR', 'O_CREAT';
1688 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
1689
1690 # open an existing file in read-only mode
1691 use Fcntl 'O_RDONLY';
1692 tie @array, 'Tie::File', $file, mode => O_RDONLY;
1693
1694Opening the data file in write-only or append mode is not supported.
1695
b3fe5a4c
AMS
1696=head2 C<memory>
1697
57c7bc08
AMS
1698This is an upper limit on the amount of memory that C<Tie::File> will
1699consume at any time while managing the file. This is used for two
1700things: managing the I<read cache> and managing the I<deferred write
1701buffer>.
b5aed31e
AMS
1702
1703Records read in from the file are cached, to avoid having to re-read
1704them repeatedly. If you read the same record twice, the first time it
1705will be stored in memory, and the second time it will be fetched from
b3fe5a4c
AMS
1706the I<read cache>. The amount of data in the read cache will not
1707exceed the value you specified for C<memory>. If C<Tie::File> wants
1708to cache a new record, but the read cache is full, it will make room
1709by expiring the least-recently visited records from the read cache.
b5aed31e 1710
b3fe5a4c
AMS
1711The default memory limit is 2Mib. You can adjust the maximum read
1712cache size by supplying the C<memory> option. The argument is the
1713desired cache size, in bytes.
b5aed31e
AMS
1714
1715 # I have a lot of memory, so use a large cache to speed up access
b3fe5a4c 1716 tie @array, 'Tie::File', $file, memory => 20_000_000;
b5aed31e 1717
b3fe5a4c 1718Setting the memory limit to 0 will inhibit caching; records will be
b5aed31e
AMS
1719fetched from disk every time you examine them.
1720
57c7bc08
AMS
1721=head2 C<dw_size>
1722
1723(This is an advanced feature. Skip this section on first reading.)
1724
1725If you use deferred writing (See L<"Deferred Writing">, below) then
1726data you write into the array will not be written directly to the
1727file; instead, it will be saved in the I<deferred write buffer> to be
1728written out later. Data in the deferred write buffer is also charged
1729against the memory limit you set with the C<memory> option.
1730
1731You may set the C<dw_size> option to limit the amount of data that can
1732be saved in the deferred write buffer. This limit may not exceed the
1733total memory limit. For example, if you set C<dw_size> to 1000 and
1734C<memory> to 2500, that means that no more than 1000 bytes of deferred
1735writes will be saved up. The space available for the read cache will
1736vary, but it will always be at least 1500 bytes (if the deferred write
1737buffer is full) and it could grow as large as 2500 bytes (if the
1738deferred write buffer is empty.)
1739
1740If you don't specify a C<dw_size>, it defaults to the entire memory
1741limit.
1742
b5aed31e
AMS
1743=head2 Option Format
1744
1745C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
b3fe5a4c 1746C<recsep>. C<-memory> is a synonym for C<memory>. You get the
b5aed31e
AMS
1747idea.
1748
1749=head1 Public Methods
1750
1751The C<tie> call returns an object, say C<$o>. You may call
1752
1753 $rec = $o->FETCH($n);
1754 $o->STORE($n, $rec);
1755
b3fe5a4c
AMS
1756to fetch or store the record at line C<$n>, respectively; similarly
1757the other tied array methods. (See L<perltie> for details.) You may
1758also call the following methods on this object:
51efdd02
AMS
1759
1760=head2 C<flock>
1761
1762 $o->flock(MODE)
1763
1764will lock the tied file. C<MODE> has the same meaning as the second
1765argument to the Perl built-in C<flock> function; for example
1766C<LOCK_SH> or C<LOCK_EX | LOCK_NB>. (These constants are provided by
1767the C<use Fcntl ':flock'> declaration.)
1768
57c7bc08
AMS
1769C<MODE> is optional; the default is C<LOCK_EX>.
1770
1771C<Tie::File> promises that the following sequence of operations will
1772be safe:
1773
1774 my $o = tie @array, "Tie::File", $filename;
1775 $o->flock;
1776
1777In particular, C<Tie::File> will I<not> read or write the file during
1778the C<tie> call. (Exception: Using C<mode =E<gt> O_TRUNC> will, of
1779course, erase the file during the C<tie> call. If you want to do this
1780safely, then open the file without C<O_TRUNC>, lock the file, and use
1781C<@array = ()>.)
51efdd02
AMS
1782
1783The best way to unlock a file is to discard the object and untie the
1784array. It is probably unsafe to unlock the file without also untying
1785it, because if you do, changes may remain unwritten inside the object.
1786That is why there is no shortcut for unlocking. If you really want to
1787unlock the file prematurely, you know what to do; if you don't know
1788what to do, then don't do it.
1789
1790All the usual warnings about file locking apply here. In particular,
1791note that file locking in Perl is B<advisory>, which means that
1792holding a lock will not prevent anyone else from reading, writing, or
1793erasing the file; it only prevents them from getting another lock at
1794the same time. Locks are analogous to green traffic lights: If you
1795have a green light, that does not prevent the idiot coming the other
1796way from plowing into you sideways; it merely guarantees to you that
1797the idiot does not also have a green light at the same time.
b5aed31e 1798
0b28bc9a
AMS
1799=head2 C<autochomp>
1800
1801 my $old_value = $o->autochomp(0); # disable autochomp option
1802 my $old_value = $o->autochomp(1); # enable autochomp option
1803
1804 my $ac = $o->autochomp(); # recover current value
1805
1806See L<"autochomp">, above.
1807
6fc0ea7e 1808=head2 C<defer>, C<flush>, C<discard>, and C<autodefer>
57c7bc08
AMS
1809
1810See L<"Deferred Writing">, below.
1811
0b28bc9a 1812=head1 Tying to an already-opened filehandle
fa408a35
AMS
1813
1814If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
1815of the other C<IO> modules, you may use:
1816
1817 tie @array, 'Tie::File', $fh, ...;
1818
1819Similarly if you opened that handle C<FH> with regular C<open> or
1820C<sysopen>, you may use:
1821
1822 tie @array, 'Tie::File', \*FH, ...;
1823
1824Handles that were opened write-only won't work. Handles that were
57c7bc08
AMS
1825opened read-only will work as long as you don't try to modify the
1826array. Handles must be attached to seekable sources of data---that
1827means no pipes or sockets. If you supply a non-seekable handle, the
1828C<tie> call will try to throw an exception. (On Unix systems, it
1829B<will> throw an exception.)
1830
1831=head1 Deferred Writing
1832
1833(This is an advanced feature. Skip this section on first reading.)
1834
1835Normally, modifying a C<Tie::File> array writes to the underlying file
1836immediately. Every assignment like C<$a[3] = ...> rewrites as much of
1837the file as is necessary; typically, everything from line 3 through
1838the end will need to be rewritten. This is the simplest and most
1839transparent behavior. Performance even for large files is reasonably
1840good.
1841
1842However, under some circumstances, this behavior may be excessively
1843slow. For example, suppose you have a million-record file, and you
1844want to do:
1845
1846 for (@FILE) {
1847 $_ = "> $_";
1848 }
1849
1850The first time through the loop, you will rewrite the entire file,
1851from line 0 through the end. The second time through the loop, you
1852will rewrite the entire file from line 1 through the end. The third
1853time through the loop, you will rewrite the entire file from line 2 to
1854the end. And so on.
1855
1856If the performance in such cases is unacceptable, you may defer the
1857actual writing, and then have it done all at once. The following loop
1858will perform much better for large files:
1859
1860 (tied @a)->defer;
1861 for (@a) {
1862 $_ = "> $_";
1863 }
1864 (tied @a)->flush;
1865
1866If C<Tie::File>'s memory limit is large enough, all the writing will
1867done in memory. Then, when you call C<-E<gt>flush>, the entire file
1868will be rewritten in a single pass.
1869
6fc0ea7e
JH
1870(Actually, the preceding discussion is something of a fib. You don't
1871need to enable deferred writing to get good performance for this
1872common case, because C<Tie::File> will do it for you automatically
1873unless you specifically tell it not to. See L<"autodeferring">,
1874below.)
1875
57c7bc08
AMS
1876Calling C<-E<gt>flush> returns the array to immediate-write mode. If
1877you wish to discard the deferred writes, you may call C<-E<gt>discard>
1878instead of C<-E<gt>flush>. Note that in some cases, some of the data
1879will have been written already, and it will be too late for
6fc0ea7e
JH
1880C<-E<gt>discard> to discard all the changes. Support for
1881C<-E<gt>discard> may be withdrawn in a future version of C<Tie::File>.
57c7bc08
AMS
1882
1883Deferred writes are cached in memory up to the limit specified by the
1884C<dw_size> option (see above). If the deferred-write buffer is full
1885and you try to write still more deferred data, the buffer will be
1886flushed. All buffered data will be written immediately, the buffer
1887will be emptied, and the now-empty space will be used for future
1888deferred writes.
1889
1890If the deferred-write buffer isn't yet full, but the total size of the
1891buffer and the read cache would exceed the C<memory> limit, the oldest
1892records will be flushed out of the read cache until total usage is
1893under the limit.
1894
1895C<push>, C<pop>, C<shift>, C<unshift>, and C<splice> cannot be
1896deferred. When you perform one of these operations, any deferred data
1897is written to the file and the operation is performed immediately.
1898This may change in a future version.
1899
6fc0ea7e
JH
1900If you resize the array with deferred writing enabled, the file will
1901be resized immediately, but deferred records will not be written.
1902
1903=head2 Autodeferring
1904
1905C<Tie::File> tries to guess when deferred writing might be helpful,
1906and to turn it on and off automatically. In the example above, only
1907the first two assignments will be done immediately; after this, all
1908the changes to the file will be deferred up to the user-specified
1909memory limit.
1910
1911You should usually be able to ignore this and just use the module
1912without thinking about deferring. However, special applications may
1913require fine control over which writes are deferred, or may require
1914that all writes be immediate. To disable the autodeferment feature,
1915use
57c7bc08
AMS
1916
1917 (tied @o)->autodefer(0);
1918
6fc0ea7e
JH
1919or
1920
1921 tie @array, 'Tie::File', $file, autodefer => 0;
1922
fa408a35 1923
b5aed31e
AMS
1924=head1 CAVEATS
1925
1926(That's Latin for 'warnings'.)
1927
b3fe5a4c
AMS
1928=over 4
1929
1930=item *
1931
1932This is BETA RELEASE SOFTWARE. It may have bugs. See the discussion
1933below about the (lack of any) warranty.
1934
6fc0ea7e
JH
1935In particular, this means that the interface may change in
1936incompatible ways from one version to the next, without warning. That
1937has happened at least once already. The interface will freeze before
1938Perl 5.8 is released, probably sometime in April 2002.
1939
b3fe5a4c 1940=item *
b5aed31e 1941
6fc0ea7e 1942Reasonable effort was made to make this module efficient. Nevertheless,
b5aed31e 1943changing the size of a record in the middle of a large file will
b3fe5a4c
AMS
1944always be fairly slow, because everything after the new record must be
1945moved.
b5aed31e 1946
b3fe5a4c
AMS
1947=item *
1948
1949The behavior of tied arrays is not precisely the same as for regular
1950arrays. For example:
b5aed31e 1951
57c7bc08
AMS
1952 # This DOES print "How unusual!"
1953 undef $a[10]; print "How unusual!\n" if defined $a[10];
b3fe5a4c
AMS
1954
1955C<undef>-ing a C<Tie::File> array element just blanks out the
1956corresponding record in the file. When you read it back again, you'll
57c7bc08
AMS
1957get the empty string, so the supposedly-C<undef>'ed value will be
1958defined. Similarly, if you have C<autochomp> disabled, then
1959
1960 # This DOES print "How unusual!" if 'autochomp' is disabled
1961 undef $a[10];
1962 print "How unusual!\n" if $a[10];
1963
1964Because when C<autochomp> is disabled, C<$a[10]> will read back as
1965C<"\n"> (or whatever the record separator string is.)
b5aed31e 1966
6fc0ea7e
JH
1967There are other minor differences, particularly regarding C<exists>
1968and C<delete>, but in general, the correspondence is extremely close.
b3fe5a4c
AMS
1969
1970=item *
1971
1972Not quite every effort was made to make this module as efficient as
b5aed31e
AMS
1973possible. C<FETCHSIZE> should use binary search instead of linear
1974search. The cache's LRU queue should be a heap instead of a list.
57c7bc08
AMS
1975
1976The performance of the C<flush> method could be improved. At present,
1977it still rewrites the tail of the file once for each block of
1978contiguous lines to be changed. In the typical case, this will result
1979in only one rewrite, but in peculiar cases it might be bad. It should
1980be possible to perform I<all> deferred writing with a single rewrite.
1981
b5aed31e 1982These defects are probably minor; in any event, they will be fixed in
57c7bc08 1983a future version of the module.
b5aed31e 1984
b3fe5a4c 1985=item *
b5aed31e
AMS
1986
1987The author has supposed that since this module is concerned with file
1988I/O, almost all normal use of it will be heavily I/O bound, and that
1989the time to maintain complicated data structures inside the module
1990will be dominated by the time to actually perform the I/O. This
6fc0ea7e
JH
1991suggests, for example, that an LRU read-cache is a good tradeoff, even
1992if it requires substantial bookkeeping following a C<splice>
b5aed31e
AMS
1993operation.
1994
57c7bc08 1995=item *
6fc0ea7e 1996
57c7bc08
AMS
1997You might be tempted to think that deferred writing is like
1998transactions, with C<flush> as C<commit> and C<discard> as
6fc0ea7e 1999C<rollback>, but it isn't, so don't.
57c7bc08 2000
b3fe5a4c 2001=back
51efdd02 2002
57c7bc08
AMS
2003=head1 SUBCLASSING
2004
2005This version promises absolutely nothing about the internals, which
2006may change without notice. A future version of the module will have a
2007well-defined and stable subclassing API.
2008
b3fe5a4c 2009=head1 WHAT ABOUT C<DB_File>?
51efdd02 2010
b3fe5a4c
AMS
2011C<DB_File>'s C<DB_RECNO> feature does something similar to
2012C<Tie::File>, but there are a number of reasons that you might prefer
2013C<Tie::File>. C<DB_File> is a great piece of software, but the
2014C<DB_RECNO> part is less great than the rest of it.
b5aed31e 2015
b3fe5a4c 2016=over 4
51efdd02 2017
b3fe5a4c 2018=item *
51efdd02 2019
b3fe5a4c
AMS
2020C<DB_File> reads your entire file into memory, modifies it in memory,
2021and the writes out the entire file again when you untie the file.
2022This is completely impractical for large files.
2023
2024C<Tie::File> does not do any of those things. It doesn't try to read
2025the entire file into memory; instead it uses a lazy approach and
2026caches recently-used records. The cache size is strictly bounded by
2027the C<memory> option. DB_File's C<-E<gt>{cachesize}> doesn't prevent
2028your process from blowing up when reading a big file.
2029
2030=item *
2031
2032C<DB_File> has an extremely poor writing strategy. If you have a
2033ten-megabyte file and tie it with C<DB_File>, and then use
2034
2035 $a[0] =~ s/PERL/Perl/;
2036
2037C<DB_file> will then read the entire ten-megabyte file into memory, do
2038the change, and write the entire file back to disk, reading ten
2039megabytes and writing ten megabytes. C<Tie::File> will read and write
2040only the first record.
2041
2042If you have a million-record file and tie it with C<DB_File>, and then
2043use
2044
2045 $a[999998] =~ s/Larry/Larry Wall/;
2046
2047C<DB_File> will read the entire million-record file into memory, do
2048the change, and write the entire file back to disk. C<Tie::File> will
2049only rewrite records 999998 and 999999. During the writing process,
2050it will never have more than a few kilobytes of data in memory at any
2051time, even if the two records are very large.
2052
2053=item *
2054
2055Since changes to C<DB_File> files only appear when you do C<untie>, it
2056can be inconvenient to arrange for concurrent access to the same file
2057by two or more processes. Each process needs to call C<$db-E<gt>sync>
2058after every write. When you change a C<Tie::File> array, the changes
2059are reflected in the file immediately; no explicit C<-E<gt>sync> call
57c7bc08
AMS
2060is required. (Or you can enable deferred writing mode to require that
2061changes be explicitly sync'ed.)
b3fe5a4c
AMS
2062
2063=item *
2064
2065C<DB_File> is only installed by default if you already have the C<db>
2066library on your system; C<Tie::File> is pure Perl and is installed by
2067default no matter what. Starting with Perl 5.7.3 you can be
2068absolutely sure it will be everywhere. You will never have that
2069surety with C<DB_File>. If you don't have C<DB_File> yet, it requires
2070a C compiler. You can install C<Tie::File> from CPAN in five minutes
2071with no compiler.
2072
2073=item *
2074
2075C<DB_File> is written in C, so if you aren't allowed to install
2076modules on your system, it is useless. C<Tie::File> is written in Perl,
2077so even if you aren't allowed to install modules, you can look into
2078the source code, see how it works, and copy the subroutines or the
2079ideas from the subroutines directly into your own Perl program.
2080
2081=item *
2082
2083Except in very old, unsupported versions, C<DB_File>'s free license
2084requires that you distribute the source code for your entire
2085application. If you are not able to distribute the source code for
2086your application, you must negotiate an alternative license from
2087Sleepycat, possibly for a fee. Tie::File is under the Perl Artistic
2088license and can be distributed free under the same terms as Perl
2089itself.
2090
2091=back
b5aed31e
AMS
2092
2093=head1 AUTHOR
2094
2095Mark Jason Dominus
2096
2097To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
2098
2099To receive an announcement whenever a new version of this module is
2100released, send a blank email message to
2101C<mjd-perl-tiefile-subscribe@plover.com>.
2102
57c7bc08
AMS
2103The most recent version of this module, including documentation and
2104any news of importance, will be available at
2105
2106 http://perl.plover.com/TieFile/
2107
2108
b5aed31e
AMS
2109=head1 LICENSE
2110
6fc0ea7e 2111C<Tie::File> version 0.90 is copyright (C) 2002 Mark Jason Dominus.
7b6b3db1
JH
2112
2113This library is free software; you may redistribute it and/or modify
2114it under the same terms as Perl itself.
b5aed31e 2115
57c7bc08
AMS
2116These terms are your choice of any of (1) the Perl Artistic Licence,
2117or (2) version 2 of the GNU General Public License as published by the
7b6b3db1
JH
2118Free Software Foundation, or (3) any later version of the GNU General
2119Public License.
b5aed31e 2120
7b6b3db1 2121This library is distributed in the hope that it will be useful,
b5aed31e
AMS
2122but WITHOUT ANY WARRANTY; without even the implied warranty of
2123MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2124GNU General Public License for more details.
2125
2126You should have received a copy of the GNU General Public License
7b6b3db1
JH
2127along with this library program; it should be in the file C<COPYING>.
2128If not, write to the Free Software Foundation, Inc., 59 Temple Place,
2129Suite 330, Boston, MA 02111 USA
b5aed31e
AMS
2130
2131For licensing inquiries, contact the author at:
2132
2133 Mark Jason Dominus
2134 255 S. Warnock St.
2135 Philadelphia, PA 19107
2136
2137=head1 WARRANTY
2138
6fc0ea7e 2139C<Tie::File> version 0.90 comes with ABSOLUTELY NO WARRANTY.
b5aed31e
AMS
2140For details, see the license.
2141
fa408a35
AMS
2142=head1 THANKS
2143
2144Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
2145core when I hadn't written it yet, and for generally being helpful,
2146supportive, and competent. (Usually the rule is "choose any one.")
2147Also big thanks to Abhijit Menon-Sen for all of the same things.
2148
57c7bc08
AMS
2149Special thanks to Craig Berry and Peter Prymmer (for VMS portability
2150help), Randy Kobes (for Win32 portability help), Clinton Pierce and
2151Autrijus Tang (for heroic eleventh-hour Win32 testing above and beyond
6fc0ea7e
JH
2152the call of duty), Michael G Schwern (for testing advice), and the
2153rest of the CPAN testers (for testing generally).
b5aed31e 2154
57c7bc08 2155Additional thanks to:
b3fe5a4c 2156Edward Avis /
fa408a35 2157Gerrit Haase /
b3fe5a4c 2158Nikola Knezevic /
836d9961 2159Nick Ing-Simmons /
fa408a35
AMS
2160Tassilo von Parseval /
2161H. Dieter Pearcey /
b3fe5a4c 2162Slaven Rezic /
6fc0ea7e 2163Peter Scott /
fa408a35 2164Peter Somu /
57c7bc08 2165Autrijus Tang (again) /
fa408a35 2166Tels
7b6b3db1 2167
fa408a35
AMS
2168=head1 TODO
2169
6fc0ea7e
JH
2170More tests. (_twrite should be tested separately, because there are a
2171lot of weird special cases lurking in there.)
b5aed31e 2172
6fc0ea7e 2173Improve SPLICE algorithm to use deferred writing machinery.
b5aed31e
AMS
2174
2175More tests. (Stuff I didn't think of yet.)
2176
b5aed31e
AMS
2177Paragraph mode?
2178
6fc0ea7e 2179Fixed-length mode. Leave-blanks mode.
b5aed31e 2180
fa408a35
AMS
2181Maybe an autolocking mode?
2182
6fc0ea7e
JH
2183Record locking with fcntl()? Then the module might support an undo
2184log and get real transactions. What a tour de force that would be.
b3fe5a4c 2185
6fc0ea7e 2186Cleverer strategy for flushing deferred writes.
b3fe5a4c 2187
6fc0ea7e 2188oMore tests.
b3fe5a4c 2189
b5aed31e
AMS
2190=cut
2191
6fc0ea7e
JH
2192
2193
2194
2195
2196
2197