This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
There seems to be a trend for the *time_r to be hidden.
[perl5.git] / lib / Tie / File.pm
CommitLineData
b5aed31e
AMS
1
2package Tie::File;
3use Carp;
4use POSIX 'SEEK_SET';
51efdd02 5use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
b5aed31e
AMS
6require 5.005;
7
b3fe5a4c 8$VERSION = "0.19";
b5aed31e
AMS
9
10# Idea: The object will always contain an array of byte offsets
11# this will be filled in as is necessary and convenient.
12# fetch will do seek-read.
13# There will be a cache parameter that controls the amount of cached *data*
14# Also an LRU queue of cached records
15# store will read the relevant record into the cache
16# If it's the same length as what is being written, it will overwrite it in
17# place; if not, it will do a from-to copying write.
18# The record separator string is also a parameter
19
20# Record numbers start at ZERO.
21
b3fe5a4c
AMS
22my $DEFAULT_MEMORY_SIZE = 1<<21; # 2 megabytes
23
24my %good_opt = map {$_ => 1, "-$_" => 1}
25 qw(memory dw_size mode recsep discipline);
b5aed31e
AMS
26
27sub TIEARRAY {
28 if (@_ % 2 != 0) {
29 croak "usage: tie \@array, $_[0], filename, [option => value]...";
30 }
31 my ($pack, $file, %opts) = @_;
32
33 # transform '-foo' keys into 'foo' keys
34 for my $key (keys %opts) {
b3fe5a4c
AMS
35 unless ($good_opt{$key}) {
36 croak("$pack: Unrecognized option '$key'\n");
37 }
b5aed31e
AMS
38 my $okey = $key;
39 if ($key =~ s/^-+//) {
40 $opts{$key} = delete $opts{$okey};
41 }
42 }
43
b3fe5a4c
AMS
44 unless (defined $opts{memory}) {
45 # default is the larger of the default cache size and the
46 # deferred-write buffer size (if specified)
47 $opts{memory} = $DEFAULT_MEMORY_SIZE;
48 $opts{memory} = $opts{dw_size}
49 if defined $opts{dw_size} && $opts{dw_size} > $DEFAULT_MEMORY_SIZE;
50 }
51 $opts{dw_size} = $opts{memory} unless defined $opts{dw_size};
52 if ($opts{dw_size} > $opts{memory}) {
53 croak("$pack: dw_size may not be larger than total memory allocation\n");
54 }
55 $opts{deferred} = {}; # no records presently deferred
56 $opts{deferred_s} = 0; # count of total bytes in ->{deferred}
b5aed31e
AMS
57
58 # the cache is a hash instead of an array because it is likely to be
59 # sparsely populated
60 $opts{cache} = {};
61 $opts{cached} = 0; # total size of cached data
62 $opts{lru} = []; # replace with heap in later version
63
64 $opts{offsets} = [0];
65 $opts{filename} = $file;
b3fe5a4c
AMS
66 unless (defined $opts{recsep}) {
67 $opts{recsep} = _default_recsep();
68 }
b5aed31e
AMS
69 $opts{recseplen} = length($opts{recsep});
70 if ($opts{recseplen} == 0) {
71 croak "Empty record separator not supported by $pack";
72 }
73
74 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
fa408a35 75 my $fh;
b5aed31e 76
fa408a35
AMS
77 if (UNIVERSAL::isa($file, 'GLOB')) {
78 unless (seek $file, 0, SEEK_SET) {
79 croak "$pack: your filehandle does not appear to be seekable";
80 }
81 $fh = $file;
82 } elsif (ref $file) {
83 croak "usage: tie \@array, $pack, filename, [option => value]...";
84 } else {
85 $fh = \do { local *FH }; # only works in 5.005 and later
86 sysopen $fh, $file, $mode, 0666 or return;
87 binmode $fh;
88 }
b5aed31e 89 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
b3fe5a4c
AMS
90 if (defined $opts{discipline} && $] >= 5.006) {
91 # This avoids a compile-time warning under 5.005
92 eval 'binmode($fh, $opts{discipline})';
93 croak $@ if $@ =~ /unknown discipline/i;
94 die if $@;
95 }
b5aed31e
AMS
96 $opts{fh} = $fh;
97
98 bless \%opts => $pack;
99}
100
101sub FETCH {
102 my ($self, $n) = @_;
103
104 # check the record cache
105 { my $cached = $self->_check_cache($n);
106 return $cached if defined $cached;
107 }
108
109 unless ($#{$self->{offsets}} >= $n) {
110 my $o = $self->_fill_offsets_to($n);
111 # If it's still undefined, there is no such record, so return 'undef'
112 return unless defined $o;
113 }
114
115 my $fh = $self->{FH};
116 $self->_seek($n); # we can do this now that offsets is populated
117 my $rec = $self->_read_record;
b3fe5a4c
AMS
118
119# If we happen to have just read the first record, check to see if
120# the length of the record matches what 'tell' says. If not, Tie::File
121# won't work, and should drop dead.
122#
123# if ($n == 0 && defined($rec) && tell($self->{fh}) != length($rec)) {
124# if (defined $self->{discipline}) {
125# croak "I/O discipline $self->{discipline} not supported";
126# } else {
127# croak "File encoding not supported";
128# }
129# }
130
b5aed31e
AMS
131 $self->_cache_insert($n, $rec) if defined $rec;
132 $rec;
133}
134
135sub STORE {
136 my ($self, $n, $rec) = @_;
137
138 $self->_fixrecs($rec);
139
b3fe5a4c 140 return $self->_store_deferred($n, $rec) if $self->{defer};
b5aed31e
AMS
141
142 # We need this to decide whether the new record will fit
143 # It incidentally populates the offsets table
144 # Note we have to do this before we alter the cache
145 my $oldrec = $self->FETCH($n);
146
147 # _check_cache promotes record $n to MRU. Is this correct behavior?
fa408a35 148 if (my $cached = $self->_check_cache($n)) {
b3fe5a4c 149 my $len_diff = length($rec) - length($cached);
fa408a35 150 $self->{cache}{$n} = $rec;
b3fe5a4c
AMS
151 $self->{cached} += $len_diff;
152 $self->_cache_flush
153 if $len_diff > 0
154 && $self->{deferred_s} + $self->{cached} > $self->{memory};
fa408a35 155 }
b5aed31e
AMS
156
157 if (not defined $oldrec) {
158 # We're storing a record beyond the end of the file
51efdd02 159 $self->_extend_file_to($n+1);
b5aed31e
AMS
160 $oldrec = $self->{recsep};
161 }
162 my $len_diff = length($rec) - length($oldrec);
163
b3fe5a4c 164 # length($oldrec) here is not consistent with text mode TODO XXX BUG
b5aed31e
AMS
165 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
166
167 # now update the offsets
168 # array slice goes from element $n+1 (the first one to move)
169 # to the end
170 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
171 $_ += $len_diff;
172 }
173}
174
b3fe5a4c
AMS
175sub _store_deferred {
176 my ($self, $n, $rec) = @_;
177 $self->_uncache($n);
178 my $old_deferred = $self->{deferred}{$n};
179 $self->{deferred}{$n} = $rec;
180 $self->{deferred_s} += length($rec);
181 $self->{deferred_s} -= length($old_deferred) if defined $old_deferred;
182 if ($self->{deferred_s} > $self->{dw_size}) {
183 $self->flush;
184 $self->defer; # flush clears the 'defer' flag
185 } elsif ($self->{deferred_s} + $self->{cached} > $self->{memory}) {
186 $self->_cache_flush;
187 }
188}
189
b5aed31e
AMS
190sub FETCHSIZE {
191 my $self = shift;
192 my $n = $#{$self->{offsets}};
193 while (defined ($self->_fill_offsets_to($n+1))) {
194 ++$n;
195 }
196 $n;
197}
198
199sub STORESIZE {
200 my ($self, $len) = @_;
201 my $olen = $self->FETCHSIZE;
202 return if $len == $olen; # Woo-hoo!
203
204 # file gets longer
205 if ($len > $olen) {
51efdd02 206 $self->_extend_file_to($len);
b5aed31e
AMS
207 return;
208 }
209
210 # file gets shorter
211 $self->_seek($len);
212 $self->_chop_file;
836d9961 213 $#{$self->{offsets}} = $len;
b3fe5a4c 214# $self->{offsets}[0] = 0; # in case we just chopped this
836d9961
JH
215 my @cached = grep $_ >= $len, keys %{$self->{cache}};
216 $self->_uncache(@cached);
b5aed31e
AMS
217}
218
51efdd02
AMS
219sub PUSH {
220 my $self = shift;
221 $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
222 $self->FETCHSIZE;
223}
224
225sub POP {
226 my $self = shift;
7b6b3db1
JH
227 my $size = $self->FETCHSIZE;
228 return if $size == 0;
229# print STDERR "# POPPITY POP POP POP\n";
230 scalar $self->SPLICE($size-1, 1);
51efdd02
AMS
231}
232
233sub SHIFT {
234 my $self = shift;
235 scalar $self->SPLICE(0, 1);
236}
237
238sub UNSHIFT {
239 my $self = shift;
240 $self->SPLICE(0, 0, @_);
241 $self->FETCHSIZE;
242}
243
244sub CLEAR {
245 # And enable auto-defer mode, since it's likely that they just
246 # did @a = (...);
247 my $self = shift;
248 $self->_seekb(0);
249 $self->_chop_file;
250 %{$self->{cache}} = ();
251 $self->{cached} = 0;
252 @{$self->{lru}} = ();
253 @{$self->{offsets}} = (0);
254}
255
256sub EXTEND {
257 my ($self, $n) = @_;
258 $self->_fill_offsets_to($n);
259 $self->_extend_file_to($n);
260}
261
262sub DELETE {
263 my ($self, $n) = @_;
264 my $lastrec = $self->FETCHSIZE-1;
265 if ($n == $lastrec) {
266 $self->_seek($n);
267 $self->_chop_file;
fa408a35 268 $#{$self->{offsets}}--;
836d9961 269 $self->_uncache($n);
51efdd02
AMS
270 # perhaps in this case I should also remove trailing null records?
271 } else {
272 $self->STORE($n, "");
273 }
274}
275
276sub EXISTS {
277 my ($self, $n) = @_;
278 $self->_fill_offsets_to($n);
279 0 <= $n && $n < $self->FETCHSIZE;
280}
281
b5aed31e 282sub SPLICE {
b3fe5a4c
AMS
283 my $self = shift;
284 $self->_flush if $self->{defer};
285 $self->_splice(@_);
286}
287
288sub DESTROY {
289 $self->flush if $self->{defer};
290}
291
292sub _splice {
b5aed31e
AMS
293 my ($self, $pos, $nrecs, @data) = @_;
294 my @result;
295
7b6b3db1
JH
296 $pos = 0 unless defined $pos;
297
298 # Deal with negative and other out-of-range positions
299 # Also set default for $nrecs
51efdd02
AMS
300 {
301 my $oldsize = $self->FETCHSIZE;
7b6b3db1 302 $nrecs = $oldsize unless defined $nrecs;
51efdd02
AMS
303 my $oldpos = $pos;
304
305 if ($pos < 0) {
306 $pos += $oldsize;
307 if ($pos < 0) {
308 croak "Modification of non-creatable array value attempted, subscript $oldpos";
309 }
310 }
311
312 if ($pos > $oldsize) {
313 return unless @data;
314 $pos = $oldsize; # This is what perl does for normal arrays
315 }
316 }
b5aed31e
AMS
317
318 $self->_fixrecs(@data);
319 my $data = join '', @data;
320 my $datalen = length $data;
321 my $oldlen = 0;
322
323 # compute length of data being removed
51efdd02 324 # Incidentally fills offsets table
b5aed31e
AMS
325 for ($pos .. $pos+$nrecs-1) {
326 my $rec = $self->FETCH($_);
327 last unless defined $rec;
328 push @result, $rec;
329 $oldlen += length($rec);
330 }
331
51efdd02 332 # Modify the file
b5aed31e
AMS
333 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
334
335 # update the offsets table part 1
336 # compute the offsets of the new records:
337 my @new_offsets;
338 if (@data) {
339 push @new_offsets, $self->{offsets}[$pos];
340 for (0 .. $#data-1) {
341 push @new_offsets, $new_offsets[-1] + length($data[$_]);
342 }
343 }
344 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
345
346 # update the offsets table part 2
347 # adjust the offsets of the following old records
348 for ($pos+@data .. $#{$self->{offsets}}) {
349 $self->{offsets}[$_] += $datalen - $oldlen;
350 }
351 # If we scrubbed out all known offsets, regenerate the trivial table
352 # that knows that the file does indeed start at 0.
353 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
354
51efdd02
AMS
355 # Perhaps the following cache foolery could be factored out
356 # into a bunch of mor opaque cache functions. For example,
357 # it's odd to delete a record from the cache and then remove
358 # it from the LRU queue later on; there should be a function to
359 # do both at once.
360
b5aed31e
AMS
361 # update the read cache, part 1
362 # modified records
363 # Consider this carefully for correctness
364 for ($pos .. $pos+$nrecs-1) {
365 my $cached = $self->{cache}{$_};
366 next unless defined $cached;
367 my $new = $data[$_-$pos];
368 if (defined $new) {
369 $self->{cached} += length($new) - length($cached);
370 $self->{cache}{$_} = $new;
371 } else {
836d9961 372 $self->_uncache($_);
b5aed31e
AMS
373 }
374 }
375 # update the read cache, part 2
376 # moved records - records past the site of the change
377 # need to be renumbered
378 # Maybe merge this with the previous block?
b3fe5a4c
AMS
379 {
380 my %adjusted;
381 for (keys %{$self->{cache}}) {
382 next unless $_ >= $pos + $nrecs;
383 $adjusted{$_-$nrecs+@data} = delete $self->{cache}{$_};
384 }
385 @{$self->{cache}}{keys %adjusted} = values %adjusted;
386# for (keys %{$self->{cache}}) {
387# next unless $_ >= $pos + $nrecs;
388# $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
389# }
b5aed31e 390 }
b3fe5a4c 391
b5aed31e
AMS
392 # fix the LRU queue
393 my(@new, @changed);
394 for (@{$self->{lru}}) {
395 if ($_ >= $pos + $nrecs) {
396 push @new, $_ + @data - $nrecs;
397 } elsif ($_ >= $pos) {
398 push @changed, $_ if $_ < $pos + @data;
399 } else {
400 push @new, $_;
401 }
402 }
403 @{$self->{lru}} = (@new, @changed);
404
b3fe5a4c
AMS
405 # Now there might be too much data in the cache, if we spliced out
406 # some short records and spliced in some long ones. If so, flush
407 # the cache.
408 $self->_cache_flush;
409
51efdd02
AMS
410 # Yes, the return value of 'splice' *is* actually this complicated
411 wantarray ? @result : @result ? $result[-1] : undef;
b5aed31e
AMS
412}
413
414# write data into the file
415# $data is the data to be written.
416# it should be written at position $pos, and should overwrite
417# exactly $len of the following bytes.
418# Note that if length($data) > $len, the subsequent bytes will have to
419# be moved up, and if length($data) < $len, they will have to
420# be moved down
421sub _twrite {
422 my ($self, $data, $pos, $len) = @_;
423
424 unless (defined $pos) {
425 die "\$pos was undefined in _twrite";
426 }
427
428 my $len_diff = length($data) - $len;
429
430 if ($len_diff == 0) { # Woo-hoo!
431 my $fh = $self->{fh};
432 $self->_seekb($pos);
433 $self->_write_record($data);
434 return; # well, that was easy.
435 }
436
437 # the two records are of different lengths
438 # our strategy here: rewrite the tail of the file,
439 # reading ahead one buffer at a time
440 # $bufsize is required to be at least as large as the data we're overwriting
441 my $bufsize = _bufsize($len_diff);
442 my ($writepos, $readpos) = ($pos, $pos+$len);
51efdd02 443 my $next_block;
b5aed31e
AMS
444
445 # Seems like there ought to be a way to avoid the repeated code
446 # and the special case here. The read(1) is also a little weird.
447 # Think about this.
448 do {
449 $self->_seekb($readpos);
51efdd02 450 my $br = read $self->{fh}, $next_block, $bufsize;
b5aed31e
AMS
451 my $more_data = read $self->{fh}, my($dummy), 1;
452 $self->_seekb($writepos);
453 $self->_write_record($data);
454 $readpos += $br;
455 $writepos += length $data;
456 $data = $next_block;
b5aed31e 457 } while $more_data;
51efdd02
AMS
458 $self->_seekb($writepos);
459 $self->_write_record($next_block);
b5aed31e
AMS
460
461 # There might be leftover data at the end of the file
462 $self->_chop_file if $len_diff < 0;
463}
464
465# If a record does not already end with the appropriate terminator
466# string, append one.
467sub _fixrecs {
468 my $self = shift;
469 for (@_) {
470 $_ .= $self->{recsep}
471 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
472 }
473}
474
475# seek to the beginning of record #$n
476# Assumes that the offsets table is already correctly populated
477#
478# Note that $n=-1 has a special meaning here: It means the start of
479# the last known record; this may or may not be the very last record
480# in the file, depending on whether the offsets table is fully populated.
481#
482sub _seek {
483 my ($self, $n) = @_;
484 my $o = $self->{offsets}[$n];
485 defined($o)
486 or confess("logic error: undefined offset for record $n");
487 seek $self->{fh}, $o, SEEK_SET
488 or die "Couldn't seek filehandle: $!"; # "Should never happen."
489}
490
491sub _seekb {
492 my ($self, $b) = @_;
493 seek $self->{fh}, $b, SEEK_SET
494 or die "Couldn't seek filehandle: $!"; # "Should never happen."
495}
496
497# populate the offsets table up to the beginning of record $n
498# return the offset of record $n
499sub _fill_offsets_to {
500 my ($self, $n) = @_;
501 my $fh = $self->{fh};
502 local *OFF = $self->{offsets};
503 my $rec;
504
505 until ($#OFF >= $n) {
506 my $o = $OFF[-1];
507 $self->_seek(-1); # tricky -- see comment at _seek
508 $rec = $self->_read_record;
509 if (defined $rec) {
51efdd02 510 push @OFF, tell $fh;
b5aed31e
AMS
511 } else {
512 return; # It turns out there is no such record
513 }
514 }
515
516 # we have now read all the records up to record n-1,
517 # so we can return the offset of record n
518 return $OFF[$n];
519}
520
521# assumes that $rec is already suitably terminated
522sub _write_record {
523 my ($self, $rec) = @_;
524 my $fh = $self->{fh};
525 print $fh $rec
526 or die "Couldn't write record: $!"; # "Should never happen."
527
528}
529
530sub _read_record {
531 my $self = shift;
532 my $rec;
533 { local $/ = $self->{recsep};
534 my $fh = $self->{fh};
535 $rec = <$fh>;
536 }
537 $rec;
538}
539
540sub _cache_insert {
541 my ($self, $n, $rec) = @_;
542
543 # Do not cache records that are too big to fit in the cache.
b3fe5a4c 544 return unless length $rec <= $self->{memory};
b5aed31e
AMS
545
546 $self->{cache}{$n} = $rec;
547 $self->{cached} += length $rec;
548 push @{$self->{lru}}, $n; # most-recently-used is at the END
549
b3fe5a4c 550 $self->_cache_flush if $self->{cached} > $self->{memory};
b5aed31e
AMS
551}
552
836d9961
JH
553sub _uncache {
554 my $self = shift;
555 for my $n (@_) {
556 my $cached = delete $self->{cache}{$n};
557 next unless defined $cached;
558 @{$self->{lru}} = grep $_ != $n, @{$self->{lru}};
559 $self->{cached} -= length($cached);
560 }
561}
562
b5aed31e
AMS
563sub _check_cache {
564 my ($self, $n) = @_;
565 my $rec;
566 return unless defined($rec = $self->{cache}{$n});
567
568 # cache hit; update LRU queue and return $rec
569 # replace this with a heap in a later version
570 @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
571 $rec;
572}
573
574sub _cache_flush {
575 my ($self) = @_;
b3fe5a4c 576 while ($self->{cached} + $self->{deferred_s} > $self->{memory}) {
b5aed31e 577 my $lru = shift @{$self->{lru}};
b3fe5a4c
AMS
578 my $rec = delete $self->{cache}{$lru};
579 $self->{cached} -= length $rec;
b5aed31e
AMS
580 }
581}
582
583# We have read to the end of the file and have the offsets table
584# entirely populated. Now we need to write a new record beyond
585# the end of the file. We prepare for this by writing
586# empty records into the file up to the position we want
51efdd02
AMS
587#
588# assumes that the offsets table already contains the offset of record $n,
589# if it exists, and extends to the end of the file if not.
b5aed31e
AMS
590sub _extend_file_to {
591 my ($self, $n) = @_;
592 $self->_seek(-1); # position after the end of the last record
593 my $pos = $self->{offsets}[-1];
594
595 # the offsets table has one entry more than the total number of records
51efdd02 596 $extras = $n - $#{$self->{offsets}};
b5aed31e
AMS
597
598 # Todo : just use $self->{recsep} x $extras here?
599 while ($extras-- > 0) {
600 $self->_write_record($self->{recsep});
fa408a35 601 push @{$self->{offsets}}, tell $self->{fh};
b5aed31e
AMS
602 }
603}
604
605# Truncate the file at the current position
606sub _chop_file {
607 my $self = shift;
608 truncate $self->{fh}, tell($self->{fh});
609}
610
611# compute the size of a buffer suitable for moving
612# all the data in a file forward $n bytes
613# ($n may be negative)
614# The result should be at least $n.
615sub _bufsize {
616 my $n = shift;
617 return 8192 if $n < 0;
618 my $b = $n & ~8191;
619 $b += 8192 if $n & 8191;
620 $b;
621}
622
51efdd02
AMS
623# Lock the file
624sub flock {
625 my ($self, $op) = @_;
626 unless (@_ <= 3) {
627 my $pack = ref $self;
628 croak "Usage: $pack\->flock([OPERATION])";
629 }
630 my $fh = $self->{fh};
631 $op = LOCK_EX unless defined $op;
632 flock $fh, $op;
633}
b5aed31e 634
b3fe5a4c
AMS
635# Defer writes
636sub defer {
637 my $self = shift;
638 $self->{defer} = 1;
639}
640
641# Flush deferred writes
642#
643# This could be better optimized to write the file in one pass, instead
644# of one pass per block of records. But that will require modifications
645# to _twrite, so I should have a good _twite test suite first.
646sub flush {
647 my $self = shift;
648
649 $self->_flush;
650 $self->{defer} = 0;
651}
652
653sub _flush {
654 my $self = shift;
655 my @writable = sort {$a<=>$b} (keys %{$self->{deferred}});
656
657 while (@writable) {
658 # gather all consecutive records from the front of @writable
659 my $first_rec = shift @writable;
660 my $last_rec = $first_rec+1;
661 ++$last_rec, shift @writable while @writable && $last_rec == $writable[0];
662 --$last_rec;
663 $self->_fill_offsets_to($last_rec);
664 $self->_extend_file_to($last_rec);
665 $self->_splice($first_rec, $last_rec-$first_rec+1,
666 @{$self->{deferred}}{$first_rec .. $last_rec});
667 }
668
669 $self->discard; # clear out defered-write-cache
670}
671
672# Discard deferred writes
673sub discard {
674 my $self = shift;
675 undef $self->{deferred};
676 $self->{deferred_s} = 0;
677 $self->{defer} = 0;
678}
679
680# Not yet implemented
681sub autodefer { }
682
683sub _default_recsep {
684 my $recsep = $/;
685 if ($^O eq 'MSWin32') {
686 # Windows users expect files to be terminated with \r\n
687 # But $/ is set to \n instead
688 # Note that this also transforms \n\n into \r\n\r\n.
689 # That is a feature.
690 $recsep =~ s/\n/\r\n/g;
691 }
692 $recsep;
693}
694
b5aed31e
AMS
695# Given a file, make sure the cache is consistent with the
696# file contents
697sub _check_integrity {
698 my ($self, $file, $warn) = @_;
699 my $good = 1;
fa408a35 700
836d9961
JH
701 if (not defined $self->{offsets}[0]) {
702 $warn && print STDERR "# offset 0 is missing!\n";
703 $good = 0;
704 } elsif ($self->{offsets}[0] != 0) {
b5aed31e
AMS
705 $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
706 $good = 0;
707 }
fa408a35
AMS
708
709 local *F = $self->{fh};
710 seek F, 0, SEEK_SET;
711 local $/ = $self->{recsep};
712 $. = 0;
713
b5aed31e
AMS
714 while (<F>) {
715 my $n = $. - 1;
716 my $cached = $self->{cache}{$n};
717 my $offset = $self->{offsets}[$.];
718 my $ao = tell F;
719 if (defined $offset && $offset != $ao) {
720 $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
fa408a35 721 $good = 0;
b5aed31e
AMS
722 }
723 if (defined $cached && $_ ne $cached) {
724 $good = 0;
725 chomp $cached;
726 chomp;
727 $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
728 }
729 }
730
b3fe5a4c 731 my $memory = 0;
b5aed31e 732 while (my ($n, $r) = each %{$self->{cache}}) {
b3fe5a4c 733 $memory += length($r);
b5aed31e
AMS
734 next if $n+1 <= $.; # checked this already
735 $warn && print STDERR "# spurious caching of record $n\n";
736 $good = 0;
737 }
b3fe5a4c
AMS
738 if ($memory != $self->{cached}) {
739 $warn && print STDERR "# cache size is $self->{cached}, should be $memory\n";
b5aed31e
AMS
740 $good = 0;
741 }
742
743 my (%seen, @duplicate);
744 for (@{$self->{lru}}) {
745 $seen{$_}++;
746 if (not exists $self->{cache}{$_}) {
b3fe5a4c 747 $warn && print "# $_ is mentioned in the LRU queue, but not in the cache\n";
b5aed31e
AMS
748 $good = 0;
749 }
750 }
751 @duplicate = grep $seen{$_}>1, keys %seen;
752 if (@duplicate) {
753 my $records = @duplicate == 1 ? 'Record' : 'Records';
754 my $appear = @duplicate == 1 ? 'appears' : 'appear';
b3fe5a4c 755 $warn && print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
b5aed31e
AMS
756 $good = 0;
757 }
758 for (keys %{$self->{cache}}) {
759 unless (exists $seen{$_}) {
836d9961 760 print "# record $_ is in the cache but not the LRU queue\n";
b5aed31e
AMS
761 $good = 0;
762 }
763 }
764
765 $good;
766}
767
fa408a35
AMS
768"Cogito, ergo sum."; # don't forget to return a true value from the file
769
b5aed31e
AMS
770=head1 NAME
771
772Tie::File - Access the lines of a disk file via a Perl array
773
774=head1 SYNOPSIS
775
b3fe5a4c 776 # This file documents Tie::File version 0.19
b5aed31e
AMS
777
778 tie @array, 'Tie::File', filename or die ...;
779
780 $array[13] = 'blah'; # line 13 of the file is now 'blah'
781 print $array[42]; # display line 42 of the file
782
783 $n_recs = @array; # how many records are in the file?
784 $#array = $n_recs - 2; # chop records off the end
785
51efdd02
AMS
786 # As you would expect:
787
788 push @array, new recs...;
789 my $r1 = pop @array;
790 unshift @array, new recs...;
791 my $r1 = shift @array;
b5aed31e
AMS
792 @old_recs = splice @array, 3, 7, new recs...;
793
794 untie @array; # all finished
795
796=head1 DESCRIPTION
797
798C<Tie::File> represents a regular text file as a Perl array. Each
799element in the array corresponds to a record in the file. The first
800line of the file is element 0 of the array; the second line is element
8011, and so on.
802
803The file is I<not> loaded into memory, so this will work even for
804gigantic files.
805
806Changes to the array are reflected in the file immediately.
807
b3fe5a4c
AMS
808Lazy people may now stop reading the manual.
809
b5aed31e
AMS
810=head2 C<recsep>
811
812What is a 'record'? By default, the meaning is the same as for the
813C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
b3fe5a4c
AMS
814probably C<"\n">. (Minor exception: on dos and Win32 systems, a
815'record' is a string terminated by C<"\r\n">.) You may change the
816definition of "record" by supplying the C<recsep> option in the C<tie>
817call:
b5aed31e
AMS
818
819 tie @array, 'Tie::File', $file, recsep => 'es';
820
b3fe5a4c
AMS
821This says that records are delimited by the string C<es>. If the file
822contained the following data:
b5aed31e
AMS
823
824 Curse these pesky flies!\n
825
826then the C<@array> would appear to have four elements:
827
828 "Curse thes"
829 "e pes"
830 "ky flies"
831 "!\n"
832
833An undefined value is not permitted as a record separator. Perl's
834special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
835emulated.
836
837Records read from the tied array will have the record separator string
838on the end, just as if they were read from the C<E<lt>...E<gt>>
839operator. Records stored into the array will have the record
840separator string appended before they are written to the file, if they
841don't have one already. For example, if the record separator string
842is C<"\n">, then the following two lines do exactly the same thing:
843
844 $array[17] = "Cherry pie";
845 $array[17] = "Cherry pie\n";
846
847The result is that the contents of line 17 of the file will be
848replaced with "Cherry pie"; a newline character will separate line 17
7b6b3db1 849from line 18. This means that in particular, this will do nothing:
b5aed31e
AMS
850
851 chomp $array[17];
852
853Because the C<chomp>ed value will have the separator reattached when
854it is written back to the file. There is no way to create a file
855whose trailing record separator string is missing.
856
857Inserting records that I<contain> the record separator string will
858produce a reasonable result, but if you can't foresee what this result
859will be, you'd better avoid doing this.
860
861=head2 C<mode>
862
863Normally, the specified file will be opened for read and write access,
864and will be created if it does not exist. (That is, the flags
865C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
866change this, you may supply alternative flags in the C<mode> option.
867See L<Fcntl> for a listing of available flags.
868For example:
869
870 # open the file if it exists, but fail if it does not exist
871 use Fcntl 'O_RDWR';
872 tie @array, 'Tie::File', $file, mode => O_RDWR;
873
874 # create the file if it does not exist
875 use Fcntl 'O_RDWR', 'O_CREAT';
876 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
877
878 # open an existing file in read-only mode
879 use Fcntl 'O_RDONLY';
880 tie @array, 'Tie::File', $file, mode => O_RDONLY;
881
882Opening the data file in write-only or append mode is not supported.
883
b3fe5a4c
AMS
884=head2 C<memory>
885
886This is an (inexact) upper limit on the amount of memory that
887C<Tie::File> will consume at any time while managing the file.
888At present, this is used as a bound on the size of the read cache.
b5aed31e
AMS
889
890Records read in from the file are cached, to avoid having to re-read
891them repeatedly. If you read the same record twice, the first time it
892will be stored in memory, and the second time it will be fetched from
b3fe5a4c
AMS
893the I<read cache>. The amount of data in the read cache will not
894exceed the value you specified for C<memory>. If C<Tie::File> wants
895to cache a new record, but the read cache is full, it will make room
896by expiring the least-recently visited records from the read cache.
b5aed31e 897
b3fe5a4c
AMS
898The default memory limit is 2Mib. You can adjust the maximum read
899cache size by supplying the C<memory> option. The argument is the
900desired cache size, in bytes.
b5aed31e
AMS
901
902 # I have a lot of memory, so use a large cache to speed up access
b3fe5a4c 903 tie @array, 'Tie::File', $file, memory => 20_000_000;
b5aed31e 904
b3fe5a4c 905Setting the memory limit to 0 will inhibit caching; records will be
b5aed31e
AMS
906fetched from disk every time you examine them.
907
908=head2 Option Format
909
910C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
b3fe5a4c 911C<recsep>. C<-memory> is a synonym for C<memory>. You get the
b5aed31e
AMS
912idea.
913
914=head1 Public Methods
915
916The C<tie> call returns an object, say C<$o>. You may call
917
918 $rec = $o->FETCH($n);
919 $o->STORE($n, $rec);
920
b3fe5a4c
AMS
921to fetch or store the record at line C<$n>, respectively; similarly
922the other tied array methods. (See L<perltie> for details.) You may
923also call the following methods on this object:
51efdd02
AMS
924
925=head2 C<flock>
926
927 $o->flock(MODE)
928
929will lock the tied file. C<MODE> has the same meaning as the second
930argument to the Perl built-in C<flock> function; for example
931C<LOCK_SH> or C<LOCK_EX | LOCK_NB>. (These constants are provided by
932the C<use Fcntl ':flock'> declaration.)
933
fa408a35 934C<MODE> is optional; C<$o-E<gt>flock> simply locks the file with
51efdd02
AMS
935C<LOCK_EX>.
936
937The best way to unlock a file is to discard the object and untie the
938array. It is probably unsafe to unlock the file without also untying
939it, because if you do, changes may remain unwritten inside the object.
940That is why there is no shortcut for unlocking. If you really want to
941unlock the file prematurely, you know what to do; if you don't know
942what to do, then don't do it.
943
944All the usual warnings about file locking apply here. In particular,
945note that file locking in Perl is B<advisory>, which means that
946holding a lock will not prevent anyone else from reading, writing, or
947erasing the file; it only prevents them from getting another lock at
948the same time. Locks are analogous to green traffic lights: If you
949have a green light, that does not prevent the idiot coming the other
950way from plowing into you sideways; it merely guarantees to you that
951the idiot does not also have a green light at the same time.
b5aed31e 952
fa408a35
AMS
953=head2 Tying to an already-opened filehandle
954
955If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
956of the other C<IO> modules, you may use:
957
958 tie @array, 'Tie::File', $fh, ...;
959
960Similarly if you opened that handle C<FH> with regular C<open> or
961C<sysopen>, you may use:
962
963 tie @array, 'Tie::File', \*FH, ...;
964
965Handles that were opened write-only won't work. Handles that were
966opened read-only will work as long as you don't try to write to them.
967Handles must be attached to seekable sources of data---that means no
b3fe5a4c
AMS
968pipes or sockets. If you supply a non-seekable handle, the C<tie>
969call will try to abort your program.
fa408a35 970
b5aed31e
AMS
971=head1 CAVEATS
972
973(That's Latin for 'warnings'.)
974
b3fe5a4c
AMS
975=over 4
976
977=item *
978
979This is BETA RELEASE SOFTWARE. It may have bugs. See the discussion
980below about the (lack of any) warranty.
981
982=item *
b5aed31e
AMS
983
984Every effort was made to make this module efficient. Nevertheless,
985changing the size of a record in the middle of a large file will
b3fe5a4c
AMS
986always be fairly slow, because everything after the new record must be
987moved.
b5aed31e 988
b3fe5a4c
AMS
989In particular, note that the following innocent-looking loop has very
990bad behavior:
b5aed31e 991
b3fe5a4c
AMS
992 # million-line file
993 for (@file_array) {
994 $_ .= 'x';
995 }
b5aed31e 996
b3fe5a4c
AMS
997This is likely to be very slow, because the first iteration must
998relocate lines 1 through 999,999; the second iteration must relocate
999lines 2 through 999,999, and so on. The relocation is done using
1000block writes, however, so it's not as slow as it might be.
b5aed31e 1001
836d9961
JH
1002A soon-to-be-released version of this module will provide a mechanism
1003for getting better performance in such cases, by deferring the writing
b3fe5a4c
AMS
1004until it can be done all at once. This deferred writing feature might
1005be enabled automagically if C<Tie::File> guesses that you are about to write many consecutive records. To disable this feature, use
1006
1007 (tied @o)->autodefer(0);
1008
1009(At present, this call does nothing.)
1010
1011=item *
1012
1013The behavior of tied arrays is not precisely the same as for regular
1014arrays. For example:
b5aed31e 1015
b3fe5a4c
AMS
1016 undef $a[10]; print "How unusual!\n" if $a[10];
1017
1018C<undef>-ing a C<Tie::File> array element just blanks out the
1019corresponding record in the file. When you read it back again, you'll
1020see the record separator (typically, $a[10] will appear to contain
1021"\n") so the supposedly-C<undef>'ed value will be true.
b5aed31e 1022
b3fe5a4c
AMS
1023There are other minor differences, but in general, the correspondence
1024is extremely close.
1025
1026=item *
1027
1028Not quite every effort was made to make this module as efficient as
b5aed31e
AMS
1029possible. C<FETCHSIZE> should use binary search instead of linear
1030search. The cache's LRU queue should be a heap instead of a list.
1031These defects are probably minor; in any event, they will be fixed in
1032a later version of the module.
1033
b3fe5a4c 1034=item *
b5aed31e
AMS
1035
1036The author has supposed that since this module is concerned with file
1037I/O, almost all normal use of it will be heavily I/O bound, and that
1038the time to maintain complicated data structures inside the module
1039will be dominated by the time to actually perform the I/O. This
fa408a35 1040suggests, for example, that an LRU read-cache is a good tradeoff,
b5aed31e
AMS
1041even if it requires substantial adjustment following a C<splice>
1042operation.
1043
b3fe5a4c 1044=back
51efdd02 1045
b3fe5a4c 1046=head1 WHAT ABOUT C<DB_File>?
51efdd02 1047
b3fe5a4c
AMS
1048C<DB_File>'s C<DB_RECNO> feature does something similar to
1049C<Tie::File>, but there are a number of reasons that you might prefer
1050C<Tie::File>. C<DB_File> is a great piece of software, but the
1051C<DB_RECNO> part is less great than the rest of it.
b5aed31e 1052
b3fe5a4c 1053=over 4
51efdd02 1054
b3fe5a4c 1055=item *
51efdd02 1056
b3fe5a4c
AMS
1057C<DB_File> reads your entire file into memory, modifies it in memory,
1058and the writes out the entire file again when you untie the file.
1059This is completely impractical for large files.
1060
1061C<Tie::File> does not do any of those things. It doesn't try to read
1062the entire file into memory; instead it uses a lazy approach and
1063caches recently-used records. The cache size is strictly bounded by
1064the C<memory> option. DB_File's C<-E<gt>{cachesize}> doesn't prevent
1065your process from blowing up when reading a big file.
1066
1067=item *
1068
1069C<DB_File> has an extremely poor writing strategy. If you have a
1070ten-megabyte file and tie it with C<DB_File>, and then use
1071
1072 $a[0] =~ s/PERL/Perl/;
1073
1074C<DB_file> will then read the entire ten-megabyte file into memory, do
1075the change, and write the entire file back to disk, reading ten
1076megabytes and writing ten megabytes. C<Tie::File> will read and write
1077only the first record.
1078
1079If you have a million-record file and tie it with C<DB_File>, and then
1080use
1081
1082 $a[999998] =~ s/Larry/Larry Wall/;
1083
1084C<DB_File> will read the entire million-record file into memory, do
1085the change, and write the entire file back to disk. C<Tie::File> will
1086only rewrite records 999998 and 999999. During the writing process,
1087it will never have more than a few kilobytes of data in memory at any
1088time, even if the two records are very large.
1089
1090=item *
1091
1092Since changes to C<DB_File> files only appear when you do C<untie>, it
1093can be inconvenient to arrange for concurrent access to the same file
1094by two or more processes. Each process needs to call C<$db-E<gt>sync>
1095after every write. When you change a C<Tie::File> array, the changes
1096are reflected in the file immediately; no explicit C<-E<gt>sync> call
1097is required. (The forthcoming "deferred writing" mode will allow you
1098to request that writes be held in memory until explicitly C<sync>'ed.)
1099
1100=item *
1101
1102C<DB_File> is only installed by default if you already have the C<db>
1103library on your system; C<Tie::File> is pure Perl and is installed by
1104default no matter what. Starting with Perl 5.7.3 you can be
1105absolutely sure it will be everywhere. You will never have that
1106surety with C<DB_File>. If you don't have C<DB_File> yet, it requires
1107a C compiler. You can install C<Tie::File> from CPAN in five minutes
1108with no compiler.
1109
1110=item *
1111
1112C<DB_File> is written in C, so if you aren't allowed to install
1113modules on your system, it is useless. C<Tie::File> is written in Perl,
1114so even if you aren't allowed to install modules, you can look into
1115the source code, see how it works, and copy the subroutines or the
1116ideas from the subroutines directly into your own Perl program.
1117
1118=item *
1119
1120Except in very old, unsupported versions, C<DB_File>'s free license
1121requires that you distribute the source code for your entire
1122application. If you are not able to distribute the source code for
1123your application, you must negotiate an alternative license from
1124Sleepycat, possibly for a fee. Tie::File is under the Perl Artistic
1125license and can be distributed free under the same terms as Perl
1126itself.
1127
1128=back
b5aed31e
AMS
1129
1130=head1 AUTHOR
1131
1132Mark Jason Dominus
1133
1134To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
1135
1136To receive an announcement whenever a new version of this module is
1137released, send a blank email message to
1138C<mjd-perl-tiefile-subscribe@plover.com>.
1139
1140=head1 LICENSE
1141
b3fe5a4c 1142C<Tie::File> version 0.19 is copyright (C) 2002 Mark Jason Dominus.
7b6b3db1
JH
1143
1144This library is free software; you may redistribute it and/or modify
1145it under the same terms as Perl itself.
b5aed31e 1146
7b6b3db1
JH
1147These terms include your choice of (1) the Perl Artistic Licence, or
1148(2) version 2 of the GNU General Public License as published by the
1149Free Software Foundation, or (3) any later version of the GNU General
1150Public License.
b5aed31e 1151
7b6b3db1 1152This library is distributed in the hope that it will be useful,
b5aed31e
AMS
1153but WITHOUT ANY WARRANTY; without even the implied warranty of
1154MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1155GNU General Public License for more details.
1156
1157You should have received a copy of the GNU General Public License
7b6b3db1
JH
1158along with this library program; it should be in the file C<COPYING>.
1159If not, write to the Free Software Foundation, Inc., 59 Temple Place,
1160Suite 330, Boston, MA 02111 USA
b5aed31e
AMS
1161
1162For licensing inquiries, contact the author at:
1163
1164 Mark Jason Dominus
1165 255 S. Warnock St.
1166 Philadelphia, PA 19107
1167
1168=head1 WARRANTY
1169
b3fe5a4c 1170C<Tie::File> version 0.19 comes with ABSOLUTELY NO WARRANTY.
b5aed31e
AMS
1171For details, see the license.
1172
fa408a35
AMS
1173=head1 THANKS
1174
1175Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
1176core when I hadn't written it yet, and for generally being helpful,
1177supportive, and competent. (Usually the rule is "choose any one.")
1178Also big thanks to Abhijit Menon-Sen for all of the same things.
1179
1180Special thanks to Craig Berry (for VMS portability help), Randy Kobes
b3fe5a4c
AMS
1181(for Win32 portability help), Clinton Pierce and Autrijus Tang (for
1182heroic eleventh-hour Win32 testing above and beyond the call of duty),
1183and the rest of the CPAN testers (for testing generally).
b5aed31e 1184
fa408a35 1185More thanks to:
b3fe5a4c 1186Edward Avis /
fa408a35 1187Gerrit Haase /
b3fe5a4c 1188Nikola Knezevic /
836d9961 1189Nick Ing-Simmons /
fa408a35
AMS
1190Tassilo von Parseval /
1191H. Dieter Pearcey /
b3fe5a4c 1192Slaven Rezic /
fa408a35
AMS
1193Peter Somu /
1194Tels
7b6b3db1 1195
fa408a35
AMS
1196=head1 TODO
1197
1198Test DELETE machinery more carefully.
b5aed31e 1199
b3fe5a4c
AMS
1200More tests. (C<mode> option. _twrite should be tested separately,
1201because there are a lot of weird special cases lurking in there.)
b5aed31e
AMS
1202
1203More tests. (Stuff I didn't think of yet.)
1204
b5aed31e
AMS
1205Paragraph mode?
1206
1207More tests.
1208
1209Fixed-length mode.
1210
fa408a35
AMS
1211Maybe an autolocking mode?
1212
b3fe5a4c
AMS
1213Finish deferred writing.
1214
1215Autodeferment.
1216
1217Record locking with fcntl()? Then you might support an undo log and
1218get real transactions. What a coup that would be.
1219
1220Leave-blanks mode
1221
b5aed31e
AMS
1222=cut
1223