This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add Tie::File 0.12 from MJD.
[perl5.git] / lib / Tie / File.pm
CommitLineData
b5aed31e
AMS
1
2package Tie::File;
3use Carp;
4use POSIX 'SEEK_SET';
5use Fcntl 'O_CREAT', 'O_RDWR';
6require 5.005;
7
8$VERSION = "0.12";
9
10# Idea: The object will always contain an array of byte offsets
11# this will be filled in as is necessary and convenient.
12# fetch will do seek-read.
13# There will be a cache parameter that controls the amount of cached *data*
14# Also an LRU queue of cached records
15# store will read the relevant record into the cache
16# If it's the same length as what is being written, it will overwrite it in
17# place; if not, it will do a from-to copying write.
18# The record separator string is also a parameter
19
20# Record numbers start at ZERO.
21
22my $DEFAULT_CACHE_SIZE = 1<<21; # 2 megabytes
23
24sub TIEARRAY {
25 if (@_ % 2 != 0) {
26 croak "usage: tie \@array, $_[0], filename, [option => value]...";
27 }
28 my ($pack, $file, %opts) = @_;
29
30 # transform '-foo' keys into 'foo' keys
31 for my $key (keys %opts) {
32 my $okey = $key;
33 if ($key =~ s/^-+//) {
34 $opts{$key} = delete $opts{$okey};
35 }
36 }
37
38 $opts{cachesize} ||= $DEFAULT_CACHE_SIZE;
39
40 # the cache is a hash instead of an array because it is likely to be
41 # sparsely populated
42 $opts{cache} = {};
43 $opts{cached} = 0; # total size of cached data
44 $opts{lru} = []; # replace with heap in later version
45
46 $opts{offsets} = [0];
47 $opts{filename} = $file;
48 $opts{recsep} = $/ unless defined $opts{recsep};
49 $opts{recseplen} = length($opts{recsep});
50 if ($opts{recseplen} == 0) {
51 croak "Empty record separator not supported by $pack";
52 }
53
54 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
55
56 my $fh = \do { local *FH }; # only works in 5.005 and later
57 sysopen $fh, $file, $mode, 0666 or return;
58 binmode $fh;
59 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
60 $opts{fh} = $fh;
61
62 bless \%opts => $pack;
63}
64
65sub FETCH {
66 my ($self, $n) = @_;
67
68 # check the record cache
69 { my $cached = $self->_check_cache($n);
70 return $cached if defined $cached;
71 }
72
73 unless ($#{$self->{offsets}} >= $n) {
74 my $o = $self->_fill_offsets_to($n);
75 # If it's still undefined, there is no such record, so return 'undef'
76 return unless defined $o;
77 }
78
79 my $fh = $self->{FH};
80 $self->_seek($n); # we can do this now that offsets is populated
81 my $rec = $self->_read_record;
82 $self->_cache_insert($n, $rec) if defined $rec;
83 $rec;
84}
85
86sub STORE {
87 my ($self, $n, $rec) = @_;
88
89 $self->_fixrecs($rec);
90
91 # TODO: what should we do about the cache? Install the new record
92 # in the cache only if the old version of the same record was
93 # already there?
94
95 # We need this to decide whether the new record will fit
96 # It incidentally populates the offsets table
97 # Note we have to do this before we alter the cache
98 my $oldrec = $self->FETCH($n);
99
100 # _check_cache promotes record $n to MRU. Is this correct behavior?
101 $self->{cache}{$n} = $rec if $self->_check_cache($n);
102
103 if (not defined $oldrec) {
104 # We're storing a record beyond the end of the file
105 $self->_extend_file_to($n);
106 $oldrec = $self->{recsep};
107 }
108 my $len_diff = length($rec) - length($oldrec);
109
110 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
111
112 # now update the offsets
113 # array slice goes from element $n+1 (the first one to move)
114 # to the end
115 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
116 $_ += $len_diff;
117 }
118}
119
120sub FETCHSIZE {
121 my $self = shift;
122 my $n = $#{$self->{offsets}};
123 while (defined ($self->_fill_offsets_to($n+1))) {
124 ++$n;
125 }
126 $n;
127}
128
129sub STORESIZE {
130 my ($self, $len) = @_;
131 my $olen = $self->FETCHSIZE;
132 return if $len == $olen; # Woo-hoo!
133
134 # file gets longer
135 if ($len > $olen) {
136 $self->_extend_file_to($len-1); # record numbers from 0 .. $len-1
137 return;
138 }
139
140 # file gets shorter
141 $self->_seek($len);
142 $self->_chop_file;
143 $#{$self->{offsets}} = $len-1;
144 my @cached = grep $_ > $len, keys %{$self->{cache}};
145 delete @{$self->{cache}}{@cached} if @cached;
146}
147
148sub SPLICE {
149 my ($self, $pos, $nrecs, @data) = @_;
150 my @result;
151
152 $pos += $self->FETCHSIZE if $pos < 0;
153
154 $self->_fixrecs(@data);
155 my $data = join '', @data;
156 my $datalen = length $data;
157 my $oldlen = 0;
158
159 # compute length of data being removed
160 for ($pos .. $pos+$nrecs-1) {
161 my $rec = $self->FETCH($_);
162 last unless defined $rec;
163 push @result, $rec;
164 $oldlen += length($rec);
165 }
166
167 $self->_fill_offsets_to($pos);
168 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
169
170 # update the offsets table part 1
171 # compute the offsets of the new records:
172 my @new_offsets;
173 if (@data) {
174 push @new_offsets, $self->{offsets}[$pos];
175 for (0 .. $#data-1) {
176 push @new_offsets, $new_offsets[-1] + length($data[$_]);
177 }
178 }
179 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
180
181 # update the offsets table part 2
182 # adjust the offsets of the following old records
183 for ($pos+@data .. $#{$self->{offsets}}) {
184 $self->{offsets}[$_] += $datalen - $oldlen;
185 }
186 # If we scrubbed out all known offsets, regenerate the trivial table
187 # that knows that the file does indeed start at 0.
188 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
189
190 # update the read cache, part 1
191 # modified records
192 # Consider this carefully for correctness
193 for ($pos .. $pos+$nrecs-1) {
194 my $cached = $self->{cache}{$_};
195 next unless defined $cached;
196 my $new = $data[$_-$pos];
197 if (defined $new) {
198 $self->{cached} += length($new) - length($cached);
199 $self->{cache}{$_} = $new;
200 } else {
201 delete $self->{cache}{$_};
202 $self->{cached} -= length($cached);
203 }
204 }
205 # update the read cache, part 2
206 # moved records - records past the site of the change
207 # need to be renumbered
208 # Maybe merge this with the previous block?
209 for (keys %{$self->{cache}}) {
210 next unless $_ >= $pos + $nrecs;
211 $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
212 }
213
214 # fix the LRU queue
215 my(@new, @changed);
216 for (@{$self->{lru}}) {
217 if ($_ >= $pos + $nrecs) {
218 push @new, $_ + @data - $nrecs;
219 } elsif ($_ >= $pos) {
220 push @changed, $_ if $_ < $pos + @data;
221 } else {
222 push @new, $_;
223 }
224 }
225 @{$self->{lru}} = (@new, @changed);
226
227 @result;
228}
229
230# write data into the file
231# $data is the data to be written.
232# it should be written at position $pos, and should overwrite
233# exactly $len of the following bytes.
234# Note that if length($data) > $len, the subsequent bytes will have to
235# be moved up, and if length($data) < $len, they will have to
236# be moved down
237sub _twrite {
238 my ($self, $data, $pos, $len) = @_;
239
240 unless (defined $pos) {
241 die "\$pos was undefined in _twrite";
242 }
243
244 my $len_diff = length($data) - $len;
245
246 if ($len_diff == 0) { # Woo-hoo!
247 my $fh = $self->{fh};
248 $self->_seekb($pos);
249 $self->_write_record($data);
250 return; # well, that was easy.
251 }
252
253 # the two records are of different lengths
254 # our strategy here: rewrite the tail of the file,
255 # reading ahead one buffer at a time
256 # $bufsize is required to be at least as large as the data we're overwriting
257 my $bufsize = _bufsize($len_diff);
258 my ($writepos, $readpos) = ($pos, $pos+$len);
259
260 # Seems like there ought to be a way to avoid the repeated code
261 # and the special case here. The read(1) is also a little weird.
262 # Think about this.
263 do {
264 $self->_seekb($readpos);
265 my $br = read $self->{fh}, my($next_block), $bufsize;
266 my $more_data = read $self->{fh}, my($dummy), 1;
267 $self->_seekb($writepos);
268 $self->_write_record($data);
269 $readpos += $br;
270 $writepos += length $data;
271 $data = $next_block;
272 unless ($more_data) {
273 $self->_seekb($writepos);
274 $self->_write_record($next_block);
275 }
276 } while $more_data;
277
278 # There might be leftover data at the end of the file
279 $self->_chop_file if $len_diff < 0;
280}
281
282# If a record does not already end with the appropriate terminator
283# string, append one.
284sub _fixrecs {
285 my $self = shift;
286 for (@_) {
287 $_ .= $self->{recsep}
288 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
289 }
290}
291
292# seek to the beginning of record #$n
293# Assumes that the offsets table is already correctly populated
294#
295# Note that $n=-1 has a special meaning here: It means the start of
296# the last known record; this may or may not be the very last record
297# in the file, depending on whether the offsets table is fully populated.
298#
299sub _seek {
300 my ($self, $n) = @_;
301 my $o = $self->{offsets}[$n];
302 defined($o)
303 or confess("logic error: undefined offset for record $n");
304 seek $self->{fh}, $o, SEEK_SET
305 or die "Couldn't seek filehandle: $!"; # "Should never happen."
306}
307
308sub _seekb {
309 my ($self, $b) = @_;
310 seek $self->{fh}, $b, SEEK_SET
311 or die "Couldn't seek filehandle: $!"; # "Should never happen."
312}
313
314# populate the offsets table up to the beginning of record $n
315# return the offset of record $n
316sub _fill_offsets_to {
317 my ($self, $n) = @_;
318 my $fh = $self->{fh};
319 local *OFF = $self->{offsets};
320 my $rec;
321
322 until ($#OFF >= $n) {
323 my $o = $OFF[-1];
324 $self->_seek(-1); # tricky -- see comment at _seek
325 $rec = $self->_read_record;
326 if (defined $rec) {
327 push @OFF, $o+length($rec);
328 } else {
329 return; # It turns out there is no such record
330 }
331 }
332
333 # we have now read all the records up to record n-1,
334 # so we can return the offset of record n
335 return $OFF[$n];
336}
337
338# assumes that $rec is already suitably terminated
339sub _write_record {
340 my ($self, $rec) = @_;
341 my $fh = $self->{fh};
342 print $fh $rec
343 or die "Couldn't write record: $!"; # "Should never happen."
344
345}
346
347sub _read_record {
348 my $self = shift;
349 my $rec;
350 { local $/ = $self->{recsep};
351 my $fh = $self->{fh};
352 $rec = <$fh>;
353 }
354 $rec;
355}
356
357sub _cache_insert {
358 my ($self, $n, $rec) = @_;
359
360 # Do not cache records that are too big to fit in the cache.
361 return unless length $rec <= $self->{cachesize};
362
363 $self->{cache}{$n} = $rec;
364 $self->{cached} += length $rec;
365 push @{$self->{lru}}, $n; # most-recently-used is at the END
366
367 $self->_cache_flush if $self->{cached} > $self->{cachesize};
368}
369
370sub _check_cache {
371 my ($self, $n) = @_;
372 my $rec;
373 return unless defined($rec = $self->{cache}{$n});
374
375 # cache hit; update LRU queue and return $rec
376 # replace this with a heap in a later version
377 @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
378 $rec;
379}
380
381sub _cache_flush {
382 my ($self) = @_;
383 while ($self->{cached} > $self->{cachesize}) {
384 my $lru = shift @{$self->{lru}};
385 $self->{cached} -= length $lru;
386 delete $self->{cache}{$lru};
387 }
388}
389
390# We have read to the end of the file and have the offsets table
391# entirely populated. Now we need to write a new record beyond
392# the end of the file. We prepare for this by writing
393# empty records into the file up to the position we want
394# $n here is the record number of the last record we're going to write
395sub _extend_file_to {
396 my ($self, $n) = @_;
397 $self->_seek(-1); # position after the end of the last record
398 my $pos = $self->{offsets}[-1];
399
400 # the offsets table has one entry more than the total number of records
401 $extras = $n - ($#{$self->{offsets}} - 1);
402
403 # Todo : just use $self->{recsep} x $extras here?
404 while ($extras-- > 0) {
405 $self->_write_record($self->{recsep});
406 $pos += $self->{recseplen};
407 push @{$self->{offsets}}, $pos;
408 }
409}
410
411# Truncate the file at the current position
412sub _chop_file {
413 my $self = shift;
414 truncate $self->{fh}, tell($self->{fh});
415}
416
417# compute the size of a buffer suitable for moving
418# all the data in a file forward $n bytes
419# ($n may be negative)
420# The result should be at least $n.
421sub _bufsize {
422 my $n = shift;
423 return 8192 if $n < 0;
424 my $b = $n & ~8191;
425 $b += 8192 if $n & 8191;
426 $b;
427}
428
429
430# Given a file, make sure the cache is consistent with the
431# file contents
432sub _check_integrity {
433 my ($self, $file, $warn) = @_;
434 my $good = 1;
435 local *F;
436 open F, $file or die "Couldn't open file $file: $!";
437 local $/ = $self->{recsep};
438 unless ($self->{offsets}[0] == 0) {
439 $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
440 $good = 0;
441 }
442 while (<F>) {
443 my $n = $. - 1;
444 my $cached = $self->{cache}{$n};
445 my $offset = $self->{offsets}[$.];
446 my $ao = tell F;
447 if (defined $offset && $offset != $ao) {
448 $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
449 }
450 if (defined $cached && $_ ne $cached) {
451 $good = 0;
452 chomp $cached;
453 chomp;
454 $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
455 }
456 }
457
458 my $cachesize = 0;
459 while (my ($n, $r) = each %{$self->{cache}}) {
460 $cachesize += length($r);
461 next if $n+1 <= $.; # checked this already
462 $warn && print STDERR "# spurious caching of record $n\n";
463 $good = 0;
464 }
465 if ($cachesize != $self->{cached}) {
466 $warn && print STDERR "# cache size is $self->{cached}, should be $cachesize\n";
467 $good = 0;
468 }
469
470 my (%seen, @duplicate);
471 for (@{$self->{lru}}) {
472 $seen{$_}++;
473 if (not exists $self->{cache}{$_}) {
474 print "# $_ is mentioned in the LRU queue, but not in the cache\n";
475 $good = 0;
476 }
477 }
478 @duplicate = grep $seen{$_}>1, keys %seen;
479 if (@duplicate) {
480 my $records = @duplicate == 1 ? 'Record' : 'Records';
481 my $appear = @duplicate == 1 ? 'appears' : 'appear';
482 print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
483 $good = 0;
484 }
485 for (keys %{$self->{cache}}) {
486 unless (exists $seen{$_}) {
487 print "# $record $_ is in the cache but not the LRU queue\n";
488 $good = 0;
489 }
490 }
491
492 $good;
493}
494
495=head1 NAME
496
497Tie::File - Access the lines of a disk file via a Perl array
498
499=head1 SYNOPSIS
500
501 # This file documents Tie::File version 0.12
502
503 tie @array, 'Tie::File', filename or die ...;
504
505 $array[13] = 'blah'; # line 13 of the file is now 'blah'
506 print $array[42]; # display line 42 of the file
507
508 $n_recs = @array; # how many records are in the file?
509 $#array = $n_recs - 2; # chop records off the end
510
511 # As you would expect
512 @old_recs = splice @array, 3, 7, new recs...;
513
514 untie @array; # all finished
515
516=head1 DESCRIPTION
517
518C<Tie::File> represents a regular text file as a Perl array. Each
519element in the array corresponds to a record in the file. The first
520line of the file is element 0 of the array; the second line is element
5211, and so on.
522
523The file is I<not> loaded into memory, so this will work even for
524gigantic files.
525
526Changes to the array are reflected in the file immediately.
527
528=head2 C<recsep>
529
530What is a 'record'? By default, the meaning is the same as for the
531C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
532probably C<"\n"> or C<"\r\n">. You may change the definition of
533"record" by supplying the C<recsep> option in the C<tie> call:
534
535 tie @array, 'Tie::File', $file, recsep => 'es';
536
537This says that records are delimited by the string C<es>. If the file contained the following data:
538
539 Curse these pesky flies!\n
540
541then the C<@array> would appear to have four elements:
542
543 "Curse thes"
544 "e pes"
545 "ky flies"
546 "!\n"
547
548An undefined value is not permitted as a record separator. Perl's
549special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
550emulated.
551
552Records read from the tied array will have the record separator string
553on the end, just as if they were read from the C<E<lt>...E<gt>>
554operator. Records stored into the array will have the record
555separator string appended before they are written to the file, if they
556don't have one already. For example, if the record separator string
557is C<"\n">, then the following two lines do exactly the same thing:
558
559 $array[17] = "Cherry pie";
560 $array[17] = "Cherry pie\n";
561
562The result is that the contents of line 17 of the file will be
563replaced with "Cherry pie"; a newline character will separate line 17
564from line 18. This means that inparticular, this will do nothing:
565
566 chomp $array[17];
567
568Because the C<chomp>ed value will have the separator reattached when
569it is written back to the file. There is no way to create a file
570whose trailing record separator string is missing.
571
572Inserting records that I<contain> the record separator string will
573produce a reasonable result, but if you can't foresee what this result
574will be, you'd better avoid doing this.
575
576=head2 C<mode>
577
578Normally, the specified file will be opened for read and write access,
579and will be created if it does not exist. (That is, the flags
580C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
581change this, you may supply alternative flags in the C<mode> option.
582See L<Fcntl> for a listing of available flags.
583For example:
584
585 # open the file if it exists, but fail if it does not exist
586 use Fcntl 'O_RDWR';
587 tie @array, 'Tie::File', $file, mode => O_RDWR;
588
589 # create the file if it does not exist
590 use Fcntl 'O_RDWR', 'O_CREAT';
591 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
592
593 # open an existing file in read-only mode
594 use Fcntl 'O_RDONLY';
595 tie @array, 'Tie::File', $file, mode => O_RDONLY;
596
597Opening the data file in write-only or append mode is not supported.
598
599=head2 C<cachesize>
600
601Records read in from the file are cached, to avoid having to re-read
602them repeatedly. If you read the same record twice, the first time it
603will be stored in memory, and the second time it will be fetched from
604memory.
605
606The cache has a bounded size; when it exceeds this size, the
607least-recently visited records will be purged from the cache. The
608default size is 2Mib. You can adjust the amount of space used for the
609cache by supplying the C<cachesize> option. The argument is the desired cache size, in bytes.
610
611 # I have a lot of memory, so use a large cache to speed up access
612 tie @array, 'Tie::File', $file, cachesize => 20_000_000;
613
614Setting the cache size to 0 will inhibit caching; records will be
615fetched from disk every time you examine them.
616
617=head2 Option Format
618
619C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
620C<recsep>. C<-cachesize> is a synonym for C<cachesize>. You get the
621idea.
622
623=head1 Public Methods
624
625The C<tie> call returns an object, say C<$o>. You may call
626
627 $rec = $o->FETCH($n);
628 $o->STORE($n, $rec);
629
630to fetch or store the record at line C<$n>, respectively. There are
631no other public methods in this package.
632
633=head1 CAVEATS
634
635(That's Latin for 'warnings'.)
636
637=head2 Efficiency Note
638
639Every effort was made to make this module efficient. Nevertheless,
640changing the size of a record in the middle of a large file will
641always be slow, because everything after the new record must be move.
642
643In particular, note that:
644
645 # million-line file
646 for (@file_array) {
647 $_ .= 'x';
648 }
649
650is likely to be very slow, because the first iteration must relocate
651lines 1 through 999,999; the second iteration must relocate lines 2
652through 999,999, and so on. The relocation is done using block
653writes, however, so it's not as slow as it might be.
654
655A future version of this module will provide some mechanism for
656getting better performance in such cases, by deferring the writing
657until it can be done all at once.
658
659=head2 Efficiency Note 2
660
661Not every effort was made to make this module as efficient as
662possible. C<FETCHSIZE> should use binary search instead of linear
663search. The cache's LRU queue should be a heap instead of a list.
664These defects are probably minor; in any event, they will be fixed in
665a later version of the module.
666
667=head2 Efficiency Note 3
668
669The author has supposed that since this module is concerned with file
670I/O, almost all normal use of it will be heavily I/O bound, and that
671the time to maintain complicated data structures inside the module
672will be dominated by the time to actually perform the I/O. This
673suggests, for example, that and LRU read-cache is a good tradeoff,
674even if it requires substantial adjustment following a C<splice>
675operation.
676
677=head2 Missing Methods
678
679The tied array does not yet support C<push>, C<pop>, C<shift>,
680C<unshift>, C<splice>, or size-setting via C<$#array = $n>. I will
681put these in soon.
682
683=head1 AUTHOR
684
685Mark Jason Dominus
686
687To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
688
689To receive an announcement whenever a new version of this module is
690released, send a blank email message to
691C<mjd-perl-tiefile-subscribe@plover.com>.
692
693=head1 LICENSE
694
695C<Tie::File> version 0.12 is copyright (C) 2002 Mark Jason Dominus.
696
697This program is free software; you can redistribute it and/or modify
698it under the terms of the GNU General Public License as published by
699the Free Software Foundation; either version 2 of the License, or (at
700your option) any later version.
701
702This program is distributed in the hope that it will be useful,
703but WITHOUT ANY WARRANTY; without even the implied warranty of
704MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
705GNU General Public License for more details.
706
707You should have received a copy of the GNU General Public License
708along with this program; it should be in the file C<COPYING>. If not,
709write to the Free Software Foundation, Inc., 59 Temple Place, Suite
710330, Boston, MA 02111 USA
711
712For licensing inquiries, contact the author at:
713
714 Mark Jason Dominus
715 255 S. Warnock St.
716 Philadelphia, PA 19107
717
718=head1 WARRANTY
719
720C<Tie::File> version 0.12 comes with ABSOLUTELY NO WARRANTY.
721For details, see the license.
722
723=head1 TODO
724
725C<push>, C<pop>, C<shift>, C<unshift>.
726
727More tests. (Configuration options, cache flushery. _twrite shoule
728be tested separately, because there are a lot of weird special cases
729lurking in there.)
730
731More tests. (Stuff I didn't think of yet.)
732
733File locking.
734
735Deferred writing. (!!!)
736
737Paragraph mode?
738
739More tests.
740
741Fixed-length mode.
742
743=cut
744