This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
More CJK (just C, really) tables from Autrijus Tang
[perl5.git] / lib / Tie / File.pm
CommitLineData
b5aed31e
AMS
1
2package Tie::File;
3use Carp;
4use POSIX 'SEEK_SET';
5use Fcntl 'O_CREAT', 'O_RDWR';
6require 5.005;
7
1768807e 8$VERSION = "0.13";
b5aed31e
AMS
9
10# Idea: The object will always contain an array of byte offsets
11# this will be filled in as is necessary and convenient.
12# fetch will do seek-read.
13# There will be a cache parameter that controls the amount of cached *data*
14# Also an LRU queue of cached records
15# store will read the relevant record into the cache
16# If it's the same length as what is being written, it will overwrite it in
17# place; if not, it will do a from-to copying write.
18# The record separator string is also a parameter
19
20# Record numbers start at ZERO.
21
22my $DEFAULT_CACHE_SIZE = 1<<21; # 2 megabytes
23
24sub TIEARRAY {
25 if (@_ % 2 != 0) {
26 croak "usage: tie \@array, $_[0], filename, [option => value]...";
27 }
28 my ($pack, $file, %opts) = @_;
29
30 # transform '-foo' keys into 'foo' keys
31 for my $key (keys %opts) {
32 my $okey = $key;
33 if ($key =~ s/^-+//) {
34 $opts{$key} = delete $opts{$okey};
35 }
36 }
37
38 $opts{cachesize} ||= $DEFAULT_CACHE_SIZE;
39
40 # the cache is a hash instead of an array because it is likely to be
41 # sparsely populated
42 $opts{cache} = {};
43 $opts{cached} = 0; # total size of cached data
44 $opts{lru} = []; # replace with heap in later version
45
46 $opts{offsets} = [0];
47 $opts{filename} = $file;
48 $opts{recsep} = $/ unless defined $opts{recsep};
49 $opts{recseplen} = length($opts{recsep});
50 if ($opts{recseplen} == 0) {
51 croak "Empty record separator not supported by $pack";
52 }
53
54 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
55
56 my $fh = \do { local *FH }; # only works in 5.005 and later
57 sysopen $fh, $file, $mode, 0666 or return;
58 binmode $fh;
59 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
60 $opts{fh} = $fh;
61
62 bless \%opts => $pack;
63}
64
65sub FETCH {
66 my ($self, $n) = @_;
67
68 # check the record cache
69 { my $cached = $self->_check_cache($n);
70 return $cached if defined $cached;
71 }
72
73 unless ($#{$self->{offsets}} >= $n) {
74 my $o = $self->_fill_offsets_to($n);
75 # If it's still undefined, there is no such record, so return 'undef'
76 return unless defined $o;
77 }
78
79 my $fh = $self->{FH};
80 $self->_seek($n); # we can do this now that offsets is populated
81 my $rec = $self->_read_record;
82 $self->_cache_insert($n, $rec) if defined $rec;
83 $rec;
84}
85
86sub STORE {
87 my ($self, $n, $rec) = @_;
88
89 $self->_fixrecs($rec);
90
91 # TODO: what should we do about the cache? Install the new record
92 # in the cache only if the old version of the same record was
93 # already there?
94
95 # We need this to decide whether the new record will fit
96 # It incidentally populates the offsets table
97 # Note we have to do this before we alter the cache
98 my $oldrec = $self->FETCH($n);
99
100 # _check_cache promotes record $n to MRU. Is this correct behavior?
101 $self->{cache}{$n} = $rec if $self->_check_cache($n);
102
103 if (not defined $oldrec) {
104 # We're storing a record beyond the end of the file
105 $self->_extend_file_to($n);
106 $oldrec = $self->{recsep};
107 }
108 my $len_diff = length($rec) - length($oldrec);
109
110 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
111
112 # now update the offsets
113 # array slice goes from element $n+1 (the first one to move)
114 # to the end
115 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
116 $_ += $len_diff;
117 }
118}
119
120sub FETCHSIZE {
121 my $self = shift;
122 my $n = $#{$self->{offsets}};
123 while (defined ($self->_fill_offsets_to($n+1))) {
124 ++$n;
125 }
126 $n;
127}
128
129sub STORESIZE {
130 my ($self, $len) = @_;
131 my $olen = $self->FETCHSIZE;
132 return if $len == $olen; # Woo-hoo!
133
134 # file gets longer
135 if ($len > $olen) {
136 $self->_extend_file_to($len-1); # record numbers from 0 .. $len-1
137 return;
138 }
139
140 # file gets shorter
141 $self->_seek($len);
142 $self->_chop_file;
143 $#{$self->{offsets}} = $len-1;
144 my @cached = grep $_ > $len, keys %{$self->{cache}};
145 delete @{$self->{cache}}{@cached} if @cached;
146}
147
148sub SPLICE {
149 my ($self, $pos, $nrecs, @data) = @_;
150 my @result;
151
152 $pos += $self->FETCHSIZE if $pos < 0;
153
154 $self->_fixrecs(@data);
155 my $data = join '', @data;
156 my $datalen = length $data;
157 my $oldlen = 0;
158
159 # compute length of data being removed
160 for ($pos .. $pos+$nrecs-1) {
161 my $rec = $self->FETCH($_);
162 last unless defined $rec;
163 push @result, $rec;
164 $oldlen += length($rec);
165 }
166
167 $self->_fill_offsets_to($pos);
168 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
169
170 # update the offsets table part 1
171 # compute the offsets of the new records:
172 my @new_offsets;
173 if (@data) {
174 push @new_offsets, $self->{offsets}[$pos];
175 for (0 .. $#data-1) {
176 push @new_offsets, $new_offsets[-1] + length($data[$_]);
177 }
178 }
179 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
180
181 # update the offsets table part 2
182 # adjust the offsets of the following old records
183 for ($pos+@data .. $#{$self->{offsets}}) {
184 $self->{offsets}[$_] += $datalen - $oldlen;
185 }
186 # If we scrubbed out all known offsets, regenerate the trivial table
187 # that knows that the file does indeed start at 0.
188 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
189
190 # update the read cache, part 1
191 # modified records
192 # Consider this carefully for correctness
193 for ($pos .. $pos+$nrecs-1) {
194 my $cached = $self->{cache}{$_};
195 next unless defined $cached;
196 my $new = $data[$_-$pos];
197 if (defined $new) {
198 $self->{cached} += length($new) - length($cached);
199 $self->{cache}{$_} = $new;
200 } else {
201 delete $self->{cache}{$_};
202 $self->{cached} -= length($cached);
203 }
204 }
205 # update the read cache, part 2
206 # moved records - records past the site of the change
207 # need to be renumbered
208 # Maybe merge this with the previous block?
209 for (keys %{$self->{cache}}) {
210 next unless $_ >= $pos + $nrecs;
211 $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
212 }
213
214 # fix the LRU queue
215 my(@new, @changed);
216 for (@{$self->{lru}}) {
217 if ($_ >= $pos + $nrecs) {
218 push @new, $_ + @data - $nrecs;
219 } elsif ($_ >= $pos) {
220 push @changed, $_ if $_ < $pos + @data;
221 } else {
222 push @new, $_;
223 }
224 }
225 @{$self->{lru}} = (@new, @changed);
226
227 @result;
228}
229
230# write data into the file
231# $data is the data to be written.
232# it should be written at position $pos, and should overwrite
233# exactly $len of the following bytes.
234# Note that if length($data) > $len, the subsequent bytes will have to
235# be moved up, and if length($data) < $len, they will have to
236# be moved down
237sub _twrite {
238 my ($self, $data, $pos, $len) = @_;
239
240 unless (defined $pos) {
241 die "\$pos was undefined in _twrite";
242 }
243
244 my $len_diff = length($data) - $len;
245
246 if ($len_diff == 0) { # Woo-hoo!
247 my $fh = $self->{fh};
248 $self->_seekb($pos);
249 $self->_write_record($data);
250 return; # well, that was easy.
251 }
252
253 # the two records are of different lengths
254 # our strategy here: rewrite the tail of the file,
255 # reading ahead one buffer at a time
256 # $bufsize is required to be at least as large as the data we're overwriting
257 my $bufsize = _bufsize($len_diff);
258 my ($writepos, $readpos) = ($pos, $pos+$len);
259
260 # Seems like there ought to be a way to avoid the repeated code
261 # and the special case here. The read(1) is also a little weird.
262 # Think about this.
263 do {
264 $self->_seekb($readpos);
265 my $br = read $self->{fh}, my($next_block), $bufsize;
266 my $more_data = read $self->{fh}, my($dummy), 1;
267 $self->_seekb($writepos);
268 $self->_write_record($data);
269 $readpos += $br;
270 $writepos += length $data;
271 $data = $next_block;
272 unless ($more_data) {
273 $self->_seekb($writepos);
274 $self->_write_record($next_block);
275 }
276 } while $more_data;
277
278 # There might be leftover data at the end of the file
279 $self->_chop_file if $len_diff < 0;
280}
281
282# If a record does not already end with the appropriate terminator
283# string, append one.
284sub _fixrecs {
285 my $self = shift;
286 for (@_) {
287 $_ .= $self->{recsep}
288 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
289 }
290}
291
292# seek to the beginning of record #$n
293# Assumes that the offsets table is already correctly populated
294#
295# Note that $n=-1 has a special meaning here: It means the start of
296# the last known record; this may or may not be the very last record
297# in the file, depending on whether the offsets table is fully populated.
298#
299sub _seek {
300 my ($self, $n) = @_;
301 my $o = $self->{offsets}[$n];
302 defined($o)
303 or confess("logic error: undefined offset for record $n");
304 seek $self->{fh}, $o, SEEK_SET
305 or die "Couldn't seek filehandle: $!"; # "Should never happen."
306}
307
308sub _seekb {
309 my ($self, $b) = @_;
310 seek $self->{fh}, $b, SEEK_SET
311 or die "Couldn't seek filehandle: $!"; # "Should never happen."
312}
313
314# populate the offsets table up to the beginning of record $n
315# return the offset of record $n
316sub _fill_offsets_to {
317 my ($self, $n) = @_;
318 my $fh = $self->{fh};
319 local *OFF = $self->{offsets};
320 my $rec;
321
322 until ($#OFF >= $n) {
323 my $o = $OFF[-1];
324 $self->_seek(-1); # tricky -- see comment at _seek
325 $rec = $self->_read_record;
326 if (defined $rec) {
327 push @OFF, $o+length($rec);
328 } else {
329 return; # It turns out there is no such record
330 }
331 }
332
333 # we have now read all the records up to record n-1,
334 # so we can return the offset of record n
335 return $OFF[$n];
336}
337
338# assumes that $rec is already suitably terminated
339sub _write_record {
340 my ($self, $rec) = @_;
341 my $fh = $self->{fh};
342 print $fh $rec
343 or die "Couldn't write record: $!"; # "Should never happen."
344
345}
346
347sub _read_record {
348 my $self = shift;
349 my $rec;
350 { local $/ = $self->{recsep};
351 my $fh = $self->{fh};
352 $rec = <$fh>;
353 }
354 $rec;
355}
356
357sub _cache_insert {
358 my ($self, $n, $rec) = @_;
359
360 # Do not cache records that are too big to fit in the cache.
361 return unless length $rec <= $self->{cachesize};
362
363 $self->{cache}{$n} = $rec;
364 $self->{cached} += length $rec;
365 push @{$self->{lru}}, $n; # most-recently-used is at the END
366
367 $self->_cache_flush if $self->{cached} > $self->{cachesize};
368}
369
370sub _check_cache {
371 my ($self, $n) = @_;
372 my $rec;
373 return unless defined($rec = $self->{cache}{$n});
374
375 # cache hit; update LRU queue and return $rec
376 # replace this with a heap in a later version
377 @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
378 $rec;
379}
380
381sub _cache_flush {
382 my ($self) = @_;
383 while ($self->{cached} > $self->{cachesize}) {
384 my $lru = shift @{$self->{lru}};
385 $self->{cached} -= length $lru;
386 delete $self->{cache}{$lru};
387 }
388}
389
390# We have read to the end of the file and have the offsets table
391# entirely populated. Now we need to write a new record beyond
392# the end of the file. We prepare for this by writing
393# empty records into the file up to the position we want
394# $n here is the record number of the last record we're going to write
395sub _extend_file_to {
396 my ($self, $n) = @_;
397 $self->_seek(-1); # position after the end of the last record
398 my $pos = $self->{offsets}[-1];
399
400 # the offsets table has one entry more than the total number of records
401 $extras = $n - ($#{$self->{offsets}} - 1);
402
403 # Todo : just use $self->{recsep} x $extras here?
404 while ($extras-- > 0) {
405 $self->_write_record($self->{recsep});
406 $pos += $self->{recseplen};
407 push @{$self->{offsets}}, $pos;
408 }
409}
410
411# Truncate the file at the current position
412sub _chop_file {
413 my $self = shift;
414 truncate $self->{fh}, tell($self->{fh});
415}
416
417# compute the size of a buffer suitable for moving
418# all the data in a file forward $n bytes
419# ($n may be negative)
420# The result should be at least $n.
421sub _bufsize {
422 my $n = shift;
423 return 8192 if $n < 0;
424 my $b = $n & ~8191;
425 $b += 8192 if $n & 8191;
426 $b;
427}
428
429
430# Given a file, make sure the cache is consistent with the
431# file contents
432sub _check_integrity {
433 my ($self, $file, $warn) = @_;
434 my $good = 1;
435 local *F;
436 open F, $file or die "Couldn't open file $file: $!";
1768807e 437 binmode F;
b5aed31e
AMS
438 local $/ = $self->{recsep};
439 unless ($self->{offsets}[0] == 0) {
440 $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
441 $good = 0;
442 }
443 while (<F>) {
444 my $n = $. - 1;
445 my $cached = $self->{cache}{$n};
446 my $offset = $self->{offsets}[$.];
447 my $ao = tell F;
448 if (defined $offset && $offset != $ao) {
449 $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
450 }
451 if (defined $cached && $_ ne $cached) {
452 $good = 0;
453 chomp $cached;
454 chomp;
455 $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
456 }
457 }
458
459 my $cachesize = 0;
460 while (my ($n, $r) = each %{$self->{cache}}) {
461 $cachesize += length($r);
462 next if $n+1 <= $.; # checked this already
463 $warn && print STDERR "# spurious caching of record $n\n";
464 $good = 0;
465 }
466 if ($cachesize != $self->{cached}) {
467 $warn && print STDERR "# cache size is $self->{cached}, should be $cachesize\n";
468 $good = 0;
469 }
470
471 my (%seen, @duplicate);
472 for (@{$self->{lru}}) {
473 $seen{$_}++;
474 if (not exists $self->{cache}{$_}) {
475 print "# $_ is mentioned in the LRU queue, but not in the cache\n";
476 $good = 0;
477 }
478 }
479 @duplicate = grep $seen{$_}>1, keys %seen;
480 if (@duplicate) {
481 my $records = @duplicate == 1 ? 'Record' : 'Records';
482 my $appear = @duplicate == 1 ? 'appears' : 'appear';
483 print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
484 $good = 0;
485 }
486 for (keys %{$self->{cache}}) {
487 unless (exists $seen{$_}) {
488 print "# $record $_ is in the cache but not the LRU queue\n";
489 $good = 0;
490 }
491 }
492
493 $good;
494}
495
496=head1 NAME
497
498Tie::File - Access the lines of a disk file via a Perl array
499
500=head1 SYNOPSIS
501
1768807e 502 # This file documents Tie::File version 0.13
b5aed31e
AMS
503
504 tie @array, 'Tie::File', filename or die ...;
505
506 $array[13] = 'blah'; # line 13 of the file is now 'blah'
507 print $array[42]; # display line 42 of the file
508
509 $n_recs = @array; # how many records are in the file?
510 $#array = $n_recs - 2; # chop records off the end
511
512 # As you would expect
513 @old_recs = splice @array, 3, 7, new recs...;
514
515 untie @array; # all finished
516
517=head1 DESCRIPTION
518
519C<Tie::File> represents a regular text file as a Perl array. Each
520element in the array corresponds to a record in the file. The first
521line of the file is element 0 of the array; the second line is element
5221, and so on.
523
524The file is I<not> loaded into memory, so this will work even for
525gigantic files.
526
527Changes to the array are reflected in the file immediately.
528
529=head2 C<recsep>
530
531What is a 'record'? By default, the meaning is the same as for the
532C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
533probably C<"\n"> or C<"\r\n">. You may change the definition of
534"record" by supplying the C<recsep> option in the C<tie> call:
535
536 tie @array, 'Tie::File', $file, recsep => 'es';
537
538This says that records are delimited by the string C<es>. If the file contained the following data:
539
540 Curse these pesky flies!\n
541
542then the C<@array> would appear to have four elements:
543
544 "Curse thes"
545 "e pes"
546 "ky flies"
547 "!\n"
548
549An undefined value is not permitted as a record separator. Perl's
550special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
551emulated.
552
553Records read from the tied array will have the record separator string
554on the end, just as if they were read from the C<E<lt>...E<gt>>
555operator. Records stored into the array will have the record
556separator string appended before they are written to the file, if they
557don't have one already. For example, if the record separator string
558is C<"\n">, then the following two lines do exactly the same thing:
559
560 $array[17] = "Cherry pie";
561 $array[17] = "Cherry pie\n";
562
563The result is that the contents of line 17 of the file will be
564replaced with "Cherry pie"; a newline character will separate line 17
565from line 18. This means that inparticular, this will do nothing:
566
567 chomp $array[17];
568
569Because the C<chomp>ed value will have the separator reattached when
570it is written back to the file. There is no way to create a file
571whose trailing record separator string is missing.
572
573Inserting records that I<contain> the record separator string will
574produce a reasonable result, but if you can't foresee what this result
575will be, you'd better avoid doing this.
576
577=head2 C<mode>
578
579Normally, the specified file will be opened for read and write access,
580and will be created if it does not exist. (That is, the flags
581C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
582change this, you may supply alternative flags in the C<mode> option.
583See L<Fcntl> for a listing of available flags.
584For example:
585
586 # open the file if it exists, but fail if it does not exist
587 use Fcntl 'O_RDWR';
588 tie @array, 'Tie::File', $file, mode => O_RDWR;
589
590 # create the file if it does not exist
591 use Fcntl 'O_RDWR', 'O_CREAT';
592 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
593
594 # open an existing file in read-only mode
595 use Fcntl 'O_RDONLY';
596 tie @array, 'Tie::File', $file, mode => O_RDONLY;
597
598Opening the data file in write-only or append mode is not supported.
599
600=head2 C<cachesize>
601
602Records read in from the file are cached, to avoid having to re-read
603them repeatedly. If you read the same record twice, the first time it
604will be stored in memory, and the second time it will be fetched from
605memory.
606
607The cache has a bounded size; when it exceeds this size, the
608least-recently visited records will be purged from the cache. The
609default size is 2Mib. You can adjust the amount of space used for the
610cache by supplying the C<cachesize> option. The argument is the desired cache size, in bytes.
611
612 # I have a lot of memory, so use a large cache to speed up access
613 tie @array, 'Tie::File', $file, cachesize => 20_000_000;
614
615Setting the cache size to 0 will inhibit caching; records will be
616fetched from disk every time you examine them.
617
618=head2 Option Format
619
620C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
621C<recsep>. C<-cachesize> is a synonym for C<cachesize>. You get the
622idea.
623
624=head1 Public Methods
625
626The C<tie> call returns an object, say C<$o>. You may call
627
628 $rec = $o->FETCH($n);
629 $o->STORE($n, $rec);
630
631to fetch or store the record at line C<$n>, respectively. There are
632no other public methods in this package.
633
634=head1 CAVEATS
635
636(That's Latin for 'warnings'.)
637
638=head2 Efficiency Note
639
640Every effort was made to make this module efficient. Nevertheless,
641changing the size of a record in the middle of a large file will
642always be slow, because everything after the new record must be move.
643
644In particular, note that:
645
646 # million-line file
647 for (@file_array) {
648 $_ .= 'x';
649 }
650
651is likely to be very slow, because the first iteration must relocate
652lines 1 through 999,999; the second iteration must relocate lines 2
653through 999,999, and so on. The relocation is done using block
654writes, however, so it's not as slow as it might be.
655
656A future version of this module will provide some mechanism for
657getting better performance in such cases, by deferring the writing
658until it can be done all at once.
659
660=head2 Efficiency Note 2
661
662Not every effort was made to make this module as efficient as
663possible. C<FETCHSIZE> should use binary search instead of linear
664search. The cache's LRU queue should be a heap instead of a list.
665These defects are probably minor; in any event, they will be fixed in
666a later version of the module.
667
668=head2 Efficiency Note 3
669
670The author has supposed that since this module is concerned with file
671I/O, almost all normal use of it will be heavily I/O bound, and that
672the time to maintain complicated data structures inside the module
673will be dominated by the time to actually perform the I/O. This
674suggests, for example, that and LRU read-cache is a good tradeoff,
675even if it requires substantial adjustment following a C<splice>
676operation.
677
678=head2 Missing Methods
679
680The tied array does not yet support C<push>, C<pop>, C<shift>,
681C<unshift>, C<splice>, or size-setting via C<$#array = $n>. I will
682put these in soon.
683
684=head1 AUTHOR
685
686Mark Jason Dominus
687
688To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
689
690To receive an announcement whenever a new version of this module is
691released, send a blank email message to
692C<mjd-perl-tiefile-subscribe@plover.com>.
693
694=head1 LICENSE
695
1768807e 696C<Tie::File> version 0.13 is copyright (C) 2002 Mark Jason Dominus.
b5aed31e
AMS
697
698This program is free software; you can redistribute it and/or modify
699it under the terms of the GNU General Public License as published by
700the Free Software Foundation; either version 2 of the License, or (at
701your option) any later version.
702
703This program is distributed in the hope that it will be useful,
704but WITHOUT ANY WARRANTY; without even the implied warranty of
705MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
706GNU General Public License for more details.
707
708You should have received a copy of the GNU General Public License
709along with this program; it should be in the file C<COPYING>. If not,
710write to the Free Software Foundation, Inc., 59 Temple Place, Suite
711330, Boston, MA 02111 USA
712
713For licensing inquiries, contact the author at:
714
715 Mark Jason Dominus
716 255 S. Warnock St.
717 Philadelphia, PA 19107
718
719=head1 WARRANTY
720
1768807e 721C<Tie::File> version 0.13 comes with ABSOLUTELY NO WARRANTY.
b5aed31e
AMS
722For details, see the license.
723
724=head1 TODO
725
726C<push>, C<pop>, C<shift>, C<unshift>.
727
728More tests. (Configuration options, cache flushery. _twrite shoule
729be tested separately, because there are a lot of weird special cases
730lurking in there.)
731
732More tests. (Stuff I didn't think of yet.)
733
734File locking.
735
736Deferred writing. (!!!)
737
738Paragraph mode?
739
740More tests.
741
742Fixed-length mode.
743
744=cut
745