This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Simple speed gainback (as it was in 5.6): for non-Unicode
[perl5.git] / lib / Tie / File.pm
CommitLineData
b5aed31e
AMS
1
2package Tie::File;
3use Carp;
4use POSIX 'SEEK_SET';
51efdd02 5use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
b5aed31e
AMS
6require 5.005;
7
0b28bc9a 8$VERSION = "0.20";
b5aed31e
AMS
9
10# Idea: The object will always contain an array of byte offsets
11# this will be filled in as is necessary and convenient.
12# fetch will do seek-read.
13# There will be a cache parameter that controls the amount of cached *data*
14# Also an LRU queue of cached records
15# store will read the relevant record into the cache
16# If it's the same length as what is being written, it will overwrite it in
17# place; if not, it will do a from-to copying write.
18# The record separator string is also a parameter
19
20# Record numbers start at ZERO.
21
b3fe5a4c
AMS
22my $DEFAULT_MEMORY_SIZE = 1<<21; # 2 megabytes
23
24my %good_opt = map {$_ => 1, "-$_" => 1}
0b28bc9a 25 qw(memory dw_size mode recsep discipline autochomp);
b5aed31e
AMS
26
27sub TIEARRAY {
28 if (@_ % 2 != 0) {
29 croak "usage: tie \@array, $_[0], filename, [option => value]...";
30 }
31 my ($pack, $file, %opts) = @_;
32
33 # transform '-foo' keys into 'foo' keys
34 for my $key (keys %opts) {
b3fe5a4c
AMS
35 unless ($good_opt{$key}) {
36 croak("$pack: Unrecognized option '$key'\n");
37 }
b5aed31e
AMS
38 my $okey = $key;
39 if ($key =~ s/^-+//) {
40 $opts{$key} = delete $opts{$okey};
41 }
42 }
43
b3fe5a4c
AMS
44 unless (defined $opts{memory}) {
45 # default is the larger of the default cache size and the
46 # deferred-write buffer size (if specified)
47 $opts{memory} = $DEFAULT_MEMORY_SIZE;
48 $opts{memory} = $opts{dw_size}
49 if defined $opts{dw_size} && $opts{dw_size} > $DEFAULT_MEMORY_SIZE;
50 }
51 $opts{dw_size} = $opts{memory} unless defined $opts{dw_size};
52 if ($opts{dw_size} > $opts{memory}) {
53 croak("$pack: dw_size may not be larger than total memory allocation\n");
54 }
55 $opts{deferred} = {}; # no records presently deferred
56 $opts{deferred_s} = 0; # count of total bytes in ->{deferred}
b5aed31e
AMS
57
58 # the cache is a hash instead of an array because it is likely to be
59 # sparsely populated
60 $opts{cache} = {};
61 $opts{cached} = 0; # total size of cached data
62 $opts{lru} = []; # replace with heap in later version
63
64 $opts{offsets} = [0];
65 $opts{filename} = $file;
b3fe5a4c
AMS
66 unless (defined $opts{recsep}) {
67 $opts{recsep} = _default_recsep();
68 }
b5aed31e
AMS
69 $opts{recseplen} = length($opts{recsep});
70 if ($opts{recseplen} == 0) {
71 croak "Empty record separator not supported by $pack";
72 }
73
0b28bc9a
AMS
74 $opts{autochomp} = 1 unless defined $opts{autochomp};
75
b5aed31e 76 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
fa408a35 77 my $fh;
b5aed31e 78
fa408a35
AMS
79 if (UNIVERSAL::isa($file, 'GLOB')) {
80 unless (seek $file, 0, SEEK_SET) {
81 croak "$pack: your filehandle does not appear to be seekable";
82 }
83 $fh = $file;
84 } elsif (ref $file) {
85 croak "usage: tie \@array, $pack, filename, [option => value]...";
86 } else {
87 $fh = \do { local *FH }; # only works in 5.005 and later
88 sysopen $fh, $file, $mode, 0666 or return;
89 binmode $fh;
90 }
b5aed31e 91 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
b3fe5a4c
AMS
92 if (defined $opts{discipline} && $] >= 5.006) {
93 # This avoids a compile-time warning under 5.005
94 eval 'binmode($fh, $opts{discipline})';
95 croak $@ if $@ =~ /unknown discipline/i;
96 die if $@;
97 }
b5aed31e
AMS
98 $opts{fh} = $fh;
99
100 bless \%opts => $pack;
101}
102
103sub FETCH {
104 my ($self, $n) = @_;
0b28bc9a
AMS
105 $self->_chomp1($self->_fetch($n));
106}
107
108# Chomp many records in-place; return nothing useful
109sub _chomp {
110 my $self = shift;
111 return unless $self->{autochomp};
112 if ($self->{autochomp}) {
113 for (@_) {
114 next unless defined;
115 substr($_, - $self->{recseplen}) = "";
116 }
117 }
118}
119
120# Chomp one record in-place; return modified record
121sub _chomp1 {
122 my ($self, $rec) = @_;
123 return $rec unless $self->{autochomp};
124 return unless defined $rec;
125 substr($rec, - $self->{recseplen}) = "";
126 $rec;
127}
128
129sub _fetch {
130 my ($self, $n) = @_;
b5aed31e
AMS
131
132 # check the record cache
133 { my $cached = $self->_check_cache($n);
134 return $cached if defined $cached;
135 }
136
137 unless ($#{$self->{offsets}} >= $n) {
138 my $o = $self->_fill_offsets_to($n);
139 # If it's still undefined, there is no such record, so return 'undef'
140 return unless defined $o;
141 }
142
143 my $fh = $self->{FH};
144 $self->_seek($n); # we can do this now that offsets is populated
145 my $rec = $self->_read_record;
b3fe5a4c
AMS
146
147# If we happen to have just read the first record, check to see if
148# the length of the record matches what 'tell' says. If not, Tie::File
149# won't work, and should drop dead.
150#
151# if ($n == 0 && defined($rec) && tell($self->{fh}) != length($rec)) {
152# if (defined $self->{discipline}) {
153# croak "I/O discipline $self->{discipline} not supported";
154# } else {
155# croak "File encoding not supported";
156# }
157# }
158
b5aed31e
AMS
159 $self->_cache_insert($n, $rec) if defined $rec;
160 $rec;
161}
162
163sub STORE {
164 my ($self, $n, $rec) = @_;
165
166 $self->_fixrecs($rec);
167
b3fe5a4c 168 return $self->_store_deferred($n, $rec) if $self->{defer};
b5aed31e
AMS
169
170 # We need this to decide whether the new record will fit
171 # It incidentally populates the offsets table
172 # Note we have to do this before we alter the cache
0b28bc9a 173 my $oldrec = $self->_fetch($n);
b5aed31e
AMS
174
175 # _check_cache promotes record $n to MRU. Is this correct behavior?
fa408a35 176 if (my $cached = $self->_check_cache($n)) {
b3fe5a4c 177 my $len_diff = length($rec) - length($cached);
fa408a35 178 $self->{cache}{$n} = $rec;
b3fe5a4c
AMS
179 $self->{cached} += $len_diff;
180 $self->_cache_flush
181 if $len_diff > 0
182 && $self->{deferred_s} + $self->{cached} > $self->{memory};
fa408a35 183 }
b5aed31e
AMS
184
185 if (not defined $oldrec) {
186 # We're storing a record beyond the end of the file
51efdd02 187 $self->_extend_file_to($n+1);
b5aed31e
AMS
188 $oldrec = $self->{recsep};
189 }
190 my $len_diff = length($rec) - length($oldrec);
191
b3fe5a4c 192 # length($oldrec) here is not consistent with text mode TODO XXX BUG
b5aed31e
AMS
193 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
194
195 # now update the offsets
196 # array slice goes from element $n+1 (the first one to move)
197 # to the end
198 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
199 $_ += $len_diff;
200 }
201}
202
b3fe5a4c
AMS
203sub _store_deferred {
204 my ($self, $n, $rec) = @_;
205 $self->_uncache($n);
206 my $old_deferred = $self->{deferred}{$n};
207 $self->{deferred}{$n} = $rec;
208 $self->{deferred_s} += length($rec);
209 $self->{deferred_s} -= length($old_deferred) if defined $old_deferred;
210 if ($self->{deferred_s} > $self->{dw_size}) {
211 $self->flush;
212 $self->defer; # flush clears the 'defer' flag
213 } elsif ($self->{deferred_s} + $self->{cached} > $self->{memory}) {
214 $self->_cache_flush;
215 }
216}
217
b5aed31e
AMS
218sub FETCHSIZE {
219 my $self = shift;
220 my $n = $#{$self->{offsets}};
221 while (defined ($self->_fill_offsets_to($n+1))) {
222 ++$n;
223 }
224 $n;
225}
226
227sub STORESIZE {
228 my ($self, $len) = @_;
229 my $olen = $self->FETCHSIZE;
230 return if $len == $olen; # Woo-hoo!
231
232 # file gets longer
233 if ($len > $olen) {
51efdd02 234 $self->_extend_file_to($len);
b5aed31e
AMS
235 return;
236 }
237
238 # file gets shorter
239 $self->_seek($len);
240 $self->_chop_file;
836d9961 241 $#{$self->{offsets}} = $len;
b3fe5a4c 242# $self->{offsets}[0] = 0; # in case we just chopped this
836d9961
JH
243 my @cached = grep $_ >= $len, keys %{$self->{cache}};
244 $self->_uncache(@cached);
b5aed31e
AMS
245}
246
51efdd02
AMS
247sub PUSH {
248 my $self = shift;
249 $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
250 $self->FETCHSIZE;
251}
252
253sub POP {
254 my $self = shift;
7b6b3db1
JH
255 my $size = $self->FETCHSIZE;
256 return if $size == 0;
257# print STDERR "# POPPITY POP POP POP\n";
258 scalar $self->SPLICE($size-1, 1);
51efdd02
AMS
259}
260
261sub SHIFT {
262 my $self = shift;
263 scalar $self->SPLICE(0, 1);
264}
265
266sub UNSHIFT {
267 my $self = shift;
268 $self->SPLICE(0, 0, @_);
269 $self->FETCHSIZE;
270}
271
272sub CLEAR {
273 # And enable auto-defer mode, since it's likely that they just
274 # did @a = (...);
275 my $self = shift;
276 $self->_seekb(0);
277 $self->_chop_file;
278 %{$self->{cache}} = ();
279 $self->{cached} = 0;
280 @{$self->{lru}} = ();
281 @{$self->{offsets}} = (0);
282}
283
284sub EXTEND {
285 my ($self, $n) = @_;
286 $self->_fill_offsets_to($n);
287 $self->_extend_file_to($n);
288}
289
290sub DELETE {
291 my ($self, $n) = @_;
292 my $lastrec = $self->FETCHSIZE-1;
293 if ($n == $lastrec) {
294 $self->_seek($n);
295 $self->_chop_file;
fa408a35 296 $#{$self->{offsets}}--;
836d9961 297 $self->_uncache($n);
51efdd02
AMS
298 # perhaps in this case I should also remove trailing null records?
299 } else {
300 $self->STORE($n, "");
301 }
302}
303
304sub EXISTS {
305 my ($self, $n) = @_;
306 $self->_fill_offsets_to($n);
307 0 <= $n && $n < $self->FETCHSIZE;
308}
309
b5aed31e 310sub SPLICE {
b3fe5a4c
AMS
311 my $self = shift;
312 $self->_flush if $self->{defer};
0b28bc9a
AMS
313 if (wantarray) {
314 $self->_chomp(my @a = $self->_splice(@_));
315 @a;
316 } else {
317 $self->_chomp1(scalar $self->_splice(@_));
318 }
b3fe5a4c
AMS
319}
320
321sub DESTROY {
322 $self->flush if $self->{defer};
323}
324
325sub _splice {
b5aed31e
AMS
326 my ($self, $pos, $nrecs, @data) = @_;
327 my @result;
328
7b6b3db1
JH
329 $pos = 0 unless defined $pos;
330
331 # Deal with negative and other out-of-range positions
332 # Also set default for $nrecs
51efdd02
AMS
333 {
334 my $oldsize = $self->FETCHSIZE;
7b6b3db1 335 $nrecs = $oldsize unless defined $nrecs;
51efdd02
AMS
336 my $oldpos = $pos;
337
338 if ($pos < 0) {
339 $pos += $oldsize;
340 if ($pos < 0) {
341 croak "Modification of non-creatable array value attempted, subscript $oldpos";
342 }
343 }
344
345 if ($pos > $oldsize) {
346 return unless @data;
347 $pos = $oldsize; # This is what perl does for normal arrays
348 }
349 }
b5aed31e
AMS
350
351 $self->_fixrecs(@data);
352 my $data = join '', @data;
353 my $datalen = length $data;
354 my $oldlen = 0;
355
356 # compute length of data being removed
51efdd02 357 # Incidentally fills offsets table
b5aed31e 358 for ($pos .. $pos+$nrecs-1) {
0b28bc9a 359 my $rec = $self->_fetch($_);
b5aed31e
AMS
360 last unless defined $rec;
361 push @result, $rec;
362 $oldlen += length($rec);
363 }
364
51efdd02 365 # Modify the file
b5aed31e
AMS
366 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
367
368 # update the offsets table part 1
369 # compute the offsets of the new records:
370 my @new_offsets;
371 if (@data) {
372 push @new_offsets, $self->{offsets}[$pos];
373 for (0 .. $#data-1) {
374 push @new_offsets, $new_offsets[-1] + length($data[$_]);
375 }
376 }
377 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
378
379 # update the offsets table part 2
380 # adjust the offsets of the following old records
381 for ($pos+@data .. $#{$self->{offsets}}) {
382 $self->{offsets}[$_] += $datalen - $oldlen;
383 }
384 # If we scrubbed out all known offsets, regenerate the trivial table
385 # that knows that the file does indeed start at 0.
386 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
387
51efdd02
AMS
388 # Perhaps the following cache foolery could be factored out
389 # into a bunch of mor opaque cache functions. For example,
390 # it's odd to delete a record from the cache and then remove
391 # it from the LRU queue later on; there should be a function to
392 # do both at once.
393
b5aed31e
AMS
394 # update the read cache, part 1
395 # modified records
396 # Consider this carefully for correctness
397 for ($pos .. $pos+$nrecs-1) {
398 my $cached = $self->{cache}{$_};
399 next unless defined $cached;
400 my $new = $data[$_-$pos];
401 if (defined $new) {
402 $self->{cached} += length($new) - length($cached);
403 $self->{cache}{$_} = $new;
404 } else {
836d9961 405 $self->_uncache($_);
b5aed31e
AMS
406 }
407 }
408 # update the read cache, part 2
409 # moved records - records past the site of the change
410 # need to be renumbered
411 # Maybe merge this with the previous block?
b3fe5a4c
AMS
412 {
413 my %adjusted;
414 for (keys %{$self->{cache}}) {
415 next unless $_ >= $pos + $nrecs;
416 $adjusted{$_-$nrecs+@data} = delete $self->{cache}{$_};
417 }
418 @{$self->{cache}}{keys %adjusted} = values %adjusted;
419# for (keys %{$self->{cache}}) {
420# next unless $_ >= $pos + $nrecs;
421# $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
422# }
b5aed31e 423 }
b3fe5a4c 424
b5aed31e
AMS
425 # fix the LRU queue
426 my(@new, @changed);
427 for (@{$self->{lru}}) {
428 if ($_ >= $pos + $nrecs) {
429 push @new, $_ + @data - $nrecs;
430 } elsif ($_ >= $pos) {
431 push @changed, $_ if $_ < $pos + @data;
432 } else {
433 push @new, $_;
434 }
435 }
436 @{$self->{lru}} = (@new, @changed);
437
b3fe5a4c
AMS
438 # Now there might be too much data in the cache, if we spliced out
439 # some short records and spliced in some long ones. If so, flush
440 # the cache.
441 $self->_cache_flush;
442
51efdd02
AMS
443 # Yes, the return value of 'splice' *is* actually this complicated
444 wantarray ? @result : @result ? $result[-1] : undef;
b5aed31e
AMS
445}
446
447# write data into the file
448# $data is the data to be written.
449# it should be written at position $pos, and should overwrite
450# exactly $len of the following bytes.
451# Note that if length($data) > $len, the subsequent bytes will have to
452# be moved up, and if length($data) < $len, they will have to
453# be moved down
454sub _twrite {
455 my ($self, $data, $pos, $len) = @_;
456
457 unless (defined $pos) {
458 die "\$pos was undefined in _twrite";
459 }
460
461 my $len_diff = length($data) - $len;
462
463 if ($len_diff == 0) { # Woo-hoo!
464 my $fh = $self->{fh};
465 $self->_seekb($pos);
466 $self->_write_record($data);
467 return; # well, that was easy.
468 }
469
470 # the two records are of different lengths
471 # our strategy here: rewrite the tail of the file,
472 # reading ahead one buffer at a time
473 # $bufsize is required to be at least as large as the data we're overwriting
474 my $bufsize = _bufsize($len_diff);
475 my ($writepos, $readpos) = ($pos, $pos+$len);
51efdd02 476 my $next_block;
b5aed31e
AMS
477
478 # Seems like there ought to be a way to avoid the repeated code
479 # and the special case here. The read(1) is also a little weird.
480 # Think about this.
481 do {
482 $self->_seekb($readpos);
51efdd02 483 my $br = read $self->{fh}, $next_block, $bufsize;
b5aed31e
AMS
484 my $more_data = read $self->{fh}, my($dummy), 1;
485 $self->_seekb($writepos);
486 $self->_write_record($data);
487 $readpos += $br;
488 $writepos += length $data;
489 $data = $next_block;
b5aed31e 490 } while $more_data;
51efdd02
AMS
491 $self->_seekb($writepos);
492 $self->_write_record($next_block);
b5aed31e
AMS
493
494 # There might be leftover data at the end of the file
495 $self->_chop_file if $len_diff < 0;
496}
497
498# If a record does not already end with the appropriate terminator
499# string, append one.
500sub _fixrecs {
501 my $self = shift;
502 for (@_) {
503 $_ .= $self->{recsep}
504 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
505 }
506}
507
508# seek to the beginning of record #$n
509# Assumes that the offsets table is already correctly populated
510#
511# Note that $n=-1 has a special meaning here: It means the start of
512# the last known record; this may or may not be the very last record
513# in the file, depending on whether the offsets table is fully populated.
514#
515sub _seek {
516 my ($self, $n) = @_;
517 my $o = $self->{offsets}[$n];
518 defined($o)
519 or confess("logic error: undefined offset for record $n");
520 seek $self->{fh}, $o, SEEK_SET
521 or die "Couldn't seek filehandle: $!"; # "Should never happen."
522}
523
524sub _seekb {
525 my ($self, $b) = @_;
526 seek $self->{fh}, $b, SEEK_SET
527 or die "Couldn't seek filehandle: $!"; # "Should never happen."
528}
529
530# populate the offsets table up to the beginning of record $n
531# return the offset of record $n
532sub _fill_offsets_to {
533 my ($self, $n) = @_;
534 my $fh = $self->{fh};
535 local *OFF = $self->{offsets};
536 my $rec;
537
538 until ($#OFF >= $n) {
539 my $o = $OFF[-1];
540 $self->_seek(-1); # tricky -- see comment at _seek
541 $rec = $self->_read_record;
542 if (defined $rec) {
51efdd02 543 push @OFF, tell $fh;
b5aed31e
AMS
544 } else {
545 return; # It turns out there is no such record
546 }
547 }
548
549 # we have now read all the records up to record n-1,
550 # so we can return the offset of record n
551 return $OFF[$n];
552}
553
554# assumes that $rec is already suitably terminated
555sub _write_record {
556 my ($self, $rec) = @_;
557 my $fh = $self->{fh};
558 print $fh $rec
559 or die "Couldn't write record: $!"; # "Should never happen."
560
561}
562
563sub _read_record {
564 my $self = shift;
565 my $rec;
566 { local $/ = $self->{recsep};
567 my $fh = $self->{fh};
568 $rec = <$fh>;
569 }
570 $rec;
571}
572
573sub _cache_insert {
574 my ($self, $n, $rec) = @_;
575
576 # Do not cache records that are too big to fit in the cache.
b3fe5a4c 577 return unless length $rec <= $self->{memory};
b5aed31e
AMS
578
579 $self->{cache}{$n} = $rec;
580 $self->{cached} += length $rec;
581 push @{$self->{lru}}, $n; # most-recently-used is at the END
582
b3fe5a4c 583 $self->_cache_flush if $self->{cached} > $self->{memory};
b5aed31e
AMS
584}
585
836d9961
JH
586sub _uncache {
587 my $self = shift;
588 for my $n (@_) {
589 my $cached = delete $self->{cache}{$n};
590 next unless defined $cached;
591 @{$self->{lru}} = grep $_ != $n, @{$self->{lru}};
592 $self->{cached} -= length($cached);
593 }
594}
595
b5aed31e
AMS
596sub _check_cache {
597 my ($self, $n) = @_;
598 my $rec;
599 return unless defined($rec = $self->{cache}{$n});
600
601 # cache hit; update LRU queue and return $rec
602 # replace this with a heap in a later version
603 @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
604 $rec;
605}
606
607sub _cache_flush {
608 my ($self) = @_;
b3fe5a4c 609 while ($self->{cached} + $self->{deferred_s} > $self->{memory}) {
b5aed31e 610 my $lru = shift @{$self->{lru}};
b3fe5a4c
AMS
611 my $rec = delete $self->{cache}{$lru};
612 $self->{cached} -= length $rec;
b5aed31e
AMS
613 }
614}
615
616# We have read to the end of the file and have the offsets table
617# entirely populated. Now we need to write a new record beyond
618# the end of the file. We prepare for this by writing
619# empty records into the file up to the position we want
51efdd02
AMS
620#
621# assumes that the offsets table already contains the offset of record $n,
622# if it exists, and extends to the end of the file if not.
b5aed31e
AMS
623sub _extend_file_to {
624 my ($self, $n) = @_;
625 $self->_seek(-1); # position after the end of the last record
626 my $pos = $self->{offsets}[-1];
627
628 # the offsets table has one entry more than the total number of records
51efdd02 629 $extras = $n - $#{$self->{offsets}};
b5aed31e
AMS
630
631 # Todo : just use $self->{recsep} x $extras here?
632 while ($extras-- > 0) {
633 $self->_write_record($self->{recsep});
fa408a35 634 push @{$self->{offsets}}, tell $self->{fh};
b5aed31e
AMS
635 }
636}
637
638# Truncate the file at the current position
639sub _chop_file {
640 my $self = shift;
641 truncate $self->{fh}, tell($self->{fh});
642}
643
644# compute the size of a buffer suitable for moving
645# all the data in a file forward $n bytes
646# ($n may be negative)
647# The result should be at least $n.
648sub _bufsize {
649 my $n = shift;
650 return 8192 if $n < 0;
651 my $b = $n & ~8191;
652 $b += 8192 if $n & 8191;
653 $b;
654}
655
51efdd02
AMS
656# Lock the file
657sub flock {
658 my ($self, $op) = @_;
659 unless (@_ <= 3) {
660 my $pack = ref $self;
661 croak "Usage: $pack\->flock([OPERATION])";
662 }
663 my $fh = $self->{fh};
664 $op = LOCK_EX unless defined $op;
665 flock $fh, $op;
666}
b5aed31e 667
b3fe5a4c
AMS
668# Defer writes
669sub defer {
670 my $self = shift;
671 $self->{defer} = 1;
672}
673
0b28bc9a
AMS
674# Get/set autochomp option
675sub autochomp {
676 my $self = shift;
677 if (@_) {
678 my $old = $self->{autochomp};
679 $self->{autochomp} = shift;
680 $old;
681 } else {
682 $self->{autochomp};
683 }
684}
685
b3fe5a4c
AMS
686# Flush deferred writes
687#
688# This could be better optimized to write the file in one pass, instead
689# of one pass per block of records. But that will require modifications
690# to _twrite, so I should have a good _twite test suite first.
691sub flush {
692 my $self = shift;
693
694 $self->_flush;
695 $self->{defer} = 0;
696}
697
698sub _flush {
699 my $self = shift;
700 my @writable = sort {$a<=>$b} (keys %{$self->{deferred}});
701
702 while (@writable) {
703 # gather all consecutive records from the front of @writable
704 my $first_rec = shift @writable;
705 my $last_rec = $first_rec+1;
706 ++$last_rec, shift @writable while @writable && $last_rec == $writable[0];
707 --$last_rec;
708 $self->_fill_offsets_to($last_rec);
709 $self->_extend_file_to($last_rec);
710 $self->_splice($first_rec, $last_rec-$first_rec+1,
711 @{$self->{deferred}}{$first_rec .. $last_rec});
712 }
713
714 $self->discard; # clear out defered-write-cache
715}
716
717# Discard deferred writes
718sub discard {
719 my $self = shift;
720 undef $self->{deferred};
721 $self->{deferred_s} = 0;
722 $self->{defer} = 0;
723}
724
725# Not yet implemented
726sub autodefer { }
727
728sub _default_recsep {
729 my $recsep = $/;
730 if ($^O eq 'MSWin32') {
731 # Windows users expect files to be terminated with \r\n
732 # But $/ is set to \n instead
733 # Note that this also transforms \n\n into \r\n\r\n.
734 # That is a feature.
735 $recsep =~ s/\n/\r\n/g;
736 }
737 $recsep;
738}
739
b5aed31e
AMS
740# Given a file, make sure the cache is consistent with the
741# file contents
742sub _check_integrity {
743 my ($self, $file, $warn) = @_;
744 my $good = 1;
fa408a35 745
836d9961
JH
746 if (not defined $self->{offsets}[0]) {
747 $warn && print STDERR "# offset 0 is missing!\n";
748 $good = 0;
749 } elsif ($self->{offsets}[0] != 0) {
b5aed31e
AMS
750 $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
751 $good = 0;
752 }
fa408a35
AMS
753
754 local *F = $self->{fh};
755 seek F, 0, SEEK_SET;
756 local $/ = $self->{recsep};
757 $. = 0;
758
b5aed31e
AMS
759 while (<F>) {
760 my $n = $. - 1;
761 my $cached = $self->{cache}{$n};
762 my $offset = $self->{offsets}[$.];
763 my $ao = tell F;
764 if (defined $offset && $offset != $ao) {
765 $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
fa408a35 766 $good = 0;
b5aed31e
AMS
767 }
768 if (defined $cached && $_ ne $cached) {
769 $good = 0;
770 chomp $cached;
771 chomp;
772 $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
773 }
774 }
775
b3fe5a4c 776 my $memory = 0;
b5aed31e 777 while (my ($n, $r) = each %{$self->{cache}}) {
b3fe5a4c 778 $memory += length($r);
b5aed31e
AMS
779 next if $n+1 <= $.; # checked this already
780 $warn && print STDERR "# spurious caching of record $n\n";
781 $good = 0;
782 }
b3fe5a4c
AMS
783 if ($memory != $self->{cached}) {
784 $warn && print STDERR "# cache size is $self->{cached}, should be $memory\n";
b5aed31e
AMS
785 $good = 0;
786 }
787
788 my (%seen, @duplicate);
789 for (@{$self->{lru}}) {
790 $seen{$_}++;
791 if (not exists $self->{cache}{$_}) {
b3fe5a4c 792 $warn && print "# $_ is mentioned in the LRU queue, but not in the cache\n";
b5aed31e
AMS
793 $good = 0;
794 }
795 }
796 @duplicate = grep $seen{$_}>1, keys %seen;
797 if (@duplicate) {
798 my $records = @duplicate == 1 ? 'Record' : 'Records';
799 my $appear = @duplicate == 1 ? 'appears' : 'appear';
b3fe5a4c 800 $warn && print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
b5aed31e
AMS
801 $good = 0;
802 }
803 for (keys %{$self->{cache}}) {
804 unless (exists $seen{$_}) {
836d9961 805 print "# record $_ is in the cache but not the LRU queue\n";
b5aed31e
AMS
806 $good = 0;
807 }
808 }
809
810 $good;
811}
812
fa408a35
AMS
813"Cogito, ergo sum."; # don't forget to return a true value from the file
814
b5aed31e
AMS
815=head1 NAME
816
817Tie::File - Access the lines of a disk file via a Perl array
818
819=head1 SYNOPSIS
820
0b28bc9a 821 # This file documents Tie::File version 0.20
b5aed31e
AMS
822
823 tie @array, 'Tie::File', filename or die ...;
824
825 $array[13] = 'blah'; # line 13 of the file is now 'blah'
826 print $array[42]; # display line 42 of the file
827
828 $n_recs = @array; # how many records are in the file?
829 $#array = $n_recs - 2; # chop records off the end
830
51efdd02
AMS
831 # As you would expect:
832
833 push @array, new recs...;
834 my $r1 = pop @array;
835 unshift @array, new recs...;
836 my $r1 = shift @array;
b5aed31e
AMS
837 @old_recs = splice @array, 3, 7, new recs...;
838
839 untie @array; # all finished
840
841=head1 DESCRIPTION
842
843C<Tie::File> represents a regular text file as a Perl array. Each
844element in the array corresponds to a record in the file. The first
845line of the file is element 0 of the array; the second line is element
8461, and so on.
847
848The file is I<not> loaded into memory, so this will work even for
849gigantic files.
850
851Changes to the array are reflected in the file immediately.
852
b3fe5a4c
AMS
853Lazy people may now stop reading the manual.
854
b5aed31e
AMS
855=head2 C<recsep>
856
857What is a 'record'? By default, the meaning is the same as for the
858C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
b3fe5a4c
AMS
859probably C<"\n">. (Minor exception: on dos and Win32 systems, a
860'record' is a string terminated by C<"\r\n">.) You may change the
861definition of "record" by supplying the C<recsep> option in the C<tie>
862call:
b5aed31e
AMS
863
864 tie @array, 'Tie::File', $file, recsep => 'es';
865
b3fe5a4c
AMS
866This says that records are delimited by the string C<es>. If the file
867contained the following data:
b5aed31e
AMS
868
869 Curse these pesky flies!\n
870
871then the C<@array> would appear to have four elements:
872
0b28bc9a
AMS
873 "Curse th"
874 "e p"
875 "ky fli"
b5aed31e
AMS
876 "!\n"
877
878An undefined value is not permitted as a record separator. Perl's
879special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
880emulated.
881
0b28bc9a
AMS
882Records read from the tied array do not have the record separator
883string on the end; this is to allow
884
885 $array[17] .= "extra";
886
887to work as expected.
888
889(See L<"autochomp">, below.) Records stored into the array will have
890the record separator string appended before they are written to the
891file, if they don't have one already. For example, if the record
892separator string is C<"\n">, then the following two lines do exactly
893the same thing:
b5aed31e
AMS
894
895 $array[17] = "Cherry pie";
896 $array[17] = "Cherry pie\n";
897
898The result is that the contents of line 17 of the file will be
899replaced with "Cherry pie"; a newline character will separate line 17
7b6b3db1 900from line 18. This means that in particular, this will do nothing:
b5aed31e
AMS
901
902 chomp $array[17];
903
904Because the C<chomp>ed value will have the separator reattached when
905it is written back to the file. There is no way to create a file
906whose trailing record separator string is missing.
907
908Inserting records that I<contain> the record separator string will
909produce a reasonable result, but if you can't foresee what this result
910will be, you'd better avoid doing this.
911
0b28bc9a
AMS
912=head2 C<autochomp>
913
914Normally, array elements have the record separator removed, so that if
915the file contains the text
916
917 Gold
918 Frankincense
919 Myrrh
920
921the tied array will appear to contain C<("Gold", "Frankincense", "Myrrh")>.
922If you set C<autochomp> to a false value, the record separator will not be removed. If the file above was tied with
923
924 tie @gifts, "Tie::File", $gifts, autochomp => 0;
925
926then the array C<@gifts> would appear to contain C<("Gold\n",
927"Frankincense\n", "Myrrh\n")>, or (on Win32 systems) C<("Gold\r\n",
928"Frankincense\r\n", "Myrrh\r\n")>.
929
b5aed31e
AMS
930=head2 C<mode>
931
932Normally, the specified file will be opened for read and write access,
933and will be created if it does not exist. (That is, the flags
934C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
935change this, you may supply alternative flags in the C<mode> option.
936See L<Fcntl> for a listing of available flags.
937For example:
938
939 # open the file if it exists, but fail if it does not exist
940 use Fcntl 'O_RDWR';
941 tie @array, 'Tie::File', $file, mode => O_RDWR;
942
943 # create the file if it does not exist
944 use Fcntl 'O_RDWR', 'O_CREAT';
945 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
946
947 # open an existing file in read-only mode
948 use Fcntl 'O_RDONLY';
949 tie @array, 'Tie::File', $file, mode => O_RDONLY;
950
951Opening the data file in write-only or append mode is not supported.
952
b3fe5a4c
AMS
953=head2 C<memory>
954
955This is an (inexact) upper limit on the amount of memory that
956C<Tie::File> will consume at any time while managing the file.
957At present, this is used as a bound on the size of the read cache.
b5aed31e
AMS
958
959Records read in from the file are cached, to avoid having to re-read
960them repeatedly. If you read the same record twice, the first time it
961will be stored in memory, and the second time it will be fetched from
b3fe5a4c
AMS
962the I<read cache>. The amount of data in the read cache will not
963exceed the value you specified for C<memory>. If C<Tie::File> wants
964to cache a new record, but the read cache is full, it will make room
965by expiring the least-recently visited records from the read cache.
b5aed31e 966
b3fe5a4c
AMS
967The default memory limit is 2Mib. You can adjust the maximum read
968cache size by supplying the C<memory> option. The argument is the
969desired cache size, in bytes.
b5aed31e
AMS
970
971 # I have a lot of memory, so use a large cache to speed up access
b3fe5a4c 972 tie @array, 'Tie::File', $file, memory => 20_000_000;
b5aed31e 973
b3fe5a4c 974Setting the memory limit to 0 will inhibit caching; records will be
b5aed31e
AMS
975fetched from disk every time you examine them.
976
977=head2 Option Format
978
979C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
b3fe5a4c 980C<recsep>. C<-memory> is a synonym for C<memory>. You get the
b5aed31e
AMS
981idea.
982
983=head1 Public Methods
984
985The C<tie> call returns an object, say C<$o>. You may call
986
987 $rec = $o->FETCH($n);
988 $o->STORE($n, $rec);
989
b3fe5a4c
AMS
990to fetch or store the record at line C<$n>, respectively; similarly
991the other tied array methods. (See L<perltie> for details.) You may
992also call the following methods on this object:
51efdd02
AMS
993
994=head2 C<flock>
995
996 $o->flock(MODE)
997
998will lock the tied file. C<MODE> has the same meaning as the second
999argument to the Perl built-in C<flock> function; for example
1000C<LOCK_SH> or C<LOCK_EX | LOCK_NB>. (These constants are provided by
1001the C<use Fcntl ':flock'> declaration.)
1002
fa408a35 1003C<MODE> is optional; C<$o-E<gt>flock> simply locks the file with
51efdd02
AMS
1004C<LOCK_EX>.
1005
1006The best way to unlock a file is to discard the object and untie the
1007array. It is probably unsafe to unlock the file without also untying
1008it, because if you do, changes may remain unwritten inside the object.
1009That is why there is no shortcut for unlocking. If you really want to
1010unlock the file prematurely, you know what to do; if you don't know
1011what to do, then don't do it.
1012
1013All the usual warnings about file locking apply here. In particular,
1014note that file locking in Perl is B<advisory>, which means that
1015holding a lock will not prevent anyone else from reading, writing, or
1016erasing the file; it only prevents them from getting another lock at
1017the same time. Locks are analogous to green traffic lights: If you
1018have a green light, that does not prevent the idiot coming the other
1019way from plowing into you sideways; it merely guarantees to you that
1020the idiot does not also have a green light at the same time.
b5aed31e 1021
0b28bc9a
AMS
1022=head2 C<autochomp>
1023
1024 my $old_value = $o->autochomp(0); # disable autochomp option
1025 my $old_value = $o->autochomp(1); # enable autochomp option
1026
1027 my $ac = $o->autochomp(); # recover current value
1028
1029See L<"autochomp">, above.
1030
1031=head1 Tying to an already-opened filehandle
fa408a35
AMS
1032
1033If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
1034of the other C<IO> modules, you may use:
1035
1036 tie @array, 'Tie::File', $fh, ...;
1037
1038Similarly if you opened that handle C<FH> with regular C<open> or
1039C<sysopen>, you may use:
1040
1041 tie @array, 'Tie::File', \*FH, ...;
1042
1043Handles that were opened write-only won't work. Handles that were
1044opened read-only will work as long as you don't try to write to them.
1045Handles must be attached to seekable sources of data---that means no
b3fe5a4c
AMS
1046pipes or sockets. If you supply a non-seekable handle, the C<tie>
1047call will try to abort your program.
fa408a35 1048
b5aed31e
AMS
1049=head1 CAVEATS
1050
1051(That's Latin for 'warnings'.)
1052
b3fe5a4c
AMS
1053=over 4
1054
1055=item *
1056
1057This is BETA RELEASE SOFTWARE. It may have bugs. See the discussion
1058below about the (lack of any) warranty.
1059
1060=item *
b5aed31e
AMS
1061
1062Every effort was made to make this module efficient. Nevertheless,
1063changing the size of a record in the middle of a large file will
b3fe5a4c
AMS
1064always be fairly slow, because everything after the new record must be
1065moved.
b5aed31e 1066
b3fe5a4c
AMS
1067In particular, note that the following innocent-looking loop has very
1068bad behavior:
b5aed31e 1069
b3fe5a4c
AMS
1070 # million-line file
1071 for (@file_array) {
1072 $_ .= 'x';
1073 }
b5aed31e 1074
b3fe5a4c
AMS
1075This is likely to be very slow, because the first iteration must
1076relocate lines 1 through 999,999; the second iteration must relocate
1077lines 2 through 999,999, and so on. The relocation is done using
1078block writes, however, so it's not as slow as it might be.
b5aed31e 1079
836d9961
JH
1080A soon-to-be-released version of this module will provide a mechanism
1081for getting better performance in such cases, by deferring the writing
b3fe5a4c
AMS
1082until it can be done all at once. This deferred writing feature might
1083be enabled automagically if C<Tie::File> guesses that you are about to write many consecutive records. To disable this feature, use
1084
1085 (tied @o)->autodefer(0);
1086
1087(At present, this call does nothing.)
1088
1089=item *
1090
1091The behavior of tied arrays is not precisely the same as for regular
1092arrays. For example:
b5aed31e 1093
b3fe5a4c
AMS
1094 undef $a[10]; print "How unusual!\n" if $a[10];
1095
1096C<undef>-ing a C<Tie::File> array element just blanks out the
1097corresponding record in the file. When you read it back again, you'll
1098see the record separator (typically, $a[10] will appear to contain
1099"\n") so the supposedly-C<undef>'ed value will be true.
b5aed31e 1100
b3fe5a4c
AMS
1101There are other minor differences, but in general, the correspondence
1102is extremely close.
1103
1104=item *
1105
1106Not quite every effort was made to make this module as efficient as
b5aed31e
AMS
1107possible. C<FETCHSIZE> should use binary search instead of linear
1108search. The cache's LRU queue should be a heap instead of a list.
1109These defects are probably minor; in any event, they will be fixed in
1110a later version of the module.
1111
b3fe5a4c 1112=item *
b5aed31e
AMS
1113
1114The author has supposed that since this module is concerned with file
1115I/O, almost all normal use of it will be heavily I/O bound, and that
1116the time to maintain complicated data structures inside the module
1117will be dominated by the time to actually perform the I/O. This
fa408a35 1118suggests, for example, that an LRU read-cache is a good tradeoff,
b5aed31e
AMS
1119even if it requires substantial adjustment following a C<splice>
1120operation.
1121
b3fe5a4c 1122=back
51efdd02 1123
b3fe5a4c 1124=head1 WHAT ABOUT C<DB_File>?
51efdd02 1125
b3fe5a4c
AMS
1126C<DB_File>'s C<DB_RECNO> feature does something similar to
1127C<Tie::File>, but there are a number of reasons that you might prefer
1128C<Tie::File>. C<DB_File> is a great piece of software, but the
1129C<DB_RECNO> part is less great than the rest of it.
b5aed31e 1130
b3fe5a4c 1131=over 4
51efdd02 1132
b3fe5a4c 1133=item *
51efdd02 1134
b3fe5a4c
AMS
1135C<DB_File> reads your entire file into memory, modifies it in memory,
1136and the writes out the entire file again when you untie the file.
1137This is completely impractical for large files.
1138
1139C<Tie::File> does not do any of those things. It doesn't try to read
1140the entire file into memory; instead it uses a lazy approach and
1141caches recently-used records. The cache size is strictly bounded by
1142the C<memory> option. DB_File's C<-E<gt>{cachesize}> doesn't prevent
1143your process from blowing up when reading a big file.
1144
1145=item *
1146
1147C<DB_File> has an extremely poor writing strategy. If you have a
1148ten-megabyte file and tie it with C<DB_File>, and then use
1149
1150 $a[0] =~ s/PERL/Perl/;
1151
1152C<DB_file> will then read the entire ten-megabyte file into memory, do
1153the change, and write the entire file back to disk, reading ten
1154megabytes and writing ten megabytes. C<Tie::File> will read and write
1155only the first record.
1156
1157If you have a million-record file and tie it with C<DB_File>, and then
1158use
1159
1160 $a[999998] =~ s/Larry/Larry Wall/;
1161
1162C<DB_File> will read the entire million-record file into memory, do
1163the change, and write the entire file back to disk. C<Tie::File> will
1164only rewrite records 999998 and 999999. During the writing process,
1165it will never have more than a few kilobytes of data in memory at any
1166time, even if the two records are very large.
1167
1168=item *
1169
1170Since changes to C<DB_File> files only appear when you do C<untie>, it
1171can be inconvenient to arrange for concurrent access to the same file
1172by two or more processes. Each process needs to call C<$db-E<gt>sync>
1173after every write. When you change a C<Tie::File> array, the changes
1174are reflected in the file immediately; no explicit C<-E<gt>sync> call
1175is required. (The forthcoming "deferred writing" mode will allow you
1176to request that writes be held in memory until explicitly C<sync>'ed.)
1177
1178=item *
1179
1180C<DB_File> is only installed by default if you already have the C<db>
1181library on your system; C<Tie::File> is pure Perl and is installed by
1182default no matter what. Starting with Perl 5.7.3 you can be
1183absolutely sure it will be everywhere. You will never have that
1184surety with C<DB_File>. If you don't have C<DB_File> yet, it requires
1185a C compiler. You can install C<Tie::File> from CPAN in five minutes
1186with no compiler.
1187
1188=item *
1189
1190C<DB_File> is written in C, so if you aren't allowed to install
1191modules on your system, it is useless. C<Tie::File> is written in Perl,
1192so even if you aren't allowed to install modules, you can look into
1193the source code, see how it works, and copy the subroutines or the
1194ideas from the subroutines directly into your own Perl program.
1195
1196=item *
1197
1198Except in very old, unsupported versions, C<DB_File>'s free license
1199requires that you distribute the source code for your entire
1200application. If you are not able to distribute the source code for
1201your application, you must negotiate an alternative license from
1202Sleepycat, possibly for a fee. Tie::File is under the Perl Artistic
1203license and can be distributed free under the same terms as Perl
1204itself.
1205
1206=back
b5aed31e
AMS
1207
1208=head1 AUTHOR
1209
1210Mark Jason Dominus
1211
1212To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
1213
1214To receive an announcement whenever a new version of this module is
1215released, send a blank email message to
1216C<mjd-perl-tiefile-subscribe@plover.com>.
1217
1218=head1 LICENSE
1219
0b28bc9a 1220C<Tie::File> version 0.20 is copyright (C) 2002 Mark Jason Dominus.
7b6b3db1
JH
1221
1222This library is free software; you may redistribute it and/or modify
1223it under the same terms as Perl itself.
b5aed31e 1224
7b6b3db1
JH
1225These terms include your choice of (1) the Perl Artistic Licence, or
1226(2) version 2 of the GNU General Public License as published by the
1227Free Software Foundation, or (3) any later version of the GNU General
1228Public License.
b5aed31e 1229
7b6b3db1 1230This library is distributed in the hope that it will be useful,
b5aed31e
AMS
1231but WITHOUT ANY WARRANTY; without even the implied warranty of
1232MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1233GNU General Public License for more details.
1234
1235You should have received a copy of the GNU General Public License
7b6b3db1
JH
1236along with this library program; it should be in the file C<COPYING>.
1237If not, write to the Free Software Foundation, Inc., 59 Temple Place,
1238Suite 330, Boston, MA 02111 USA
b5aed31e
AMS
1239
1240For licensing inquiries, contact the author at:
1241
1242 Mark Jason Dominus
1243 255 S. Warnock St.
1244 Philadelphia, PA 19107
1245
1246=head1 WARRANTY
1247
0b28bc9a 1248C<Tie::File> version 0.20 comes with ABSOLUTELY NO WARRANTY.
b5aed31e
AMS
1249For details, see the license.
1250
fa408a35
AMS
1251=head1 THANKS
1252
1253Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
1254core when I hadn't written it yet, and for generally being helpful,
1255supportive, and competent. (Usually the rule is "choose any one.")
1256Also big thanks to Abhijit Menon-Sen for all of the same things.
1257
1258Special thanks to Craig Berry (for VMS portability help), Randy Kobes
b3fe5a4c
AMS
1259(for Win32 portability help), Clinton Pierce and Autrijus Tang (for
1260heroic eleventh-hour Win32 testing above and beyond the call of duty),
1261and the rest of the CPAN testers (for testing generally).
b5aed31e 1262
fa408a35 1263More thanks to:
b3fe5a4c 1264Edward Avis /
fa408a35 1265Gerrit Haase /
b3fe5a4c 1266Nikola Knezevic /
836d9961 1267Nick Ing-Simmons /
fa408a35
AMS
1268Tassilo von Parseval /
1269H. Dieter Pearcey /
b3fe5a4c 1270Slaven Rezic /
fa408a35
AMS
1271Peter Somu /
1272Tels
7b6b3db1 1273
fa408a35
AMS
1274=head1 TODO
1275
1276Test DELETE machinery more carefully.
b5aed31e 1277
b3fe5a4c
AMS
1278More tests. (C<mode> option. _twrite should be tested separately,
1279because there are a lot of weird special cases lurking in there.)
b5aed31e
AMS
1280
1281More tests. (Stuff I didn't think of yet.)
1282
b5aed31e
AMS
1283Paragraph mode?
1284
1285More tests.
1286
1287Fixed-length mode.
1288
fa408a35
AMS
1289Maybe an autolocking mode?
1290
b3fe5a4c
AMS
1291Finish deferred writing.
1292
1293Autodeferment.
1294
1295Record locking with fcntl()? Then you might support an undo log and
1296get real transactions. What a coup that would be.
1297
1298Leave-blanks mode
1299
b5aed31e
AMS
1300=cut
1301