This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
s2p fix for VMS
[perl5.git] / x2p / s2p.PL
CommitLineData
86a59229 1#!/usr/bin/perl
4633a7c4
LW
2
3use Config;
4use File::Basename qw(&basename &dirname);
8a5546a1 5use Cwd;
4633a7c4
LW
6
7# List explicitly here the variables you want Configure to
8# generate. Metaconfig only looks for shell variables, so you
9# have to mention them as if they were shell variables, not
10# %Config entries. Thus you write
11# $startperl
12# to ensure Configure will look for $Config{startperl}.
13
14# This forces PL files to create target in same directory as PL file.
15# This is so that make depend always knows where to find PL derivatives.
8a5546a1 16$origdir = cwd;
44a8e56a
PP
17chdir dirname($0);
18$file = basename($0, '.PL');
774d564b 19$file .= '.com' if $^O eq 'VMS';
4633a7c4
LW
20
21open OUT,">$file" or die "Can't create $file: $!";
22
23print "Extracting $file (with variable substitutions)\n";
24
25# In this section, perl variables will be expanded during extraction.
26# You can use $Config{...} to use Configure variables.
27
28print OUT <<"!GROK!THIS!";
5f05dabc
PP
29$Config{startperl}
30 eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
31 if \$running_under_some_shell;
c0393c90
JH
32my \$startperl;
33my \$perlpath;
ed6d8ea1
JH
34(\$startperl = <<'/../') =~ s/\\s*\\z//;
35$Config{startperl}
36/../
37(\$perlpath = <<'/../') =~ s/\\s*\\z//;
38$Config{perlpath}
39/../
a687059c
LW
40!GROK!THIS!
41
4633a7c4
LW
42# In the following, perl variables are not expanded during extraction.
43
44print OUT <<'!NO!SUBS!';
a687059c 45
9ae2b9a0 46$0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
86a59229
WL
47
48# (p)sed - a stream editor
49# History: Aug 12 2000: Original version.
d16f50bd 50# Mar 25 2002: Rearrange generated Perl program.
86a59229
WL
51
52use strict;
53use integer;
54use Symbol;
8d063cd8 55
d83e3bda
JM
56=head1 NAME
57
ba34211e 58psed - a stream editor
d83e3bda
JM
59
60=head1 SYNOPSIS
61
ba34211e
JH
62 psed [-an] script [file ...]
63 psed [-an] [-e script] [-f script-file] [file ...]
64
65 s2p [-an] [-e script] [-f script-file]
d83e3bda
JM
66
67=head1 DESCRIPTION
68
86a59229
WL
69A stream editor reads the input stream consisting of the specified files
70(or standard input, if none are given), processes is line by line by
71applying a script consisting of edit commands, and writes resulting lines
72to standard output. The filename `C<->' may be used to read standard input.
73
74The edit script is composed from arguments of B<-e> options and
75script-files, in the given order. A single script argument may be specified
76as the first parameter.
77
78If this program is invoked with the name F<s2p>, it will act as a
79sed-to-Perl translator. See L<"sed Script Translation">.
d83e3bda 80
86a59229 81B<sed> returns an exit code of 0 on success or >0 if an error occurred.
d83e3bda 82
86a59229 83=head1 OPTIONS
d83e3bda 84
86a59229 85=over 4
d83e3bda 86
86a59229 87=item B<-a>
d83e3bda 88
86a59229
WL
89A file specified as argument to the B<w> edit command is by default
90opened before input processing starts. Using B<-a>, opening of such
91files is delayed until the first line is actually written to the file.
92
93=item B<-e> I<script>
94
95The editing commands defined by I<script> are appended to the script.
96Multiple commands must be separated by newlines.
97
98=item B<-f> I<script-file>
99
100Editing commands from the specified I<script-file> are read and appended
101to the script.
d83e3bda
JM
102
103=item B<-n>
104
86a59229
WL
105By default, a line is written to standard output after the editing script
106has been applied to it. The B<-n> option suppresses automatic printing.
107
108=back
109
110=head1 COMMANDS
111
112B<sed> command syntax is defined as
113
114Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
115
116with whitespace being permitted before or after addresses, and between
117the function character and the argument. The I<address>es and the
118address inverter (C<!>) are used to restrict the application of a
119command to the selected line(s) of input.
120
121Each command must be on a line of its own, except where noted in
122the synopses below.
123
124The edit cycle performed on each input line consist of reading the line
125(without its trailing newline character) into the I<pattern space>,
126applying the applicable commands of the edit script, writing the final
127contents of the pattern space and a newline to the standard output.
128A I<hold space> is provided for saving the contents of the
129pattern space for later use.
130
131=head2 Addresses
132
133A sed address is either a line number or a pattern, which may be combined
134arbitrarily to construct ranges. Lines are numbered across all input files.
135
136Any address may be followed by an exclamation mark (`C<!>'), selecting
137all lines not matching that address.
d83e3bda 138
86a59229 139=over 4
d83e3bda 140
86a59229
WL
141=item I<number>
142
143The line with the given number is selected.
144
145=item B<$>
146
147A dollar sign (C<$>) is the line number of the last line of the input stream.
148
149=item B</>I<regular expression>B</>
150
151A pattern address is a basic regular expression (see
152L<"Basic Regular Expressions">), between the delimiting character C</>.
153Any other character except C<\> or newline may be used to delimit a
154pattern address when the initial delimiter is prefixed with a
155backslash (`C<\>').
d83e3bda
JM
156
157=back
158
86a59229 159If no address is given, the command selects every line.
d83e3bda 160
86a59229
WL
161If one address is given, it selects the line (or lines) matching the
162address.
d83e3bda 163
86a59229
WL
164Two addresses select a range that begins whenever the first address
165matches, and ends (including that line) when the second address matches.
166If the first (second) address is a matching pattern, the second
167address is not applied to the very same line to determine the end of
168the range. Likewise, if the second address is a matching pattern, the
169first address is not applied to the very same line to determine the
170begin of another range. If both addresses are line numbers,
171and the second line number is less than the first line number, then
172only the first line is selected.
d83e3bda 173
d83e3bda 174
86a59229 175=head2 Functions
d83e3bda 176
86a59229
WL
177The maximum permitted number of addresses is indicated with each
178function synopsis below.
d83e3bda 179
86a59229
WL
180The argument I<text> consists of one or more lines following the command.
181Embedded newlines in I<text> must be preceded with a backslash. Other
182backslashes in I<text> are deleted and the following character is taken
183literally.
d83e3bda 184
86a59229 185=over 4
d83e3bda 186
86a59229
WL
187=cut
188
189my %ComTab;
d16f50bd 190my %GenKey;
86a59229
WL
191#--------------------------------------------------------------------------
192$ComTab{'a'}=[ 1, 'txt', \&Emit, '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
193
194=item [1addr]B<a\> I<text>
195
196Write I<text> (which must start on the line following the command)
197to standard output immediately before reading the next line
198of input, either by executing the B<N> function or by beginning a new cycle.
199
200=cut
201
202#--------------------------------------------------------------------------
203$ComTab{'b'}=[ 2, 'str', \&Branch, '{ goto XXX; }' ]; #ok
204
205=item [2addr]B<b> [I<label>]
206
207Branch to the B<:> function with the specified I<label>. If no label
208is given, branch to the end of the script.
209
210=cut
211
212#--------------------------------------------------------------------------
213$ComTab{'c'}=[ 2, 'txt', \&Change, <<'-X-' ]; #ok
214{ print <<'TheEnd'; } $doPrint = 0; goto EOS;
215-X-
216### continue OK => next CYCLE;
217
218=item [2addr]B<c\> I<text>
219
220The line, or range of lines, selected by the address is deleted.
221The I<text> (which must start on the line following the command)
222is written to standard output. With an address range, this occurs at
223the end of the range.
224
225=cut
226
227#--------------------------------------------------------------------------
228$ComTab{'d'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
229{ $doPrint = 0;
230 goto EOS;
231}
232-X-
233### continue OK => next CYCLE;
234
235=item [2addr]B<d>
236
237Deletes the pattern space and starts the next cycle.
238
239=cut
240
241#--------------------------------------------------------------------------
242$ComTab{'D'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
243{ s/^.*\n?//;
244 if(length($_)){ goto BOS } else { goto EOS }
245}
246-X-
247### continue OK => next CYCLE;
248
249=item [2addr]B<D>
250
251Deletes the pattern space through the first embedded newline or to the end.
252If the pattern space becomes empty, a new cycle is started, otherwise
253execution of the script is restarted.
254
255=cut
256
257#--------------------------------------------------------------------------
258$ComTab{'g'}=[ 2, '', \&Emit, '{ $_ = $Hold };' ]; #ok
259
260=item [2addr]B<g>
261
262Replace the contents of the pattern space with the hold space.
263
264=cut
265
266#--------------------------------------------------------------------------
267$ComTab{'G'}=[ 2, '', \&Emit, '{ $_ .= "\n"; $_ .= $Hold };' ]; #ok
268
269=item [2addr]B<G>
270
271Append a newline and the contents of the hold space to the pattern space.
272
273=cut
274
275#--------------------------------------------------------------------------
276$ComTab{'h'}=[ 2, '', \&Emit, '{ $Hold = $_ }' ]; #ok
277
278=item [2addr]B<h>
279
280Replace the contents of the hold space with the pattern space.
281
282=cut
283
284#--------------------------------------------------------------------------
285$ComTab{'H'}=[ 2, '', \&Emit, '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
286
287=item [2addr]B<H>
288
289Append a newline and the contents of the pattern space to the hold space.
290
291=cut
292
293#--------------------------------------------------------------------------
294$ComTab{'i'}=[ 1, 'txt', \&Emit, '{ print <<'."'TheEnd' }\n" ]; #ok
295
296=item [1addr]B<i\> I<text>
297
298Write the I<text> (which must start on the line following the command)
299to standard output.
300
301=cut
302
303#--------------------------------------------------------------------------
304$ComTab{'l'}=[ 2, '', \&Emit, '{ _l() }' ]; #okUTF8
305
306=item [2addr]B<l>
307
308Print the contents of the pattern space: non-printable characters are
309shown in C-style escaped form; long lines are split and have a trailing
310`C<\>' at the point of the split; the true end of a line is marked with
311a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
312BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
313octal number for all other non-printable characters.
314
315=cut
316
317#--------------------------------------------------------------------------
318$ComTab{'n'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
319{ print $_, "\n" if $doPrint;
d16f50bd 320 printQ() if @Q;
86a59229
WL
321 $CondReg = 0;
322 last CYCLE unless getsARGV();
323 chomp();
324}
325-X-
326
327=item [2addr]B<n>
328
329If automatic printing is enabled, write the pattern space to the standard
330output. Replace the pattern space with the next line of input. If
331there is no more input, processing is terminated.
332
333=cut
334
335#--------------------------------------------------------------------------
336$ComTab{'N'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
d16f50bd 337{ printQ() if @Q;
86a59229
WL
338 $CondReg = 0;
339 last CYCLE unless getsARGV( $h );
340 chomp( $h );
341 $_ .= "\n$h";
342}
343-X-
344
345=item [2addr]B<N>
346
347Append a newline and the next line of input to the pattern space. If
348there is no more input, processing is terminated.
349
350=cut
351
352#--------------------------------------------------------------------------
353$ComTab{'p'}=[ 2, '', \&Emit, '{ print $_, "\n"; }' ]; #ok
354
355=item [2addr]B<p>
356
357Print the pattern space to the standard output. (Use the B<-n> option
358to suppress automatic printing at the end of a cycle if you want to
359avoid double printing of lines.)
360
361=cut
362
363#--------------------------------------------------------------------------
364$ComTab{'P'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
365{ if( /^(.*)/ ){ print $1, "\n"; } }
366-X-
367
368=item [2addr]B<P>
369
370Prints the pattern space through the first embedded newline or to the end.
371
372=cut
373
374#--------------------------------------------------------------------------
375$ComTab{'q'}=[ 1, '', \&Emit, <<'-X-' ]; #ok
376{ print $_, "\n" if $doPrint;
377 last CYCLE;
378}
379-X-
380
381=item [1addr]B<q>
382
383Branch to the end of the script and quit without starting a new cycle.
384
385=cut
386
387#--------------------------------------------------------------------------
388$ComTab{'r'}=[ 1, 'str', \&Emit, "{ _r( '-X-' ) }" ]; #ok
86a59229
WL
389
390=item [1addr]B<r> I<file>
391
392Copy the contents of the I<file> to standard output immediately before
393the next attempt to read a line of input. Any error encountered while
394reading I<file> is silently ignored.
395
396=cut
397
398#--------------------------------------------------------------------------
399$ComTab{'s'}=[ 2, 'sub', \&Emit, '' ]; #ok
400
401=item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
402
403Substitute the I<replacement> string for the first substring in
404the pattern space that matches the I<regular expression>.
405Any character other than backslash or newline can be used instead of a
406slash to delimit the regular expression and the replacement.
407To use the delimiter as a literal character within the regular expression
408and the replacement, precede the character by a backslash (`C<\>').
409
410Literal newlines may be embedded in the replacement string by
411preceding a newline with a backslash.
412
413Within the replacement, an ampersand (`C<&>') is replaced by the string
414matching the regular expression. The strings `C<\1>' through `C<\9>' are
415replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
416To get a literal `C<&>' or `C<\>' in the replacement text, precede it
417by a backslash.
418
419The following I<flags> modify the behaviour of the B<s> command:
420
421=over 8
422
423=item B<g>
424
425The replacement is performed for all matching, non-overlapping substrings
426of the pattern space.
427
428=item B<1>..B<9>
429
430Replace only the n-th matching substring of the pattern space.
431
432=item B<p>
433
434If the substitution was made, print the new value of the pattern space.
435
436=item B<w> I<file>
437
438If the substitution was made, write the new value of the pattern space
439to the specified file.
440
441=back
442
443=cut
444
445#--------------------------------------------------------------------------
446$ComTab{'t'}=[ 2, 'str', \&Branch, '{ goto XXX if _t() }' ]; #ok
447
448=item [2addr]B<t> [I<label>]
449
450Branch to the B<:> function with the specified I<label> if any B<s>
451substitutions have been made since the most recent reading of an input line
452or execution of a B<t> function. If no label is given, branch to the end of
453the script.
454
455
456=cut
457
458#--------------------------------------------------------------------------
459$ComTab{'w'}=[ 2, 'str', \&Write, "{ _w( '-X-' ) }" ]; #ok
460
461=item [2addr]B<w> I<file>
462
463The contents of the pattern space are written to the I<file>.
464
465=cut
466
467#--------------------------------------------------------------------------
468$ComTab{'x'}=[ 2, '', \&Emit, '{ ($Hold, $_) = ($_, $Hold) }' ]; #ok
469
470=item [2addr]B<x>
471
472Swap the contents of the pattern space and the hold space.
473
474=cut
475
476#--------------------------------------------------------------------------
477$ComTab{'y'}=[ 2, 'tra', \&Emit, '' ]; #ok
478=item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
479
480In the pattern space, replace all characters occuring in I<string1> by the
481character at the corresponding position in I<string2>. It is possible
482to use any character (other than a backslash or newline) instead of a
483slash to delimit the strings. Within I<string1> and I<string2>, a
484backslash followed by any character other than a newline is that literal
485character, and a backslash followed by an `n' is replaced by a newline
486character.
487
488=cut
d83e3bda 489
86a59229
WL
490#--------------------------------------------------------------------------
491$ComTab{'='}=[ 1, '', \&Emit, '{ print "$.\n" }' ]; #ok
492
493=item [1addr]B<=>
494
495Prints the current line number on the standard output.
496
497=cut
498
499#--------------------------------------------------------------------------
500$ComTab{':'}=[ 0, 'str', \&Label, '' ]; #ok
501
502=item [0addr]B<:> [I<label>]
503
504The command specifies the position of the I<label>. It has no other effect.
505
506=cut
507
508#--------------------------------------------------------------------------
509$ComTab{'{'}=[ 2, '', \&BeginBlock, '{' ]; #ok
510$ComTab{'}'}=[ 0, '', \&EndBlock, ';}' ]; #ok
511# ';' to avoid warning on empty {}-block
512
513=item [2addr]B<{> [I<command>]
514
515=item [0addr]B<}>
516
517These two commands begin and end a command list. The first command may
518be given on the same line as the opening B<{> command. The commands
519within the list are jointly selected by the address(es) given on the
520B<{> command (but may still have individual addresses).
521
522=cut
523
524#--------------------------------------------------------------------------
525$ComTab{'#'}=[ 0, 'str', \&Comment, '' ]; #ok
526
527=item [0addr]B<#> [I<comment>]
528
529The entire line is ignored (treated as a comment). If, however, the first
530two characters in the script are `C<#n>', automatic printing of output is
531suppressed, as if the B<-n> option were given on the command line.
532
533=back
534
535=cut
536
537use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
538
539my $useDEBUG = exists( $ENV{PSEDDEBUG} );
540my $useEXTBRE = $ENV{PSEDEXTBRE} || '';
541$useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
542
543my $doAutoPrint = 1; # automatic printing of pattern space (-n => 0)
544my $doOpenWrite = 1; # open w command output files at start (-a => 0)
545my $svOpenWrite = 0; # save $doOpenWrite
546my $doGenerate = $0 eq 's2p';
547
548# Collected and compiled script
549#
d16f50bd
WL
550my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
551$Code = '';
86a59229
WL
552
553##################
554# Compile Time
555#
556# Labels
557#
558# Error handling
559#
560sub Warn($;$){
561 my( $msg, $loc ) = @_;
562 $loc ||= '';
563 $loc .= ': ' if length( $loc );
564 warn( "$0: $loc$msg\n" );
565}
566
567$labNum = 0;
568sub newLabel(){
569 return 'L_'.++$labNum;
570}
571
572# safeHere: create safe here delimiter and modify opcode and argument
573#
574sub safeHere($$){
575 my( $codref, $argref ) = @_;
576 my $eod = 'EOD000';
577 while( $$argref =~ /^$eod$/m ){
578 $eod++;
579 }
580 $$codref =~ s/TheEnd/$eod/e;
581 $$argref .= "$eod\n";
582}
583
584# Emit: create address logic and emit command
585#
586sub Emit($$$$$$){
587 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
588 my $cond = '';
589 if( defined( $addr1 ) ){
590 if( defined( $addr2 ) ){
591 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
592 } else {
593 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
594 }
595 $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
596 }
597
598 if( $opcode eq '' ){
599 $Code .= "$cond$arg\n";
600
601 } elsif( $opcode =~ s/-X-/$arg/e ){
602 $Code .= "$cond$opcode\n";
603
604 } elsif( $opcode =~ /TheEnd/ ){
605 safeHere( \$opcode, \$arg );
606 $Code .= "$cond$opcode$arg";
607
608 } else {
609 $Code .= "$cond$opcode\n";
610 }
611 0;
612}
613
614# Write (w command, w flag): store pathname
615#
616sub Write($$$$$$){
617 my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
618 $wFiles{$path} = '';
619 Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
620}
621
622
623# Label (: command): label definition
624#
625sub Label($$$$$$){
626 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
627 my $rc = 0;
628 $lab =~ s/\s+//;
629 if( length( $lab ) ){
630 my $h;
631 if( ! exists( $Label{$lab} ) ){
632 $h = $Label{$lab}{name} = newLabel();
633 } else {
634 $h = $Label{$lab}{name};
635 if( exists( $Label{$lab}{defined} ) ){
636 my $dl = $Label{$lab}{defined};
637 Warn( "duplicate label $lab (first defined at $dl)", $fl );
638 $rc = 1;
639 }
640 }
641 $Label{$lab}{defined} = $fl;
642 $Code .= "$h:;\n";
643 }
644 $rc;
645}
646
647# BeginBlock ({ command): push block start
648#
649sub BeginBlock($$$$$$){
650 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
651 push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
652 Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
653}
654
655# EndBlock (} command): check proper nesting
656#
657sub EndBlock($$$$$$){
658 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
659 my $rc;
660 my $jcom = pop( @BlockStack );
661 if( defined( $jcom ) ){
662 $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
663 } else {
664 Warn( "unexpected `}'", $fl );
665 $rc = 1;
666 }
667 $rc;
668}
669
670# Branch (t, b commands): check or create label, substitute default
671#
672sub Branch($$$$$$){
673 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
674 $lab =~ s/\s+//; # no spaces at end
675 my $h;
676 if( length( $lab ) ){
677 if( ! exists( $Label{$lab} ) ){
678 $h = $Label{$lab}{name} = newLabel();
679 } else {
680 $h = $Label{$lab}{name};
681 }
682 push( @{$Label{$lab}{used}}, $fl );
683 } else {
684 $h = 'EOS';
685 }
686 $opcode =~ s/XXX/$h/e;
687 Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
688}
689
690# Change (c command): is special due to range end watching
691#
692sub Change($$$$$$){
693 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
694 my $kwd = $negated ? 'unless' : 'if';
695 if( defined( $addr2 ) ){
696 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
697 if( ! $negated ){
698 $addr1 = '$icnt = ('.$addr1.')';
699 $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
700 }
701 } else {
702 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
703 }
704 safeHere( \$opcode, \$arg );
705 $Code .= "$kwd( $addr1 ){\n $opcode$arg}\n";
706 0;
707}
708
709
710# Comment (# command): A no-op. Who would've thought that!
711#
712sub Comment($$$$$$){
713 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
714### $Code .= "# $arg\n";
715 0;
716}
717
718
719sub stripRegex($$){
720 my( $del, $sref ) = @_;
721 my $regex = $del;
722 print "stripRegex:$del:$$sref:\n" if $useDEBUG;
723 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
724 my $sl = $2;
725 $regex .= $1.$sl.$del;
726 if( length( $sl ) % 2 == 0 ){
727 return $regex;
728 }
729 $regex .= $3;
730 }
731 undef();
732}
733
734# stripTrans: take a <del> terminated string from y command
735# honoring and cleaning up of \-escaped <del>'s
736#
737sub stripTrans($$){
738 my( $del, $sref ) = @_;
739 my $t = '';
740 print "stripTrans:$del:$$sref:\n" if $useDEBUG;
741 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
742 my $sl = $2;
743 $t .= $1;
744 if( length( $sl ) % 2 == 0 ){
745 $t .= $sl;
746 $t =~ s/\\\\/\\/g;
747 return $t;
748 }
749 chop( $sl );
750 $t .= $sl.$del.$3;
751 }
752 undef();
753}
754
755# makey - construct Perl y/// from sed y///
756#
757sub makey($$$){
758 my( $fr, $to, $fl ) = @_;
759 my $error = 0;
760
761 # Ensure that any '-' is up front.
762 # Diagnose duplicate contradicting mappings
763 my %tr;
764 for( my $i = 0; $i < length($fr); $i++ ){
765 my $fc = substr($fr,$i,1);
766 my $tc = substr($to,$i,1);
767 if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
768 Warn( "ambiguos translation for character `$fc' in `y' command",
769 $fl );
770 $error++;
771 }
772 $tr{$fc} = $tc;
773 }
774 $fr = $to = '';
775 if( exists( $tr{'-'} ) ){
776 ( $fr, $to ) = ( '-', $tr{'-'} );
777 delete( $tr{'-'} );
778 } else {
779 $fr = $to = '';
780 }
781 # might just as well sort it...
782 for my $fc ( sort keys( %tr ) ){
783 $fr .= $fc;
784 $to .= $tr{$fc};
785 }
786 # make embedded delimiters and newlines safe
787 $fr =~ s/([{}])/\$1/g;
788 $to =~ s/([{}])/\$1/g;
789 $fr =~ s/\n/\\n/g;
790 $to =~ s/\n/\\n/g;
791 return $error ? undef() : "{ y{$fr}{$to}; }";
792}
793
794######
795# makes - construct Perl s/// from sed s///
796#
797sub makes($$$$$$$){
798 my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
799
800 # make embedded newlines safe
801 $regex =~ s/\n/\\n/g;
802 $subst =~ s/\n/\\n/g;
803
804 my $code;
805 # n-th occurrence
806 #
807 if( length( $nmatch ) ){
808 $code = <<TheEnd;
809{ \$n = $nmatch;
810 while( --\$n && ( \$s = m ${regex}g ) ){}
811 \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
812 \$CondReg ||= \$s;
813TheEnd
814 } else {
815 $code = <<TheEnd;
816{ \$s = s ${regex}${subst}s${global};
817 \$CondReg ||= \$s;
818TheEnd
819 }
820 if( $print ){
821 $code .= ' print $_, "\n" if $s;'."\n";
822 }
823 if( defined( $path ) ){
824 $wFiles{$path} = '';
825 $code .= " _w( '$path' ) if \$s;\n";
d16f50bd 826 $GenKey{'w'} = 1;
86a59229
WL
827 }
828 $code .= "}";
829}
830
831=head1 BASIC REGULAR EXPRESSIONS
832
833A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
834of I<atoms>, for matching parts of a string, and I<bounds>, specifying
835repetitions of a preceding atom.
836
837=head2 Atoms
838
839The possible atoms of a BRE are: B<.>, matching any single character;
840B<^> and B<$>, matching the null string at the beginning or end
841of a string, respectively; a I<bracket expressions>, enclosed
842in B<[> and B<]> (see below); and any single character with no
843other significance (matching that character). A B<\> before one
844of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
845after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
846becomes an atom and establishes the target for a I<backreference>,
847consisting of the substring that actually matches the enclosed atoms.
848Finally, B<\> followed by one of the digits B<0> through B<9> is a
849backreference.
850
851A B<^> that is not first, or a B<$> that is not last does not have
852a special significance and need not be preceded by a backslash to
853become literal. The same is true for a B<]>, that does not terminate
854a bracket expression.
855
856An unescaped backslash cannot be last in a BRE.
857
858=head2 Bounds
859
860The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
861atom; B<\{>I<count>B<\}>, specifying that many repetitions;
862B<\{>I<minimum>B<,\}>, giving a lower limit; and
863B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
864bound.
865
866A bound appearing as the first item in a BRE is taken literally.
867
868=head2 Bracket Expressions
869
870A I<bracket expression> is a list of characters, character ranges
871and character classes enclosed in B<[> and B<]> and matches any
872single character from the represented set of characters.
873
874A character range is written as two characters separated by B<-> and
875represents all characters (according to the character collating sequence)
876that are not less than the first and not greater than the second.
877(Ranges are very collating-sequence-dependent, and portable programs
878should avoid relying on them.)
879
880A character class is one of the class names
881
882 alnum digit punct
883 alpha graph space
884 blank lower upper
885 cntrl print xdigit
886
887enclosed in B<[:> and B<:]> and represents the set of characters
888as defined in ctype(3).
889
890If the first character after B<[> is B<^>, the sense of matching is
891inverted.
892
893To include a literal `C<^>', place it anywhere else but first. To
894include a literal 'C<]>' place it first or immediately after an
895initial B<^>. To include a literal `C<->' make it the first (or
896second after B<^>) or last character, or the second endpoint of
897a range.
898
899The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
900match the null string at the beginning and end of a word respectively.
901(Note that neither is identical to Perl's `\b' atom.)
902
903=head2 Additional Atoms
904
905Since some sed implementations provide additional regular expression
906atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
907the following backslash escapes:
908
909=over 4
910
911=item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
912
913=item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
914
915=item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
916
917=item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
918
919=item B<\y> Match the empty string at a word boundary.
920
921=item B<\B> Match the empty string between any two either word or non-word characters.
922
923=back
924
925To enable this feature, the environment variable PSEDEXTBRE must be set
926to a string containing the requested characters, e.g.:
927C<PSEDEXTBRE='E<lt>E<gt>wW'>.
928
929=cut
930
931#####
932# bre2p - convert BRE to Perl RE
933#
934sub peek(\$$){
935 my( $pref, $ic ) = @_;
936 $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
937}
938
939sub bre2p($$$){
940 my( $del, $pat, $fl ) = @_;
941 my $led = $del;
942 $led =~ tr/{([</})]>/;
943 $led = '' if $led eq $del;
944
945 $pat = substr( $pat, 1, length($pat) - 2 );
946 my $res = '';
947 my $bracklev = 0;
948 my $backref = 0;
949 my $parlev = 0;
950 for( my $ic = 0; $ic < length( $pat ); $ic++ ){
951 my $c = substr( $pat, $ic, 1 );
952 if( $c eq '\\' ){
953 ### backslash escapes
954 my $nc = peek($pat,$ic);
955 if( $nc eq '' ){
956 Warn( "`\\' cannot be last in pattern", $fl );
957 return undef();
958 }
959 $ic++;
960 if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
961 $res .= "\\$del";
962
963 } elsif( $nc =~ /([[.*\\n])/ ){
964 ## check for \-escaped magics and \n:
965 ## \[ \. \* \\ \n stay as they are
966 $res .= '\\'.$nc;
967
968 } elsif( $nc eq '(' ){ ## \( => (
969 $parlev++;
970 $res .= '(';
971
972 } elsif( $nc eq ')' ){ ## \) => )
973 $parlev--;
974 $backref++;
975 if( $parlev < 0 ){
976 Warn( "unmatched `\\)'", $fl );
977 return undef();
978 }
979 $res .= ')';
980
981 } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
982 my $endpos = index( $pat, '\\}', $ic );
983 if( $endpos < 0 ){
984 Warn( "unmatched `\\{'", $fl );
985 return undef();
986 }
987 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
988 $ic = $endpos + 1;
989
990 if( $res =~ /^\^?$/ ){
991 $res .= "\\{$rep\}";
992 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
993 my $min = $1;
994 my $com = $2 || '';
995 my $max = $3;
996 if( length( $max ) ){
997 if( $max < $min ){
998 Warn( "maximum less than minimum in `\\{$rep\\}'",
999 $fl );
1000 return undef();
1001 }
1002 } else {
1003 $max = '';
1004 }
1005 # simplify some
1006 if( $min == 0 && $max eq '1' ){
1007 $res .= '?';
1008 } elsif( $min == 1 && "$com$max" eq ',' ){
1009 $res .= '+';
1010 } elsif( $min == 0 && "$com$max" eq ',' ){
1011 $res .= '*';
1012 } else {
1013 $res .= "{$min$com$max}";
1014 }
1015 } else {
1016 Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1017 return undef();
1018 }
1019
1020 } elsif( $nc =~ /^[1-9]$/ ){
1021 ## \1 .. \9 => \1 .. \9, but check for a following digit
1022 if( $nc > $backref ){
1023 Warn( "invalid backreference ($nc)", $fl );
1024 return undef();
1025 }
1026 $res .= "\\$nc";
1027 if( peek($pat,$ic) =~ /[0-9]/ ){
1028 $res .= '(?:)';
1029 }
1030
1031 } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1032 ## extensions - at most <>wWyB - not in POSIX
1033 if( $nc eq '<' ){ ## \< => \b(?=\w), be precise
1034 $res .= '\\b(?<=\\W)';
1035 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1036 $res .= '\\b(?=\\W)';
1037 } elsif( $nc eq 'y' ){ ## \y => \b
1038 $res .= '\\b';
1039 } else { ## \B, \w, \W remain the same
1040 $res .= "\\$nc";
1041 }
1042 } elsif( $nc eq $led ){
1043 ## \<closing bracketing-delimiter> - keep '\'
1044 $res .= "\\$nc";
1045
1046 } else { ## \ <char> => <char> ("as if `\' were not present")
1047 $res .= $nc;
1048 }
1049
1050 } elsif( $c eq '.' ){ ## . => .
1051 $res .= $c;
1052
1053 } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1054 if( $res =~ /^\^?$/ ){
1055 $res .= '\\*';
1056 } elsif( substr( $res, -1, 1 ) ne '*' ){
1057 $res .= $c;
1058 }
1059
1060 } elsif( $c eq '[' ){
1061 ## parse []: [^...] [^]...] [-...]
1062 my $add = '[';
1063 if( peek($pat,$ic) eq '^' ){
1064 $ic++;
1065 $add .= '^';
1066 }
1067 my $nc = peek($pat,$ic);
1068 if( $nc eq ']' || $nc eq '-' ){
1069 $add .= $nc;
1070 $ic++;
1071 }
1072 # check that [ is not trailing
1073 if( $ic >= length( $pat ) - 1 ){
1074 Warn( "unmatched `['", $fl );
1075 return undef();
1076 }
1077 # look for [:...:] and x-y
1078 my $rstr = substr( $pat, $ic+1 );
1079 if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1080 my $cnt = $1;
1081 $ic += length( $cnt );
1082 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1083 # try some simplifications
1084 my $red = $cnt;
1085 if( $red =~ s/0-9// ){
1086 $cnt = $red.'\d';
1087 if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1088 $cnt = $red.'\w';
1089 }
1090 }
1091 $add .= $cnt;
1092
1093 # POSIX 1003.2 has this (optional) for begin/end word
1094 $add = '\\b(?=\\W)' if $add eq '[[:<:]]';
1095 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1096
1097 }
1098
1099 ## may have a trailing `-' before `]'
1100 if( $ic < length($pat) - 1 &&
1101 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1102 $ic += length( $1 );
1103 $add .= $1;
1104 # another simplification
1105 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1106 $res .= $add;
1107 } else {
1108 Warn( "unmatched `['", $fl );
1109 return undef();
1110 }
3cb6de81 1111
86a59229
WL
1112 } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1113 $res .= "\\$c";
d83e3bda 1114
86a59229
WL
1115 } elsif( $c eq ']' ){ ## unmatched ] is not magic
1116 $res .= ']';
d83e3bda 1117
86a59229
WL
1118 } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1119 $res .= "\\$c";
d83e3bda 1120
86a59229
WL
1121 } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1122 $res .= length( $res ) ? '\\^' : '^';
d83e3bda 1123
86a59229
WL
1124 } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1125 $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
8d063cd8 1126
86a59229
WL
1127 } else {
1128 $res .= $c;
1129 }
8d063cd8 1130 }
0a12ae7d 1131
86a59229
WL
1132 if( $parlev ){
1133 Warn( "unmatched `\\('", $fl );
1134 return undef();
8d063cd8 1135 }
0a12ae7d 1136
86a59229
WL
1137 # final cleanup: eliminate raw HTs
1138 $res =~ s/\t/\\t/g;
1139 return $del . $res . ( $led ? $led : $del );
1140}
0a12ae7d 1141
86a59229
WL
1142
1143#####
1144# sub2p - convert sed substitution to Perl substitution
1145#
1146sub sub2p($$$){
1147 my( $del, $subst, $fl ) = @_;
1148 my $led = $del;
1149 $led =~ tr/{([</})]>/;
1150 $led = '' if $led eq $del;
1151
1152 $subst = substr( $subst, 1, length($subst) - 2 );
1153 my $res = '';
1154
1155 for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1156 my $c = substr( $subst, $ic, 1 );
1157 if( $c eq '\\' ){
1158 ### backslash escapes
1159 my $nc = peek($subst,$ic);
1160 if( $nc eq '' ){
1161 Warn( "`\\' cannot be last in substitution", $fl );
1162 return undef();
1163 }
1164 $ic++;
1165 if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1166 $res .= '\\' . $nc;
1167 } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1168 $res .= '${' . $nc . '}';
1169 } else { ## everything else (includes &): omit \
1170 $res .= $nc;
1171 }
1172 } elsif( $c eq '&' ){ ## & => $&
1173 $res .= '$&';
1174 } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1175 $res .= '\\' . $c;
1176 } else {
1177 $res .= $c;
8d063cd8 1178 }
8d063cd8
LW
1179 }
1180
86a59229
WL
1181 # final cleanup: eliminate raw HTs
1182 $res =~ s/\t/\\t/g;
1183 return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1184}
0a12ae7d 1185
86a59229
WL
1186
1187sub Parse(){
1188 my $error = 0;
1189 my( $pdef, $pfil, $plin );
1190 for( my $icom = 0; $icom < @Commands; $icom++ ){
1191 my $cmd = $Commands[$icom];
1192 print "Parse:$cmd:\n" if $useDEBUG;
1193 $cmd =~ s/^\s+//;
1194 next unless length( $cmd );
1195 my $scom = $icom;
1196 if( exists( $Defined{$icom} ) ){
1197 $pdef = $Defined{$icom};
1198 if( $pdef =~ /^ #(\d+)/ ){
1199 $pfil = 'expression #';
1200 $plin = $1;
1201 } else {
1202 $pfil = "$pdef l.";
1203 $plin = 1;
1204 }
1205 } else {
1206 $plin++;
1207 }
1208 my $fl = "$pfil$plin";
1209
1210 # insert command as comment in gnerated code
1211 #
1212 $Code .= "# $cmd\n" if $doGenerate;
1213
1214 # The Address(es)
1215 #
1216 my( $negated, $naddr, $addr1, $addr2 );
1217 $naddr = 0;
1218 if( $cmd =~ s/^(\d+)\s*// ){
1219 $addr1 = "$1"; $naddr++;
1220 } elsif( $cmd =~ s/^\$\s*// ){
1221 $addr1 = 'eofARGV()'; $naddr++;
1222 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1223 my $del = $1;
1224 my $regex = stripRegex( $del, \$cmd );
1225 if( defined( $regex ) ){
1226 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1227 $naddr++;
1228 } else {
1229 Warn( "malformed regex, 1st address", $fl );
1230 $error++;
1231 next;
1232 }
1233 }
1234 if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1235 if( $cmd =~ s/^(\d+)\s*// ){
1236 $addr2 = "$1"; $naddr++;
1237 } elsif( $cmd =~ s/^\$\s*// ){
1238 $addr2 = 'eofARGV()'; $naddr++;
1239 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1240 my $del = $1;
1241 my $regex = stripRegex( $del, \$cmd );
1242 if( defined( $regex ) ){
1243 $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1244 $naddr++;
1245 } else {
1246 Warn( "malformed regex, 2nd address", $fl );
1247 $error++;
1248 next;
1249 }
1250 } else {
1251 Warn( "invalid address after `,'", $fl );
1252 $error++;
1253 next;
1254 }
1255 }
1256
1257 # address modifier `!'
1258 #
1259 $negated = $cmd =~ s/^!\s*//;
1260 if( defined( $addr1 ) ){
1261 print "Parse: addr1=$addr1" if $useDEBUG;
1262 if( defined( $addr2 ) ){
1263 print ", addr2=$addr2 " if $useDEBUG;
1264 # both numeric and addr1 > addr2 => eliminate addr2
1265 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1266 $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1267 }
9ef589d8 1268 }
86a59229
WL
1269 print 'negated' if $useDEBUG && $negated;
1270 print " command:$cmd\n" if $useDEBUG;
1271
1272 # The Command
1273 #
1274 if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1275 my $h = substr( $cmd, 0, 1 );
1276 Warn( "unknown command `$h'", $fl );
1277 $error++;
8d063cd8
LW
1278 next;
1279 }
86a59229 1280 my $key = $1;
8d063cd8 1281
86a59229 1282 my $tabref = $ComTab{$key};
d16f50bd 1283 $GenKey{$key} = 1;
86a59229
WL
1284 if( $naddr > $tabref->[0] ){
1285 Warn( "excess address(es)", $fl );
1286 $error++;
8d063cd8
LW
1287 next;
1288 }
1289
86a59229
WL
1290 my $arg = '';
1291 if( $tabref->[1] eq 'str' ){
1292 # take remainder - don't care if it is empty
1293 $arg = $cmd;
1294 $cmd = '';
1295
1296 } elsif( $tabref->[1] eq 'txt' ){
1297 # multi-line text
1298 my $goon = $cmd =~ /(.*)\\$/;
1299 if( length( $1 ) ){
1300 Warn( "extra characters after command ($cmd)", $fl );
1301 $error++;
8d063cd8 1302 }
86a59229
WL
1303 while( $goon ){
1304 $icom++;
1305 if( $icom > $#Commands ){
1306 Warn( "unexpected end of script", $fl );
1307 $error++;
1308 last;
1309 }
1310 $cmd = $Commands[$icom];
1311 $Code .= "# $cmd\n" if $doGenerate;
1312 $goon = $cmd =~ s/\\$//;
1313 $cmd =~ s/\\(.)/$1/g;
1314 $arg .= "\n" if length( $arg );
1315 $arg .= $cmd;
8d063cd8 1316 }
86a59229
WL
1317 $arg .= "\n" if length( $arg );
1318 $cmd = '';
8d063cd8 1319
86a59229
WL
1320 } elsif( $tabref->[1] eq 'sub' ){
1321 # s///
1322 if( ! length( $cmd ) ){
1323 Warn( "`s' command requires argument", $fl );
1324 $error++;
1325 next;
1326 }
1327 if( $cmd =~ s{^([^\\\n])}{} ){
1328 my $del = $1;
1329 my $regex = stripRegex( $del, \$cmd );
1330 if( ! defined( $regex ) ){
1331 Warn( "malformed regular expression", $fl );
1332 $error++;
1333 next;
a687059c 1334 }
86a59229
WL
1335 $regex = bre2p( $del, $regex, $fl );
1336
1337 # a trailing \ indicates embedded NL (in replacement string)
1338 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1339 $icom++;
1340 if( $icom > $#Commands ){
1341 Warn( "unexpected end of script", $fl );
1342 $error++;
1343 last;
8d063cd8 1344 }
86a59229
WL
1345 $cmd .= $Commands[$icom];
1346 $Code .= "# $Commands[$icom]\n" if $doGenerate;
9ef589d8 1347 }
86a59229
WL
1348
1349 my $subst = stripRegex( $del, \$cmd );
1350 if( ! defined( $regex ) ){
1351 Warn( "malformed substitution expression", $fl );
1352 $error++;
0a12ae7d
LW
1353 next;
1354 }
86a59229
WL
1355 $subst = sub2p( $del, $subst, $fl );
1356
1357 # parse s/// modifier: g|p|0-9|w <file>
1358 my( $global, $nmatch, $print, $write ) =
1359 ( '', '', 0, undef );
1360 while( $cmd =~ s/^([gp0-9])// ){
1361 $1 eq 'g' ? ( $global = 'g' ) :
1362 $1 eq 'p' ? ( $print = $1 ) : ( $nmatch .= $1 );
1363 }
1364 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1365 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1366 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1367 Warn( "conflicting flags `$global$nmatch'", $fl );
1368 $error++;
0a12ae7d
LW
1369 next;
1370 }
86a59229
WL
1371
1372 $arg = makes( $regex, $subst,
1373 $write, $global, $print, $nmatch, $fl );
1374 if( ! defined( $arg ) ){
1375 $error++;
8d063cd8
LW
1376 next;
1377 }
86a59229
WL
1378
1379 } else {
1380 Warn( "improper delimiter in s command", $fl );
1381 $error++;
1382 next;
1383 }
1384
1385 } elsif( $tabref->[1] eq 'tra' ){
1386 # y///
1387 # a trailing \ indicates embedded newline
1388 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1389 $icom++;
1390 if( $icom > $#Commands ){
1391 Warn( "unexpected end of script", $fl );
1392 $error++;
1393 last;
1394 }
1395 $cmd .= $Commands[$icom];
1396 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1397 }
1398 if( ! length( $cmd ) ){
1399 Warn( "`y' command requires argument", $fl );
1400 $error++;
1401 next;
1402 }
1403 my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1404 if( $d eq '\\' ){
1405 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1406 $error++;
1407 next;
1408 }
1409 my $fr = stripTrans( $d, \$cmd );
1410 if( ! defined( $fr ) || ! length( $cmd ) ){
1411 Warn( "malformed `y' command argument", $fl );
1412 $error++;
1413 next;
1414 }
1415 my $to = stripTrans( $d, \$cmd );
1416 if( ! defined( $to ) ){
1417 Warn( "malformed `y' command argument", $fl );
1418 $error++;
1419 next;
1420 }
1421 if( length($fr) != length($to) ){
1422 Warn( "string lengths in `y' command differ", $fl );
1423 $error++;
1424 next;
1425 }
1426 if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1427 $error++;
1428 next;
8d063cd8 1429 }
8d063cd8 1430
8d063cd8
LW
1431 }
1432
86a59229
WL
1433 # $cmd must be now empty - exception is {
1434 if( $cmd !~ /^\s*$/ ){
1435 if( $key eq '{' ){
1436 # dirty hack to process command on '{' line
1437 $Commands[$icom--] = $cmd;
1438 } else {
1439 Warn( "extra characters after command ($cmd)", $fl );
1440 $error++;
1441 next;
1442 }
8d063cd8
LW
1443 }
1444
86a59229
WL
1445 # Make Code
1446 #
1447 if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1448 $tabref->[3], $arg, $fl ) ){
1449 $error++;
8d063cd8 1450 }
86a59229 1451 }
8d063cd8 1452
86a59229
WL
1453 while( @BlockStack ){
1454 my $bl = pop( @BlockStack );
1455 Warn( "start of unterminated `{'", $bl );
1456 $error++;
1457 }
8d063cd8 1458
86a59229
WL
1459 for my $lab ( keys( %Label ) ){
1460 if( ! exists( $Label{$lab}{defined} ) ){
1461 for my $used ( @{$Label{$lab}{used}} ){
1462 Warn( "undefined label `$lab'", $used );
1463 $error++;
1464 }
8d063cd8 1465 }
86a59229 1466 }
8d063cd8 1467
86a59229
WL
1468 exit( 1 ) if $error;
1469}
8d063cd8 1470
8d063cd8 1471
86a59229
WL
1472##############
1473#### MAIN ####
1474##############
8d063cd8 1475
86a59229
WL
1476sub usage(){
1477 print STDERR "Usage: sed [-an] command [file...]\n";
1478 print STDERR " [-an] [-e command] [-f script-file] [file...]\n";
1479}
8d063cd8 1480
86a59229
WL
1481###################
1482# Here we go again...
1483#
1484my $expr = 0;
1485while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1486 my $opt = $1;
1487 my $arg = $2;
1488 shift( @ARGV );
1489 if( $opt eq 'e' ){
1490 if( length( $arg ) ){
1491 push( @Commands, split( "\n", $arg ) );
1492 } elsif( @ARGV ){
1493 push( @Commands, shift( @ARGV ) );
1494 } else {
1495 Warn( "option -e requires an argument" );
1496 usage();
1497 exit( 1 );
1498 }
1499 $expr++;
1500 $Defined{$#Commands} = " #$expr";
1501 next;
1502 }
1503 if( $opt eq 'f' ){
1504 my $path;
1505 if( length( $arg ) ){
1506 $path = $arg;
1507 } elsif( @ARGV ){
1508 $path = shift( @ARGV );
1509 } else {
1510 Warn( "option -f requires an argument" );
1511 usage();
1512 exit( 1 );
1513 }
1514 my $fst = $#Commands + 1;
1515 open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1516 my $cmd;
1517 while( defined( $cmd = <SCRIPT> ) ){
1518 chomp( $cmd );
1519 push( @Commands, $cmd );
1520 }
1521 close( SCRIPT );
1522 if( $#Commands >= $fst ){
1523 $Defined{$fst} = "$path";
8d063cd8 1524 }
86a59229
WL
1525 next;
1526 }
1527 if( $opt eq '-' && $arg eq '' ){
1528 last;
1529 }
1530 if( $opt eq 'h' || $opt eq '?' ){
1531 usage();
1532 exit( 0 );
1533 }
1534 if( $opt eq 'n' ){
1535 $doAutoPrint = 0;
1536 } elsif( $opt eq 'a' ){
1537 $doOpenWrite = 0;
1538 } else {
1539 Warn( "illegal option `$opt'" );
1540 usage();
1541 exit( 1 );
1542 }
1543 if( length( $arg ) ){
1544 unshift( @ARGV, "-$arg" );
1545 }
1546}
8d063cd8 1547
86a59229
WL
1548# A singleton command may be the 1st argument when there are no options.
1549#
1550if( @Commands == 0 ){
1551 if( @ARGV == 0 ){
1552 Warn( "no script command given" );
1553 usage();
1554 exit( 1 );
1555 }
1556 push( @Commands, split( "\n", shift( @ARGV ) ) );
1557 $Defined{0} = ' #1';
1558}
8d063cd8 1559
86a59229 1560print STDERR "Files: @ARGV\n" if $useDEBUG;
8d063cd8 1561
86a59229
WL
1562# generate leading code
1563#
d16f50bd 1564$Func = <<'[TheEnd]';
86a59229 1565
d16f50bd
WL
1566# openARGV: open 1st input file
1567#
86a59229
WL
1568sub openARGV(){
1569 unshift( @ARGV, '-' ) unless @ARGV;
1570 my $file = shift( @ARGV );
1571 open( ARG, "<$file" )
1572 || die( "$0: can't open $file for reading ($!)\n" );
1573 $isEOF = 0;
1574}
8d063cd8 1575
d16f50bd
WL
1576# getsARGV: Read another input line into argument (default: $_).
1577# Move on to next input file, and reset EOF flag $isEOF.
86a59229
WL
1578sub getsARGV(;\$){
1579 my $argref = @_ ? shift() : \$_;
1580 while( $isEOF || ! defined( $$argref = <ARG> ) ){
1581 close( ARG );
1582 return 0 unless @ARGV;
1583 my $file = shift( @ARGV );
1584 open( ARG, "<$file" )
1585 || die( "$0: can't open $file for reading ($!)\n" );
1586 $isEOF = 0;
1587 }
1588 1;
1589}
8d063cd8 1590
d16f50bd
WL
1591# eofARGV: end-of-file test
1592#
86a59229
WL
1593sub eofARGV(){
1594 return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1595}
8d063cd8 1596
d16f50bd
WL
1597# makeHandle: Generates another file handle for some file (given by its path)
1598# to be written due to a w command or an s command's w flag.
86a59229
WL
1599sub makeHandle($){
1600 my( $path ) = @_;
1601 my $handle;
1602 if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1603 $handle = $wFiles{$path} = gensym();
1604 if( $doOpenWrite ){
1605 if( ! open( $handle, ">$path" ) ){
1606 die( "$0: can't open $path for writing: ($!)\n" );
1607 }
9ef589d8 1608 }
86a59229
WL
1609 } else {
1610 $handle = $wFiles{$path};
1611 }
1612 return $handle;
1613}
9ef589d8 1614
d16f50bd
WL
1615# printQ: Print queued output which is either a string or a reference
1616# to a pathname.
86a59229
WL
1617sub printQ(){
1618 for my $q ( @Q ){
1619 if( ref( $q ) ){
d16f50bd 1620 # flush open w files so that reading this file gets it all
86a59229
WL
1621 if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1622 open( $wFiles{$$q}, ">>$$q" );
1623 }
d16f50bd 1624 # copy file to stdout: slow, but safe
86a59229 1625 if( open( RF, "<$$q" ) ){
d16f50bd 1626 while( defined( my $line = <RF> ) ){
86a59229
WL
1627 print $line;
1628 }
1629 close( RF );
1630 }
1631 } else {
1632 print $q;
a687059c 1633 }
86a59229
WL
1634 }
1635 undef( @Q );
1636}
1637
d16f50bd
WL
1638[TheEnd]
1639
1640# generate the sed loop
1641#
1642$Code .= <<'[TheEnd]';
1643sub openARGV();
1644sub getsARGV(;\$);
1645sub eofARGV();
1646sub printQ();
1647
1648# Run: the sed loop reading input and applying the script
1649#
86a59229
WL
1650sub Run(){
1651 my( $h, $icnt, $s, $n );
1652 # hack (not unbreakable :-/) to avoid // matching an empty string
1653 my $z = "\000"; $z =~ /$z/;
1654 # Initialize.
1655 openARGV();
1656 $Hold = '';
1657 $CondReg = 0;
1658 $doPrint = $doAutoPrint;
1659CYCLE:
1660 while( getsARGV() ){
1661 chomp();
1662 $CondReg = 0; # cleared on t
1663BOS:;
1664[TheEnd]
1665
1666 # parse - avoid opening files when doing s2p
1667 #
1668 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1669 if $doGenerate;
1670 Parse();
1671 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1672 if $doGenerate;
1673
1674 # append trailing code
1675 #
1676 $Code .= <<'[TheEnd]';
1677EOS: if( $doPrint ){
1678 print $_, "\n";
1679 } else {
1680 $doPrint = $doAutoPrint;
a687059c 1681 }
86a59229 1682 printQ() if @Q;
a687059c 1683 }
86a59229
WL
1684
1685 exit( 0 );
a687059c 1686}
86a59229
WL
1687[TheEnd]
1688
d16f50bd
WL
1689
1690# append optional functions, prepend prototypes
1691#
1692my $Proto = "# prototypes\n";
1693if( $GenKey{'l'} ){
1694 $Proto .= "sub _l();\n";
1695 $Func .= <<'[TheEnd]';
1696# _l: l command processing
1697#
1698sub _l(){
1699 my $h = $_;
1700 my $mcpl = 70;
1701 # transform non printing chars into escape notation
1702 $h =~ s/\\/\\\\/g;
1703 if( $h =~ /[^[:print:]]/ ){
1704 $h =~ s/\a/\\a/g;
1705 $h =~ s/\f/\\f/g;
1706 $h =~ s/\n/\\n/g;
1707 $h =~ s/\t/\\t/g;
1708 $h =~ s/\r/\\r/g;
1709 $h =~ s/\e/\\e/g;
1710 $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1711 }
1712 # split into lines of length $mcpl
1713 while( length( $h ) > $mcpl ){
1714 my $l = substr( $h, 0, $mcpl-1 );
1715 $h = substr( $h, $mcpl );
1716 # remove incomplete \-escape from end of line
1717 if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1718 $h = $1 . $h;
1719 }
1720 print $l, "\\\n";
1721 }
1722 print "$h\$\n";
1723}
1724
1725[TheEnd]
1726}
1727
1728if( $GenKey{'r'} ){
1729 $Proto .= "sub _r(\$);\n";
1730 $Func .= <<'[TheEnd]';
1731# _r: r command processing: Save a reference to the pathname.
1732#
1733sub _r($){
1734 my $path = shift();
1735 push( @Q, \$path );
1736}
1737
1738[TheEnd]
1739}
1740
1741if( $GenKey{'t'} ){
1742 $Proto .= "sub _t();\n";
1743 $Func .= <<'[TheEnd]';
1744# _t: t command - condition register test/reset
1745#
1746sub _t(){
1747 my $res = $CondReg;
1748 $CondReg = 0;
1749 $res;
1750}
1751
1752[TheEnd]
1753}
1754
1755if( $GenKey{'w'} ){
1756 $Proto .= "sub _w(\$);\n";
1757 $Func .= <<'[TheEnd]';
1758# _w: w command and s command's w flag - write to file
1759#
1760sub _w($){
1761 my $path = shift();
1762 my $handle = $wFiles{$path};
1763 if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1764 open( $handle, ">$path" )
1765 || die( "$0: $path: cannot open ($!)\n" );
1766 }
1767 print $handle $_, "\n";
1768}
1769
1770[TheEnd]
1771}
1772
1773$Code = $Proto . $Code;
1774
86a59229
WL
1775# magic "#n" - same as -n option
1776#
1777$doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
a687059c 1778
86a59229
WL
1779# eval code - check for errors
1780#
d16f50bd
WL
1781print "Code:\n$Code$Func" if $useDEBUG;
1782eval $Code . $Func;
86a59229 1783if( $@ ){
d16f50bd 1784 print "Code:\n$Code$Func";
86a59229 1785 die( "$0: internal error - generated incorrect Perl code: $@\n" );
9ef589d8
LW
1786}
1787
86a59229
WL
1788if( $doGenerate ){
1789
1790 # write full Perl program
1791 #
1792
d16f50bd 1793 # bang line, declarations, prototypes
86a59229
WL
1794 print <<TheEnd;
1795#!$perlpath -w
1796eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1797 if 0;
9ae2b9a0 1798\$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
86a59229
WL
1799
1800use strict;
1801use Symbol;
1802use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1803 \$doAutoPrint \$doOpenWrite \$doPrint };
1804\$doAutoPrint = $doAutoPrint;
1805\$doOpenWrite = $doOpenWrite;
1806TheEnd
1807
1808 my $wf = "'" . join( "', '", keys( %wFiles ) ) . "'";
1809 if( $wf ne "''" ){
1810 print <<TheEnd;
1811sub makeHandle(\$);
1812for my \$p ( $wf ){
1813 exit( 1 ) unless makeHandle( \$p );
9ef589d8 1814}
86a59229
WL
1815TheEnd
1816 }
9ef589d8 1817
86a59229 1818 print $Code;
d16f50bd
WL
1819 print "Run();\n";
1820 print $Func;
86a59229 1821 exit( 0 );
1aa91729 1822
86a59229
WL
1823} else {
1824
1825 # execute: make handles (and optionally open) all w files; run!
86a59229
WL
1826 for my $p ( keys( %wFiles ) ){
1827 exit( 1 ) unless makeHandle( $p );
1828 }
d16f50bd 1829 Run();
1aa91729 1830}
86a59229
WL
1831
1832
1833=head1 ENVIRONMENT
1834
1835The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1836See L<"Additional Atoms">.
1837
1838=head1 DIAGNOSTICS
1839
1840=over 4
1841
1842=item ambiguos translation for character `%s' in `y' command
1843
1844The indicated character appears twice, with different translations.
1845
1846=item `[' cannot be last in pattern
1847
1848A `[' in a BRE indicates the beginning of a I<bracket expression>.
1849
1850=item `\' cannot be last in pattern
1851
1852A `\' in a BRE is used to make the subsequent character literal.
1853
1854=item `\' cannot be last in substitution
1855
1856A `\' in a subsitution string is used to make the subsequent character literal.
1857
1858=item conflicting flags `%s'
1859
1860In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1861multiple n-th occurrence flags are specified. Note that only the digits
1862`1' through `9' are permitted.
1863
1864=item duplicate label %s (first defined at %s)
1865
1866=item excess address(es)
1867
1868The command has more than the permitted number of addresses.
1869
1870=item extra characters after command (%s)
1871
1872=item illegal option `%s'
1873
1874=item improper delimiter in s command
1875
1876The BRE and substitution may not be delimited with `\' or newline.
1877
1878=item invalid address after `,'
1879
1880=item invalid backreference (%s)
1881
1882The specified backreference number exceeds the number of backreferences
1883in the BRE.
1884
1885=item invalid repeat clause `\{%s\}'
1886
1887The repeat clause does not contain a valid integer value, or pair of
1888values.
1889
1890=item malformed regex, 1st address
1891
1892=item malformed regex, 2nd address
1893
1894=item malformed regular expression
1895
1896=item malformed substitution expression
1897
1898=item malformed `y' command argument
1899
1900The first or second string of a B<y> command is syntactically incorrect.
1901
1902=item maximum less than minimum in `\{%s\}'
1903
1904=item no script command given
1905
1906There must be at least one B<-e> or one B<-f> option specifying a
1907script or script file.
1908
1909=item `\' not valid as delimiter in `y' command
1910
1911=item option -e requires an argument
1912
1913=item option -f requires an argument
1914
1915=item `s' command requires argument
1916
1917=item start of unterminated `{'
1918
1919=item string lengths in `y' command differ
1920
1921The translation table strings in a B<y> commanf must have equal lengths.
1922
1923=item undefined label `%s'
1924
1925=item unexpected `}'
1926
1927A B<}> command without a preceding B<{> command was encountered.
1928
1929=item unexpected end of script
1930
1931The end of the script was reached although a text line after a
1932B<a>, B<c> or B<i> command indicated another line.
1933
1934=item unknown command `%s'
1935
1936=item unterminated `['
1937
1938A BRE contains an unterminated bracket expression.
1939
1940=item unterminated `\('
1941
1942A BRE contains an unterminated backreference.
1943
1944=item `\{' without closing `\}'
1945
1946A BRE contains an unterminated bounds specification.
1947
1948=item `\)' without preceding `\('
1949
1950=item `y' command requires argument
1951
1952=back
1953
1954=head1 EXAMPLE
1955
1956The basic material for the preceding section was generated by running
1957the sed script
1958
1959 #no autoprint
1960 s/^.*Warn( *"\([^"]*\)".*$/\1/
1961 t process
1962 b
1963 :process
1964 s/$!/%s/g
1965 s/$[_[:alnum:]]\{1,\}/%s/g
1966 s/\\\\/\\/g
1967 s/^/=item /
1968 p
1969
1970on the program's own text, and piping the output into C<sort -u>.
1971
1972
1973=head1 SED SCRIPT TRANSLATION
1974
1975If this program is invoked with the name F<s2p> it will act as a
1976sed-to-Perl translator. After option processing (all other
1977arguments are ignored), a Perl program is printed on standard
1978output, which will process the input stream (as read from all
1979arguments) in the way defined by the sed script and the option setting
1980used for the translation.
1981
1982=head1 SEE ALSO
1983
1984perl(1), re_format(7)
1985
1986=head1 BUGS
1987
1988The B<l> command will show escape characters (ESC) as `C<\e>', but
1989a vertical tab (VT) in octal.
1990
1991Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
1992
1993The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
1994is "the last pattern used, at run time". This deviates from the Perl
1995interpretation, which will re-use the "last last successfully executed
1996regular expression". Since keeping track of pattern usage would create
1997terribly cluttered code, and differences would only appear in obscure
1998context (where other B<sed> implementations appear to deviate, too),
1999the Perl semantics was adopted. Note that common usage of this feature,
2000such as in C</abc/s//xyz/>, will work as expected.
2001
2002Collating elements (of bracket expressions in BREs) are not implemented.
2003
2004=head1 STANDARDS
2005
2006This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2007definition of B<sed>, and is compatible with the I<OpenBSD>
2008implementation, except where otherwise noted (see L<"BUGS">).
2009
2010=head1 AUTHOR
2011
2012This Perl implementation of I<sed> was written by Wolfgang Laun,
2013I<Wolfgang.Laun@alcatel.at>.
2014
2015=head1 COPYRIGHT and LICENSE
2016
2017This program is free and open software. You may use, modify,
2018distribute, and sell this program (and any modified variants) in any
2019way you wish, provided you do not restrict others from doing the same.
2020
2021=cut
2022
a687059c 2023!NO!SUBS!
4633a7c4
LW
2024
2025close OUT or die "Can't close $file: $!";
2026chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
d38b618a
A
2027unlink 'psed';
2028print "Linking s2p to psed.\n";
2029if (defined $Config{d_link}) {
2030 link 's2p', 'psed';
2031} else {
2032 unshift @INC, '../lib';
2033 require File::Copy;
2034 File::Copy::syscopy('s2p', 'psed');
2035}
4633a7c4 2036exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
8a5546a1 2037chdir $origdir;