This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
GNU Public General Public General GNU Public License License
[perl5.git] / x2p / s2p.PL
CommitLineData
86a59229 1#!/usr/bin/perl
4633a7c4
LW
2
3use Config;
4use File::Basename qw(&basename &dirname);
8a5546a1 5use Cwd;
231bc313
JH
6use subs qw(link);
7
6ef9d486 8sub link { # This is a cut-down version of installperl:link().
231bc313
JH
9 my($from,$to) = @_;
10 my($success) = 0;
11
12 eval {
13 CORE::link($from, $to)
14 ? $success++
15 : ($from =~ m#^/afs/# || $to =~ m#^/afs/#)
16 ? die "AFS" # okay inside eval {}
17 : die "Couldn't link $from to $to: $!\n";
18 };
19 if ($@) {
231bc313
JH
20 require File::Copy;
21 File::Copy::copy($from, $to)
22 ? $success++
23 : warn "Couldn't copy $from to $to: $!\n";
24 }
25 $success;
26}
4633a7c4
LW
27
28# List explicitly here the variables you want Configure to
29# generate. Metaconfig only looks for shell variables, so you
30# have to mention them as if they were shell variables, not
31# %Config entries. Thus you write
32# $startperl
33# to ensure Configure will look for $Config{startperl}.
34
35# This forces PL files to create target in same directory as PL file.
36# This is so that make depend always knows where to find PL derivatives.
8a5546a1 37$origdir = cwd;
44a8e56a 38chdir dirname($0);
39$file = basename($0, '.PL');
774d564b 40$file .= '.com' if $^O eq 'VMS';
4633a7c4
LW
41
42open OUT,">$file" or die "Can't create $file: $!";
43
44print "Extracting $file (with variable substitutions)\n";
45
46# In this section, perl variables will be expanded during extraction.
47# You can use $Config{...} to use Configure variables.
48
49print OUT <<"!GROK!THIS!";
5f05dabc 50$Config{startperl}
51 eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
52 if \$running_under_some_shell;
c0393c90
JH
53my \$startperl;
54my \$perlpath;
ed6d8ea1
JH
55(\$startperl = <<'/../') =~ s/\\s*\\z//;
56$Config{startperl}
57/../
58(\$perlpath = <<'/../') =~ s/\\s*\\z//;
59$Config{perlpath}
60/../
a687059c
LW
61!GROK!THIS!
62
4633a7c4
LW
63# In the following, perl variables are not expanded during extraction.
64
65print OUT <<'!NO!SUBS!';
a687059c 66
9ae2b9a0 67$0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
86a59229
WL
68
69# (p)sed - a stream editor
70# History: Aug 12 2000: Original version.
d16f50bd 71# Mar 25 2002: Rearrange generated Perl program.
86a59229
WL
72
73use strict;
74use integer;
75use Symbol;
8d063cd8 76
d83e3bda
JM
77=head1 NAME
78
ba34211e 79psed - a stream editor
d83e3bda
JM
80
81=head1 SYNOPSIS
82
ba34211e
JH
83 psed [-an] script [file ...]
84 psed [-an] [-e script] [-f script-file] [file ...]
85
86 s2p [-an] [-e script] [-f script-file]
d83e3bda
JM
87
88=head1 DESCRIPTION
89
86a59229
WL
90A stream editor reads the input stream consisting of the specified files
91(or standard input, if none are given), processes is line by line by
92applying a script consisting of edit commands, and writes resulting lines
93to standard output. The filename `C<->' may be used to read standard input.
94
95The edit script is composed from arguments of B<-e> options and
96script-files, in the given order. A single script argument may be specified
97as the first parameter.
98
99If this program is invoked with the name F<s2p>, it will act as a
100sed-to-Perl translator. See L<"sed Script Translation">.
d83e3bda 101
86a59229 102B<sed> returns an exit code of 0 on success or >0 if an error occurred.
d83e3bda 103
86a59229 104=head1 OPTIONS
d83e3bda 105
86a59229 106=over 4
d83e3bda 107
86a59229 108=item B<-a>
d83e3bda 109
86a59229
WL
110A file specified as argument to the B<w> edit command is by default
111opened before input processing starts. Using B<-a>, opening of such
112files is delayed until the first line is actually written to the file.
113
114=item B<-e> I<script>
115
116The editing commands defined by I<script> are appended to the script.
117Multiple commands must be separated by newlines.
118
119=item B<-f> I<script-file>
120
121Editing commands from the specified I<script-file> are read and appended
122to the script.
d83e3bda
JM
123
124=item B<-n>
125
86a59229
WL
126By default, a line is written to standard output after the editing script
127has been applied to it. The B<-n> option suppresses automatic printing.
128
129=back
130
131=head1 COMMANDS
132
133B<sed> command syntax is defined as
134
135Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
136
137with whitespace being permitted before or after addresses, and between
138the function character and the argument. The I<address>es and the
139address inverter (C<!>) are used to restrict the application of a
140command to the selected line(s) of input.
141
142Each command must be on a line of its own, except where noted in
143the synopses below.
144
145The edit cycle performed on each input line consist of reading the line
146(without its trailing newline character) into the I<pattern space>,
147applying the applicable commands of the edit script, writing the final
148contents of the pattern space and a newline to the standard output.
149A I<hold space> is provided for saving the contents of the
150pattern space for later use.
151
152=head2 Addresses
153
154A sed address is either a line number or a pattern, which may be combined
155arbitrarily to construct ranges. Lines are numbered across all input files.
156
157Any address may be followed by an exclamation mark (`C<!>'), selecting
158all lines not matching that address.
d83e3bda 159
86a59229 160=over 4
d83e3bda 161
86a59229
WL
162=item I<number>
163
164The line with the given number is selected.
165
166=item B<$>
167
168A dollar sign (C<$>) is the line number of the last line of the input stream.
169
170=item B</>I<regular expression>B</>
171
172A pattern address is a basic regular expression (see
173L<"Basic Regular Expressions">), between the delimiting character C</>.
174Any other character except C<\> or newline may be used to delimit a
175pattern address when the initial delimiter is prefixed with a
176backslash (`C<\>').
d83e3bda
JM
177
178=back
179
86a59229 180If no address is given, the command selects every line.
d83e3bda 181
86a59229
WL
182If one address is given, it selects the line (or lines) matching the
183address.
d83e3bda 184
86a59229
WL
185Two addresses select a range that begins whenever the first address
186matches, and ends (including that line) when the second address matches.
187If the first (second) address is a matching pattern, the second
188address is not applied to the very same line to determine the end of
189the range. Likewise, if the second address is a matching pattern, the
190first address is not applied to the very same line to determine the
191begin of another range. If both addresses are line numbers,
192and the second line number is less than the first line number, then
193only the first line is selected.
d83e3bda 194
d83e3bda 195
86a59229 196=head2 Functions
d83e3bda 197
86a59229
WL
198The maximum permitted number of addresses is indicated with each
199function synopsis below.
d83e3bda 200
86a59229
WL
201The argument I<text> consists of one or more lines following the command.
202Embedded newlines in I<text> must be preceded with a backslash. Other
203backslashes in I<text> are deleted and the following character is taken
204literally.
d83e3bda 205
86a59229 206=over 4
d83e3bda 207
86a59229
WL
208=cut
209
210my %ComTab;
d16f50bd 211my %GenKey;
86a59229
WL
212#--------------------------------------------------------------------------
213$ComTab{'a'}=[ 1, 'txt', \&Emit, '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
214
215=item [1addr]B<a\> I<text>
216
217Write I<text> (which must start on the line following the command)
218to standard output immediately before reading the next line
219of input, either by executing the B<N> function or by beginning a new cycle.
220
221=cut
222
223#--------------------------------------------------------------------------
224$ComTab{'b'}=[ 2, 'str', \&Branch, '{ goto XXX; }' ]; #ok
225
226=item [2addr]B<b> [I<label>]
227
228Branch to the B<:> function with the specified I<label>. If no label
229is given, branch to the end of the script.
230
231=cut
232
233#--------------------------------------------------------------------------
234$ComTab{'c'}=[ 2, 'txt', \&Change, <<'-X-' ]; #ok
235{ print <<'TheEnd'; } $doPrint = 0; goto EOS;
236-X-
237### continue OK => next CYCLE;
238
239=item [2addr]B<c\> I<text>
240
241The line, or range of lines, selected by the address is deleted.
242The I<text> (which must start on the line following the command)
243is written to standard output. With an address range, this occurs at
244the end of the range.
245
246=cut
247
248#--------------------------------------------------------------------------
249$ComTab{'d'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
250{ $doPrint = 0;
251 goto EOS;
252}
253-X-
254### continue OK => next CYCLE;
255
256=item [2addr]B<d>
257
258Deletes the pattern space and starts the next cycle.
259
260=cut
261
262#--------------------------------------------------------------------------
263$ComTab{'D'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
264{ s/^.*\n?//;
265 if(length($_)){ goto BOS } else { goto EOS }
266}
267-X-
268### continue OK => next CYCLE;
269
270=item [2addr]B<D>
271
272Deletes the pattern space through the first embedded newline or to the end.
273If the pattern space becomes empty, a new cycle is started, otherwise
274execution of the script is restarted.
275
276=cut
277
278#--------------------------------------------------------------------------
279$ComTab{'g'}=[ 2, '', \&Emit, '{ $_ = $Hold };' ]; #ok
280
281=item [2addr]B<g>
282
283Replace the contents of the pattern space with the hold space.
284
285=cut
286
287#--------------------------------------------------------------------------
288$ComTab{'G'}=[ 2, '', \&Emit, '{ $_ .= "\n"; $_ .= $Hold };' ]; #ok
289
290=item [2addr]B<G>
291
292Append a newline and the contents of the hold space to the pattern space.
293
294=cut
295
296#--------------------------------------------------------------------------
297$ComTab{'h'}=[ 2, '', \&Emit, '{ $Hold = $_ }' ]; #ok
298
299=item [2addr]B<h>
300
301Replace the contents of the hold space with the pattern space.
302
303=cut
304
305#--------------------------------------------------------------------------
306$ComTab{'H'}=[ 2, '', \&Emit, '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
307
308=item [2addr]B<H>
309
310Append a newline and the contents of the pattern space to the hold space.
311
312=cut
313
314#--------------------------------------------------------------------------
315$ComTab{'i'}=[ 1, 'txt', \&Emit, '{ print <<'."'TheEnd' }\n" ]; #ok
316
317=item [1addr]B<i\> I<text>
318
319Write the I<text> (which must start on the line following the command)
320to standard output.
321
322=cut
323
324#--------------------------------------------------------------------------
325$ComTab{'l'}=[ 2, '', \&Emit, '{ _l() }' ]; #okUTF8
326
327=item [2addr]B<l>
328
329Print the contents of the pattern space: non-printable characters are
330shown in C-style escaped form; long lines are split and have a trailing
331`C<\>' at the point of the split; the true end of a line is marked with
332a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
333BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
334octal number for all other non-printable characters.
335
336=cut
337
338#--------------------------------------------------------------------------
339$ComTab{'n'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
340{ print $_, "\n" if $doPrint;
d16f50bd 341 printQ() if @Q;
86a59229
WL
342 $CondReg = 0;
343 last CYCLE unless getsARGV();
344 chomp();
345}
346-X-
347
348=item [2addr]B<n>
349
350If automatic printing is enabled, write the pattern space to the standard
351output. Replace the pattern space with the next line of input. If
352there is no more input, processing is terminated.
353
354=cut
355
356#--------------------------------------------------------------------------
357$ComTab{'N'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
d16f50bd 358{ printQ() if @Q;
86a59229
WL
359 $CondReg = 0;
360 last CYCLE unless getsARGV( $h );
361 chomp( $h );
362 $_ .= "\n$h";
363}
364-X-
365
366=item [2addr]B<N>
367
368Append a newline and the next line of input to the pattern space. If
369there is no more input, processing is terminated.
370
371=cut
372
373#--------------------------------------------------------------------------
374$ComTab{'p'}=[ 2, '', \&Emit, '{ print $_, "\n"; }' ]; #ok
375
376=item [2addr]B<p>
377
378Print the pattern space to the standard output. (Use the B<-n> option
379to suppress automatic printing at the end of a cycle if you want to
380avoid double printing of lines.)
381
382=cut
383
384#--------------------------------------------------------------------------
385$ComTab{'P'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
386{ if( /^(.*)/ ){ print $1, "\n"; } }
387-X-
388
389=item [2addr]B<P>
390
391Prints the pattern space through the first embedded newline or to the end.
392
393=cut
394
395#--------------------------------------------------------------------------
396$ComTab{'q'}=[ 1, '', \&Emit, <<'-X-' ]; #ok
397{ print $_, "\n" if $doPrint;
398 last CYCLE;
399}
400-X-
401
402=item [1addr]B<q>
403
404Branch to the end of the script and quit without starting a new cycle.
405
406=cut
407
408#--------------------------------------------------------------------------
409$ComTab{'r'}=[ 1, 'str', \&Emit, "{ _r( '-X-' ) }" ]; #ok
86a59229
WL
410
411=item [1addr]B<r> I<file>
412
413Copy the contents of the I<file> to standard output immediately before
414the next attempt to read a line of input. Any error encountered while
415reading I<file> is silently ignored.
416
417=cut
418
419#--------------------------------------------------------------------------
420$ComTab{'s'}=[ 2, 'sub', \&Emit, '' ]; #ok
421
422=item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
423
424Substitute the I<replacement> string for the first substring in
425the pattern space that matches the I<regular expression>.
426Any character other than backslash or newline can be used instead of a
427slash to delimit the regular expression and the replacement.
428To use the delimiter as a literal character within the regular expression
429and the replacement, precede the character by a backslash (`C<\>').
430
431Literal newlines may be embedded in the replacement string by
432preceding a newline with a backslash.
433
434Within the replacement, an ampersand (`C<&>') is replaced by the string
435matching the regular expression. The strings `C<\1>' through `C<\9>' are
436replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
437To get a literal `C<&>' or `C<\>' in the replacement text, precede it
438by a backslash.
439
440The following I<flags> modify the behaviour of the B<s> command:
441
442=over 8
443
444=item B<g>
445
446The replacement is performed for all matching, non-overlapping substrings
447of the pattern space.
448
449=item B<1>..B<9>
450
451Replace only the n-th matching substring of the pattern space.
452
453=item B<p>
454
455If the substitution was made, print the new value of the pattern space.
456
457=item B<w> I<file>
458
459If the substitution was made, write the new value of the pattern space
460to the specified file.
461
462=back
463
464=cut
465
466#--------------------------------------------------------------------------
467$ComTab{'t'}=[ 2, 'str', \&Branch, '{ goto XXX if _t() }' ]; #ok
468
469=item [2addr]B<t> [I<label>]
470
471Branch to the B<:> function with the specified I<label> if any B<s>
472substitutions have been made since the most recent reading of an input line
473or execution of a B<t> function. If no label is given, branch to the end of
474the script.
475
476
477=cut
478
479#--------------------------------------------------------------------------
480$ComTab{'w'}=[ 2, 'str', \&Write, "{ _w( '-X-' ) }" ]; #ok
481
482=item [2addr]B<w> I<file>
483
484The contents of the pattern space are written to the I<file>.
485
486=cut
487
488#--------------------------------------------------------------------------
489$ComTab{'x'}=[ 2, '', \&Emit, '{ ($Hold, $_) = ($_, $Hold) }' ]; #ok
490
491=item [2addr]B<x>
492
493Swap the contents of the pattern space and the hold space.
494
495=cut
496
497#--------------------------------------------------------------------------
498$ComTab{'y'}=[ 2, 'tra', \&Emit, '' ]; #ok
499=item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
500
501In the pattern space, replace all characters occuring in I<string1> by the
502character at the corresponding position in I<string2>. It is possible
503to use any character (other than a backslash or newline) instead of a
504slash to delimit the strings. Within I<string1> and I<string2>, a
505backslash followed by any character other than a newline is that literal
506character, and a backslash followed by an `n' is replaced by a newline
507character.
508
509=cut
d83e3bda 510
86a59229
WL
511#--------------------------------------------------------------------------
512$ComTab{'='}=[ 1, '', \&Emit, '{ print "$.\n" }' ]; #ok
513
514=item [1addr]B<=>
515
516Prints the current line number on the standard output.
517
518=cut
519
520#--------------------------------------------------------------------------
521$ComTab{':'}=[ 0, 'str', \&Label, '' ]; #ok
522
523=item [0addr]B<:> [I<label>]
524
525The command specifies the position of the I<label>. It has no other effect.
526
527=cut
528
529#--------------------------------------------------------------------------
530$ComTab{'{'}=[ 2, '', \&BeginBlock, '{' ]; #ok
531$ComTab{'}'}=[ 0, '', \&EndBlock, ';}' ]; #ok
532# ';' to avoid warning on empty {}-block
533
534=item [2addr]B<{> [I<command>]
535
536=item [0addr]B<}>
537
538These two commands begin and end a command list. The first command may
539be given on the same line as the opening B<{> command. The commands
540within the list are jointly selected by the address(es) given on the
541B<{> command (but may still have individual addresses).
542
543=cut
544
545#--------------------------------------------------------------------------
546$ComTab{'#'}=[ 0, 'str', \&Comment, '' ]; #ok
547
548=item [0addr]B<#> [I<comment>]
549
550The entire line is ignored (treated as a comment). If, however, the first
551two characters in the script are `C<#n>', automatic printing of output is
552suppressed, as if the B<-n> option were given on the command line.
553
554=back
555
556=cut
557
558use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
559
560my $useDEBUG = exists( $ENV{PSEDDEBUG} );
561my $useEXTBRE = $ENV{PSEDEXTBRE} || '';
562$useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
563
564my $doAutoPrint = 1; # automatic printing of pattern space (-n => 0)
565my $doOpenWrite = 1; # open w command output files at start (-a => 0)
566my $svOpenWrite = 0; # save $doOpenWrite
567my $doGenerate = $0 eq 's2p';
568
569# Collected and compiled script
570#
d16f50bd
WL
571my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
572$Code = '';
86a59229
WL
573
574##################
575# Compile Time
576#
577# Labels
578#
579# Error handling
580#
581sub Warn($;$){
582 my( $msg, $loc ) = @_;
583 $loc ||= '';
584 $loc .= ': ' if length( $loc );
585 warn( "$0: $loc$msg\n" );
586}
587
588$labNum = 0;
589sub newLabel(){
590 return 'L_'.++$labNum;
591}
592
593# safeHere: create safe here delimiter and modify opcode and argument
594#
595sub safeHere($$){
596 my( $codref, $argref ) = @_;
597 my $eod = 'EOD000';
598 while( $$argref =~ /^$eod$/m ){
599 $eod++;
600 }
601 $$codref =~ s/TheEnd/$eod/e;
602 $$argref .= "$eod\n";
603}
604
605# Emit: create address logic and emit command
606#
607sub Emit($$$$$$){
608 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
609 my $cond = '';
610 if( defined( $addr1 ) ){
611 if( defined( $addr2 ) ){
612 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
613 } else {
614 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
615 }
616 $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
617 }
618
619 if( $opcode eq '' ){
620 $Code .= "$cond$arg\n";
621
622 } elsif( $opcode =~ s/-X-/$arg/e ){
623 $Code .= "$cond$opcode\n";
624
625 } elsif( $opcode =~ /TheEnd/ ){
626 safeHere( \$opcode, \$arg );
627 $Code .= "$cond$opcode$arg";
628
629 } else {
630 $Code .= "$cond$opcode\n";
631 }
632 0;
633}
634
635# Write (w command, w flag): store pathname
636#
637sub Write($$$$$$){
638 my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
639 $wFiles{$path} = '';
640 Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
641}
642
643
644# Label (: command): label definition
645#
646sub Label($$$$$$){
647 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
648 my $rc = 0;
649 $lab =~ s/\s+//;
650 if( length( $lab ) ){
651 my $h;
652 if( ! exists( $Label{$lab} ) ){
653 $h = $Label{$lab}{name} = newLabel();
654 } else {
655 $h = $Label{$lab}{name};
656 if( exists( $Label{$lab}{defined} ) ){
657 my $dl = $Label{$lab}{defined};
658 Warn( "duplicate label $lab (first defined at $dl)", $fl );
659 $rc = 1;
660 }
661 }
662 $Label{$lab}{defined} = $fl;
663 $Code .= "$h:;\n";
664 }
665 $rc;
666}
667
668# BeginBlock ({ command): push block start
669#
670sub BeginBlock($$$$$$){
671 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
672 push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
673 Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
674}
675
676# EndBlock (} command): check proper nesting
677#
678sub EndBlock($$$$$$){
679 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
680 my $rc;
681 my $jcom = pop( @BlockStack );
682 if( defined( $jcom ) ){
683 $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
684 } else {
685 Warn( "unexpected `}'", $fl );
686 $rc = 1;
687 }
688 $rc;
689}
690
691# Branch (t, b commands): check or create label, substitute default
692#
693sub Branch($$$$$$){
694 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
695 $lab =~ s/\s+//; # no spaces at end
696 my $h;
697 if( length( $lab ) ){
698 if( ! exists( $Label{$lab} ) ){
699 $h = $Label{$lab}{name} = newLabel();
700 } else {
701 $h = $Label{$lab}{name};
702 }
703 push( @{$Label{$lab}{used}}, $fl );
704 } else {
705 $h = 'EOS';
706 }
707 $opcode =~ s/XXX/$h/e;
708 Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
709}
710
711# Change (c command): is special due to range end watching
712#
713sub Change($$$$$$){
714 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
715 my $kwd = $negated ? 'unless' : 'if';
716 if( defined( $addr2 ) ){
717 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
718 if( ! $negated ){
719 $addr1 = '$icnt = ('.$addr1.')';
720 $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
721 }
722 } else {
723 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
724 }
725 safeHere( \$opcode, \$arg );
726 $Code .= "$kwd( $addr1 ){\n $opcode$arg}\n";
727 0;
728}
729
730
731# Comment (# command): A no-op. Who would've thought that!
732#
733sub Comment($$$$$$){
734 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
735### $Code .= "# $arg\n";
736 0;
737}
738
739
740sub stripRegex($$){
741 my( $del, $sref ) = @_;
742 my $regex = $del;
743 print "stripRegex:$del:$$sref:\n" if $useDEBUG;
744 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
745 my $sl = $2;
746 $regex .= $1.$sl.$del;
747 if( length( $sl ) % 2 == 0 ){
748 return $regex;
749 }
750 $regex .= $3;
751 }
752 undef();
753}
754
755# stripTrans: take a <del> terminated string from y command
756# honoring and cleaning up of \-escaped <del>'s
757#
758sub stripTrans($$){
759 my( $del, $sref ) = @_;
760 my $t = '';
761 print "stripTrans:$del:$$sref:\n" if $useDEBUG;
762 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
763 my $sl = $2;
764 $t .= $1;
765 if( length( $sl ) % 2 == 0 ){
766 $t .= $sl;
767 $t =~ s/\\\\/\\/g;
768 return $t;
769 }
770 chop( $sl );
771 $t .= $sl.$del.$3;
772 }
773 undef();
774}
775
776# makey - construct Perl y/// from sed y///
777#
778sub makey($$$){
779 my( $fr, $to, $fl ) = @_;
780 my $error = 0;
781
782 # Ensure that any '-' is up front.
783 # Diagnose duplicate contradicting mappings
784 my %tr;
785 for( my $i = 0; $i < length($fr); $i++ ){
786 my $fc = substr($fr,$i,1);
787 my $tc = substr($to,$i,1);
788 if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
3484938b 789 Warn( "ambiguous translation for character `$fc' in `y' command",
86a59229
WL
790 $fl );
791 $error++;
792 }
793 $tr{$fc} = $tc;
794 }
795 $fr = $to = '';
796 if( exists( $tr{'-'} ) ){
797 ( $fr, $to ) = ( '-', $tr{'-'} );
798 delete( $tr{'-'} );
799 } else {
800 $fr = $to = '';
801 }
802 # might just as well sort it...
803 for my $fc ( sort keys( %tr ) ){
804 $fr .= $fc;
805 $to .= $tr{$fc};
806 }
807 # make embedded delimiters and newlines safe
808 $fr =~ s/([{}])/\$1/g;
809 $to =~ s/([{}])/\$1/g;
810 $fr =~ s/\n/\\n/g;
811 $to =~ s/\n/\\n/g;
812 return $error ? undef() : "{ y{$fr}{$to}; }";
813}
814
815######
816# makes - construct Perl s/// from sed s///
817#
818sub makes($$$$$$$){
819 my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
820
821 # make embedded newlines safe
822 $regex =~ s/\n/\\n/g;
823 $subst =~ s/\n/\\n/g;
824
825 my $code;
826 # n-th occurrence
827 #
828 if( length( $nmatch ) ){
829 $code = <<TheEnd;
830{ \$n = $nmatch;
831 while( --\$n && ( \$s = m ${regex}g ) ){}
832 \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
833 \$CondReg ||= \$s;
834TheEnd
835 } else {
836 $code = <<TheEnd;
837{ \$s = s ${regex}${subst}s${global};
838 \$CondReg ||= \$s;
839TheEnd
840 }
841 if( $print ){
842 $code .= ' print $_, "\n" if $s;'."\n";
843 }
844 if( defined( $path ) ){
845 $wFiles{$path} = '';
846 $code .= " _w( '$path' ) if \$s;\n";
d16f50bd 847 $GenKey{'w'} = 1;
86a59229
WL
848 }
849 $code .= "}";
850}
851
852=head1 BASIC REGULAR EXPRESSIONS
853
854A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
855of I<atoms>, for matching parts of a string, and I<bounds>, specifying
856repetitions of a preceding atom.
857
858=head2 Atoms
859
860The possible atoms of a BRE are: B<.>, matching any single character;
861B<^> and B<$>, matching the null string at the beginning or end
862of a string, respectively; a I<bracket expressions>, enclosed
863in B<[> and B<]> (see below); and any single character with no
864other significance (matching that character). A B<\> before one
865of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
866after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
867becomes an atom and establishes the target for a I<backreference>,
868consisting of the substring that actually matches the enclosed atoms.
869Finally, B<\> followed by one of the digits B<0> through B<9> is a
870backreference.
871
872A B<^> that is not first, or a B<$> that is not last does not have
873a special significance and need not be preceded by a backslash to
874become literal. The same is true for a B<]>, that does not terminate
875a bracket expression.
876
877An unescaped backslash cannot be last in a BRE.
878
879=head2 Bounds
880
881The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
882atom; B<\{>I<count>B<\}>, specifying that many repetitions;
883B<\{>I<minimum>B<,\}>, giving a lower limit; and
884B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
885bound.
886
887A bound appearing as the first item in a BRE is taken literally.
888
889=head2 Bracket Expressions
890
891A I<bracket expression> is a list of characters, character ranges
892and character classes enclosed in B<[> and B<]> and matches any
893single character from the represented set of characters.
894
895A character range is written as two characters separated by B<-> and
896represents all characters (according to the character collating sequence)
897that are not less than the first and not greater than the second.
898(Ranges are very collating-sequence-dependent, and portable programs
899should avoid relying on them.)
900
901A character class is one of the class names
902
903 alnum digit punct
904 alpha graph space
905 blank lower upper
906 cntrl print xdigit
907
908enclosed in B<[:> and B<:]> and represents the set of characters
909as defined in ctype(3).
910
911If the first character after B<[> is B<^>, the sense of matching is
912inverted.
913
914To include a literal `C<^>', place it anywhere else but first. To
915include a literal 'C<]>' place it first or immediately after an
916initial B<^>. To include a literal `C<->' make it the first (or
917second after B<^>) or last character, or the second endpoint of
918a range.
919
920The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
921match the null string at the beginning and end of a word respectively.
922(Note that neither is identical to Perl's `\b' atom.)
923
924=head2 Additional Atoms
925
926Since some sed implementations provide additional regular expression
927atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
928the following backslash escapes:
929
930=over 4
931
932=item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
933
934=item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
935
936=item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
937
938=item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
939
940=item B<\y> Match the empty string at a word boundary.
941
942=item B<\B> Match the empty string between any two either word or non-word characters.
943
944=back
945
946To enable this feature, the environment variable PSEDEXTBRE must be set
947to a string containing the requested characters, e.g.:
948C<PSEDEXTBRE='E<lt>E<gt>wW'>.
949
950=cut
951
952#####
953# bre2p - convert BRE to Perl RE
954#
955sub peek(\$$){
956 my( $pref, $ic ) = @_;
957 $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
958}
959
960sub bre2p($$$){
961 my( $del, $pat, $fl ) = @_;
962 my $led = $del;
963 $led =~ tr/{([</})]>/;
964 $led = '' if $led eq $del;
965
966 $pat = substr( $pat, 1, length($pat) - 2 );
967 my $res = '';
968 my $bracklev = 0;
969 my $backref = 0;
970 my $parlev = 0;
971 for( my $ic = 0; $ic < length( $pat ); $ic++ ){
972 my $c = substr( $pat, $ic, 1 );
973 if( $c eq '\\' ){
974 ### backslash escapes
975 my $nc = peek($pat,$ic);
976 if( $nc eq '' ){
977 Warn( "`\\' cannot be last in pattern", $fl );
978 return undef();
979 }
980 $ic++;
981 if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
982 $res .= "\\$del";
983
984 } elsif( $nc =~ /([[.*\\n])/ ){
985 ## check for \-escaped magics and \n:
986 ## \[ \. \* \\ \n stay as they are
987 $res .= '\\'.$nc;
988
989 } elsif( $nc eq '(' ){ ## \( => (
990 $parlev++;
991 $res .= '(';
992
993 } elsif( $nc eq ')' ){ ## \) => )
994 $parlev--;
995 $backref++;
996 if( $parlev < 0 ){
997 Warn( "unmatched `\\)'", $fl );
998 return undef();
999 }
1000 $res .= ')';
1001
1002 } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
1003 my $endpos = index( $pat, '\\}', $ic );
1004 if( $endpos < 0 ){
1005 Warn( "unmatched `\\{'", $fl );
1006 return undef();
1007 }
1008 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
1009 $ic = $endpos + 1;
1010
1011 if( $res =~ /^\^?$/ ){
1012 $res .= "\\{$rep\}";
1013 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
1014 my $min = $1;
1015 my $com = $2 || '';
1016 my $max = $3;
1017 if( length( $max ) ){
1018 if( $max < $min ){
1019 Warn( "maximum less than minimum in `\\{$rep\\}'",
1020 $fl );
1021 return undef();
1022 }
1023 } else {
1024 $max = '';
1025 }
1026 # simplify some
1027 if( $min == 0 && $max eq '1' ){
1028 $res .= '?';
1029 } elsif( $min == 1 && "$com$max" eq ',' ){
1030 $res .= '+';
1031 } elsif( $min == 0 && "$com$max" eq ',' ){
1032 $res .= '*';
1033 } else {
1034 $res .= "{$min$com$max}";
1035 }
1036 } else {
1037 Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1038 return undef();
1039 }
1040
1041 } elsif( $nc =~ /^[1-9]$/ ){
1042 ## \1 .. \9 => \1 .. \9, but check for a following digit
1043 if( $nc > $backref ){
1044 Warn( "invalid backreference ($nc)", $fl );
1045 return undef();
1046 }
1047 $res .= "\\$nc";
1048 if( peek($pat,$ic) =~ /[0-9]/ ){
1049 $res .= '(?:)';
1050 }
1051
1052 } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1053 ## extensions - at most <>wWyB - not in POSIX
1054 if( $nc eq '<' ){ ## \< => \b(?=\w), be precise
1055 $res .= '\\b(?<=\\W)';
1056 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1057 $res .= '\\b(?=\\W)';
1058 } elsif( $nc eq 'y' ){ ## \y => \b
1059 $res .= '\\b';
1060 } else { ## \B, \w, \W remain the same
1061 $res .= "\\$nc";
1062 }
1063 } elsif( $nc eq $led ){
1064 ## \<closing bracketing-delimiter> - keep '\'
1065 $res .= "\\$nc";
1066
1067 } else { ## \ <char> => <char> ("as if `\' were not present")
1068 $res .= $nc;
1069 }
1070
1071 } elsif( $c eq '.' ){ ## . => .
1072 $res .= $c;
1073
1074 } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1075 if( $res =~ /^\^?$/ ){
1076 $res .= '\\*';
1077 } elsif( substr( $res, -1, 1 ) ne '*' ){
1078 $res .= $c;
1079 }
1080
1081 } elsif( $c eq '[' ){
1082 ## parse []: [^...] [^]...] [-...]
1083 my $add = '[';
1084 if( peek($pat,$ic) eq '^' ){
1085 $ic++;
1086 $add .= '^';
1087 }
1088 my $nc = peek($pat,$ic);
1089 if( $nc eq ']' || $nc eq '-' ){
1090 $add .= $nc;
1091 $ic++;
1092 }
1093 # check that [ is not trailing
1094 if( $ic >= length( $pat ) - 1 ){
1095 Warn( "unmatched `['", $fl );
1096 return undef();
1097 }
1098 # look for [:...:] and x-y
1099 my $rstr = substr( $pat, $ic+1 );
1100 if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1101 my $cnt = $1;
1102 $ic += length( $cnt );
1103 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1104 # try some simplifications
1105 my $red = $cnt;
1106 if( $red =~ s/0-9// ){
1107 $cnt = $red.'\d';
1108 if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1109 $cnt = $red.'\w';
1110 }
1111 }
1112 $add .= $cnt;
1113
1114 # POSIX 1003.2 has this (optional) for begin/end word
1115 $add = '\\b(?=\\W)' if $add eq '[[:<:]]';
1116 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1117
1118 }
1119
1120 ## may have a trailing `-' before `]'
1121 if( $ic < length($pat) - 1 &&
1122 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1123 $ic += length( $1 );
1124 $add .= $1;
1125 # another simplification
1126 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1127 $res .= $add;
1128 } else {
1129 Warn( "unmatched `['", $fl );
1130 return undef();
1131 }
3cb6de81 1132
86a59229
WL
1133 } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1134 $res .= "\\$c";
d83e3bda 1135
86a59229
WL
1136 } elsif( $c eq ']' ){ ## unmatched ] is not magic
1137 $res .= ']';
d83e3bda 1138
86a59229
WL
1139 } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1140 $res .= "\\$c";
d83e3bda 1141
86a59229
WL
1142 } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1143 $res .= length( $res ) ? '\\^' : '^';
d83e3bda 1144
86a59229
WL
1145 } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1146 $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
8d063cd8 1147
86a59229
WL
1148 } else {
1149 $res .= $c;
1150 }
8d063cd8 1151 }
0a12ae7d 1152
86a59229
WL
1153 if( $parlev ){
1154 Warn( "unmatched `\\('", $fl );
1155 return undef();
8d063cd8 1156 }
0a12ae7d 1157
86a59229
WL
1158 # final cleanup: eliminate raw HTs
1159 $res =~ s/\t/\\t/g;
1160 return $del . $res . ( $led ? $led : $del );
1161}
0a12ae7d 1162
86a59229
WL
1163
1164#####
1165# sub2p - convert sed substitution to Perl substitution
1166#
1167sub sub2p($$$){
1168 my( $del, $subst, $fl ) = @_;
1169 my $led = $del;
1170 $led =~ tr/{([</})]>/;
1171 $led = '' if $led eq $del;
1172
1173 $subst = substr( $subst, 1, length($subst) - 2 );
1174 my $res = '';
1175
1176 for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1177 my $c = substr( $subst, $ic, 1 );
1178 if( $c eq '\\' ){
1179 ### backslash escapes
1180 my $nc = peek($subst,$ic);
1181 if( $nc eq '' ){
1182 Warn( "`\\' cannot be last in substitution", $fl );
1183 return undef();
1184 }
1185 $ic++;
1186 if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1187 $res .= '\\' . $nc;
1188 } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1189 $res .= '${' . $nc . '}';
1190 } else { ## everything else (includes &): omit \
1191 $res .= $nc;
1192 }
1193 } elsif( $c eq '&' ){ ## & => $&
1194 $res .= '$&';
1195 } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1196 $res .= '\\' . $c;
1197 } else {
1198 $res .= $c;
8d063cd8 1199 }
8d063cd8
LW
1200 }
1201
86a59229
WL
1202 # final cleanup: eliminate raw HTs
1203 $res =~ s/\t/\\t/g;
1204 return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1205}
0a12ae7d 1206
86a59229
WL
1207
1208sub Parse(){
1209 my $error = 0;
1210 my( $pdef, $pfil, $plin );
1211 for( my $icom = 0; $icom < @Commands; $icom++ ){
1212 my $cmd = $Commands[$icom];
1213 print "Parse:$cmd:\n" if $useDEBUG;
1214 $cmd =~ s/^\s+//;
1215 next unless length( $cmd );
1216 my $scom = $icom;
1217 if( exists( $Defined{$icom} ) ){
1218 $pdef = $Defined{$icom};
1219 if( $pdef =~ /^ #(\d+)/ ){
1220 $pfil = 'expression #';
1221 $plin = $1;
1222 } else {
1223 $pfil = "$pdef l.";
1224 $plin = 1;
1225 }
1226 } else {
1227 $plin++;
1228 }
1229 my $fl = "$pfil$plin";
1230
1231 # insert command as comment in gnerated code
1232 #
1233 $Code .= "# $cmd\n" if $doGenerate;
1234
1235 # The Address(es)
1236 #
1237 my( $negated, $naddr, $addr1, $addr2 );
1238 $naddr = 0;
1239 if( $cmd =~ s/^(\d+)\s*// ){
1240 $addr1 = "$1"; $naddr++;
1241 } elsif( $cmd =~ s/^\$\s*// ){
1242 $addr1 = 'eofARGV()'; $naddr++;
1243 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1244 my $del = $1;
1245 my $regex = stripRegex( $del, \$cmd );
1246 if( defined( $regex ) ){
1247 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1248 $naddr++;
1249 } else {
1250 Warn( "malformed regex, 1st address", $fl );
1251 $error++;
1252 next;
1253 }
1254 }
1255 if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1256 if( $cmd =~ s/^(\d+)\s*// ){
1257 $addr2 = "$1"; $naddr++;
1258 } elsif( $cmd =~ s/^\$\s*// ){
1259 $addr2 = 'eofARGV()'; $naddr++;
1260 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1261 my $del = $1;
1262 my $regex = stripRegex( $del, \$cmd );
1263 if( defined( $regex ) ){
1264 $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1265 $naddr++;
1266 } else {
1267 Warn( "malformed regex, 2nd address", $fl );
1268 $error++;
1269 next;
1270 }
1271 } else {
1272 Warn( "invalid address after `,'", $fl );
1273 $error++;
1274 next;
1275 }
1276 }
1277
1278 # address modifier `!'
1279 #
1280 $negated = $cmd =~ s/^!\s*//;
1281 if( defined( $addr1 ) ){
1282 print "Parse: addr1=$addr1" if $useDEBUG;
1283 if( defined( $addr2 ) ){
1284 print ", addr2=$addr2 " if $useDEBUG;
1285 # both numeric and addr1 > addr2 => eliminate addr2
1286 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1287 $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1288 }
9ef589d8 1289 }
86a59229
WL
1290 print 'negated' if $useDEBUG && $negated;
1291 print " command:$cmd\n" if $useDEBUG;
1292
1293 # The Command
1294 #
1295 if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1296 my $h = substr( $cmd, 0, 1 );
1297 Warn( "unknown command `$h'", $fl );
1298 $error++;
8d063cd8
LW
1299 next;
1300 }
86a59229 1301 my $key = $1;
8d063cd8 1302
86a59229 1303 my $tabref = $ComTab{$key};
d16f50bd 1304 $GenKey{$key} = 1;
86a59229
WL
1305 if( $naddr > $tabref->[0] ){
1306 Warn( "excess address(es)", $fl );
1307 $error++;
8d063cd8
LW
1308 next;
1309 }
1310
86a59229
WL
1311 my $arg = '';
1312 if( $tabref->[1] eq 'str' ){
1313 # take remainder - don't care if it is empty
1314 $arg = $cmd;
1315 $cmd = '';
1316
1317 } elsif( $tabref->[1] eq 'txt' ){
1318 # multi-line text
1319 my $goon = $cmd =~ /(.*)\\$/;
1320 if( length( $1 ) ){
1321 Warn( "extra characters after command ($cmd)", $fl );
1322 $error++;
8d063cd8 1323 }
86a59229
WL
1324 while( $goon ){
1325 $icom++;
1326 if( $icom > $#Commands ){
1327 Warn( "unexpected end of script", $fl );
1328 $error++;
1329 last;
1330 }
1331 $cmd = $Commands[$icom];
1332 $Code .= "# $cmd\n" if $doGenerate;
1333 $goon = $cmd =~ s/\\$//;
1334 $cmd =~ s/\\(.)/$1/g;
1335 $arg .= "\n" if length( $arg );
1336 $arg .= $cmd;
8d063cd8 1337 }
86a59229
WL
1338 $arg .= "\n" if length( $arg );
1339 $cmd = '';
8d063cd8 1340
86a59229
WL
1341 } elsif( $tabref->[1] eq 'sub' ){
1342 # s///
1343 if( ! length( $cmd ) ){
1344 Warn( "`s' command requires argument", $fl );
1345 $error++;
1346 next;
1347 }
1348 if( $cmd =~ s{^([^\\\n])}{} ){
1349 my $del = $1;
1350 my $regex = stripRegex( $del, \$cmd );
1351 if( ! defined( $regex ) ){
1352 Warn( "malformed regular expression", $fl );
1353 $error++;
1354 next;
a687059c 1355 }
86a59229
WL
1356 $regex = bre2p( $del, $regex, $fl );
1357
1358 # a trailing \ indicates embedded NL (in replacement string)
1359 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1360 $icom++;
1361 if( $icom > $#Commands ){
1362 Warn( "unexpected end of script", $fl );
1363 $error++;
1364 last;
8d063cd8 1365 }
86a59229
WL
1366 $cmd .= $Commands[$icom];
1367 $Code .= "# $Commands[$icom]\n" if $doGenerate;
9ef589d8 1368 }
86a59229
WL
1369
1370 my $subst = stripRegex( $del, \$cmd );
1371 if( ! defined( $regex ) ){
1372 Warn( "malformed substitution expression", $fl );
1373 $error++;
0a12ae7d
LW
1374 next;
1375 }
86a59229
WL
1376 $subst = sub2p( $del, $subst, $fl );
1377
1378 # parse s/// modifier: g|p|0-9|w <file>
1379 my( $global, $nmatch, $print, $write ) =
1380 ( '', '', 0, undef );
1381 while( $cmd =~ s/^([gp0-9])// ){
1382 $1 eq 'g' ? ( $global = 'g' ) :
1383 $1 eq 'p' ? ( $print = $1 ) : ( $nmatch .= $1 );
1384 }
1385 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1386 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1387 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1388 Warn( "conflicting flags `$global$nmatch'", $fl );
1389 $error++;
0a12ae7d
LW
1390 next;
1391 }
86a59229
WL
1392
1393 $arg = makes( $regex, $subst,
1394 $write, $global, $print, $nmatch, $fl );
1395 if( ! defined( $arg ) ){
1396 $error++;
8d063cd8
LW
1397 next;
1398 }
86a59229
WL
1399
1400 } else {
1401 Warn( "improper delimiter in s command", $fl );
1402 $error++;
1403 next;
1404 }
1405
1406 } elsif( $tabref->[1] eq 'tra' ){
1407 # y///
1408 # a trailing \ indicates embedded newline
1409 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1410 $icom++;
1411 if( $icom > $#Commands ){
1412 Warn( "unexpected end of script", $fl );
1413 $error++;
1414 last;
1415 }
1416 $cmd .= $Commands[$icom];
1417 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1418 }
1419 if( ! length( $cmd ) ){
1420 Warn( "`y' command requires argument", $fl );
1421 $error++;
1422 next;
1423 }
1424 my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1425 if( $d eq '\\' ){
1426 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1427 $error++;
1428 next;
1429 }
1430 my $fr = stripTrans( $d, \$cmd );
1431 if( ! defined( $fr ) || ! length( $cmd ) ){
1432 Warn( "malformed `y' command argument", $fl );
1433 $error++;
1434 next;
1435 }
1436 my $to = stripTrans( $d, \$cmd );
1437 if( ! defined( $to ) ){
1438 Warn( "malformed `y' command argument", $fl );
1439 $error++;
1440 next;
1441 }
1442 if( length($fr) != length($to) ){
1443 Warn( "string lengths in `y' command differ", $fl );
1444 $error++;
1445 next;
1446 }
1447 if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1448 $error++;
1449 next;
8d063cd8 1450 }
8d063cd8 1451
8d063cd8
LW
1452 }
1453
86a59229
WL
1454 # $cmd must be now empty - exception is {
1455 if( $cmd !~ /^\s*$/ ){
1456 if( $key eq '{' ){
1457 # dirty hack to process command on '{' line
1458 $Commands[$icom--] = $cmd;
1459 } else {
1460 Warn( "extra characters after command ($cmd)", $fl );
1461 $error++;
1462 next;
1463 }
8d063cd8
LW
1464 }
1465
86a59229
WL
1466 # Make Code
1467 #
1468 if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1469 $tabref->[3], $arg, $fl ) ){
1470 $error++;
8d063cd8 1471 }
86a59229 1472 }
8d063cd8 1473
86a59229
WL
1474 while( @BlockStack ){
1475 my $bl = pop( @BlockStack );
1476 Warn( "start of unterminated `{'", $bl );
1477 $error++;
1478 }
8d063cd8 1479
86a59229
WL
1480 for my $lab ( keys( %Label ) ){
1481 if( ! exists( $Label{$lab}{defined} ) ){
1482 for my $used ( @{$Label{$lab}{used}} ){
1483 Warn( "undefined label `$lab'", $used );
1484 $error++;
1485 }
8d063cd8 1486 }
86a59229 1487 }
8d063cd8 1488
86a59229
WL
1489 exit( 1 ) if $error;
1490}
8d063cd8 1491
8d063cd8 1492
86a59229
WL
1493##############
1494#### MAIN ####
1495##############
8d063cd8 1496
86a59229
WL
1497sub usage(){
1498 print STDERR "Usage: sed [-an] command [file...]\n";
1499 print STDERR " [-an] [-e command] [-f script-file] [file...]\n";
1500}
8d063cd8 1501
86a59229
WL
1502###################
1503# Here we go again...
1504#
1505my $expr = 0;
1506while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1507 my $opt = $1;
1508 my $arg = $2;
1509 shift( @ARGV );
1510 if( $opt eq 'e' ){
1511 if( length( $arg ) ){
1512 push( @Commands, split( "\n", $arg ) );
1513 } elsif( @ARGV ){
1514 push( @Commands, shift( @ARGV ) );
1515 } else {
1516 Warn( "option -e requires an argument" );
1517 usage();
1518 exit( 1 );
1519 }
1520 $expr++;
1521 $Defined{$#Commands} = " #$expr";
1522 next;
1523 }
1524 if( $opt eq 'f' ){
1525 my $path;
1526 if( length( $arg ) ){
1527 $path = $arg;
1528 } elsif( @ARGV ){
1529 $path = shift( @ARGV );
1530 } else {
1531 Warn( "option -f requires an argument" );
1532 usage();
1533 exit( 1 );
1534 }
1535 my $fst = $#Commands + 1;
1536 open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1537 my $cmd;
1538 while( defined( $cmd = <SCRIPT> ) ){
1539 chomp( $cmd );
1540 push( @Commands, $cmd );
1541 }
1542 close( SCRIPT );
1543 if( $#Commands >= $fst ){
1544 $Defined{$fst} = "$path";
8d063cd8 1545 }
86a59229
WL
1546 next;
1547 }
1548 if( $opt eq '-' && $arg eq '' ){
1549 last;
1550 }
1551 if( $opt eq 'h' || $opt eq '?' ){
1552 usage();
1553 exit( 0 );
1554 }
1555 if( $opt eq 'n' ){
1556 $doAutoPrint = 0;
1557 } elsif( $opt eq 'a' ){
1558 $doOpenWrite = 0;
1559 } else {
1560 Warn( "illegal option `$opt'" );
1561 usage();
1562 exit( 1 );
1563 }
1564 if( length( $arg ) ){
1565 unshift( @ARGV, "-$arg" );
1566 }
1567}
8d063cd8 1568
86a59229
WL
1569# A singleton command may be the 1st argument when there are no options.
1570#
1571if( @Commands == 0 ){
1572 if( @ARGV == 0 ){
1573 Warn( "no script command given" );
1574 usage();
1575 exit( 1 );
1576 }
1577 push( @Commands, split( "\n", shift( @ARGV ) ) );
1578 $Defined{0} = ' #1';
1579}
8d063cd8 1580
86a59229 1581print STDERR "Files: @ARGV\n" if $useDEBUG;
8d063cd8 1582
86a59229
WL
1583# generate leading code
1584#
d16f50bd 1585$Func = <<'[TheEnd]';
86a59229 1586
d16f50bd
WL
1587# openARGV: open 1st input file
1588#
86a59229
WL
1589sub openARGV(){
1590 unshift( @ARGV, '-' ) unless @ARGV;
1591 my $file = shift( @ARGV );
1592 open( ARG, "<$file" )
1593 || die( "$0: can't open $file for reading ($!)\n" );
1594 $isEOF = 0;
1595}
8d063cd8 1596
d16f50bd
WL
1597# getsARGV: Read another input line into argument (default: $_).
1598# Move on to next input file, and reset EOF flag $isEOF.
86a59229
WL
1599sub getsARGV(;\$){
1600 my $argref = @_ ? shift() : \$_;
1601 while( $isEOF || ! defined( $$argref = <ARG> ) ){
1602 close( ARG );
1603 return 0 unless @ARGV;
1604 my $file = shift( @ARGV );
1605 open( ARG, "<$file" )
1606 || die( "$0: can't open $file for reading ($!)\n" );
1607 $isEOF = 0;
1608 }
1609 1;
1610}
8d063cd8 1611
d16f50bd
WL
1612# eofARGV: end-of-file test
1613#
86a59229
WL
1614sub eofARGV(){
1615 return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1616}
8d063cd8 1617
d16f50bd
WL
1618# makeHandle: Generates another file handle for some file (given by its path)
1619# to be written due to a w command or an s command's w flag.
86a59229
WL
1620sub makeHandle($){
1621 my( $path ) = @_;
1622 my $handle;
1623 if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1624 $handle = $wFiles{$path} = gensym();
1625 if( $doOpenWrite ){
1626 if( ! open( $handle, ">$path" ) ){
1627 die( "$0: can't open $path for writing: ($!)\n" );
1628 }
9ef589d8 1629 }
86a59229
WL
1630 } else {
1631 $handle = $wFiles{$path};
1632 }
1633 return $handle;
1634}
9ef589d8 1635
d16f50bd
WL
1636# printQ: Print queued output which is either a string or a reference
1637# to a pathname.
86a59229
WL
1638sub printQ(){
1639 for my $q ( @Q ){
1640 if( ref( $q ) ){
d16f50bd 1641 # flush open w files so that reading this file gets it all
86a59229
WL
1642 if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1643 open( $wFiles{$$q}, ">>$$q" );
1644 }
d16f50bd 1645 # copy file to stdout: slow, but safe
86a59229 1646 if( open( RF, "<$$q" ) ){
d16f50bd 1647 while( defined( my $line = <RF> ) ){
86a59229
WL
1648 print $line;
1649 }
1650 close( RF );
1651 }
1652 } else {
1653 print $q;
a687059c 1654 }
86a59229
WL
1655 }
1656 undef( @Q );
1657}
1658
d16f50bd
WL
1659[TheEnd]
1660
1661# generate the sed loop
1662#
1663$Code .= <<'[TheEnd]';
1664sub openARGV();
1665sub getsARGV(;\$);
1666sub eofARGV();
1667sub printQ();
1668
1669# Run: the sed loop reading input and applying the script
1670#
86a59229
WL
1671sub Run(){
1672 my( $h, $icnt, $s, $n );
1673 # hack (not unbreakable :-/) to avoid // matching an empty string
1674 my $z = "\000"; $z =~ /$z/;
1675 # Initialize.
1676 openARGV();
1677 $Hold = '';
1678 $CondReg = 0;
1679 $doPrint = $doAutoPrint;
1680CYCLE:
1681 while( getsARGV() ){
1682 chomp();
1683 $CondReg = 0; # cleared on t
1684BOS:;
1685[TheEnd]
1686
1687 # parse - avoid opening files when doing s2p
1688 #
1689 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1690 if $doGenerate;
1691 Parse();
1692 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1693 if $doGenerate;
1694
1695 # append trailing code
1696 #
1697 $Code .= <<'[TheEnd]';
1698EOS: if( $doPrint ){
1699 print $_, "\n";
1700 } else {
1701 $doPrint = $doAutoPrint;
a687059c 1702 }
86a59229 1703 printQ() if @Q;
a687059c 1704 }
86a59229
WL
1705
1706 exit( 0 );
a687059c 1707}
86a59229
WL
1708[TheEnd]
1709
d16f50bd
WL
1710
1711# append optional functions, prepend prototypes
1712#
1713my $Proto = "# prototypes\n";
1714if( $GenKey{'l'} ){
1715 $Proto .= "sub _l();\n";
1716 $Func .= <<'[TheEnd]';
1717# _l: l command processing
1718#
1719sub _l(){
1720 my $h = $_;
1721 my $mcpl = 70;
1722 # transform non printing chars into escape notation
1723 $h =~ s/\\/\\\\/g;
1724 if( $h =~ /[^[:print:]]/ ){
1725 $h =~ s/\a/\\a/g;
1726 $h =~ s/\f/\\f/g;
1727 $h =~ s/\n/\\n/g;
1728 $h =~ s/\t/\\t/g;
1729 $h =~ s/\r/\\r/g;
1730 $h =~ s/\e/\\e/g;
1731 $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1732 }
1733 # split into lines of length $mcpl
1734 while( length( $h ) > $mcpl ){
1735 my $l = substr( $h, 0, $mcpl-1 );
1736 $h = substr( $h, $mcpl );
1737 # remove incomplete \-escape from end of line
1738 if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1739 $h = $1 . $h;
1740 }
1741 print $l, "\\\n";
1742 }
1743 print "$h\$\n";
1744}
1745
1746[TheEnd]
1747}
1748
1749if( $GenKey{'r'} ){
1750 $Proto .= "sub _r(\$);\n";
1751 $Func .= <<'[TheEnd]';
1752# _r: r command processing: Save a reference to the pathname.
1753#
1754sub _r($){
1755 my $path = shift();
1756 push( @Q, \$path );
1757}
1758
1759[TheEnd]
1760}
1761
1762if( $GenKey{'t'} ){
1763 $Proto .= "sub _t();\n";
1764 $Func .= <<'[TheEnd]';
1765# _t: t command - condition register test/reset
1766#
1767sub _t(){
1768 my $res = $CondReg;
1769 $CondReg = 0;
1770 $res;
1771}
1772
1773[TheEnd]
1774}
1775
1776if( $GenKey{'w'} ){
1777 $Proto .= "sub _w(\$);\n";
1778 $Func .= <<'[TheEnd]';
1779# _w: w command and s command's w flag - write to file
1780#
1781sub _w($){
1782 my $path = shift();
1783 my $handle = $wFiles{$path};
1784 if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1785 open( $handle, ">$path" )
1786 || die( "$0: $path: cannot open ($!)\n" );
1787 }
1788 print $handle $_, "\n";
1789}
1790
1791[TheEnd]
1792}
1793
1794$Code = $Proto . $Code;
1795
86a59229
WL
1796# magic "#n" - same as -n option
1797#
1798$doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
a687059c 1799
86a59229
WL
1800# eval code - check for errors
1801#
d16f50bd
WL
1802print "Code:\n$Code$Func" if $useDEBUG;
1803eval $Code . $Func;
86a59229 1804if( $@ ){
d16f50bd 1805 print "Code:\n$Code$Func";
86a59229 1806 die( "$0: internal error - generated incorrect Perl code: $@\n" );
9ef589d8
LW
1807}
1808
86a59229
WL
1809if( $doGenerate ){
1810
1811 # write full Perl program
1812 #
1813
d16f50bd 1814 # bang line, declarations, prototypes
86a59229
WL
1815 print <<TheEnd;
1816#!$perlpath -w
1817eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1818 if 0;
9ae2b9a0 1819\$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
86a59229
WL
1820
1821use strict;
1822use Symbol;
1823use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1824 \$doAutoPrint \$doOpenWrite \$doPrint };
1825\$doAutoPrint = $doAutoPrint;
1826\$doOpenWrite = $doOpenWrite;
1827TheEnd
1828
1829 my $wf = "'" . join( "', '", keys( %wFiles ) ) . "'";
1830 if( $wf ne "''" ){
1831 print <<TheEnd;
1832sub makeHandle(\$);
1833for my \$p ( $wf ){
1834 exit( 1 ) unless makeHandle( \$p );
9ef589d8 1835}
86a59229
WL
1836TheEnd
1837 }
9ef589d8 1838
86a59229 1839 print $Code;
d16f50bd
WL
1840 print "Run();\n";
1841 print $Func;
86a59229 1842 exit( 0 );
1aa91729 1843
86a59229
WL
1844} else {
1845
1846 # execute: make handles (and optionally open) all w files; run!
86a59229
WL
1847 for my $p ( keys( %wFiles ) ){
1848 exit( 1 ) unless makeHandle( $p );
1849 }
d16f50bd 1850 Run();
1aa91729 1851}
86a59229
WL
1852
1853
1854=head1 ENVIRONMENT
1855
1856The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1857See L<"Additional Atoms">.
1858
1859=head1 DIAGNOSTICS
1860
1861=over 4
1862
3484938b 1863=item ambiguous translation for character `%s' in `y' command
86a59229
WL
1864
1865The indicated character appears twice, with different translations.
1866
1867=item `[' cannot be last in pattern
1868
1869A `[' in a BRE indicates the beginning of a I<bracket expression>.
1870
1871=item `\' cannot be last in pattern
1872
1873A `\' in a BRE is used to make the subsequent character literal.
1874
1875=item `\' cannot be last in substitution
1876
1877A `\' in a subsitution string is used to make the subsequent character literal.
1878
1879=item conflicting flags `%s'
1880
1881In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1882multiple n-th occurrence flags are specified. Note that only the digits
1883`1' through `9' are permitted.
1884
1885=item duplicate label %s (first defined at %s)
1886
1887=item excess address(es)
1888
1889The command has more than the permitted number of addresses.
1890
1891=item extra characters after command (%s)
1892
1893=item illegal option `%s'
1894
1895=item improper delimiter in s command
1896
1897The BRE and substitution may not be delimited with `\' or newline.
1898
1899=item invalid address after `,'
1900
1901=item invalid backreference (%s)
1902
1903The specified backreference number exceeds the number of backreferences
1904in the BRE.
1905
1906=item invalid repeat clause `\{%s\}'
1907
1908The repeat clause does not contain a valid integer value, or pair of
1909values.
1910
1911=item malformed regex, 1st address
1912
1913=item malformed regex, 2nd address
1914
1915=item malformed regular expression
1916
1917=item malformed substitution expression
1918
1919=item malformed `y' command argument
1920
1921The first or second string of a B<y> command is syntactically incorrect.
1922
1923=item maximum less than minimum in `\{%s\}'
1924
1925=item no script command given
1926
1927There must be at least one B<-e> or one B<-f> option specifying a
1928script or script file.
1929
1930=item `\' not valid as delimiter in `y' command
1931
1932=item option -e requires an argument
1933
1934=item option -f requires an argument
1935
1936=item `s' command requires argument
1937
1938=item start of unterminated `{'
1939
1940=item string lengths in `y' command differ
1941
1942The translation table strings in a B<y> commanf must have equal lengths.
1943
1944=item undefined label `%s'
1945
1946=item unexpected `}'
1947
1948A B<}> command without a preceding B<{> command was encountered.
1949
1950=item unexpected end of script
1951
1952The end of the script was reached although a text line after a
1953B<a>, B<c> or B<i> command indicated another line.
1954
1955=item unknown command `%s'
1956
1957=item unterminated `['
1958
1959A BRE contains an unterminated bracket expression.
1960
1961=item unterminated `\('
1962
1963A BRE contains an unterminated backreference.
1964
1965=item `\{' without closing `\}'
1966
1967A BRE contains an unterminated bounds specification.
1968
1969=item `\)' without preceding `\('
1970
1971=item `y' command requires argument
1972
1973=back
1974
1975=head1 EXAMPLE
1976
1977The basic material for the preceding section was generated by running
1978the sed script
1979
1980 #no autoprint
1981 s/^.*Warn( *"\([^"]*\)".*$/\1/
1982 t process
1983 b
1984 :process
1985 s/$!/%s/g
1986 s/$[_[:alnum:]]\{1,\}/%s/g
1987 s/\\\\/\\/g
1988 s/^/=item /
1989 p
1990
1991on the program's own text, and piping the output into C<sort -u>.
1992
1993
1994=head1 SED SCRIPT TRANSLATION
1995
1996If this program is invoked with the name F<s2p> it will act as a
1997sed-to-Perl translator. After option processing (all other
1998arguments are ignored), a Perl program is printed on standard
1999output, which will process the input stream (as read from all
2000arguments) in the way defined by the sed script and the option setting
2001used for the translation.
2002
2003=head1 SEE ALSO
2004
2005perl(1), re_format(7)
2006
2007=head1 BUGS
2008
2009The B<l> command will show escape characters (ESC) as `C<\e>', but
2010a vertical tab (VT) in octal.
2011
2012Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
2013
2014The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
2015is "the last pattern used, at run time". This deviates from the Perl
2016interpretation, which will re-use the "last last successfully executed
2017regular expression". Since keeping track of pattern usage would create
2018terribly cluttered code, and differences would only appear in obscure
2019context (where other B<sed> implementations appear to deviate, too),
2020the Perl semantics was adopted. Note that common usage of this feature,
2021such as in C</abc/s//xyz/>, will work as expected.
2022
2023Collating elements (of bracket expressions in BREs) are not implemented.
2024
2025=head1 STANDARDS
2026
2027This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2028definition of B<sed>, and is compatible with the I<OpenBSD>
2029implementation, except where otherwise noted (see L<"BUGS">).
2030
2031=head1 AUTHOR
2032
2033This Perl implementation of I<sed> was written by Wolfgang Laun,
2034I<Wolfgang.Laun@alcatel.at>.
2035
2036=head1 COPYRIGHT and LICENSE
2037
2038This program is free and open software. You may use, modify,
2039distribute, and sell this program (and any modified variants) in any
2040way you wish, provided you do not restrict others from doing the same.
2041
2042=cut
2043
a687059c 2044!NO!SUBS!
4633a7c4
LW
2045
2046close OUT or die "Can't close $file: $!";
2047chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
d38b618a
A
2048unlink 'psed';
2049print "Linking s2p to psed.\n";
2050if (defined $Config{d_link}) {
2051 link 's2p', 'psed';
2052} else {
2053 unshift @INC, '../lib';
2054 require File::Copy;
2055 File::Copy::syscopy('s2p', 'psed');
2056}
4633a7c4 2057exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
8a5546a1 2058chdir $origdir;