This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Exterminate PL_na! Exterminate! Exterminate! Exterminate!
[perl5.git] / x2p / s2p.PL
CommitLineData
86a59229 1#!/usr/bin/perl
4633a7c4
LW
2
3use Config;
4use File::Basename qw(&basename &dirname);
8a5546a1 5use Cwd;
231bc313
JH
6use subs qw(link);
7
6ef9d486 8sub link { # This is a cut-down version of installperl:link().
231bc313
JH
9 my($from,$to) = @_;
10 my($success) = 0;
11
12 eval {
13 CORE::link($from, $to)
14 ? $success++
15 : ($from =~ m#^/afs/# || $to =~ m#^/afs/#)
16 ? die "AFS" # okay inside eval {}
17 : die "Couldn't link $from to $to: $!\n";
18 };
19 if ($@) {
231bc313
JH
20 require File::Copy;
21 File::Copy::copy($from, $to)
22 ? $success++
23 : warn "Couldn't copy $from to $to: $!\n";
24 }
25 $success;
26}
4633a7c4
LW
27
28# List explicitly here the variables you want Configure to
29# generate. Metaconfig only looks for shell variables, so you
30# have to mention them as if they were shell variables, not
31# %Config entries. Thus you write
32# $startperl
33# to ensure Configure will look for $Config{startperl}.
34
35# This forces PL files to create target in same directory as PL file.
36# This is so that make depend always knows where to find PL derivatives.
8a5546a1 37$origdir = cwd;
44a8e56a 38chdir dirname($0);
39$file = basename($0, '.PL');
774d564b 40$file .= '.com' if $^O eq 'VMS';
4633a7c4
LW
41
42open OUT,">$file" or die "Can't create $file: $!";
43
44print "Extracting $file (with variable substitutions)\n";
45
46# In this section, perl variables will be expanded during extraction.
47# You can use $Config{...} to use Configure variables.
48
49print OUT <<"!GROK!THIS!";
5f05dabc 50$Config{startperl}
51 eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
52 if \$running_under_some_shell;
c0393c90
JH
53my \$startperl;
54my \$perlpath;
ed6d8ea1
JH
55(\$startperl = <<'/../') =~ s/\\s*\\z//;
56$Config{startperl}
57/../
58(\$perlpath = <<'/../') =~ s/\\s*\\z//;
59$Config{perlpath}
60/../
a687059c
LW
61!GROK!THIS!
62
4633a7c4
LW
63# In the following, perl variables are not expanded during extraction.
64
65print OUT <<'!NO!SUBS!';
a687059c 66
9ae2b9a0 67$0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
86a59229
WL
68
69# (p)sed - a stream editor
70# History: Aug 12 2000: Original version.
d16f50bd 71# Mar 25 2002: Rearrange generated Perl program.
e2f73067 72# Jul 23 2007: Fix bug in regex stripping (M.Thorland)
86a59229
WL
73
74use strict;
75use integer;
76use Symbol;
8d063cd8 77
d83e3bda
JM
78=head1 NAME
79
ba34211e 80psed - a stream editor
d83e3bda
JM
81
82=head1 SYNOPSIS
83
ba34211e
JH
84 psed [-an] script [file ...]
85 psed [-an] [-e script] [-f script-file] [file ...]
86
87 s2p [-an] [-e script] [-f script-file]
d83e3bda
JM
88
89=head1 DESCRIPTION
90
86a59229
WL
91A stream editor reads the input stream consisting of the specified files
92(or standard input, if none are given), processes is line by line by
93applying a script consisting of edit commands, and writes resulting lines
94to standard output. The filename `C<->' may be used to read standard input.
95
96The edit script is composed from arguments of B<-e> options and
97script-files, in the given order. A single script argument may be specified
98as the first parameter.
99
100If this program is invoked with the name F<s2p>, it will act as a
101sed-to-Perl translator. See L<"sed Script Translation">.
d83e3bda 102
86a59229 103B<sed> returns an exit code of 0 on success or >0 if an error occurred.
d83e3bda 104
86a59229 105=head1 OPTIONS
d83e3bda 106
86a59229 107=over 4
d83e3bda 108
86a59229 109=item B<-a>
d83e3bda 110
86a59229
WL
111A file specified as argument to the B<w> edit command is by default
112opened before input processing starts. Using B<-a>, opening of such
113files is delayed until the first line is actually written to the file.
114
115=item B<-e> I<script>
116
117The editing commands defined by I<script> are appended to the script.
118Multiple commands must be separated by newlines.
119
120=item B<-f> I<script-file>
121
122Editing commands from the specified I<script-file> are read and appended
123to the script.
d83e3bda
JM
124
125=item B<-n>
126
86a59229
WL
127By default, a line is written to standard output after the editing script
128has been applied to it. The B<-n> option suppresses automatic printing.
129
130=back
131
132=head1 COMMANDS
133
134B<sed> command syntax is defined as
135
136Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
137
138with whitespace being permitted before or after addresses, and between
139the function character and the argument. The I<address>es and the
140address inverter (C<!>) are used to restrict the application of a
141command to the selected line(s) of input.
142
143Each command must be on a line of its own, except where noted in
144the synopses below.
145
146The edit cycle performed on each input line consist of reading the line
147(without its trailing newline character) into the I<pattern space>,
148applying the applicable commands of the edit script, writing the final
149contents of the pattern space and a newline to the standard output.
150A I<hold space> is provided for saving the contents of the
151pattern space for later use.
152
153=head2 Addresses
154
155A sed address is either a line number or a pattern, which may be combined
156arbitrarily to construct ranges. Lines are numbered across all input files.
157
158Any address may be followed by an exclamation mark (`C<!>'), selecting
159all lines not matching that address.
d83e3bda 160
86a59229 161=over 4
d83e3bda 162
86a59229
WL
163=item I<number>
164
165The line with the given number is selected.
166
167=item B<$>
168
169A dollar sign (C<$>) is the line number of the last line of the input stream.
170
171=item B</>I<regular expression>B</>
172
173A pattern address is a basic regular expression (see
174L<"Basic Regular Expressions">), between the delimiting character C</>.
175Any other character except C<\> or newline may be used to delimit a
176pattern address when the initial delimiter is prefixed with a
177backslash (`C<\>').
d83e3bda
JM
178
179=back
180
86a59229 181If no address is given, the command selects every line.
d83e3bda 182
86a59229
WL
183If one address is given, it selects the line (or lines) matching the
184address.
d83e3bda 185
86a59229
WL
186Two addresses select a range that begins whenever the first address
187matches, and ends (including that line) when the second address matches.
188If the first (second) address is a matching pattern, the second
189address is not applied to the very same line to determine the end of
190the range. Likewise, if the second address is a matching pattern, the
191first address is not applied to the very same line to determine the
192begin of another range. If both addresses are line numbers,
193and the second line number is less than the first line number, then
194only the first line is selected.
d83e3bda 195
d83e3bda 196
86a59229 197=head2 Functions
d83e3bda 198
86a59229
WL
199The maximum permitted number of addresses is indicated with each
200function synopsis below.
d83e3bda 201
86a59229
WL
202The argument I<text> consists of one or more lines following the command.
203Embedded newlines in I<text> must be preceded with a backslash. Other
204backslashes in I<text> are deleted and the following character is taken
205literally.
d83e3bda 206
86a59229 207=over 4
d83e3bda 208
86a59229
WL
209=cut
210
211my %ComTab;
d16f50bd 212my %GenKey;
86a59229
WL
213#--------------------------------------------------------------------------
214$ComTab{'a'}=[ 1, 'txt', \&Emit, '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
215
216=item [1addr]B<a\> I<text>
217
218Write I<text> (which must start on the line following the command)
219to standard output immediately before reading the next line
220of input, either by executing the B<N> function or by beginning a new cycle.
221
222=cut
223
224#--------------------------------------------------------------------------
225$ComTab{'b'}=[ 2, 'str', \&Branch, '{ goto XXX; }' ]; #ok
226
227=item [2addr]B<b> [I<label>]
228
229Branch to the B<:> function with the specified I<label>. If no label
230is given, branch to the end of the script.
231
232=cut
233
234#--------------------------------------------------------------------------
235$ComTab{'c'}=[ 2, 'txt', \&Change, <<'-X-' ]; #ok
236{ print <<'TheEnd'; } $doPrint = 0; goto EOS;
237-X-
238### continue OK => next CYCLE;
239
240=item [2addr]B<c\> I<text>
241
242The line, or range of lines, selected by the address is deleted.
243The I<text> (which must start on the line following the command)
244is written to standard output. With an address range, this occurs at
245the end of the range.
246
247=cut
248
249#--------------------------------------------------------------------------
250$ComTab{'d'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
251{ $doPrint = 0;
252 goto EOS;
253}
254-X-
255### continue OK => next CYCLE;
256
257=item [2addr]B<d>
258
259Deletes the pattern space and starts the next cycle.
260
261=cut
262
263#--------------------------------------------------------------------------
264$ComTab{'D'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
265{ s/^.*\n?//;
266 if(length($_)){ goto BOS } else { goto EOS }
267}
268-X-
269### continue OK => next CYCLE;
270
271=item [2addr]B<D>
272
273Deletes the pattern space through the first embedded newline or to the end.
274If the pattern space becomes empty, a new cycle is started, otherwise
275execution of the script is restarted.
276
277=cut
278
279#--------------------------------------------------------------------------
280$ComTab{'g'}=[ 2, '', \&Emit, '{ $_ = $Hold };' ]; #ok
281
282=item [2addr]B<g>
283
284Replace the contents of the pattern space with the hold space.
285
286=cut
287
288#--------------------------------------------------------------------------
289$ComTab{'G'}=[ 2, '', \&Emit, '{ $_ .= "\n"; $_ .= $Hold };' ]; #ok
290
291=item [2addr]B<G>
292
293Append a newline and the contents of the hold space to the pattern space.
294
295=cut
296
297#--------------------------------------------------------------------------
298$ComTab{'h'}=[ 2, '', \&Emit, '{ $Hold = $_ }' ]; #ok
299
300=item [2addr]B<h>
301
302Replace the contents of the hold space with the pattern space.
303
304=cut
305
306#--------------------------------------------------------------------------
307$ComTab{'H'}=[ 2, '', \&Emit, '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
308
309=item [2addr]B<H>
310
311Append a newline and the contents of the pattern space to the hold space.
312
313=cut
314
315#--------------------------------------------------------------------------
316$ComTab{'i'}=[ 1, 'txt', \&Emit, '{ print <<'."'TheEnd' }\n" ]; #ok
317
318=item [1addr]B<i\> I<text>
319
320Write the I<text> (which must start on the line following the command)
321to standard output.
322
323=cut
324
325#--------------------------------------------------------------------------
326$ComTab{'l'}=[ 2, '', \&Emit, '{ _l() }' ]; #okUTF8
327
328=item [2addr]B<l>
329
330Print the contents of the pattern space: non-printable characters are
331shown in C-style escaped form; long lines are split and have a trailing
332`C<\>' at the point of the split; the true end of a line is marked with
333a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
334BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
335octal number for all other non-printable characters.
336
337=cut
338
339#--------------------------------------------------------------------------
340$ComTab{'n'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
341{ print $_, "\n" if $doPrint;
d16f50bd 342 printQ() if @Q;
86a59229
WL
343 $CondReg = 0;
344 last CYCLE unless getsARGV();
345 chomp();
346}
347-X-
348
349=item [2addr]B<n>
350
351If automatic printing is enabled, write the pattern space to the standard
352output. Replace the pattern space with the next line of input. If
353there is no more input, processing is terminated.
354
355=cut
356
357#--------------------------------------------------------------------------
358$ComTab{'N'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
d16f50bd 359{ printQ() if @Q;
86a59229
WL
360 $CondReg = 0;
361 last CYCLE unless getsARGV( $h );
362 chomp( $h );
363 $_ .= "\n$h";
364}
365-X-
366
367=item [2addr]B<N>
368
369Append a newline and the next line of input to the pattern space. If
370there is no more input, processing is terminated.
371
372=cut
373
374#--------------------------------------------------------------------------
375$ComTab{'p'}=[ 2, '', \&Emit, '{ print $_, "\n"; }' ]; #ok
376
377=item [2addr]B<p>
378
379Print the pattern space to the standard output. (Use the B<-n> option
380to suppress automatic printing at the end of a cycle if you want to
381avoid double printing of lines.)
382
383=cut
384
385#--------------------------------------------------------------------------
386$ComTab{'P'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
387{ if( /^(.*)/ ){ print $1, "\n"; } }
388-X-
389
390=item [2addr]B<P>
391
392Prints the pattern space through the first embedded newline or to the end.
393
394=cut
395
396#--------------------------------------------------------------------------
397$ComTab{'q'}=[ 1, '', \&Emit, <<'-X-' ]; #ok
398{ print $_, "\n" if $doPrint;
399 last CYCLE;
400}
401-X-
402
403=item [1addr]B<q>
404
405Branch to the end of the script and quit without starting a new cycle.
406
407=cut
408
409#--------------------------------------------------------------------------
410$ComTab{'r'}=[ 1, 'str', \&Emit, "{ _r( '-X-' ) }" ]; #ok
86a59229
WL
411
412=item [1addr]B<r> I<file>
413
414Copy the contents of the I<file> to standard output immediately before
415the next attempt to read a line of input. Any error encountered while
416reading I<file> is silently ignored.
417
418=cut
419
420#--------------------------------------------------------------------------
421$ComTab{'s'}=[ 2, 'sub', \&Emit, '' ]; #ok
422
423=item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
424
425Substitute the I<replacement> string for the first substring in
426the pattern space that matches the I<regular expression>.
427Any character other than backslash or newline can be used instead of a
428slash to delimit the regular expression and the replacement.
429To use the delimiter as a literal character within the regular expression
430and the replacement, precede the character by a backslash (`C<\>').
431
432Literal newlines may be embedded in the replacement string by
433preceding a newline with a backslash.
434
435Within the replacement, an ampersand (`C<&>') is replaced by the string
436matching the regular expression. The strings `C<\1>' through `C<\9>' are
437replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
438To get a literal `C<&>' or `C<\>' in the replacement text, precede it
439by a backslash.
440
441The following I<flags> modify the behaviour of the B<s> command:
442
443=over 8
444
445=item B<g>
446
447The replacement is performed for all matching, non-overlapping substrings
448of the pattern space.
449
450=item B<1>..B<9>
451
452Replace only the n-th matching substring of the pattern space.
453
454=item B<p>
455
456If the substitution was made, print the new value of the pattern space.
457
458=item B<w> I<file>
459
460If the substitution was made, write the new value of the pattern space
461to the specified file.
462
463=back
464
465=cut
466
467#--------------------------------------------------------------------------
468$ComTab{'t'}=[ 2, 'str', \&Branch, '{ goto XXX if _t() }' ]; #ok
469
470=item [2addr]B<t> [I<label>]
471
472Branch to the B<:> function with the specified I<label> if any B<s>
473substitutions have been made since the most recent reading of an input line
474or execution of a B<t> function. If no label is given, branch to the end of
475the script.
476
477
478=cut
479
480#--------------------------------------------------------------------------
481$ComTab{'w'}=[ 2, 'str', \&Write, "{ _w( '-X-' ) }" ]; #ok
482
483=item [2addr]B<w> I<file>
484
485The contents of the pattern space are written to the I<file>.
486
487=cut
488
489#--------------------------------------------------------------------------
490$ComTab{'x'}=[ 2, '', \&Emit, '{ ($Hold, $_) = ($_, $Hold) }' ]; #ok
491
492=item [2addr]B<x>
493
494Swap the contents of the pattern space and the hold space.
495
496=cut
497
498#--------------------------------------------------------------------------
499$ComTab{'y'}=[ 2, 'tra', \&Emit, '' ]; #ok
500=item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
501
502In the pattern space, replace all characters occuring in I<string1> by the
503character at the corresponding position in I<string2>. It is possible
504to use any character (other than a backslash or newline) instead of a
505slash to delimit the strings. Within I<string1> and I<string2>, a
506backslash followed by any character other than a newline is that literal
507character, and a backslash followed by an `n' is replaced by a newline
508character.
509
510=cut
d83e3bda 511
86a59229
WL
512#--------------------------------------------------------------------------
513$ComTab{'='}=[ 1, '', \&Emit, '{ print "$.\n" }' ]; #ok
514
515=item [1addr]B<=>
516
517Prints the current line number on the standard output.
518
519=cut
520
521#--------------------------------------------------------------------------
522$ComTab{':'}=[ 0, 'str', \&Label, '' ]; #ok
523
524=item [0addr]B<:> [I<label>]
525
526The command specifies the position of the I<label>. It has no other effect.
527
528=cut
529
530#--------------------------------------------------------------------------
531$ComTab{'{'}=[ 2, '', \&BeginBlock, '{' ]; #ok
532$ComTab{'}'}=[ 0, '', \&EndBlock, ';}' ]; #ok
533# ';' to avoid warning on empty {}-block
534
535=item [2addr]B<{> [I<command>]
536
537=item [0addr]B<}>
538
539These two commands begin and end a command list. The first command may
540be given on the same line as the opening B<{> command. The commands
541within the list are jointly selected by the address(es) given on the
542B<{> command (but may still have individual addresses).
543
544=cut
545
546#--------------------------------------------------------------------------
547$ComTab{'#'}=[ 0, 'str', \&Comment, '' ]; #ok
548
549=item [0addr]B<#> [I<comment>]
550
551The entire line is ignored (treated as a comment). If, however, the first
552two characters in the script are `C<#n>', automatic printing of output is
553suppressed, as if the B<-n> option were given on the command line.
554
555=back
556
557=cut
558
559use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
560
561my $useDEBUG = exists( $ENV{PSEDDEBUG} );
562my $useEXTBRE = $ENV{PSEDEXTBRE} || '';
563$useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
564
565my $doAutoPrint = 1; # automatic printing of pattern space (-n => 0)
566my $doOpenWrite = 1; # open w command output files at start (-a => 0)
567my $svOpenWrite = 0; # save $doOpenWrite
96021e87
JM
568
569# lower case $0 below as a VMSism. The VMS build procedure creates the
570# s2p file traditionally in upper case on the disk. When VMS is in a
571# case preserved or case sensitive mode, $0 will be returned in the exact
572# case which will be on the disk, and that is not predictable at this time.
573
574my $doGenerate = lc($0) eq 's2p';
86a59229
WL
575
576# Collected and compiled script
577#
d16f50bd
WL
578my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
579$Code = '';
86a59229
WL
580
581##################
582# Compile Time
583#
584# Labels
585#
586# Error handling
587#
588sub Warn($;$){
589 my( $msg, $loc ) = @_;
590 $loc ||= '';
591 $loc .= ': ' if length( $loc );
592 warn( "$0: $loc$msg\n" );
593}
594
595$labNum = 0;
596sub newLabel(){
597 return 'L_'.++$labNum;
598}
599
600# safeHere: create safe here delimiter and modify opcode and argument
601#
602sub safeHere($$){
603 my( $codref, $argref ) = @_;
604 my $eod = 'EOD000';
605 while( $$argref =~ /^$eod$/m ){
606 $eod++;
607 }
608 $$codref =~ s/TheEnd/$eod/e;
609 $$argref .= "$eod\n";
610}
611
612# Emit: create address logic and emit command
613#
614sub Emit($$$$$$){
615 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
616 my $cond = '';
617 if( defined( $addr1 ) ){
618 if( defined( $addr2 ) ){
619 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
620 } else {
621 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
622 }
623 $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
624 }
625
626 if( $opcode eq '' ){
627 $Code .= "$cond$arg\n";
628
629 } elsif( $opcode =~ s/-X-/$arg/e ){
630 $Code .= "$cond$opcode\n";
631
632 } elsif( $opcode =~ /TheEnd/ ){
633 safeHere( \$opcode, \$arg );
634 $Code .= "$cond$opcode$arg";
635
636 } else {
637 $Code .= "$cond$opcode\n";
638 }
639 0;
640}
641
642# Write (w command, w flag): store pathname
643#
644sub Write($$$$$$){
645 my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
646 $wFiles{$path} = '';
647 Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
648}
649
650
651# Label (: command): label definition
652#
653sub Label($$$$$$){
654 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
655 my $rc = 0;
656 $lab =~ s/\s+//;
657 if( length( $lab ) ){
658 my $h;
659 if( ! exists( $Label{$lab} ) ){
660 $h = $Label{$lab}{name} = newLabel();
661 } else {
662 $h = $Label{$lab}{name};
663 if( exists( $Label{$lab}{defined} ) ){
664 my $dl = $Label{$lab}{defined};
665 Warn( "duplicate label $lab (first defined at $dl)", $fl );
666 $rc = 1;
667 }
668 }
669 $Label{$lab}{defined} = $fl;
670 $Code .= "$h:;\n";
671 }
672 $rc;
673}
674
675# BeginBlock ({ command): push block start
676#
677sub BeginBlock($$$$$$){
678 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
679 push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
680 Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
681}
682
683# EndBlock (} command): check proper nesting
684#
685sub EndBlock($$$$$$){
686 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
687 my $rc;
688 my $jcom = pop( @BlockStack );
689 if( defined( $jcom ) ){
690 $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
691 } else {
692 Warn( "unexpected `}'", $fl );
693 $rc = 1;
694 }
695 $rc;
696}
697
698# Branch (t, b commands): check or create label, substitute default
699#
700sub Branch($$$$$$){
701 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
702 $lab =~ s/\s+//; # no spaces at end
703 my $h;
704 if( length( $lab ) ){
705 if( ! exists( $Label{$lab} ) ){
706 $h = $Label{$lab}{name} = newLabel();
707 } else {
708 $h = $Label{$lab}{name};
709 }
710 push( @{$Label{$lab}{used}}, $fl );
711 } else {
712 $h = 'EOS';
713 }
714 $opcode =~ s/XXX/$h/e;
715 Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
716}
717
718# Change (c command): is special due to range end watching
719#
720sub Change($$$$$$){
721 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
722 my $kwd = $negated ? 'unless' : 'if';
723 if( defined( $addr2 ) ){
724 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
725 if( ! $negated ){
726 $addr1 = '$icnt = ('.$addr1.')';
727 $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
728 }
729 } else {
730 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
731 }
732 safeHere( \$opcode, \$arg );
733 $Code .= "$kwd( $addr1 ){\n $opcode$arg}\n";
734 0;
735}
736
737
738# Comment (# command): A no-op. Who would've thought that!
739#
740sub Comment($$$$$$){
741 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
742### $Code .= "# $arg\n";
743 0;
744}
745
e2f73067
WL
746# stripRegex from the current command. If we're in the first
747# part of s///, trailing spaces have to be kept as the initial
748# part of the replacement string.
749#
b588327e 750sub stripRegex($$;$){
e2f73067 751 my( $del, $sref, $sub ) = @_;
86a59229
WL
752 my $regex = $del;
753 print "stripRegex:$del:$$sref:\n" if $useDEBUG;
754 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
755 my $sl = $2;
756 $regex .= $1.$sl.$del;
757 if( length( $sl ) % 2 == 0 ){
e2f73067
WL
758 if( $sub && (length( $3 ) > 0) ){
759 $$sref = $3 . $$sref;
760 }
86a59229
WL
761 return $regex;
762 }
763 $regex .= $3;
764 }
765 undef();
766}
767
768# stripTrans: take a <del> terminated string from y command
769# honoring and cleaning up of \-escaped <del>'s
770#
771sub stripTrans($$){
772 my( $del, $sref ) = @_;
773 my $t = '';
774 print "stripTrans:$del:$$sref:\n" if $useDEBUG;
775 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
776 my $sl = $2;
777 $t .= $1;
778 if( length( $sl ) % 2 == 0 ){
779 $t .= $sl;
780 $t =~ s/\\\\/\\/g;
781 return $t;
782 }
783 chop( $sl );
784 $t .= $sl.$del.$3;
785 }
786 undef();
787}
788
789# makey - construct Perl y/// from sed y///
790#
791sub makey($$$){
792 my( $fr, $to, $fl ) = @_;
793 my $error = 0;
794
795 # Ensure that any '-' is up front.
796 # Diagnose duplicate contradicting mappings
797 my %tr;
798 for( my $i = 0; $i < length($fr); $i++ ){
799 my $fc = substr($fr,$i,1);
800 my $tc = substr($to,$i,1);
801 if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
3484938b 802 Warn( "ambiguous translation for character `$fc' in `y' command",
86a59229
WL
803 $fl );
804 $error++;
805 }
806 $tr{$fc} = $tc;
807 }
808 $fr = $to = '';
809 if( exists( $tr{'-'} ) ){
810 ( $fr, $to ) = ( '-', $tr{'-'} );
811 delete( $tr{'-'} );
812 } else {
813 $fr = $to = '';
814 }
815 # might just as well sort it...
816 for my $fc ( sort keys( %tr ) ){
817 $fr .= $fc;
818 $to .= $tr{$fc};
819 }
820 # make embedded delimiters and newlines safe
821 $fr =~ s/([{}])/\$1/g;
822 $to =~ s/([{}])/\$1/g;
823 $fr =~ s/\n/\\n/g;
824 $to =~ s/\n/\\n/g;
825 return $error ? undef() : "{ y{$fr}{$to}; }";
826}
827
828######
829# makes - construct Perl s/// from sed s///
830#
831sub makes($$$$$$$){
832 my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
833
834 # make embedded newlines safe
835 $regex =~ s/\n/\\n/g;
836 $subst =~ s/\n/\\n/g;
837
838 my $code;
839 # n-th occurrence
840 #
841 if( length( $nmatch ) ){
842 $code = <<TheEnd;
843{ \$n = $nmatch;
844 while( --\$n && ( \$s = m ${regex}g ) ){}
845 \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
846 \$CondReg ||= \$s;
847TheEnd
848 } else {
849 $code = <<TheEnd;
850{ \$s = s ${regex}${subst}s${global};
851 \$CondReg ||= \$s;
852TheEnd
853 }
854 if( $print ){
855 $code .= ' print $_, "\n" if $s;'."\n";
856 }
857 if( defined( $path ) ){
858 $wFiles{$path} = '';
859 $code .= " _w( '$path' ) if \$s;\n";
d16f50bd 860 $GenKey{'w'} = 1;
86a59229
WL
861 }
862 $code .= "}";
863}
864
865=head1 BASIC REGULAR EXPRESSIONS
866
867A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
868of I<atoms>, for matching parts of a string, and I<bounds>, specifying
869repetitions of a preceding atom.
870
871=head2 Atoms
872
873The possible atoms of a BRE are: B<.>, matching any single character;
874B<^> and B<$>, matching the null string at the beginning or end
875of a string, respectively; a I<bracket expressions>, enclosed
876in B<[> and B<]> (see below); and any single character with no
877other significance (matching that character). A B<\> before one
878of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
879after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
880becomes an atom and establishes the target for a I<backreference>,
881consisting of the substring that actually matches the enclosed atoms.
882Finally, B<\> followed by one of the digits B<0> through B<9> is a
883backreference.
884
885A B<^> that is not first, or a B<$> that is not last does not have
886a special significance and need not be preceded by a backslash to
887become literal. The same is true for a B<]>, that does not terminate
888a bracket expression.
889
890An unescaped backslash cannot be last in a BRE.
891
892=head2 Bounds
893
894The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
895atom; B<\{>I<count>B<\}>, specifying that many repetitions;
896B<\{>I<minimum>B<,\}>, giving a lower limit; and
897B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
898bound.
899
900A bound appearing as the first item in a BRE is taken literally.
901
902=head2 Bracket Expressions
903
904A I<bracket expression> is a list of characters, character ranges
905and character classes enclosed in B<[> and B<]> and matches any
906single character from the represented set of characters.
907
908A character range is written as two characters separated by B<-> and
909represents all characters (according to the character collating sequence)
910that are not less than the first and not greater than the second.
911(Ranges are very collating-sequence-dependent, and portable programs
912should avoid relying on them.)
913
914A character class is one of the class names
915
916 alnum digit punct
917 alpha graph space
918 blank lower upper
919 cntrl print xdigit
920
921enclosed in B<[:> and B<:]> and represents the set of characters
922as defined in ctype(3).
923
924If the first character after B<[> is B<^>, the sense of matching is
925inverted.
926
927To include a literal `C<^>', place it anywhere else but first. To
928include a literal 'C<]>' place it first or immediately after an
929initial B<^>. To include a literal `C<->' make it the first (or
930second after B<^>) or last character, or the second endpoint of
931a range.
932
933The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
934match the null string at the beginning and end of a word respectively.
935(Note that neither is identical to Perl's `\b' atom.)
936
937=head2 Additional Atoms
938
939Since some sed implementations provide additional regular expression
940atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
941the following backslash escapes:
942
943=over 4
944
945=item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
946
947=item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
948
949=item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
950
951=item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
952
953=item B<\y> Match the empty string at a word boundary.
954
955=item B<\B> Match the empty string between any two either word or non-word characters.
956
957=back
958
959To enable this feature, the environment variable PSEDEXTBRE must be set
960to a string containing the requested characters, e.g.:
961C<PSEDEXTBRE='E<lt>E<gt>wW'>.
962
963=cut
964
965#####
966# bre2p - convert BRE to Perl RE
967#
968sub peek(\$$){
969 my( $pref, $ic ) = @_;
970 $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
971}
972
973sub bre2p($$$){
974 my( $del, $pat, $fl ) = @_;
975 my $led = $del;
976 $led =~ tr/{([</})]>/;
977 $led = '' if $led eq $del;
978
979 $pat = substr( $pat, 1, length($pat) - 2 );
980 my $res = '';
981 my $bracklev = 0;
982 my $backref = 0;
983 my $parlev = 0;
984 for( my $ic = 0; $ic < length( $pat ); $ic++ ){
985 my $c = substr( $pat, $ic, 1 );
986 if( $c eq '\\' ){
987 ### backslash escapes
988 my $nc = peek($pat,$ic);
989 if( $nc eq '' ){
990 Warn( "`\\' cannot be last in pattern", $fl );
991 return undef();
992 }
993 $ic++;
994 if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
995 $res .= "\\$del";
996
997 } elsif( $nc =~ /([[.*\\n])/ ){
998 ## check for \-escaped magics and \n:
999 ## \[ \. \* \\ \n stay as they are
1000 $res .= '\\'.$nc;
1001
1002 } elsif( $nc eq '(' ){ ## \( => (
1003 $parlev++;
1004 $res .= '(';
1005
1006 } elsif( $nc eq ')' ){ ## \) => )
1007 $parlev--;
1008 $backref++;
1009 if( $parlev < 0 ){
1010 Warn( "unmatched `\\)'", $fl );
1011 return undef();
1012 }
1013 $res .= ')';
1014
1015 } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
1016 my $endpos = index( $pat, '\\}', $ic );
1017 if( $endpos < 0 ){
1018 Warn( "unmatched `\\{'", $fl );
1019 return undef();
1020 }
1021 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
1022 $ic = $endpos + 1;
1023
1024 if( $res =~ /^\^?$/ ){
1025 $res .= "\\{$rep\}";
1026 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
1027 my $min = $1;
1028 my $com = $2 || '';
1029 my $max = $3;
1030 if( length( $max ) ){
1031 if( $max < $min ){
1032 Warn( "maximum less than minimum in `\\{$rep\\}'",
1033 $fl );
1034 return undef();
1035 }
1036 } else {
1037 $max = '';
1038 }
1039 # simplify some
1040 if( $min == 0 && $max eq '1' ){
1041 $res .= '?';
1042 } elsif( $min == 1 && "$com$max" eq ',' ){
1043 $res .= '+';
1044 } elsif( $min == 0 && "$com$max" eq ',' ){
1045 $res .= '*';
1046 } else {
1047 $res .= "{$min$com$max}";
1048 }
1049 } else {
1050 Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1051 return undef();
1052 }
1053
1054 } elsif( $nc =~ /^[1-9]$/ ){
1055 ## \1 .. \9 => \1 .. \9, but check for a following digit
1056 if( $nc > $backref ){
1057 Warn( "invalid backreference ($nc)", $fl );
1058 return undef();
1059 }
1060 $res .= "\\$nc";
1061 if( peek($pat,$ic) =~ /[0-9]/ ){
1062 $res .= '(?:)';
1063 }
1064
1065 } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1066 ## extensions - at most <>wWyB - not in POSIX
1067 if( $nc eq '<' ){ ## \< => \b(?=\w), be precise
1068 $res .= '\\b(?<=\\W)';
1069 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1070 $res .= '\\b(?=\\W)';
1071 } elsif( $nc eq 'y' ){ ## \y => \b
1072 $res .= '\\b';
1073 } else { ## \B, \w, \W remain the same
1074 $res .= "\\$nc";
1075 }
1076 } elsif( $nc eq $led ){
1077 ## \<closing bracketing-delimiter> - keep '\'
1078 $res .= "\\$nc";
1079
1080 } else { ## \ <char> => <char> ("as if `\' were not present")
1081 $res .= $nc;
1082 }
1083
1084 } elsif( $c eq '.' ){ ## . => .
1085 $res .= $c;
1086
1087 } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1088 if( $res =~ /^\^?$/ ){
1089 $res .= '\\*';
1090 } elsif( substr( $res, -1, 1 ) ne '*' ){
1091 $res .= $c;
1092 }
1093
1094 } elsif( $c eq '[' ){
1095 ## parse []: [^...] [^]...] [-...]
1096 my $add = '[';
1097 if( peek($pat,$ic) eq '^' ){
1098 $ic++;
1099 $add .= '^';
1100 }
1101 my $nc = peek($pat,$ic);
1102 if( $nc eq ']' || $nc eq '-' ){
1103 $add .= $nc;
1104 $ic++;
1105 }
1106 # check that [ is not trailing
1107 if( $ic >= length( $pat ) - 1 ){
1108 Warn( "unmatched `['", $fl );
1109 return undef();
1110 }
1111 # look for [:...:] and x-y
1112 my $rstr = substr( $pat, $ic+1 );
1113 if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1114 my $cnt = $1;
1115 $ic += length( $cnt );
1116 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1117 # try some simplifications
1118 my $red = $cnt;
1119 if( $red =~ s/0-9// ){
1120 $cnt = $red.'\d';
1121 if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1122 $cnt = $red.'\w';
1123 }
1124 }
1125 $add .= $cnt;
1126
1127 # POSIX 1003.2 has this (optional) for begin/end word
1128 $add = '\\b(?=\\W)' if $add eq '[[:<:]]';
1129 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1130
1131 }
1132
1133 ## may have a trailing `-' before `]'
1134 if( $ic < length($pat) - 1 &&
1135 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1136 $ic += length( $1 );
1137 $add .= $1;
1138 # another simplification
1139 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1140 $res .= $add;
1141 } else {
1142 Warn( "unmatched `['", $fl );
1143 return undef();
1144 }
3cb6de81 1145
86a59229
WL
1146 } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1147 $res .= "\\$c";
d83e3bda 1148
86a59229
WL
1149 } elsif( $c eq ']' ){ ## unmatched ] is not magic
1150 $res .= ']';
d83e3bda 1151
86a59229
WL
1152 } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1153 $res .= "\\$c";
d83e3bda 1154
86a59229
WL
1155 } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1156 $res .= length( $res ) ? '\\^' : '^';
d83e3bda 1157
86a59229
WL
1158 } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1159 $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
8d063cd8 1160
86a59229
WL
1161 } else {
1162 $res .= $c;
1163 }
8d063cd8 1164 }
0a12ae7d 1165
86a59229
WL
1166 if( $parlev ){
1167 Warn( "unmatched `\\('", $fl );
1168 return undef();
8d063cd8 1169 }
0a12ae7d 1170
86a59229
WL
1171 # final cleanup: eliminate raw HTs
1172 $res =~ s/\t/\\t/g;
1173 return $del . $res . ( $led ? $led : $del );
1174}
0a12ae7d 1175
86a59229
WL
1176
1177#####
1178# sub2p - convert sed substitution to Perl substitution
1179#
1180sub sub2p($$$){
1181 my( $del, $subst, $fl ) = @_;
1182 my $led = $del;
1183 $led =~ tr/{([</})]>/;
1184 $led = '' if $led eq $del;
1185
1186 $subst = substr( $subst, 1, length($subst) - 2 );
1187 my $res = '';
1188
1189 for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1190 my $c = substr( $subst, $ic, 1 );
1191 if( $c eq '\\' ){
1192 ### backslash escapes
1193 my $nc = peek($subst,$ic);
1194 if( $nc eq '' ){
1195 Warn( "`\\' cannot be last in substitution", $fl );
1196 return undef();
1197 }
1198 $ic++;
1199 if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1200 $res .= '\\' . $nc;
1201 } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1202 $res .= '${' . $nc . '}';
1203 } else { ## everything else (includes &): omit \
1204 $res .= $nc;
1205 }
1206 } elsif( $c eq '&' ){ ## & => $&
1207 $res .= '$&';
1208 } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1209 $res .= '\\' . $c;
1210 } else {
1211 $res .= $c;
8d063cd8 1212 }
8d063cd8
LW
1213 }
1214
86a59229
WL
1215 # final cleanup: eliminate raw HTs
1216 $res =~ s/\t/\\t/g;
1217 return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1218}
0a12ae7d 1219
86a59229
WL
1220
1221sub Parse(){
1222 my $error = 0;
1223 my( $pdef, $pfil, $plin );
1224 for( my $icom = 0; $icom < @Commands; $icom++ ){
1225 my $cmd = $Commands[$icom];
1226 print "Parse:$cmd:\n" if $useDEBUG;
1227 $cmd =~ s/^\s+//;
1228 next unless length( $cmd );
1229 my $scom = $icom;
1230 if( exists( $Defined{$icom} ) ){
1231 $pdef = $Defined{$icom};
1232 if( $pdef =~ /^ #(\d+)/ ){
1233 $pfil = 'expression #';
1234 $plin = $1;
1235 } else {
1236 $pfil = "$pdef l.";
1237 $plin = 1;
1238 }
1239 } else {
1240 $plin++;
1241 }
1242 my $fl = "$pfil$plin";
1243
1244 # insert command as comment in gnerated code
1245 #
1246 $Code .= "# $cmd\n" if $doGenerate;
1247
1248 # The Address(es)
1249 #
1250 my( $negated, $naddr, $addr1, $addr2 );
1251 $naddr = 0;
1252 if( $cmd =~ s/^(\d+)\s*// ){
1253 $addr1 = "$1"; $naddr++;
1254 } elsif( $cmd =~ s/^\$\s*// ){
1255 $addr1 = 'eofARGV()'; $naddr++;
1256 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1257 my $del = $1;
1258 my $regex = stripRegex( $del, \$cmd );
1259 if( defined( $regex ) ){
1260 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1261 $naddr++;
1262 } else {
1263 Warn( "malformed regex, 1st address", $fl );
1264 $error++;
1265 next;
1266 }
1267 }
1268 if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1269 if( $cmd =~ s/^(\d+)\s*// ){
1270 $addr2 = "$1"; $naddr++;
1271 } elsif( $cmd =~ s/^\$\s*// ){
1272 $addr2 = 'eofARGV()'; $naddr++;
1273 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1274 my $del = $1;
1275 my $regex = stripRegex( $del, \$cmd );
1276 if( defined( $regex ) ){
1277 $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1278 $naddr++;
1279 } else {
1280 Warn( "malformed regex, 2nd address", $fl );
1281 $error++;
1282 next;
1283 }
1284 } else {
1285 Warn( "invalid address after `,'", $fl );
1286 $error++;
1287 next;
1288 }
1289 }
1290
1291 # address modifier `!'
1292 #
1293 $negated = $cmd =~ s/^!\s*//;
1294 if( defined( $addr1 ) ){
1295 print "Parse: addr1=$addr1" if $useDEBUG;
1296 if( defined( $addr2 ) ){
1297 print ", addr2=$addr2 " if $useDEBUG;
1298 # both numeric and addr1 > addr2 => eliminate addr2
1299 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1300 $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1301 }
9ef589d8 1302 }
86a59229
WL
1303 print 'negated' if $useDEBUG && $negated;
1304 print " command:$cmd\n" if $useDEBUG;
1305
1306 # The Command
1307 #
1308 if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1309 my $h = substr( $cmd, 0, 1 );
1310 Warn( "unknown command `$h'", $fl );
1311 $error++;
8d063cd8
LW
1312 next;
1313 }
86a59229 1314 my $key = $1;
8d063cd8 1315
86a59229 1316 my $tabref = $ComTab{$key};
d16f50bd 1317 $GenKey{$key} = 1;
86a59229
WL
1318 if( $naddr > $tabref->[0] ){
1319 Warn( "excess address(es)", $fl );
1320 $error++;
8d063cd8
LW
1321 next;
1322 }
1323
86a59229
WL
1324 my $arg = '';
1325 if( $tabref->[1] eq 'str' ){
1326 # take remainder - don't care if it is empty
1327 $arg = $cmd;
1328 $cmd = '';
1329
1330 } elsif( $tabref->[1] eq 'txt' ){
1331 # multi-line text
1332 my $goon = $cmd =~ /(.*)\\$/;
1333 if( length( $1 ) ){
1334 Warn( "extra characters after command ($cmd)", $fl );
1335 $error++;
8d063cd8 1336 }
86a59229
WL
1337 while( $goon ){
1338 $icom++;
1339 if( $icom > $#Commands ){
1340 Warn( "unexpected end of script", $fl );
1341 $error++;
1342 last;
1343 }
1344 $cmd = $Commands[$icom];
1345 $Code .= "# $cmd\n" if $doGenerate;
1346 $goon = $cmd =~ s/\\$//;
1347 $cmd =~ s/\\(.)/$1/g;
1348 $arg .= "\n" if length( $arg );
1349 $arg .= $cmd;
8d063cd8 1350 }
86a59229
WL
1351 $arg .= "\n" if length( $arg );
1352 $cmd = '';
8d063cd8 1353
86a59229
WL
1354 } elsif( $tabref->[1] eq 'sub' ){
1355 # s///
1356 if( ! length( $cmd ) ){
1357 Warn( "`s' command requires argument", $fl );
1358 $error++;
1359 next;
1360 }
1361 if( $cmd =~ s{^([^\\\n])}{} ){
1362 my $del = $1;
e2f73067 1363 my $regex = stripRegex( $del, \$cmd, "s" );
86a59229
WL
1364 if( ! defined( $regex ) ){
1365 Warn( "malformed regular expression", $fl );
1366 $error++;
1367 next;
a687059c 1368 }
86a59229
WL
1369 $regex = bre2p( $del, $regex, $fl );
1370
1371 # a trailing \ indicates embedded NL (in replacement string)
1372 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1373 $icom++;
1374 if( $icom > $#Commands ){
1375 Warn( "unexpected end of script", $fl );
1376 $error++;
1377 last;
8d063cd8 1378 }
86a59229
WL
1379 $cmd .= $Commands[$icom];
1380 $Code .= "# $Commands[$icom]\n" if $doGenerate;
9ef589d8 1381 }
86a59229
WL
1382
1383 my $subst = stripRegex( $del, \$cmd );
1384 if( ! defined( $regex ) ){
1385 Warn( "malformed substitution expression", $fl );
1386 $error++;
0a12ae7d
LW
1387 next;
1388 }
86a59229
WL
1389 $subst = sub2p( $del, $subst, $fl );
1390
1391 # parse s/// modifier: g|p|0-9|w <file>
1392 my( $global, $nmatch, $print, $write ) =
1393 ( '', '', 0, undef );
1394 while( $cmd =~ s/^([gp0-9])// ){
1395 $1 eq 'g' ? ( $global = 'g' ) :
1396 $1 eq 'p' ? ( $print = $1 ) : ( $nmatch .= $1 );
1397 }
1398 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1399 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1400 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1401 Warn( "conflicting flags `$global$nmatch'", $fl );
1402 $error++;
0a12ae7d
LW
1403 next;
1404 }
86a59229
WL
1405
1406 $arg = makes( $regex, $subst,
1407 $write, $global, $print, $nmatch, $fl );
1408 if( ! defined( $arg ) ){
1409 $error++;
8d063cd8
LW
1410 next;
1411 }
86a59229
WL
1412
1413 } else {
1414 Warn( "improper delimiter in s command", $fl );
1415 $error++;
1416 next;
1417 }
1418
1419 } elsif( $tabref->[1] eq 'tra' ){
1420 # y///
1421 # a trailing \ indicates embedded newline
1422 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1423 $icom++;
1424 if( $icom > $#Commands ){
1425 Warn( "unexpected end of script", $fl );
1426 $error++;
1427 last;
1428 }
1429 $cmd .= $Commands[$icom];
1430 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1431 }
1432 if( ! length( $cmd ) ){
1433 Warn( "`y' command requires argument", $fl );
1434 $error++;
1435 next;
1436 }
1437 my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1438 if( $d eq '\\' ){
1439 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1440 $error++;
1441 next;
1442 }
1443 my $fr = stripTrans( $d, \$cmd );
1444 if( ! defined( $fr ) || ! length( $cmd ) ){
1445 Warn( "malformed `y' command argument", $fl );
1446 $error++;
1447 next;
1448 }
1449 my $to = stripTrans( $d, \$cmd );
1450 if( ! defined( $to ) ){
1451 Warn( "malformed `y' command argument", $fl );
1452 $error++;
1453 next;
1454 }
1455 if( length($fr) != length($to) ){
1456 Warn( "string lengths in `y' command differ", $fl );
1457 $error++;
1458 next;
1459 }
1460 if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1461 $error++;
1462 next;
8d063cd8 1463 }
8d063cd8 1464
8d063cd8
LW
1465 }
1466
86a59229
WL
1467 # $cmd must be now empty - exception is {
1468 if( $cmd !~ /^\s*$/ ){
1469 if( $key eq '{' ){
1470 # dirty hack to process command on '{' line
1471 $Commands[$icom--] = $cmd;
1472 } else {
1473 Warn( "extra characters after command ($cmd)", $fl );
1474 $error++;
1475 next;
1476 }
8d063cd8
LW
1477 }
1478
86a59229
WL
1479 # Make Code
1480 #
1481 if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1482 $tabref->[3], $arg, $fl ) ){
1483 $error++;
8d063cd8 1484 }
86a59229 1485 }
8d063cd8 1486
86a59229
WL
1487 while( @BlockStack ){
1488 my $bl = pop( @BlockStack );
1489 Warn( "start of unterminated `{'", $bl );
1490 $error++;
1491 }
8d063cd8 1492
86a59229
WL
1493 for my $lab ( keys( %Label ) ){
1494 if( ! exists( $Label{$lab}{defined} ) ){
1495 for my $used ( @{$Label{$lab}{used}} ){
1496 Warn( "undefined label `$lab'", $used );
1497 $error++;
1498 }
8d063cd8 1499 }
86a59229 1500 }
8d063cd8 1501
86a59229
WL
1502 exit( 1 ) if $error;
1503}
8d063cd8 1504
8d063cd8 1505
86a59229
WL
1506##############
1507#### MAIN ####
1508##############
8d063cd8 1509
86a59229
WL
1510sub usage(){
1511 print STDERR "Usage: sed [-an] command [file...]\n";
1512 print STDERR " [-an] [-e command] [-f script-file] [file...]\n";
1513}
8d063cd8 1514
86a59229
WL
1515###################
1516# Here we go again...
1517#
1518my $expr = 0;
1519while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1520 my $opt = $1;
1521 my $arg = $2;
1522 shift( @ARGV );
1523 if( $opt eq 'e' ){
1524 if( length( $arg ) ){
1525 push( @Commands, split( "\n", $arg ) );
1526 } elsif( @ARGV ){
1527 push( @Commands, shift( @ARGV ) );
1528 } else {
1529 Warn( "option -e requires an argument" );
1530 usage();
1531 exit( 1 );
1532 }
1533 $expr++;
1534 $Defined{$#Commands} = " #$expr";
1535 next;
1536 }
1537 if( $opt eq 'f' ){
1538 my $path;
1539 if( length( $arg ) ){
1540 $path = $arg;
1541 } elsif( @ARGV ){
1542 $path = shift( @ARGV );
1543 } else {
1544 Warn( "option -f requires an argument" );
1545 usage();
1546 exit( 1 );
1547 }
1548 my $fst = $#Commands + 1;
1549 open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1550 my $cmd;
1551 while( defined( $cmd = <SCRIPT> ) ){
1552 chomp( $cmd );
1553 push( @Commands, $cmd );
1554 }
1555 close( SCRIPT );
1556 if( $#Commands >= $fst ){
1557 $Defined{$fst} = "$path";
8d063cd8 1558 }
86a59229
WL
1559 next;
1560 }
1561 if( $opt eq '-' && $arg eq '' ){
1562 last;
1563 }
1564 if( $opt eq 'h' || $opt eq '?' ){
1565 usage();
1566 exit( 0 );
1567 }
1568 if( $opt eq 'n' ){
1569 $doAutoPrint = 0;
1570 } elsif( $opt eq 'a' ){
1571 $doOpenWrite = 0;
1572 } else {
1573 Warn( "illegal option `$opt'" );
1574 usage();
1575 exit( 1 );
1576 }
1577 if( length( $arg ) ){
1578 unshift( @ARGV, "-$arg" );
1579 }
1580}
8d063cd8 1581
86a59229
WL
1582# A singleton command may be the 1st argument when there are no options.
1583#
1584if( @Commands == 0 ){
1585 if( @ARGV == 0 ){
1586 Warn( "no script command given" );
1587 usage();
1588 exit( 1 );
1589 }
1590 push( @Commands, split( "\n", shift( @ARGV ) ) );
1591 $Defined{0} = ' #1';
1592}
8d063cd8 1593
86a59229 1594print STDERR "Files: @ARGV\n" if $useDEBUG;
8d063cd8 1595
86a59229
WL
1596# generate leading code
1597#
d16f50bd 1598$Func = <<'[TheEnd]';
86a59229 1599
d16f50bd
WL
1600# openARGV: open 1st input file
1601#
86a59229
WL
1602sub openARGV(){
1603 unshift( @ARGV, '-' ) unless @ARGV;
1604 my $file = shift( @ARGV );
1605 open( ARG, "<$file" )
1606 || die( "$0: can't open $file for reading ($!)\n" );
1607 $isEOF = 0;
1608}
8d063cd8 1609
d16f50bd
WL
1610# getsARGV: Read another input line into argument (default: $_).
1611# Move on to next input file, and reset EOF flag $isEOF.
86a59229
WL
1612sub getsARGV(;\$){
1613 my $argref = @_ ? shift() : \$_;
1614 while( $isEOF || ! defined( $$argref = <ARG> ) ){
1615 close( ARG );
1616 return 0 unless @ARGV;
1617 my $file = shift( @ARGV );
1618 open( ARG, "<$file" )
1619 || die( "$0: can't open $file for reading ($!)\n" );
1620 $isEOF = 0;
1621 }
1622 1;
1623}
8d063cd8 1624
d16f50bd
WL
1625# eofARGV: end-of-file test
1626#
86a59229
WL
1627sub eofARGV(){
1628 return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1629}
8d063cd8 1630
d16f50bd
WL
1631# makeHandle: Generates another file handle for some file (given by its path)
1632# to be written due to a w command or an s command's w flag.
86a59229
WL
1633sub makeHandle($){
1634 my( $path ) = @_;
1635 my $handle;
1636 if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1637 $handle = $wFiles{$path} = gensym();
1638 if( $doOpenWrite ){
1639 if( ! open( $handle, ">$path" ) ){
1640 die( "$0: can't open $path for writing: ($!)\n" );
1641 }
9ef589d8 1642 }
86a59229
WL
1643 } else {
1644 $handle = $wFiles{$path};
1645 }
1646 return $handle;
1647}
9ef589d8 1648
d16f50bd
WL
1649# printQ: Print queued output which is either a string or a reference
1650# to a pathname.
86a59229
WL
1651sub printQ(){
1652 for my $q ( @Q ){
1653 if( ref( $q ) ){
d16f50bd 1654 # flush open w files so that reading this file gets it all
86a59229
WL
1655 if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1656 open( $wFiles{$$q}, ">>$$q" );
1657 }
d16f50bd 1658 # copy file to stdout: slow, but safe
86a59229 1659 if( open( RF, "<$$q" ) ){
d16f50bd 1660 while( defined( my $line = <RF> ) ){
86a59229
WL
1661 print $line;
1662 }
1663 close( RF );
1664 }
1665 } else {
1666 print $q;
a687059c 1667 }
86a59229
WL
1668 }
1669 undef( @Q );
1670}
1671
d16f50bd
WL
1672[TheEnd]
1673
1674# generate the sed loop
1675#
1676$Code .= <<'[TheEnd]';
1677sub openARGV();
1678sub getsARGV(;\$);
1679sub eofARGV();
1680sub printQ();
1681
1682# Run: the sed loop reading input and applying the script
1683#
86a59229
WL
1684sub Run(){
1685 my( $h, $icnt, $s, $n );
1686 # hack (not unbreakable :-/) to avoid // matching an empty string
1687 my $z = "\000"; $z =~ /$z/;
1688 # Initialize.
1689 openARGV();
1690 $Hold = '';
1691 $CondReg = 0;
1692 $doPrint = $doAutoPrint;
1693CYCLE:
1694 while( getsARGV() ){
1695 chomp();
1696 $CondReg = 0; # cleared on t
1697BOS:;
1698[TheEnd]
1699
1700 # parse - avoid opening files when doing s2p
1701 #
1702 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1703 if $doGenerate;
1704 Parse();
1705 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1706 if $doGenerate;
1707
1708 # append trailing code
1709 #
1710 $Code .= <<'[TheEnd]';
1711EOS: if( $doPrint ){
1712 print $_, "\n";
1713 } else {
1714 $doPrint = $doAutoPrint;
a687059c 1715 }
86a59229 1716 printQ() if @Q;
a687059c 1717 }
86a59229
WL
1718
1719 exit( 0 );
a687059c 1720}
86a59229
WL
1721[TheEnd]
1722
d16f50bd
WL
1723
1724# append optional functions, prepend prototypes
1725#
1726my $Proto = "# prototypes\n";
1727if( $GenKey{'l'} ){
1728 $Proto .= "sub _l();\n";
1729 $Func .= <<'[TheEnd]';
1730# _l: l command processing
1731#
1732sub _l(){
1733 my $h = $_;
1734 my $mcpl = 70;
1735 # transform non printing chars into escape notation
1736 $h =~ s/\\/\\\\/g;
1737 if( $h =~ /[^[:print:]]/ ){
1738 $h =~ s/\a/\\a/g;
1739 $h =~ s/\f/\\f/g;
1740 $h =~ s/\n/\\n/g;
1741 $h =~ s/\t/\\t/g;
1742 $h =~ s/\r/\\r/g;
1743 $h =~ s/\e/\\e/g;
1744 $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1745 }
1746 # split into lines of length $mcpl
1747 while( length( $h ) > $mcpl ){
1748 my $l = substr( $h, 0, $mcpl-1 );
1749 $h = substr( $h, $mcpl );
1750 # remove incomplete \-escape from end of line
1751 if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1752 $h = $1 . $h;
1753 }
1754 print $l, "\\\n";
1755 }
1756 print "$h\$\n";
1757}
1758
1759[TheEnd]
1760}
1761
1762if( $GenKey{'r'} ){
1763 $Proto .= "sub _r(\$);\n";
1764 $Func .= <<'[TheEnd]';
1765# _r: r command processing: Save a reference to the pathname.
1766#
1767sub _r($){
1768 my $path = shift();
1769 push( @Q, \$path );
1770}
1771
1772[TheEnd]
1773}
1774
1775if( $GenKey{'t'} ){
1776 $Proto .= "sub _t();\n";
1777 $Func .= <<'[TheEnd]';
1778# _t: t command - condition register test/reset
1779#
1780sub _t(){
1781 my $res = $CondReg;
1782 $CondReg = 0;
1783 $res;
1784}
1785
1786[TheEnd]
1787}
1788
1789if( $GenKey{'w'} ){
1790 $Proto .= "sub _w(\$);\n";
1791 $Func .= <<'[TheEnd]';
1792# _w: w command and s command's w flag - write to file
1793#
1794sub _w($){
1795 my $path = shift();
1796 my $handle = $wFiles{$path};
1797 if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1798 open( $handle, ">$path" )
1799 || die( "$0: $path: cannot open ($!)\n" );
1800 }
1801 print $handle $_, "\n";
1802}
1803
1804[TheEnd]
1805}
1806
1807$Code = $Proto . $Code;
1808
86a59229
WL
1809# magic "#n" - same as -n option
1810#
1811$doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
a687059c 1812
86a59229
WL
1813# eval code - check for errors
1814#
d16f50bd
WL
1815print "Code:\n$Code$Func" if $useDEBUG;
1816eval $Code . $Func;
86a59229 1817if( $@ ){
d16f50bd 1818 print "Code:\n$Code$Func";
86a59229 1819 die( "$0: internal error - generated incorrect Perl code: $@\n" );
9ef589d8
LW
1820}
1821
86a59229
WL
1822if( $doGenerate ){
1823
1824 # write full Perl program
1825 #
1826
d16f50bd 1827 # bang line, declarations, prototypes
86a59229
WL
1828 print <<TheEnd;
1829#!$perlpath -w
1830eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1831 if 0;
9ae2b9a0 1832\$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
86a59229
WL
1833
1834use strict;
1835use Symbol;
1836use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1837 \$doAutoPrint \$doOpenWrite \$doPrint };
1838\$doAutoPrint = $doAutoPrint;
1839\$doOpenWrite = $doOpenWrite;
1840TheEnd
1841
1842 my $wf = "'" . join( "', '", keys( %wFiles ) ) . "'";
1843 if( $wf ne "''" ){
1844 print <<TheEnd;
1845sub makeHandle(\$);
1846for my \$p ( $wf ){
1847 exit( 1 ) unless makeHandle( \$p );
9ef589d8 1848}
86a59229
WL
1849TheEnd
1850 }
9ef589d8 1851
86a59229 1852 print $Code;
d16f50bd
WL
1853 print "Run();\n";
1854 print $Func;
86a59229 1855 exit( 0 );
1aa91729 1856
86a59229
WL
1857} else {
1858
1859 # execute: make handles (and optionally open) all w files; run!
86a59229
WL
1860 for my $p ( keys( %wFiles ) ){
1861 exit( 1 ) unless makeHandle( $p );
1862 }
d16f50bd 1863 Run();
1aa91729 1864}
86a59229
WL
1865
1866
1867=head1 ENVIRONMENT
1868
1869The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1870See L<"Additional Atoms">.
1871
1872=head1 DIAGNOSTICS
1873
1874=over 4
1875
3484938b 1876=item ambiguous translation for character `%s' in `y' command
86a59229
WL
1877
1878The indicated character appears twice, with different translations.
1879
1880=item `[' cannot be last in pattern
1881
1882A `[' in a BRE indicates the beginning of a I<bracket expression>.
1883
1884=item `\' cannot be last in pattern
1885
1886A `\' in a BRE is used to make the subsequent character literal.
1887
1888=item `\' cannot be last in substitution
1889
1890A `\' in a subsitution string is used to make the subsequent character literal.
1891
1892=item conflicting flags `%s'
1893
1894In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1895multiple n-th occurrence flags are specified. Note that only the digits
1896`1' through `9' are permitted.
1897
1898=item duplicate label %s (first defined at %s)
1899
1900=item excess address(es)
1901
1902The command has more than the permitted number of addresses.
1903
1904=item extra characters after command (%s)
1905
1906=item illegal option `%s'
1907
1908=item improper delimiter in s command
1909
1910The BRE and substitution may not be delimited with `\' or newline.
1911
1912=item invalid address after `,'
1913
1914=item invalid backreference (%s)
1915
1916The specified backreference number exceeds the number of backreferences
1917in the BRE.
1918
1919=item invalid repeat clause `\{%s\}'
1920
1921The repeat clause does not contain a valid integer value, or pair of
1922values.
1923
1924=item malformed regex, 1st address
1925
1926=item malformed regex, 2nd address
1927
1928=item malformed regular expression
1929
1930=item malformed substitution expression
1931
1932=item malformed `y' command argument
1933
1934The first or second string of a B<y> command is syntactically incorrect.
1935
1936=item maximum less than minimum in `\{%s\}'
1937
1938=item no script command given
1939
1940There must be at least one B<-e> or one B<-f> option specifying a
1941script or script file.
1942
1943=item `\' not valid as delimiter in `y' command
1944
1945=item option -e requires an argument
1946
1947=item option -f requires an argument
1948
1949=item `s' command requires argument
1950
1951=item start of unterminated `{'
1952
1953=item string lengths in `y' command differ
1954
863b2ca0 1955The translation table strings in a B<y> command must have equal lengths.
86a59229
WL
1956
1957=item undefined label `%s'
1958
1959=item unexpected `}'
1960
1961A B<}> command without a preceding B<{> command was encountered.
1962
1963=item unexpected end of script
1964
1965The end of the script was reached although a text line after a
1966B<a>, B<c> or B<i> command indicated another line.
1967
1968=item unknown command `%s'
1969
1970=item unterminated `['
1971
1972A BRE contains an unterminated bracket expression.
1973
1974=item unterminated `\('
1975
1976A BRE contains an unterminated backreference.
1977
1978=item `\{' without closing `\}'
1979
1980A BRE contains an unterminated bounds specification.
1981
1982=item `\)' without preceding `\('
1983
1984=item `y' command requires argument
1985
1986=back
1987
1988=head1 EXAMPLE
1989
1990The basic material for the preceding section was generated by running
1991the sed script
1992
1993 #no autoprint
1994 s/^.*Warn( *"\([^"]*\)".*$/\1/
1995 t process
1996 b
1997 :process
1998 s/$!/%s/g
1999 s/$[_[:alnum:]]\{1,\}/%s/g
2000 s/\\\\/\\/g
2001 s/^/=item /
2002 p
2003
2004on the program's own text, and piping the output into C<sort -u>.
2005
2006
2007=head1 SED SCRIPT TRANSLATION
2008
2009If this program is invoked with the name F<s2p> it will act as a
2010sed-to-Perl translator. After option processing (all other
2011arguments are ignored), a Perl program is printed on standard
2012output, which will process the input stream (as read from all
2013arguments) in the way defined by the sed script and the option setting
2014used for the translation.
2015
2016=head1 SEE ALSO
2017
2018perl(1), re_format(7)
2019
2020=head1 BUGS
2021
2022The B<l> command will show escape characters (ESC) as `C<\e>', but
2023a vertical tab (VT) in octal.
2024
2025Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
2026
2027The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
2028is "the last pattern used, at run time". This deviates from the Perl
2029interpretation, which will re-use the "last last successfully executed
2030regular expression". Since keeping track of pattern usage would create
2031terribly cluttered code, and differences would only appear in obscure
2032context (where other B<sed> implementations appear to deviate, too),
2033the Perl semantics was adopted. Note that common usage of this feature,
2034such as in C</abc/s//xyz/>, will work as expected.
2035
2036Collating elements (of bracket expressions in BREs) are not implemented.
2037
2038=head1 STANDARDS
2039
2040This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2041definition of B<sed>, and is compatible with the I<OpenBSD>
2042implementation, except where otherwise noted (see L<"BUGS">).
2043
2044=head1 AUTHOR
2045
2046This Perl implementation of I<sed> was written by Wolfgang Laun,
2047I<Wolfgang.Laun@alcatel.at>.
2048
2049=head1 COPYRIGHT and LICENSE
2050
2051This program is free and open software. You may use, modify,
2052distribute, and sell this program (and any modified variants) in any
2053way you wish, provided you do not restrict others from doing the same.
2054
2055=cut
2056
a687059c 2057!NO!SUBS!
4633a7c4
LW
2058
2059close OUT or die "Can't close $file: $!";
2060chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
d38b618a 2061unlink 'psed';
96021e87 2062print "Linking $file to psed.\n";
d38b618a 2063if (defined $Config{d_link}) {
96021e87 2064 link $file, 'psed';
d38b618a
A
2065} else {
2066 unshift @INC, '../lib';
2067 require File::Copy;
2068 File::Copy::syscopy('s2p', 'psed');
2069}
4633a7c4 2070exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
8a5546a1 2071chdir $origdir;