This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make pstruct and psed appear also on platforms with
[perl5.git] / x2p / s2p.PL
CommitLineData
86a59229 1#!/usr/bin/perl
4633a7c4
LW
2
3use Config;
4use File::Basename qw(&basename &dirname);
8a5546a1 5use Cwd;
231bc313
JH
6use subs qw(link);
7
8sub link { # This is a cutdown vesion of installperl:link().
9 my($from,$to) = @_;
10 my($success) = 0;
11
12 eval {
13 CORE::link($from, $to)
14 ? $success++
15 : ($from =~ m#^/afs/# || $to =~ m#^/afs/#)
16 ? die "AFS" # okay inside eval {}
17 : die "Couldn't link $from to $to: $!\n";
18 };
19 if ($@) {
20 warn $@;
21 require File::Copy;
22 File::Copy::copy($from, $to)
23 ? $success++
24 : warn "Couldn't copy $from to $to: $!\n";
25 }
26 $success;
27}
4633a7c4
LW
28
29# List explicitly here the variables you want Configure to
30# generate. Metaconfig only looks for shell variables, so you
31# have to mention them as if they were shell variables, not
32# %Config entries. Thus you write
33# $startperl
34# to ensure Configure will look for $Config{startperl}.
35
36# This forces PL files to create target in same directory as PL file.
37# This is so that make depend always knows where to find PL derivatives.
8a5546a1 38$origdir = cwd;
44a8e56a 39chdir dirname($0);
40$file = basename($0, '.PL');
774d564b 41$file .= '.com' if $^O eq 'VMS';
4633a7c4
LW
42
43open OUT,">$file" or die "Can't create $file: $!";
44
45print "Extracting $file (with variable substitutions)\n";
46
47# In this section, perl variables will be expanded during extraction.
48# You can use $Config{...} to use Configure variables.
49
50print OUT <<"!GROK!THIS!";
5f05dabc 51$Config{startperl}
52 eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
53 if \$running_under_some_shell;
c0393c90
JH
54my \$startperl;
55my \$perlpath;
ed6d8ea1
JH
56(\$startperl = <<'/../') =~ s/\\s*\\z//;
57$Config{startperl}
58/../
59(\$perlpath = <<'/../') =~ s/\\s*\\z//;
60$Config{perlpath}
61/../
a687059c
LW
62!GROK!THIS!
63
4633a7c4
LW
64# In the following, perl variables are not expanded during extraction.
65
66print OUT <<'!NO!SUBS!';
a687059c 67
9ae2b9a0 68$0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
86a59229
WL
69
70# (p)sed - a stream editor
71# History: Aug 12 2000: Original version.
d16f50bd 72# Mar 25 2002: Rearrange generated Perl program.
86a59229
WL
73
74use strict;
75use integer;
76use Symbol;
8d063cd8 77
d83e3bda
JM
78=head1 NAME
79
ba34211e 80psed - a stream editor
d83e3bda
JM
81
82=head1 SYNOPSIS
83
ba34211e
JH
84 psed [-an] script [file ...]
85 psed [-an] [-e script] [-f script-file] [file ...]
86
87 s2p [-an] [-e script] [-f script-file]
d83e3bda
JM
88
89=head1 DESCRIPTION
90
86a59229
WL
91A stream editor reads the input stream consisting of the specified files
92(or standard input, if none are given), processes is line by line by
93applying a script consisting of edit commands, and writes resulting lines
94to standard output. The filename `C<->' may be used to read standard input.
95
96The edit script is composed from arguments of B<-e> options and
97script-files, in the given order. A single script argument may be specified
98as the first parameter.
99
100If this program is invoked with the name F<s2p>, it will act as a
101sed-to-Perl translator. See L<"sed Script Translation">.
d83e3bda 102
86a59229 103B<sed> returns an exit code of 0 on success or >0 if an error occurred.
d83e3bda 104
86a59229 105=head1 OPTIONS
d83e3bda 106
86a59229 107=over 4
d83e3bda 108
86a59229 109=item B<-a>
d83e3bda 110
86a59229
WL
111A file specified as argument to the B<w> edit command is by default
112opened before input processing starts. Using B<-a>, opening of such
113files is delayed until the first line is actually written to the file.
114
115=item B<-e> I<script>
116
117The editing commands defined by I<script> are appended to the script.
118Multiple commands must be separated by newlines.
119
120=item B<-f> I<script-file>
121
122Editing commands from the specified I<script-file> are read and appended
123to the script.
d83e3bda
JM
124
125=item B<-n>
126
86a59229
WL
127By default, a line is written to standard output after the editing script
128has been applied to it. The B<-n> option suppresses automatic printing.
129
130=back
131
132=head1 COMMANDS
133
134B<sed> command syntax is defined as
135
136Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
137
138with whitespace being permitted before or after addresses, and between
139the function character and the argument. The I<address>es and the
140address inverter (C<!>) are used to restrict the application of a
141command to the selected line(s) of input.
142
143Each command must be on a line of its own, except where noted in
144the synopses below.
145
146The edit cycle performed on each input line consist of reading the line
147(without its trailing newline character) into the I<pattern space>,
148applying the applicable commands of the edit script, writing the final
149contents of the pattern space and a newline to the standard output.
150A I<hold space> is provided for saving the contents of the
151pattern space for later use.
152
153=head2 Addresses
154
155A sed address is either a line number or a pattern, which may be combined
156arbitrarily to construct ranges. Lines are numbered across all input files.
157
158Any address may be followed by an exclamation mark (`C<!>'), selecting
159all lines not matching that address.
d83e3bda 160
86a59229 161=over 4
d83e3bda 162
86a59229
WL
163=item I<number>
164
165The line with the given number is selected.
166
167=item B<$>
168
169A dollar sign (C<$>) is the line number of the last line of the input stream.
170
171=item B</>I<regular expression>B</>
172
173A pattern address is a basic regular expression (see
174L<"Basic Regular Expressions">), between the delimiting character C</>.
175Any other character except C<\> or newline may be used to delimit a
176pattern address when the initial delimiter is prefixed with a
177backslash (`C<\>').
d83e3bda
JM
178
179=back
180
86a59229 181If no address is given, the command selects every line.
d83e3bda 182
86a59229
WL
183If one address is given, it selects the line (or lines) matching the
184address.
d83e3bda 185
86a59229
WL
186Two addresses select a range that begins whenever the first address
187matches, and ends (including that line) when the second address matches.
188If the first (second) address is a matching pattern, the second
189address is not applied to the very same line to determine the end of
190the range. Likewise, if the second address is a matching pattern, the
191first address is not applied to the very same line to determine the
192begin of another range. If both addresses are line numbers,
193and the second line number is less than the first line number, then
194only the first line is selected.
d83e3bda 195
d83e3bda 196
86a59229 197=head2 Functions
d83e3bda 198
86a59229
WL
199The maximum permitted number of addresses is indicated with each
200function synopsis below.
d83e3bda 201
86a59229
WL
202The argument I<text> consists of one or more lines following the command.
203Embedded newlines in I<text> must be preceded with a backslash. Other
204backslashes in I<text> are deleted and the following character is taken
205literally.
d83e3bda 206
86a59229 207=over 4
d83e3bda 208
86a59229
WL
209=cut
210
211my %ComTab;
d16f50bd 212my %GenKey;
86a59229
WL
213#--------------------------------------------------------------------------
214$ComTab{'a'}=[ 1, 'txt', \&Emit, '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
215
216=item [1addr]B<a\> I<text>
217
218Write I<text> (which must start on the line following the command)
219to standard output immediately before reading the next line
220of input, either by executing the B<N> function or by beginning a new cycle.
221
222=cut
223
224#--------------------------------------------------------------------------
225$ComTab{'b'}=[ 2, 'str', \&Branch, '{ goto XXX; }' ]; #ok
226
227=item [2addr]B<b> [I<label>]
228
229Branch to the B<:> function with the specified I<label>. If no label
230is given, branch to the end of the script.
231
232=cut
233
234#--------------------------------------------------------------------------
235$ComTab{'c'}=[ 2, 'txt', \&Change, <<'-X-' ]; #ok
236{ print <<'TheEnd'; } $doPrint = 0; goto EOS;
237-X-
238### continue OK => next CYCLE;
239
240=item [2addr]B<c\> I<text>
241
242The line, or range of lines, selected by the address is deleted.
243The I<text> (which must start on the line following the command)
244is written to standard output. With an address range, this occurs at
245the end of the range.
246
247=cut
248
249#--------------------------------------------------------------------------
250$ComTab{'d'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
251{ $doPrint = 0;
252 goto EOS;
253}
254-X-
255### continue OK => next CYCLE;
256
257=item [2addr]B<d>
258
259Deletes the pattern space and starts the next cycle.
260
261=cut
262
263#--------------------------------------------------------------------------
264$ComTab{'D'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
265{ s/^.*\n?//;
266 if(length($_)){ goto BOS } else { goto EOS }
267}
268-X-
269### continue OK => next CYCLE;
270
271=item [2addr]B<D>
272
273Deletes the pattern space through the first embedded newline or to the end.
274If the pattern space becomes empty, a new cycle is started, otherwise
275execution of the script is restarted.
276
277=cut
278
279#--------------------------------------------------------------------------
280$ComTab{'g'}=[ 2, '', \&Emit, '{ $_ = $Hold };' ]; #ok
281
282=item [2addr]B<g>
283
284Replace the contents of the pattern space with the hold space.
285
286=cut
287
288#--------------------------------------------------------------------------
289$ComTab{'G'}=[ 2, '', \&Emit, '{ $_ .= "\n"; $_ .= $Hold };' ]; #ok
290
291=item [2addr]B<G>
292
293Append a newline and the contents of the hold space to the pattern space.
294
295=cut
296
297#--------------------------------------------------------------------------
298$ComTab{'h'}=[ 2, '', \&Emit, '{ $Hold = $_ }' ]; #ok
299
300=item [2addr]B<h>
301
302Replace the contents of the hold space with the pattern space.
303
304=cut
305
306#--------------------------------------------------------------------------
307$ComTab{'H'}=[ 2, '', \&Emit, '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
308
309=item [2addr]B<H>
310
311Append a newline and the contents of the pattern space to the hold space.
312
313=cut
314
315#--------------------------------------------------------------------------
316$ComTab{'i'}=[ 1, 'txt', \&Emit, '{ print <<'."'TheEnd' }\n" ]; #ok
317
318=item [1addr]B<i\> I<text>
319
320Write the I<text> (which must start on the line following the command)
321to standard output.
322
323=cut
324
325#--------------------------------------------------------------------------
326$ComTab{'l'}=[ 2, '', \&Emit, '{ _l() }' ]; #okUTF8
327
328=item [2addr]B<l>
329
330Print the contents of the pattern space: non-printable characters are
331shown in C-style escaped form; long lines are split and have a trailing
332`C<\>' at the point of the split; the true end of a line is marked with
333a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
334BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
335octal number for all other non-printable characters.
336
337=cut
338
339#--------------------------------------------------------------------------
340$ComTab{'n'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
341{ print $_, "\n" if $doPrint;
d16f50bd 342 printQ() if @Q;
86a59229
WL
343 $CondReg = 0;
344 last CYCLE unless getsARGV();
345 chomp();
346}
347-X-
348
349=item [2addr]B<n>
350
351If automatic printing is enabled, write the pattern space to the standard
352output. Replace the pattern space with the next line of input. If
353there is no more input, processing is terminated.
354
355=cut
356
357#--------------------------------------------------------------------------
358$ComTab{'N'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
d16f50bd 359{ printQ() if @Q;
86a59229
WL
360 $CondReg = 0;
361 last CYCLE unless getsARGV( $h );
362 chomp( $h );
363 $_ .= "\n$h";
364}
365-X-
366
367=item [2addr]B<N>
368
369Append a newline and the next line of input to the pattern space. If
370there is no more input, processing is terminated.
371
372=cut
373
374#--------------------------------------------------------------------------
375$ComTab{'p'}=[ 2, '', \&Emit, '{ print $_, "\n"; }' ]; #ok
376
377=item [2addr]B<p>
378
379Print the pattern space to the standard output. (Use the B<-n> option
380to suppress automatic printing at the end of a cycle if you want to
381avoid double printing of lines.)
382
383=cut
384
385#--------------------------------------------------------------------------
386$ComTab{'P'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
387{ if( /^(.*)/ ){ print $1, "\n"; } }
388-X-
389
390=item [2addr]B<P>
391
392Prints the pattern space through the first embedded newline or to the end.
393
394=cut
395
396#--------------------------------------------------------------------------
397$ComTab{'q'}=[ 1, '', \&Emit, <<'-X-' ]; #ok
398{ print $_, "\n" if $doPrint;
399 last CYCLE;
400}
401-X-
402
403=item [1addr]B<q>
404
405Branch to the end of the script and quit without starting a new cycle.
406
407=cut
408
409#--------------------------------------------------------------------------
410$ComTab{'r'}=[ 1, 'str', \&Emit, "{ _r( '-X-' ) }" ]; #ok
86a59229
WL
411
412=item [1addr]B<r> I<file>
413
414Copy the contents of the I<file> to standard output immediately before
415the next attempt to read a line of input. Any error encountered while
416reading I<file> is silently ignored.
417
418=cut
419
420#--------------------------------------------------------------------------
421$ComTab{'s'}=[ 2, 'sub', \&Emit, '' ]; #ok
422
423=item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
424
425Substitute the I<replacement> string for the first substring in
426the pattern space that matches the I<regular expression>.
427Any character other than backslash or newline can be used instead of a
428slash to delimit the regular expression and the replacement.
429To use the delimiter as a literal character within the regular expression
430and the replacement, precede the character by a backslash (`C<\>').
431
432Literal newlines may be embedded in the replacement string by
433preceding a newline with a backslash.
434
435Within the replacement, an ampersand (`C<&>') is replaced by the string
436matching the regular expression. The strings `C<\1>' through `C<\9>' are
437replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
438To get a literal `C<&>' or `C<\>' in the replacement text, precede it
439by a backslash.
440
441The following I<flags> modify the behaviour of the B<s> command:
442
443=over 8
444
445=item B<g>
446
447The replacement is performed for all matching, non-overlapping substrings
448of the pattern space.
449
450=item B<1>..B<9>
451
452Replace only the n-th matching substring of the pattern space.
453
454=item B<p>
455
456If the substitution was made, print the new value of the pattern space.
457
458=item B<w> I<file>
459
460If the substitution was made, write the new value of the pattern space
461to the specified file.
462
463=back
464
465=cut
466
467#--------------------------------------------------------------------------
468$ComTab{'t'}=[ 2, 'str', \&Branch, '{ goto XXX if _t() }' ]; #ok
469
470=item [2addr]B<t> [I<label>]
471
472Branch to the B<:> function with the specified I<label> if any B<s>
473substitutions have been made since the most recent reading of an input line
474or execution of a B<t> function. If no label is given, branch to the end of
475the script.
476
477
478=cut
479
480#--------------------------------------------------------------------------
481$ComTab{'w'}=[ 2, 'str', \&Write, "{ _w( '-X-' ) }" ]; #ok
482
483=item [2addr]B<w> I<file>
484
485The contents of the pattern space are written to the I<file>.
486
487=cut
488
489#--------------------------------------------------------------------------
490$ComTab{'x'}=[ 2, '', \&Emit, '{ ($Hold, $_) = ($_, $Hold) }' ]; #ok
491
492=item [2addr]B<x>
493
494Swap the contents of the pattern space and the hold space.
495
496=cut
497
498#--------------------------------------------------------------------------
499$ComTab{'y'}=[ 2, 'tra', \&Emit, '' ]; #ok
500=item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
501
502In the pattern space, replace all characters occuring in I<string1> by the
503character at the corresponding position in I<string2>. It is possible
504to use any character (other than a backslash or newline) instead of a
505slash to delimit the strings. Within I<string1> and I<string2>, a
506backslash followed by any character other than a newline is that literal
507character, and a backslash followed by an `n' is replaced by a newline
508character.
509
510=cut
d83e3bda 511
86a59229
WL
512#--------------------------------------------------------------------------
513$ComTab{'='}=[ 1, '', \&Emit, '{ print "$.\n" }' ]; #ok
514
515=item [1addr]B<=>
516
517Prints the current line number on the standard output.
518
519=cut
520
521#--------------------------------------------------------------------------
522$ComTab{':'}=[ 0, 'str', \&Label, '' ]; #ok
523
524=item [0addr]B<:> [I<label>]
525
526The command specifies the position of the I<label>. It has no other effect.
527
528=cut
529
530#--------------------------------------------------------------------------
531$ComTab{'{'}=[ 2, '', \&BeginBlock, '{' ]; #ok
532$ComTab{'}'}=[ 0, '', \&EndBlock, ';}' ]; #ok
533# ';' to avoid warning on empty {}-block
534
535=item [2addr]B<{> [I<command>]
536
537=item [0addr]B<}>
538
539These two commands begin and end a command list. The first command may
540be given on the same line as the opening B<{> command. The commands
541within the list are jointly selected by the address(es) given on the
542B<{> command (but may still have individual addresses).
543
544=cut
545
546#--------------------------------------------------------------------------
547$ComTab{'#'}=[ 0, 'str', \&Comment, '' ]; #ok
548
549=item [0addr]B<#> [I<comment>]
550
551The entire line is ignored (treated as a comment). If, however, the first
552two characters in the script are `C<#n>', automatic printing of output is
553suppressed, as if the B<-n> option were given on the command line.
554
555=back
556
557=cut
558
559use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
560
561my $useDEBUG = exists( $ENV{PSEDDEBUG} );
562my $useEXTBRE = $ENV{PSEDEXTBRE} || '';
563$useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
564
565my $doAutoPrint = 1; # automatic printing of pattern space (-n => 0)
566my $doOpenWrite = 1; # open w command output files at start (-a => 0)
567my $svOpenWrite = 0; # save $doOpenWrite
568my $doGenerate = $0 eq 's2p';
569
570# Collected and compiled script
571#
d16f50bd
WL
572my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
573$Code = '';
86a59229
WL
574
575##################
576# Compile Time
577#
578# Labels
579#
580# Error handling
581#
582sub Warn($;$){
583 my( $msg, $loc ) = @_;
584 $loc ||= '';
585 $loc .= ': ' if length( $loc );
586 warn( "$0: $loc$msg\n" );
587}
588
589$labNum = 0;
590sub newLabel(){
591 return 'L_'.++$labNum;
592}
593
594# safeHere: create safe here delimiter and modify opcode and argument
595#
596sub safeHere($$){
597 my( $codref, $argref ) = @_;
598 my $eod = 'EOD000';
599 while( $$argref =~ /^$eod$/m ){
600 $eod++;
601 }
602 $$codref =~ s/TheEnd/$eod/e;
603 $$argref .= "$eod\n";
604}
605
606# Emit: create address logic and emit command
607#
608sub Emit($$$$$$){
609 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
610 my $cond = '';
611 if( defined( $addr1 ) ){
612 if( defined( $addr2 ) ){
613 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
614 } else {
615 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
616 }
617 $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
618 }
619
620 if( $opcode eq '' ){
621 $Code .= "$cond$arg\n";
622
623 } elsif( $opcode =~ s/-X-/$arg/e ){
624 $Code .= "$cond$opcode\n";
625
626 } elsif( $opcode =~ /TheEnd/ ){
627 safeHere( \$opcode, \$arg );
628 $Code .= "$cond$opcode$arg";
629
630 } else {
631 $Code .= "$cond$opcode\n";
632 }
633 0;
634}
635
636# Write (w command, w flag): store pathname
637#
638sub Write($$$$$$){
639 my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
640 $wFiles{$path} = '';
641 Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
642}
643
644
645# Label (: command): label definition
646#
647sub Label($$$$$$){
648 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
649 my $rc = 0;
650 $lab =~ s/\s+//;
651 if( length( $lab ) ){
652 my $h;
653 if( ! exists( $Label{$lab} ) ){
654 $h = $Label{$lab}{name} = newLabel();
655 } else {
656 $h = $Label{$lab}{name};
657 if( exists( $Label{$lab}{defined} ) ){
658 my $dl = $Label{$lab}{defined};
659 Warn( "duplicate label $lab (first defined at $dl)", $fl );
660 $rc = 1;
661 }
662 }
663 $Label{$lab}{defined} = $fl;
664 $Code .= "$h:;\n";
665 }
666 $rc;
667}
668
669# BeginBlock ({ command): push block start
670#
671sub BeginBlock($$$$$$){
672 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
673 push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
674 Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
675}
676
677# EndBlock (} command): check proper nesting
678#
679sub EndBlock($$$$$$){
680 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
681 my $rc;
682 my $jcom = pop( @BlockStack );
683 if( defined( $jcom ) ){
684 $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
685 } else {
686 Warn( "unexpected `}'", $fl );
687 $rc = 1;
688 }
689 $rc;
690}
691
692# Branch (t, b commands): check or create label, substitute default
693#
694sub Branch($$$$$$){
695 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
696 $lab =~ s/\s+//; # no spaces at end
697 my $h;
698 if( length( $lab ) ){
699 if( ! exists( $Label{$lab} ) ){
700 $h = $Label{$lab}{name} = newLabel();
701 } else {
702 $h = $Label{$lab}{name};
703 }
704 push( @{$Label{$lab}{used}}, $fl );
705 } else {
706 $h = 'EOS';
707 }
708 $opcode =~ s/XXX/$h/e;
709 Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
710}
711
712# Change (c command): is special due to range end watching
713#
714sub Change($$$$$$){
715 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
716 my $kwd = $negated ? 'unless' : 'if';
717 if( defined( $addr2 ) ){
718 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
719 if( ! $negated ){
720 $addr1 = '$icnt = ('.$addr1.')';
721 $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
722 }
723 } else {
724 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
725 }
726 safeHere( \$opcode, \$arg );
727 $Code .= "$kwd( $addr1 ){\n $opcode$arg}\n";
728 0;
729}
730
731
732# Comment (# command): A no-op. Who would've thought that!
733#
734sub Comment($$$$$$){
735 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
736### $Code .= "# $arg\n";
737 0;
738}
739
740
741sub stripRegex($$){
742 my( $del, $sref ) = @_;
743 my $regex = $del;
744 print "stripRegex:$del:$$sref:\n" if $useDEBUG;
745 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
746 my $sl = $2;
747 $regex .= $1.$sl.$del;
748 if( length( $sl ) % 2 == 0 ){
749 return $regex;
750 }
751 $regex .= $3;
752 }
753 undef();
754}
755
756# stripTrans: take a <del> terminated string from y command
757# honoring and cleaning up of \-escaped <del>'s
758#
759sub stripTrans($$){
760 my( $del, $sref ) = @_;
761 my $t = '';
762 print "stripTrans:$del:$$sref:\n" if $useDEBUG;
763 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
764 my $sl = $2;
765 $t .= $1;
766 if( length( $sl ) % 2 == 0 ){
767 $t .= $sl;
768 $t =~ s/\\\\/\\/g;
769 return $t;
770 }
771 chop( $sl );
772 $t .= $sl.$del.$3;
773 }
774 undef();
775}
776
777# makey - construct Perl y/// from sed y///
778#
779sub makey($$$){
780 my( $fr, $to, $fl ) = @_;
781 my $error = 0;
782
783 # Ensure that any '-' is up front.
784 # Diagnose duplicate contradicting mappings
785 my %tr;
786 for( my $i = 0; $i < length($fr); $i++ ){
787 my $fc = substr($fr,$i,1);
788 my $tc = substr($to,$i,1);
789 if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
790 Warn( "ambiguos translation for character `$fc' in `y' command",
791 $fl );
792 $error++;
793 }
794 $tr{$fc} = $tc;
795 }
796 $fr = $to = '';
797 if( exists( $tr{'-'} ) ){
798 ( $fr, $to ) = ( '-', $tr{'-'} );
799 delete( $tr{'-'} );
800 } else {
801 $fr = $to = '';
802 }
803 # might just as well sort it...
804 for my $fc ( sort keys( %tr ) ){
805 $fr .= $fc;
806 $to .= $tr{$fc};
807 }
808 # make embedded delimiters and newlines safe
809 $fr =~ s/([{}])/\$1/g;
810 $to =~ s/([{}])/\$1/g;
811 $fr =~ s/\n/\\n/g;
812 $to =~ s/\n/\\n/g;
813 return $error ? undef() : "{ y{$fr}{$to}; }";
814}
815
816######
817# makes - construct Perl s/// from sed s///
818#
819sub makes($$$$$$$){
820 my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
821
822 # make embedded newlines safe
823 $regex =~ s/\n/\\n/g;
824 $subst =~ s/\n/\\n/g;
825
826 my $code;
827 # n-th occurrence
828 #
829 if( length( $nmatch ) ){
830 $code = <<TheEnd;
831{ \$n = $nmatch;
832 while( --\$n && ( \$s = m ${regex}g ) ){}
833 \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
834 \$CondReg ||= \$s;
835TheEnd
836 } else {
837 $code = <<TheEnd;
838{ \$s = s ${regex}${subst}s${global};
839 \$CondReg ||= \$s;
840TheEnd
841 }
842 if( $print ){
843 $code .= ' print $_, "\n" if $s;'."\n";
844 }
845 if( defined( $path ) ){
846 $wFiles{$path} = '';
847 $code .= " _w( '$path' ) if \$s;\n";
d16f50bd 848 $GenKey{'w'} = 1;
86a59229
WL
849 }
850 $code .= "}";
851}
852
853=head1 BASIC REGULAR EXPRESSIONS
854
855A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
856of I<atoms>, for matching parts of a string, and I<bounds>, specifying
857repetitions of a preceding atom.
858
859=head2 Atoms
860
861The possible atoms of a BRE are: B<.>, matching any single character;
862B<^> and B<$>, matching the null string at the beginning or end
863of a string, respectively; a I<bracket expressions>, enclosed
864in B<[> and B<]> (see below); and any single character with no
865other significance (matching that character). A B<\> before one
866of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
867after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
868becomes an atom and establishes the target for a I<backreference>,
869consisting of the substring that actually matches the enclosed atoms.
870Finally, B<\> followed by one of the digits B<0> through B<9> is a
871backreference.
872
873A B<^> that is not first, or a B<$> that is not last does not have
874a special significance and need not be preceded by a backslash to
875become literal. The same is true for a B<]>, that does not terminate
876a bracket expression.
877
878An unescaped backslash cannot be last in a BRE.
879
880=head2 Bounds
881
882The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
883atom; B<\{>I<count>B<\}>, specifying that many repetitions;
884B<\{>I<minimum>B<,\}>, giving a lower limit; and
885B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
886bound.
887
888A bound appearing as the first item in a BRE is taken literally.
889
890=head2 Bracket Expressions
891
892A I<bracket expression> is a list of characters, character ranges
893and character classes enclosed in B<[> and B<]> and matches any
894single character from the represented set of characters.
895
896A character range is written as two characters separated by B<-> and
897represents all characters (according to the character collating sequence)
898that are not less than the first and not greater than the second.
899(Ranges are very collating-sequence-dependent, and portable programs
900should avoid relying on them.)
901
902A character class is one of the class names
903
904 alnum digit punct
905 alpha graph space
906 blank lower upper
907 cntrl print xdigit
908
909enclosed in B<[:> and B<:]> and represents the set of characters
910as defined in ctype(3).
911
912If the first character after B<[> is B<^>, the sense of matching is
913inverted.
914
915To include a literal `C<^>', place it anywhere else but first. To
916include a literal 'C<]>' place it first or immediately after an
917initial B<^>. To include a literal `C<->' make it the first (or
918second after B<^>) or last character, or the second endpoint of
919a range.
920
921The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
922match the null string at the beginning and end of a word respectively.
923(Note that neither is identical to Perl's `\b' atom.)
924
925=head2 Additional Atoms
926
927Since some sed implementations provide additional regular expression
928atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
929the following backslash escapes:
930
931=over 4
932
933=item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
934
935=item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
936
937=item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
938
939=item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
940
941=item B<\y> Match the empty string at a word boundary.
942
943=item B<\B> Match the empty string between any two either word or non-word characters.
944
945=back
946
947To enable this feature, the environment variable PSEDEXTBRE must be set
948to a string containing the requested characters, e.g.:
949C<PSEDEXTBRE='E<lt>E<gt>wW'>.
950
951=cut
952
953#####
954# bre2p - convert BRE to Perl RE
955#
956sub peek(\$$){
957 my( $pref, $ic ) = @_;
958 $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
959}
960
961sub bre2p($$$){
962 my( $del, $pat, $fl ) = @_;
963 my $led = $del;
964 $led =~ tr/{([</})]>/;
965 $led = '' if $led eq $del;
966
967 $pat = substr( $pat, 1, length($pat) - 2 );
968 my $res = '';
969 my $bracklev = 0;
970 my $backref = 0;
971 my $parlev = 0;
972 for( my $ic = 0; $ic < length( $pat ); $ic++ ){
973 my $c = substr( $pat, $ic, 1 );
974 if( $c eq '\\' ){
975 ### backslash escapes
976 my $nc = peek($pat,$ic);
977 if( $nc eq '' ){
978 Warn( "`\\' cannot be last in pattern", $fl );
979 return undef();
980 }
981 $ic++;
982 if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
983 $res .= "\\$del";
984
985 } elsif( $nc =~ /([[.*\\n])/ ){
986 ## check for \-escaped magics and \n:
987 ## \[ \. \* \\ \n stay as they are
988 $res .= '\\'.$nc;
989
990 } elsif( $nc eq '(' ){ ## \( => (
991 $parlev++;
992 $res .= '(';
993
994 } elsif( $nc eq ')' ){ ## \) => )
995 $parlev--;
996 $backref++;
997 if( $parlev < 0 ){
998 Warn( "unmatched `\\)'", $fl );
999 return undef();
1000 }
1001 $res .= ')';
1002
1003 } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
1004 my $endpos = index( $pat, '\\}', $ic );
1005 if( $endpos < 0 ){
1006 Warn( "unmatched `\\{'", $fl );
1007 return undef();
1008 }
1009 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
1010 $ic = $endpos + 1;
1011
1012 if( $res =~ /^\^?$/ ){
1013 $res .= "\\{$rep\}";
1014 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
1015 my $min = $1;
1016 my $com = $2 || '';
1017 my $max = $3;
1018 if( length( $max ) ){
1019 if( $max < $min ){
1020 Warn( "maximum less than minimum in `\\{$rep\\}'",
1021 $fl );
1022 return undef();
1023 }
1024 } else {
1025 $max = '';
1026 }
1027 # simplify some
1028 if( $min == 0 && $max eq '1' ){
1029 $res .= '?';
1030 } elsif( $min == 1 && "$com$max" eq ',' ){
1031 $res .= '+';
1032 } elsif( $min == 0 && "$com$max" eq ',' ){
1033 $res .= '*';
1034 } else {
1035 $res .= "{$min$com$max}";
1036 }
1037 } else {
1038 Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1039 return undef();
1040 }
1041
1042 } elsif( $nc =~ /^[1-9]$/ ){
1043 ## \1 .. \9 => \1 .. \9, but check for a following digit
1044 if( $nc > $backref ){
1045 Warn( "invalid backreference ($nc)", $fl );
1046 return undef();
1047 }
1048 $res .= "\\$nc";
1049 if( peek($pat,$ic) =~ /[0-9]/ ){
1050 $res .= '(?:)';
1051 }
1052
1053 } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1054 ## extensions - at most <>wWyB - not in POSIX
1055 if( $nc eq '<' ){ ## \< => \b(?=\w), be precise
1056 $res .= '\\b(?<=\\W)';
1057 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1058 $res .= '\\b(?=\\W)';
1059 } elsif( $nc eq 'y' ){ ## \y => \b
1060 $res .= '\\b';
1061 } else { ## \B, \w, \W remain the same
1062 $res .= "\\$nc";
1063 }
1064 } elsif( $nc eq $led ){
1065 ## \<closing bracketing-delimiter> - keep '\'
1066 $res .= "\\$nc";
1067
1068 } else { ## \ <char> => <char> ("as if `\' were not present")
1069 $res .= $nc;
1070 }
1071
1072 } elsif( $c eq '.' ){ ## . => .
1073 $res .= $c;
1074
1075 } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1076 if( $res =~ /^\^?$/ ){
1077 $res .= '\\*';
1078 } elsif( substr( $res, -1, 1 ) ne '*' ){
1079 $res .= $c;
1080 }
1081
1082 } elsif( $c eq '[' ){
1083 ## parse []: [^...] [^]...] [-...]
1084 my $add = '[';
1085 if( peek($pat,$ic) eq '^' ){
1086 $ic++;
1087 $add .= '^';
1088 }
1089 my $nc = peek($pat,$ic);
1090 if( $nc eq ']' || $nc eq '-' ){
1091 $add .= $nc;
1092 $ic++;
1093 }
1094 # check that [ is not trailing
1095 if( $ic >= length( $pat ) - 1 ){
1096 Warn( "unmatched `['", $fl );
1097 return undef();
1098 }
1099 # look for [:...:] and x-y
1100 my $rstr = substr( $pat, $ic+1 );
1101 if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1102 my $cnt = $1;
1103 $ic += length( $cnt );
1104 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1105 # try some simplifications
1106 my $red = $cnt;
1107 if( $red =~ s/0-9// ){
1108 $cnt = $red.'\d';
1109 if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1110 $cnt = $red.'\w';
1111 }
1112 }
1113 $add .= $cnt;
1114
1115 # POSIX 1003.2 has this (optional) for begin/end word
1116 $add = '\\b(?=\\W)' if $add eq '[[:<:]]';
1117 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1118
1119 }
1120
1121 ## may have a trailing `-' before `]'
1122 if( $ic < length($pat) - 1 &&
1123 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1124 $ic += length( $1 );
1125 $add .= $1;
1126 # another simplification
1127 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1128 $res .= $add;
1129 } else {
1130 Warn( "unmatched `['", $fl );
1131 return undef();
1132 }
3cb6de81 1133
86a59229
WL
1134 } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1135 $res .= "\\$c";
d83e3bda 1136
86a59229
WL
1137 } elsif( $c eq ']' ){ ## unmatched ] is not magic
1138 $res .= ']';
d83e3bda 1139
86a59229
WL
1140 } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1141 $res .= "\\$c";
d83e3bda 1142
86a59229
WL
1143 } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1144 $res .= length( $res ) ? '\\^' : '^';
d83e3bda 1145
86a59229
WL
1146 } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1147 $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
8d063cd8 1148
86a59229
WL
1149 } else {
1150 $res .= $c;
1151 }
8d063cd8 1152 }
0a12ae7d 1153
86a59229
WL
1154 if( $parlev ){
1155 Warn( "unmatched `\\('", $fl );
1156 return undef();
8d063cd8 1157 }
0a12ae7d 1158
86a59229
WL
1159 # final cleanup: eliminate raw HTs
1160 $res =~ s/\t/\\t/g;
1161 return $del . $res . ( $led ? $led : $del );
1162}
0a12ae7d 1163
86a59229
WL
1164
1165#####
1166# sub2p - convert sed substitution to Perl substitution
1167#
1168sub sub2p($$$){
1169 my( $del, $subst, $fl ) = @_;
1170 my $led = $del;
1171 $led =~ tr/{([</})]>/;
1172 $led = '' if $led eq $del;
1173
1174 $subst = substr( $subst, 1, length($subst) - 2 );
1175 my $res = '';
1176
1177 for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1178 my $c = substr( $subst, $ic, 1 );
1179 if( $c eq '\\' ){
1180 ### backslash escapes
1181 my $nc = peek($subst,$ic);
1182 if( $nc eq '' ){
1183 Warn( "`\\' cannot be last in substitution", $fl );
1184 return undef();
1185 }
1186 $ic++;
1187 if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1188 $res .= '\\' . $nc;
1189 } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1190 $res .= '${' . $nc . '}';
1191 } else { ## everything else (includes &): omit \
1192 $res .= $nc;
1193 }
1194 } elsif( $c eq '&' ){ ## & => $&
1195 $res .= '$&';
1196 } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1197 $res .= '\\' . $c;
1198 } else {
1199 $res .= $c;
8d063cd8 1200 }
8d063cd8
LW
1201 }
1202
86a59229
WL
1203 # final cleanup: eliminate raw HTs
1204 $res =~ s/\t/\\t/g;
1205 return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1206}
0a12ae7d 1207
86a59229
WL
1208
1209sub Parse(){
1210 my $error = 0;
1211 my( $pdef, $pfil, $plin );
1212 for( my $icom = 0; $icom < @Commands; $icom++ ){
1213 my $cmd = $Commands[$icom];
1214 print "Parse:$cmd:\n" if $useDEBUG;
1215 $cmd =~ s/^\s+//;
1216 next unless length( $cmd );
1217 my $scom = $icom;
1218 if( exists( $Defined{$icom} ) ){
1219 $pdef = $Defined{$icom};
1220 if( $pdef =~ /^ #(\d+)/ ){
1221 $pfil = 'expression #';
1222 $plin = $1;
1223 } else {
1224 $pfil = "$pdef l.";
1225 $plin = 1;
1226 }
1227 } else {
1228 $plin++;
1229 }
1230 my $fl = "$pfil$plin";
1231
1232 # insert command as comment in gnerated code
1233 #
1234 $Code .= "# $cmd\n" if $doGenerate;
1235
1236 # The Address(es)
1237 #
1238 my( $negated, $naddr, $addr1, $addr2 );
1239 $naddr = 0;
1240 if( $cmd =~ s/^(\d+)\s*// ){
1241 $addr1 = "$1"; $naddr++;
1242 } elsif( $cmd =~ s/^\$\s*// ){
1243 $addr1 = 'eofARGV()'; $naddr++;
1244 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1245 my $del = $1;
1246 my $regex = stripRegex( $del, \$cmd );
1247 if( defined( $regex ) ){
1248 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1249 $naddr++;
1250 } else {
1251 Warn( "malformed regex, 1st address", $fl );
1252 $error++;
1253 next;
1254 }
1255 }
1256 if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1257 if( $cmd =~ s/^(\d+)\s*// ){
1258 $addr2 = "$1"; $naddr++;
1259 } elsif( $cmd =~ s/^\$\s*// ){
1260 $addr2 = 'eofARGV()'; $naddr++;
1261 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1262 my $del = $1;
1263 my $regex = stripRegex( $del, \$cmd );
1264 if( defined( $regex ) ){
1265 $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1266 $naddr++;
1267 } else {
1268 Warn( "malformed regex, 2nd address", $fl );
1269 $error++;
1270 next;
1271 }
1272 } else {
1273 Warn( "invalid address after `,'", $fl );
1274 $error++;
1275 next;
1276 }
1277 }
1278
1279 # address modifier `!'
1280 #
1281 $negated = $cmd =~ s/^!\s*//;
1282 if( defined( $addr1 ) ){
1283 print "Parse: addr1=$addr1" if $useDEBUG;
1284 if( defined( $addr2 ) ){
1285 print ", addr2=$addr2 " if $useDEBUG;
1286 # both numeric and addr1 > addr2 => eliminate addr2
1287 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1288 $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1289 }
9ef589d8 1290 }
86a59229
WL
1291 print 'negated' if $useDEBUG && $negated;
1292 print " command:$cmd\n" if $useDEBUG;
1293
1294 # The Command
1295 #
1296 if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1297 my $h = substr( $cmd, 0, 1 );
1298 Warn( "unknown command `$h'", $fl );
1299 $error++;
8d063cd8
LW
1300 next;
1301 }
86a59229 1302 my $key = $1;
8d063cd8 1303
86a59229 1304 my $tabref = $ComTab{$key};
d16f50bd 1305 $GenKey{$key} = 1;
86a59229
WL
1306 if( $naddr > $tabref->[0] ){
1307 Warn( "excess address(es)", $fl );
1308 $error++;
8d063cd8
LW
1309 next;
1310 }
1311
86a59229
WL
1312 my $arg = '';
1313 if( $tabref->[1] eq 'str' ){
1314 # take remainder - don't care if it is empty
1315 $arg = $cmd;
1316 $cmd = '';
1317
1318 } elsif( $tabref->[1] eq 'txt' ){
1319 # multi-line text
1320 my $goon = $cmd =~ /(.*)\\$/;
1321 if( length( $1 ) ){
1322 Warn( "extra characters after command ($cmd)", $fl );
1323 $error++;
8d063cd8 1324 }
86a59229
WL
1325 while( $goon ){
1326 $icom++;
1327 if( $icom > $#Commands ){
1328 Warn( "unexpected end of script", $fl );
1329 $error++;
1330 last;
1331 }
1332 $cmd = $Commands[$icom];
1333 $Code .= "# $cmd\n" if $doGenerate;
1334 $goon = $cmd =~ s/\\$//;
1335 $cmd =~ s/\\(.)/$1/g;
1336 $arg .= "\n" if length( $arg );
1337 $arg .= $cmd;
8d063cd8 1338 }
86a59229
WL
1339 $arg .= "\n" if length( $arg );
1340 $cmd = '';
8d063cd8 1341
86a59229
WL
1342 } elsif( $tabref->[1] eq 'sub' ){
1343 # s///
1344 if( ! length( $cmd ) ){
1345 Warn( "`s' command requires argument", $fl );
1346 $error++;
1347 next;
1348 }
1349 if( $cmd =~ s{^([^\\\n])}{} ){
1350 my $del = $1;
1351 my $regex = stripRegex( $del, \$cmd );
1352 if( ! defined( $regex ) ){
1353 Warn( "malformed regular expression", $fl );
1354 $error++;
1355 next;
a687059c 1356 }
86a59229
WL
1357 $regex = bre2p( $del, $regex, $fl );
1358
1359 # a trailing \ indicates embedded NL (in replacement string)
1360 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1361 $icom++;
1362 if( $icom > $#Commands ){
1363 Warn( "unexpected end of script", $fl );
1364 $error++;
1365 last;
8d063cd8 1366 }
86a59229
WL
1367 $cmd .= $Commands[$icom];
1368 $Code .= "# $Commands[$icom]\n" if $doGenerate;
9ef589d8 1369 }
86a59229
WL
1370
1371 my $subst = stripRegex( $del, \$cmd );
1372 if( ! defined( $regex ) ){
1373 Warn( "malformed substitution expression", $fl );
1374 $error++;
0a12ae7d
LW
1375 next;
1376 }
86a59229
WL
1377 $subst = sub2p( $del, $subst, $fl );
1378
1379 # parse s/// modifier: g|p|0-9|w <file>
1380 my( $global, $nmatch, $print, $write ) =
1381 ( '', '', 0, undef );
1382 while( $cmd =~ s/^([gp0-9])// ){
1383 $1 eq 'g' ? ( $global = 'g' ) :
1384 $1 eq 'p' ? ( $print = $1 ) : ( $nmatch .= $1 );
1385 }
1386 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1387 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1388 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1389 Warn( "conflicting flags `$global$nmatch'", $fl );
1390 $error++;
0a12ae7d
LW
1391 next;
1392 }
86a59229
WL
1393
1394 $arg = makes( $regex, $subst,
1395 $write, $global, $print, $nmatch, $fl );
1396 if( ! defined( $arg ) ){
1397 $error++;
8d063cd8
LW
1398 next;
1399 }
86a59229
WL
1400
1401 } else {
1402 Warn( "improper delimiter in s command", $fl );
1403 $error++;
1404 next;
1405 }
1406
1407 } elsif( $tabref->[1] eq 'tra' ){
1408 # y///
1409 # a trailing \ indicates embedded newline
1410 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1411 $icom++;
1412 if( $icom > $#Commands ){
1413 Warn( "unexpected end of script", $fl );
1414 $error++;
1415 last;
1416 }
1417 $cmd .= $Commands[$icom];
1418 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1419 }
1420 if( ! length( $cmd ) ){
1421 Warn( "`y' command requires argument", $fl );
1422 $error++;
1423 next;
1424 }
1425 my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1426 if( $d eq '\\' ){
1427 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1428 $error++;
1429 next;
1430 }
1431 my $fr = stripTrans( $d, \$cmd );
1432 if( ! defined( $fr ) || ! length( $cmd ) ){
1433 Warn( "malformed `y' command argument", $fl );
1434 $error++;
1435 next;
1436 }
1437 my $to = stripTrans( $d, \$cmd );
1438 if( ! defined( $to ) ){
1439 Warn( "malformed `y' command argument", $fl );
1440 $error++;
1441 next;
1442 }
1443 if( length($fr) != length($to) ){
1444 Warn( "string lengths in `y' command differ", $fl );
1445 $error++;
1446 next;
1447 }
1448 if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1449 $error++;
1450 next;
8d063cd8 1451 }
8d063cd8 1452
8d063cd8
LW
1453 }
1454
86a59229
WL
1455 # $cmd must be now empty - exception is {
1456 if( $cmd !~ /^\s*$/ ){
1457 if( $key eq '{' ){
1458 # dirty hack to process command on '{' line
1459 $Commands[$icom--] = $cmd;
1460 } else {
1461 Warn( "extra characters after command ($cmd)", $fl );
1462 $error++;
1463 next;
1464 }
8d063cd8
LW
1465 }
1466
86a59229
WL
1467 # Make Code
1468 #
1469 if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1470 $tabref->[3], $arg, $fl ) ){
1471 $error++;
8d063cd8 1472 }
86a59229 1473 }
8d063cd8 1474
86a59229
WL
1475 while( @BlockStack ){
1476 my $bl = pop( @BlockStack );
1477 Warn( "start of unterminated `{'", $bl );
1478 $error++;
1479 }
8d063cd8 1480
86a59229
WL
1481 for my $lab ( keys( %Label ) ){
1482 if( ! exists( $Label{$lab}{defined} ) ){
1483 for my $used ( @{$Label{$lab}{used}} ){
1484 Warn( "undefined label `$lab'", $used );
1485 $error++;
1486 }
8d063cd8 1487 }
86a59229 1488 }
8d063cd8 1489
86a59229
WL
1490 exit( 1 ) if $error;
1491}
8d063cd8 1492
8d063cd8 1493
86a59229
WL
1494##############
1495#### MAIN ####
1496##############
8d063cd8 1497
86a59229
WL
1498sub usage(){
1499 print STDERR "Usage: sed [-an] command [file...]\n";
1500 print STDERR " [-an] [-e command] [-f script-file] [file...]\n";
1501}
8d063cd8 1502
86a59229
WL
1503###################
1504# Here we go again...
1505#
1506my $expr = 0;
1507while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1508 my $opt = $1;
1509 my $arg = $2;
1510 shift( @ARGV );
1511 if( $opt eq 'e' ){
1512 if( length( $arg ) ){
1513 push( @Commands, split( "\n", $arg ) );
1514 } elsif( @ARGV ){
1515 push( @Commands, shift( @ARGV ) );
1516 } else {
1517 Warn( "option -e requires an argument" );
1518 usage();
1519 exit( 1 );
1520 }
1521 $expr++;
1522 $Defined{$#Commands} = " #$expr";
1523 next;
1524 }
1525 if( $opt eq 'f' ){
1526 my $path;
1527 if( length( $arg ) ){
1528 $path = $arg;
1529 } elsif( @ARGV ){
1530 $path = shift( @ARGV );
1531 } else {
1532 Warn( "option -f requires an argument" );
1533 usage();
1534 exit( 1 );
1535 }
1536 my $fst = $#Commands + 1;
1537 open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1538 my $cmd;
1539 while( defined( $cmd = <SCRIPT> ) ){
1540 chomp( $cmd );
1541 push( @Commands, $cmd );
1542 }
1543 close( SCRIPT );
1544 if( $#Commands >= $fst ){
1545 $Defined{$fst} = "$path";
8d063cd8 1546 }
86a59229
WL
1547 next;
1548 }
1549 if( $opt eq '-' && $arg eq '' ){
1550 last;
1551 }
1552 if( $opt eq 'h' || $opt eq '?' ){
1553 usage();
1554 exit( 0 );
1555 }
1556 if( $opt eq 'n' ){
1557 $doAutoPrint = 0;
1558 } elsif( $opt eq 'a' ){
1559 $doOpenWrite = 0;
1560 } else {
1561 Warn( "illegal option `$opt'" );
1562 usage();
1563 exit( 1 );
1564 }
1565 if( length( $arg ) ){
1566 unshift( @ARGV, "-$arg" );
1567 }
1568}
8d063cd8 1569
86a59229
WL
1570# A singleton command may be the 1st argument when there are no options.
1571#
1572if( @Commands == 0 ){
1573 if( @ARGV == 0 ){
1574 Warn( "no script command given" );
1575 usage();
1576 exit( 1 );
1577 }
1578 push( @Commands, split( "\n", shift( @ARGV ) ) );
1579 $Defined{0} = ' #1';
1580}
8d063cd8 1581
86a59229 1582print STDERR "Files: @ARGV\n" if $useDEBUG;
8d063cd8 1583
86a59229
WL
1584# generate leading code
1585#
d16f50bd 1586$Func = <<'[TheEnd]';
86a59229 1587
d16f50bd
WL
1588# openARGV: open 1st input file
1589#
86a59229
WL
1590sub openARGV(){
1591 unshift( @ARGV, '-' ) unless @ARGV;
1592 my $file = shift( @ARGV );
1593 open( ARG, "<$file" )
1594 || die( "$0: can't open $file for reading ($!)\n" );
1595 $isEOF = 0;
1596}
8d063cd8 1597
d16f50bd
WL
1598# getsARGV: Read another input line into argument (default: $_).
1599# Move on to next input file, and reset EOF flag $isEOF.
86a59229
WL
1600sub getsARGV(;\$){
1601 my $argref = @_ ? shift() : \$_;
1602 while( $isEOF || ! defined( $$argref = <ARG> ) ){
1603 close( ARG );
1604 return 0 unless @ARGV;
1605 my $file = shift( @ARGV );
1606 open( ARG, "<$file" )
1607 || die( "$0: can't open $file for reading ($!)\n" );
1608 $isEOF = 0;
1609 }
1610 1;
1611}
8d063cd8 1612
d16f50bd
WL
1613# eofARGV: end-of-file test
1614#
86a59229
WL
1615sub eofARGV(){
1616 return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1617}
8d063cd8 1618
d16f50bd
WL
1619# makeHandle: Generates another file handle for some file (given by its path)
1620# to be written due to a w command or an s command's w flag.
86a59229
WL
1621sub makeHandle($){
1622 my( $path ) = @_;
1623 my $handle;
1624 if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1625 $handle = $wFiles{$path} = gensym();
1626 if( $doOpenWrite ){
1627 if( ! open( $handle, ">$path" ) ){
1628 die( "$0: can't open $path for writing: ($!)\n" );
1629 }
9ef589d8 1630 }
86a59229
WL
1631 } else {
1632 $handle = $wFiles{$path};
1633 }
1634 return $handle;
1635}
9ef589d8 1636
d16f50bd
WL
1637# printQ: Print queued output which is either a string or a reference
1638# to a pathname.
86a59229
WL
1639sub printQ(){
1640 for my $q ( @Q ){
1641 if( ref( $q ) ){
d16f50bd 1642 # flush open w files so that reading this file gets it all
86a59229
WL
1643 if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1644 open( $wFiles{$$q}, ">>$$q" );
1645 }
d16f50bd 1646 # copy file to stdout: slow, but safe
86a59229 1647 if( open( RF, "<$$q" ) ){
d16f50bd 1648 while( defined( my $line = <RF> ) ){
86a59229
WL
1649 print $line;
1650 }
1651 close( RF );
1652 }
1653 } else {
1654 print $q;
a687059c 1655 }
86a59229
WL
1656 }
1657 undef( @Q );
1658}
1659
d16f50bd
WL
1660[TheEnd]
1661
1662# generate the sed loop
1663#
1664$Code .= <<'[TheEnd]';
1665sub openARGV();
1666sub getsARGV(;\$);
1667sub eofARGV();
1668sub printQ();
1669
1670# Run: the sed loop reading input and applying the script
1671#
86a59229
WL
1672sub Run(){
1673 my( $h, $icnt, $s, $n );
1674 # hack (not unbreakable :-/) to avoid // matching an empty string
1675 my $z = "\000"; $z =~ /$z/;
1676 # Initialize.
1677 openARGV();
1678 $Hold = '';
1679 $CondReg = 0;
1680 $doPrint = $doAutoPrint;
1681CYCLE:
1682 while( getsARGV() ){
1683 chomp();
1684 $CondReg = 0; # cleared on t
1685BOS:;
1686[TheEnd]
1687
1688 # parse - avoid opening files when doing s2p
1689 #
1690 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1691 if $doGenerate;
1692 Parse();
1693 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1694 if $doGenerate;
1695
1696 # append trailing code
1697 #
1698 $Code .= <<'[TheEnd]';
1699EOS: if( $doPrint ){
1700 print $_, "\n";
1701 } else {
1702 $doPrint = $doAutoPrint;
a687059c 1703 }
86a59229 1704 printQ() if @Q;
a687059c 1705 }
86a59229
WL
1706
1707 exit( 0 );
a687059c 1708}
86a59229
WL
1709[TheEnd]
1710
d16f50bd
WL
1711
1712# append optional functions, prepend prototypes
1713#
1714my $Proto = "# prototypes\n";
1715if( $GenKey{'l'} ){
1716 $Proto .= "sub _l();\n";
1717 $Func .= <<'[TheEnd]';
1718# _l: l command processing
1719#
1720sub _l(){
1721 my $h = $_;
1722 my $mcpl = 70;
1723 # transform non printing chars into escape notation
1724 $h =~ s/\\/\\\\/g;
1725 if( $h =~ /[^[:print:]]/ ){
1726 $h =~ s/\a/\\a/g;
1727 $h =~ s/\f/\\f/g;
1728 $h =~ s/\n/\\n/g;
1729 $h =~ s/\t/\\t/g;
1730 $h =~ s/\r/\\r/g;
1731 $h =~ s/\e/\\e/g;
1732 $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1733 }
1734 # split into lines of length $mcpl
1735 while( length( $h ) > $mcpl ){
1736 my $l = substr( $h, 0, $mcpl-1 );
1737 $h = substr( $h, $mcpl );
1738 # remove incomplete \-escape from end of line
1739 if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1740 $h = $1 . $h;
1741 }
1742 print $l, "\\\n";
1743 }
1744 print "$h\$\n";
1745}
1746
1747[TheEnd]
1748}
1749
1750if( $GenKey{'r'} ){
1751 $Proto .= "sub _r(\$);\n";
1752 $Func .= <<'[TheEnd]';
1753# _r: r command processing: Save a reference to the pathname.
1754#
1755sub _r($){
1756 my $path = shift();
1757 push( @Q, \$path );
1758}
1759
1760[TheEnd]
1761}
1762
1763if( $GenKey{'t'} ){
1764 $Proto .= "sub _t();\n";
1765 $Func .= <<'[TheEnd]';
1766# _t: t command - condition register test/reset
1767#
1768sub _t(){
1769 my $res = $CondReg;
1770 $CondReg = 0;
1771 $res;
1772}
1773
1774[TheEnd]
1775}
1776
1777if( $GenKey{'w'} ){
1778 $Proto .= "sub _w(\$);\n";
1779 $Func .= <<'[TheEnd]';
1780# _w: w command and s command's w flag - write to file
1781#
1782sub _w($){
1783 my $path = shift();
1784 my $handle = $wFiles{$path};
1785 if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1786 open( $handle, ">$path" )
1787 || die( "$0: $path: cannot open ($!)\n" );
1788 }
1789 print $handle $_, "\n";
1790}
1791
1792[TheEnd]
1793}
1794
1795$Code = $Proto . $Code;
1796
86a59229
WL
1797# magic "#n" - same as -n option
1798#
1799$doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
a687059c 1800
86a59229
WL
1801# eval code - check for errors
1802#
d16f50bd
WL
1803print "Code:\n$Code$Func" if $useDEBUG;
1804eval $Code . $Func;
86a59229 1805if( $@ ){
d16f50bd 1806 print "Code:\n$Code$Func";
86a59229 1807 die( "$0: internal error - generated incorrect Perl code: $@\n" );
9ef589d8
LW
1808}
1809
86a59229
WL
1810if( $doGenerate ){
1811
1812 # write full Perl program
1813 #
1814
d16f50bd 1815 # bang line, declarations, prototypes
86a59229
WL
1816 print <<TheEnd;
1817#!$perlpath -w
1818eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1819 if 0;
9ae2b9a0 1820\$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
86a59229
WL
1821
1822use strict;
1823use Symbol;
1824use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1825 \$doAutoPrint \$doOpenWrite \$doPrint };
1826\$doAutoPrint = $doAutoPrint;
1827\$doOpenWrite = $doOpenWrite;
1828TheEnd
1829
1830 my $wf = "'" . join( "', '", keys( %wFiles ) ) . "'";
1831 if( $wf ne "''" ){
1832 print <<TheEnd;
1833sub makeHandle(\$);
1834for my \$p ( $wf ){
1835 exit( 1 ) unless makeHandle( \$p );
9ef589d8 1836}
86a59229
WL
1837TheEnd
1838 }
9ef589d8 1839
86a59229 1840 print $Code;
d16f50bd
WL
1841 print "Run();\n";
1842 print $Func;
86a59229 1843 exit( 0 );
1aa91729 1844
86a59229
WL
1845} else {
1846
1847 # execute: make handles (and optionally open) all w files; run!
86a59229
WL
1848 for my $p ( keys( %wFiles ) ){
1849 exit( 1 ) unless makeHandle( $p );
1850 }
d16f50bd 1851 Run();
1aa91729 1852}
86a59229
WL
1853
1854
1855=head1 ENVIRONMENT
1856
1857The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1858See L<"Additional Atoms">.
1859
1860=head1 DIAGNOSTICS
1861
1862=over 4
1863
1864=item ambiguos translation for character `%s' in `y' command
1865
1866The indicated character appears twice, with different translations.
1867
1868=item `[' cannot be last in pattern
1869
1870A `[' in a BRE indicates the beginning of a I<bracket expression>.
1871
1872=item `\' cannot be last in pattern
1873
1874A `\' in a BRE is used to make the subsequent character literal.
1875
1876=item `\' cannot be last in substitution
1877
1878A `\' in a subsitution string is used to make the subsequent character literal.
1879
1880=item conflicting flags `%s'
1881
1882In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1883multiple n-th occurrence flags are specified. Note that only the digits
1884`1' through `9' are permitted.
1885
1886=item duplicate label %s (first defined at %s)
1887
1888=item excess address(es)
1889
1890The command has more than the permitted number of addresses.
1891
1892=item extra characters after command (%s)
1893
1894=item illegal option `%s'
1895
1896=item improper delimiter in s command
1897
1898The BRE and substitution may not be delimited with `\' or newline.
1899
1900=item invalid address after `,'
1901
1902=item invalid backreference (%s)
1903
1904The specified backreference number exceeds the number of backreferences
1905in the BRE.
1906
1907=item invalid repeat clause `\{%s\}'
1908
1909The repeat clause does not contain a valid integer value, or pair of
1910values.
1911
1912=item malformed regex, 1st address
1913
1914=item malformed regex, 2nd address
1915
1916=item malformed regular expression
1917
1918=item malformed substitution expression
1919
1920=item malformed `y' command argument
1921
1922The first or second string of a B<y> command is syntactically incorrect.
1923
1924=item maximum less than minimum in `\{%s\}'
1925
1926=item no script command given
1927
1928There must be at least one B<-e> or one B<-f> option specifying a
1929script or script file.
1930
1931=item `\' not valid as delimiter in `y' command
1932
1933=item option -e requires an argument
1934
1935=item option -f requires an argument
1936
1937=item `s' command requires argument
1938
1939=item start of unterminated `{'
1940
1941=item string lengths in `y' command differ
1942
1943The translation table strings in a B<y> commanf must have equal lengths.
1944
1945=item undefined label `%s'
1946
1947=item unexpected `}'
1948
1949A B<}> command without a preceding B<{> command was encountered.
1950
1951=item unexpected end of script
1952
1953The end of the script was reached although a text line after a
1954B<a>, B<c> or B<i> command indicated another line.
1955
1956=item unknown command `%s'
1957
1958=item unterminated `['
1959
1960A BRE contains an unterminated bracket expression.
1961
1962=item unterminated `\('
1963
1964A BRE contains an unterminated backreference.
1965
1966=item `\{' without closing `\}'
1967
1968A BRE contains an unterminated bounds specification.
1969
1970=item `\)' without preceding `\('
1971
1972=item `y' command requires argument
1973
1974=back
1975
1976=head1 EXAMPLE
1977
1978The basic material for the preceding section was generated by running
1979the sed script
1980
1981 #no autoprint
1982 s/^.*Warn( *"\([^"]*\)".*$/\1/
1983 t process
1984 b
1985 :process
1986 s/$!/%s/g
1987 s/$[_[:alnum:]]\{1,\}/%s/g
1988 s/\\\\/\\/g
1989 s/^/=item /
1990 p
1991
1992on the program's own text, and piping the output into C<sort -u>.
1993
1994
1995=head1 SED SCRIPT TRANSLATION
1996
1997If this program is invoked with the name F<s2p> it will act as a
1998sed-to-Perl translator. After option processing (all other
1999arguments are ignored), a Perl program is printed on standard
2000output, which will process the input stream (as read from all
2001arguments) in the way defined by the sed script and the option setting
2002used for the translation.
2003
2004=head1 SEE ALSO
2005
2006perl(1), re_format(7)
2007
2008=head1 BUGS
2009
2010The B<l> command will show escape characters (ESC) as `C<\e>', but
2011a vertical tab (VT) in octal.
2012
2013Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
2014
2015The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
2016is "the last pattern used, at run time". This deviates from the Perl
2017interpretation, which will re-use the "last last successfully executed
2018regular expression". Since keeping track of pattern usage would create
2019terribly cluttered code, and differences would only appear in obscure
2020context (where other B<sed> implementations appear to deviate, too),
2021the Perl semantics was adopted. Note that common usage of this feature,
2022such as in C</abc/s//xyz/>, will work as expected.
2023
2024Collating elements (of bracket expressions in BREs) are not implemented.
2025
2026=head1 STANDARDS
2027
2028This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2029definition of B<sed>, and is compatible with the I<OpenBSD>
2030implementation, except where otherwise noted (see L<"BUGS">).
2031
2032=head1 AUTHOR
2033
2034This Perl implementation of I<sed> was written by Wolfgang Laun,
2035I<Wolfgang.Laun@alcatel.at>.
2036
2037=head1 COPYRIGHT and LICENSE
2038
2039This program is free and open software. You may use, modify,
2040distribute, and sell this program (and any modified variants) in any
2041way you wish, provided you do not restrict others from doing the same.
2042
2043=cut
2044
a687059c 2045!NO!SUBS!
4633a7c4
LW
2046
2047close OUT or die "Can't close $file: $!";
2048chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
d38b618a
A
2049unlink 'psed';
2050print "Linking s2p to psed.\n";
2051if (defined $Config{d_link}) {
2052 link 's2p', 'psed';
2053} else {
2054 unshift @INC, '../lib';
2055 require File::Copy;
2056 File::Copy::syscopy('s2p', 'psed');
2057}
4633a7c4 2058exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
8a5546a1 2059chdir $origdir;