This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make offsets support conditional
[perl5.git] / x2p / s2p.PL
CommitLineData
86a59229 1#!/usr/bin/perl
4633a7c4
LW
2
3use Config;
4use File::Basename qw(&basename &dirname);
8a5546a1 5use Cwd;
231bc313
JH
6use subs qw(link);
7
6ef9d486 8sub link { # This is a cut-down version of installperl:link().
231bc313
JH
9 my($from,$to) = @_;
10 my($success) = 0;
11
12 eval {
13 CORE::link($from, $to)
14 ? $success++
15 : ($from =~ m#^/afs/# || $to =~ m#^/afs/#)
16 ? die "AFS" # okay inside eval {}
17 : die "Couldn't link $from to $to: $!\n";
18 };
19 if ($@) {
231bc313
JH
20 require File::Copy;
21 File::Copy::copy($from, $to)
22 ? $success++
23 : warn "Couldn't copy $from to $to: $!\n";
24 }
25 $success;
26}
4633a7c4
LW
27
28# List explicitly here the variables you want Configure to
29# generate. Metaconfig only looks for shell variables, so you
30# have to mention them as if they were shell variables, not
31# %Config entries. Thus you write
32# $startperl
33# to ensure Configure will look for $Config{startperl}.
34
35# This forces PL files to create target in same directory as PL file.
36# This is so that make depend always knows where to find PL derivatives.
8a5546a1 37$origdir = cwd;
44a8e56a 38chdir dirname($0);
39$file = basename($0, '.PL');
774d564b 40$file .= '.com' if $^O eq 'VMS';
4633a7c4
LW
41
42open OUT,">$file" or die "Can't create $file: $!";
43
44print "Extracting $file (with variable substitutions)\n";
45
46# In this section, perl variables will be expanded during extraction.
47# You can use $Config{...} to use Configure variables.
48
49print OUT <<"!GROK!THIS!";
5f05dabc 50$Config{startperl}
51 eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
52 if \$running_under_some_shell;
c0393c90
JH
53my \$startperl;
54my \$perlpath;
ed6d8ea1
JH
55(\$startperl = <<'/../') =~ s/\\s*\\z//;
56$Config{startperl}
57/../
58(\$perlpath = <<'/../') =~ s/\\s*\\z//;
59$Config{perlpath}
60/../
a687059c
LW
61!GROK!THIS!
62
4633a7c4
LW
63# In the following, perl variables are not expanded during extraction.
64
65print OUT <<'!NO!SUBS!';
a687059c 66
9ae2b9a0 67$0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
86a59229
WL
68
69# (p)sed - a stream editor
70# History: Aug 12 2000: Original version.
d16f50bd 71# Mar 25 2002: Rearrange generated Perl program.
86a59229
WL
72
73use strict;
74use integer;
75use Symbol;
8d063cd8 76
d83e3bda
JM
77=head1 NAME
78
ba34211e 79psed - a stream editor
d83e3bda
JM
80
81=head1 SYNOPSIS
82
ba34211e
JH
83 psed [-an] script [file ...]
84 psed [-an] [-e script] [-f script-file] [file ...]
85
86 s2p [-an] [-e script] [-f script-file]
d83e3bda
JM
87
88=head1 DESCRIPTION
89
86a59229
WL
90A stream editor reads the input stream consisting of the specified files
91(or standard input, if none are given), processes is line by line by
92applying a script consisting of edit commands, and writes resulting lines
93to standard output. The filename `C<->' may be used to read standard input.
94
95The edit script is composed from arguments of B<-e> options and
96script-files, in the given order. A single script argument may be specified
97as the first parameter.
98
99If this program is invoked with the name F<s2p>, it will act as a
100sed-to-Perl translator. See L<"sed Script Translation">.
d83e3bda 101
86a59229 102B<sed> returns an exit code of 0 on success or >0 if an error occurred.
d83e3bda 103
86a59229 104=head1 OPTIONS
d83e3bda 105
86a59229 106=over 4
d83e3bda 107
86a59229 108=item B<-a>
d83e3bda 109
86a59229
WL
110A file specified as argument to the B<w> edit command is by default
111opened before input processing starts. Using B<-a>, opening of such
112files is delayed until the first line is actually written to the file.
113
114=item B<-e> I<script>
115
116The editing commands defined by I<script> are appended to the script.
117Multiple commands must be separated by newlines.
118
119=item B<-f> I<script-file>
120
121Editing commands from the specified I<script-file> are read and appended
122to the script.
d83e3bda
JM
123
124=item B<-n>
125
86a59229
WL
126By default, a line is written to standard output after the editing script
127has been applied to it. The B<-n> option suppresses automatic printing.
128
129=back
130
131=head1 COMMANDS
132
133B<sed> command syntax is defined as
134
135Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
136
137with whitespace being permitted before or after addresses, and between
138the function character and the argument. The I<address>es and the
139address inverter (C<!>) are used to restrict the application of a
140command to the selected line(s) of input.
141
142Each command must be on a line of its own, except where noted in
143the synopses below.
144
145The edit cycle performed on each input line consist of reading the line
146(without its trailing newline character) into the I<pattern space>,
147applying the applicable commands of the edit script, writing the final
148contents of the pattern space and a newline to the standard output.
149A I<hold space> is provided for saving the contents of the
150pattern space for later use.
151
152=head2 Addresses
153
154A sed address is either a line number or a pattern, which may be combined
155arbitrarily to construct ranges. Lines are numbered across all input files.
156
157Any address may be followed by an exclamation mark (`C<!>'), selecting
158all lines not matching that address.
d83e3bda 159
86a59229 160=over 4
d83e3bda 161
86a59229
WL
162=item I<number>
163
164The line with the given number is selected.
165
166=item B<$>
167
168A dollar sign (C<$>) is the line number of the last line of the input stream.
169
170=item B</>I<regular expression>B</>
171
172A pattern address is a basic regular expression (see
173L<"Basic Regular Expressions">), between the delimiting character C</>.
174Any other character except C<\> or newline may be used to delimit a
175pattern address when the initial delimiter is prefixed with a
176backslash (`C<\>').
d83e3bda
JM
177
178=back
179
86a59229 180If no address is given, the command selects every line.
d83e3bda 181
86a59229
WL
182If one address is given, it selects the line (or lines) matching the
183address.
d83e3bda 184
86a59229
WL
185Two addresses select a range that begins whenever the first address
186matches, and ends (including that line) when the second address matches.
187If the first (second) address is a matching pattern, the second
188address is not applied to the very same line to determine the end of
189the range. Likewise, if the second address is a matching pattern, the
190first address is not applied to the very same line to determine the
191begin of another range. If both addresses are line numbers,
192and the second line number is less than the first line number, then
193only the first line is selected.
d83e3bda 194
d83e3bda 195
86a59229 196=head2 Functions
d83e3bda 197
86a59229
WL
198The maximum permitted number of addresses is indicated with each
199function synopsis below.
d83e3bda 200
86a59229
WL
201The argument I<text> consists of one or more lines following the command.
202Embedded newlines in I<text> must be preceded with a backslash. Other
203backslashes in I<text> are deleted and the following character is taken
204literally.
d83e3bda 205
86a59229 206=over 4
d83e3bda 207
86a59229
WL
208=cut
209
210my %ComTab;
d16f50bd 211my %GenKey;
86a59229
WL
212#--------------------------------------------------------------------------
213$ComTab{'a'}=[ 1, 'txt', \&Emit, '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
214
215=item [1addr]B<a\> I<text>
216
217Write I<text> (which must start on the line following the command)
218to standard output immediately before reading the next line
219of input, either by executing the B<N> function or by beginning a new cycle.
220
221=cut
222
223#--------------------------------------------------------------------------
224$ComTab{'b'}=[ 2, 'str', \&Branch, '{ goto XXX; }' ]; #ok
225
226=item [2addr]B<b> [I<label>]
227
228Branch to the B<:> function with the specified I<label>. If no label
229is given, branch to the end of the script.
230
231=cut
232
233#--------------------------------------------------------------------------
234$ComTab{'c'}=[ 2, 'txt', \&Change, <<'-X-' ]; #ok
235{ print <<'TheEnd'; } $doPrint = 0; goto EOS;
236-X-
237### continue OK => next CYCLE;
238
239=item [2addr]B<c\> I<text>
240
241The line, or range of lines, selected by the address is deleted.
242The I<text> (which must start on the line following the command)
243is written to standard output. With an address range, this occurs at
244the end of the range.
245
246=cut
247
248#--------------------------------------------------------------------------
249$ComTab{'d'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
250{ $doPrint = 0;
251 goto EOS;
252}
253-X-
254### continue OK => next CYCLE;
255
256=item [2addr]B<d>
257
258Deletes the pattern space and starts the next cycle.
259
260=cut
261
262#--------------------------------------------------------------------------
263$ComTab{'D'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
264{ s/^.*\n?//;
265 if(length($_)){ goto BOS } else { goto EOS }
266}
267-X-
268### continue OK => next CYCLE;
269
270=item [2addr]B<D>
271
272Deletes the pattern space through the first embedded newline or to the end.
273If the pattern space becomes empty, a new cycle is started, otherwise
274execution of the script is restarted.
275
276=cut
277
278#--------------------------------------------------------------------------
279$ComTab{'g'}=[ 2, '', \&Emit, '{ $_ = $Hold };' ]; #ok
280
281=item [2addr]B<g>
282
283Replace the contents of the pattern space with the hold space.
284
285=cut
286
287#--------------------------------------------------------------------------
288$ComTab{'G'}=[ 2, '', \&Emit, '{ $_ .= "\n"; $_ .= $Hold };' ]; #ok
289
290=item [2addr]B<G>
291
292Append a newline and the contents of the hold space to the pattern space.
293
294=cut
295
296#--------------------------------------------------------------------------
297$ComTab{'h'}=[ 2, '', \&Emit, '{ $Hold = $_ }' ]; #ok
298
299=item [2addr]B<h>
300
301Replace the contents of the hold space with the pattern space.
302
303=cut
304
305#--------------------------------------------------------------------------
306$ComTab{'H'}=[ 2, '', \&Emit, '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
307
308=item [2addr]B<H>
309
310Append a newline and the contents of the pattern space to the hold space.
311
312=cut
313
314#--------------------------------------------------------------------------
315$ComTab{'i'}=[ 1, 'txt', \&Emit, '{ print <<'."'TheEnd' }\n" ]; #ok
316
317=item [1addr]B<i\> I<text>
318
319Write the I<text> (which must start on the line following the command)
320to standard output.
321
322=cut
323
324#--------------------------------------------------------------------------
325$ComTab{'l'}=[ 2, '', \&Emit, '{ _l() }' ]; #okUTF8
326
327=item [2addr]B<l>
328
329Print the contents of the pattern space: non-printable characters are
330shown in C-style escaped form; long lines are split and have a trailing
331`C<\>' at the point of the split; the true end of a line is marked with
332a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
333BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
334octal number for all other non-printable characters.
335
336=cut
337
338#--------------------------------------------------------------------------
339$ComTab{'n'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
340{ print $_, "\n" if $doPrint;
d16f50bd 341 printQ() if @Q;
86a59229
WL
342 $CondReg = 0;
343 last CYCLE unless getsARGV();
344 chomp();
345}
346-X-
347
348=item [2addr]B<n>
349
350If automatic printing is enabled, write the pattern space to the standard
351output. Replace the pattern space with the next line of input. If
352there is no more input, processing is terminated.
353
354=cut
355
356#--------------------------------------------------------------------------
357$ComTab{'N'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
d16f50bd 358{ printQ() if @Q;
86a59229
WL
359 $CondReg = 0;
360 last CYCLE unless getsARGV( $h );
361 chomp( $h );
362 $_ .= "\n$h";
363}
364-X-
365
366=item [2addr]B<N>
367
368Append a newline and the next line of input to the pattern space. If
369there is no more input, processing is terminated.
370
371=cut
372
373#--------------------------------------------------------------------------
374$ComTab{'p'}=[ 2, '', \&Emit, '{ print $_, "\n"; }' ]; #ok
375
376=item [2addr]B<p>
377
378Print the pattern space to the standard output. (Use the B<-n> option
379to suppress automatic printing at the end of a cycle if you want to
380avoid double printing of lines.)
381
382=cut
383
384#--------------------------------------------------------------------------
385$ComTab{'P'}=[ 2, '', \&Emit, <<'-X-' ]; #ok
386{ if( /^(.*)/ ){ print $1, "\n"; } }
387-X-
388
389=item [2addr]B<P>
390
391Prints the pattern space through the first embedded newline or to the end.
392
393=cut
394
395#--------------------------------------------------------------------------
396$ComTab{'q'}=[ 1, '', \&Emit, <<'-X-' ]; #ok
397{ print $_, "\n" if $doPrint;
398 last CYCLE;
399}
400-X-
401
402=item [1addr]B<q>
403
404Branch to the end of the script and quit without starting a new cycle.
405
406=cut
407
408#--------------------------------------------------------------------------
409$ComTab{'r'}=[ 1, 'str', \&Emit, "{ _r( '-X-' ) }" ]; #ok
86a59229
WL
410
411=item [1addr]B<r> I<file>
412
413Copy the contents of the I<file> to standard output immediately before
414the next attempt to read a line of input. Any error encountered while
415reading I<file> is silently ignored.
416
417=cut
418
419#--------------------------------------------------------------------------
420$ComTab{'s'}=[ 2, 'sub', \&Emit, '' ]; #ok
421
422=item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
423
424Substitute the I<replacement> string for the first substring in
425the pattern space that matches the I<regular expression>.
426Any character other than backslash or newline can be used instead of a
427slash to delimit the regular expression and the replacement.
428To use the delimiter as a literal character within the regular expression
429and the replacement, precede the character by a backslash (`C<\>').
430
431Literal newlines may be embedded in the replacement string by
432preceding a newline with a backslash.
433
434Within the replacement, an ampersand (`C<&>') is replaced by the string
435matching the regular expression. The strings `C<\1>' through `C<\9>' are
436replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
437To get a literal `C<&>' or `C<\>' in the replacement text, precede it
438by a backslash.
439
440The following I<flags> modify the behaviour of the B<s> command:
441
442=over 8
443
444=item B<g>
445
446The replacement is performed for all matching, non-overlapping substrings
447of the pattern space.
448
449=item B<1>..B<9>
450
451Replace only the n-th matching substring of the pattern space.
452
453=item B<p>
454
455If the substitution was made, print the new value of the pattern space.
456
457=item B<w> I<file>
458
459If the substitution was made, write the new value of the pattern space
460to the specified file.
461
462=back
463
464=cut
465
466#--------------------------------------------------------------------------
467$ComTab{'t'}=[ 2, 'str', \&Branch, '{ goto XXX if _t() }' ]; #ok
468
469=item [2addr]B<t> [I<label>]
470
471Branch to the B<:> function with the specified I<label> if any B<s>
472substitutions have been made since the most recent reading of an input line
473or execution of a B<t> function. If no label is given, branch to the end of
474the script.
475
476
477=cut
478
479#--------------------------------------------------------------------------
480$ComTab{'w'}=[ 2, 'str', \&Write, "{ _w( '-X-' ) }" ]; #ok
481
482=item [2addr]B<w> I<file>
483
484The contents of the pattern space are written to the I<file>.
485
486=cut
487
488#--------------------------------------------------------------------------
489$ComTab{'x'}=[ 2, '', \&Emit, '{ ($Hold, $_) = ($_, $Hold) }' ]; #ok
490
491=item [2addr]B<x>
492
493Swap the contents of the pattern space and the hold space.
494
495=cut
496
497#--------------------------------------------------------------------------
498$ComTab{'y'}=[ 2, 'tra', \&Emit, '' ]; #ok
499=item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
500
501In the pattern space, replace all characters occuring in I<string1> by the
502character at the corresponding position in I<string2>. It is possible
503to use any character (other than a backslash or newline) instead of a
504slash to delimit the strings. Within I<string1> and I<string2>, a
505backslash followed by any character other than a newline is that literal
506character, and a backslash followed by an `n' is replaced by a newline
507character.
508
509=cut
d83e3bda 510
86a59229
WL
511#--------------------------------------------------------------------------
512$ComTab{'='}=[ 1, '', \&Emit, '{ print "$.\n" }' ]; #ok
513
514=item [1addr]B<=>
515
516Prints the current line number on the standard output.
517
518=cut
519
520#--------------------------------------------------------------------------
521$ComTab{':'}=[ 0, 'str', \&Label, '' ]; #ok
522
523=item [0addr]B<:> [I<label>]
524
525The command specifies the position of the I<label>. It has no other effect.
526
527=cut
528
529#--------------------------------------------------------------------------
530$ComTab{'{'}=[ 2, '', \&BeginBlock, '{' ]; #ok
531$ComTab{'}'}=[ 0, '', \&EndBlock, ';}' ]; #ok
532# ';' to avoid warning on empty {}-block
533
534=item [2addr]B<{> [I<command>]
535
536=item [0addr]B<}>
537
538These two commands begin and end a command list. The first command may
539be given on the same line as the opening B<{> command. The commands
540within the list are jointly selected by the address(es) given on the
541B<{> command (but may still have individual addresses).
542
543=cut
544
545#--------------------------------------------------------------------------
546$ComTab{'#'}=[ 0, 'str', \&Comment, '' ]; #ok
547
548=item [0addr]B<#> [I<comment>]
549
550The entire line is ignored (treated as a comment). If, however, the first
551two characters in the script are `C<#n>', automatic printing of output is
552suppressed, as if the B<-n> option were given on the command line.
553
554=back
555
556=cut
557
558use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
559
560my $useDEBUG = exists( $ENV{PSEDDEBUG} );
561my $useEXTBRE = $ENV{PSEDEXTBRE} || '';
562$useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
563
564my $doAutoPrint = 1; # automatic printing of pattern space (-n => 0)
565my $doOpenWrite = 1; # open w command output files at start (-a => 0)
566my $svOpenWrite = 0; # save $doOpenWrite
96021e87
JM
567
568# lower case $0 below as a VMSism. The VMS build procedure creates the
569# s2p file traditionally in upper case on the disk. When VMS is in a
570# case preserved or case sensitive mode, $0 will be returned in the exact
571# case which will be on the disk, and that is not predictable at this time.
572
573my $doGenerate = lc($0) eq 's2p';
86a59229
WL
574
575# Collected and compiled script
576#
d16f50bd
WL
577my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
578$Code = '';
86a59229
WL
579
580##################
581# Compile Time
582#
583# Labels
584#
585# Error handling
586#
587sub Warn($;$){
588 my( $msg, $loc ) = @_;
589 $loc ||= '';
590 $loc .= ': ' if length( $loc );
591 warn( "$0: $loc$msg\n" );
592}
593
594$labNum = 0;
595sub newLabel(){
596 return 'L_'.++$labNum;
597}
598
599# safeHere: create safe here delimiter and modify opcode and argument
600#
601sub safeHere($$){
602 my( $codref, $argref ) = @_;
603 my $eod = 'EOD000';
604 while( $$argref =~ /^$eod$/m ){
605 $eod++;
606 }
607 $$codref =~ s/TheEnd/$eod/e;
608 $$argref .= "$eod\n";
609}
610
611# Emit: create address logic and emit command
612#
613sub Emit($$$$$$){
614 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
615 my $cond = '';
616 if( defined( $addr1 ) ){
617 if( defined( $addr2 ) ){
618 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
619 } else {
620 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
621 }
622 $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
623 }
624
625 if( $opcode eq '' ){
626 $Code .= "$cond$arg\n";
627
628 } elsif( $opcode =~ s/-X-/$arg/e ){
629 $Code .= "$cond$opcode\n";
630
631 } elsif( $opcode =~ /TheEnd/ ){
632 safeHere( \$opcode, \$arg );
633 $Code .= "$cond$opcode$arg";
634
635 } else {
636 $Code .= "$cond$opcode\n";
637 }
638 0;
639}
640
641# Write (w command, w flag): store pathname
642#
643sub Write($$$$$$){
644 my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
645 $wFiles{$path} = '';
646 Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
647}
648
649
650# Label (: command): label definition
651#
652sub Label($$$$$$){
653 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
654 my $rc = 0;
655 $lab =~ s/\s+//;
656 if( length( $lab ) ){
657 my $h;
658 if( ! exists( $Label{$lab} ) ){
659 $h = $Label{$lab}{name} = newLabel();
660 } else {
661 $h = $Label{$lab}{name};
662 if( exists( $Label{$lab}{defined} ) ){
663 my $dl = $Label{$lab}{defined};
664 Warn( "duplicate label $lab (first defined at $dl)", $fl );
665 $rc = 1;
666 }
667 }
668 $Label{$lab}{defined} = $fl;
669 $Code .= "$h:;\n";
670 }
671 $rc;
672}
673
674# BeginBlock ({ command): push block start
675#
676sub BeginBlock($$$$$$){
677 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
678 push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
679 Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
680}
681
682# EndBlock (} command): check proper nesting
683#
684sub EndBlock($$$$$$){
685 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
686 my $rc;
687 my $jcom = pop( @BlockStack );
688 if( defined( $jcom ) ){
689 $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
690 } else {
691 Warn( "unexpected `}'", $fl );
692 $rc = 1;
693 }
694 $rc;
695}
696
697# Branch (t, b commands): check or create label, substitute default
698#
699sub Branch($$$$$$){
700 my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
701 $lab =~ s/\s+//; # no spaces at end
702 my $h;
703 if( length( $lab ) ){
704 if( ! exists( $Label{$lab} ) ){
705 $h = $Label{$lab}{name} = newLabel();
706 } else {
707 $h = $Label{$lab}{name};
708 }
709 push( @{$Label{$lab}{used}}, $fl );
710 } else {
711 $h = 'EOS';
712 }
713 $opcode =~ s/XXX/$h/e;
714 Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
715}
716
717# Change (c command): is special due to range end watching
718#
719sub Change($$$$$$){
720 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
721 my $kwd = $negated ? 'unless' : 'if';
722 if( defined( $addr2 ) ){
723 $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
724 if( ! $negated ){
725 $addr1 = '$icnt = ('.$addr1.')';
726 $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
727 }
728 } else {
729 $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
730 }
731 safeHere( \$opcode, \$arg );
732 $Code .= "$kwd( $addr1 ){\n $opcode$arg}\n";
733 0;
734}
735
736
737# Comment (# command): A no-op. Who would've thought that!
738#
739sub Comment($$$$$$){
740 my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
741### $Code .= "# $arg\n";
742 0;
743}
744
745
746sub stripRegex($$){
747 my( $del, $sref ) = @_;
748 my $regex = $del;
749 print "stripRegex:$del:$$sref:\n" if $useDEBUG;
750 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
751 my $sl = $2;
752 $regex .= $1.$sl.$del;
753 if( length( $sl ) % 2 == 0 ){
754 return $regex;
755 }
756 $regex .= $3;
757 }
758 undef();
759}
760
761# stripTrans: take a <del> terminated string from y command
762# honoring and cleaning up of \-escaped <del>'s
763#
764sub stripTrans($$){
765 my( $del, $sref ) = @_;
766 my $t = '';
767 print "stripTrans:$del:$$sref:\n" if $useDEBUG;
768 while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
769 my $sl = $2;
770 $t .= $1;
771 if( length( $sl ) % 2 == 0 ){
772 $t .= $sl;
773 $t =~ s/\\\\/\\/g;
774 return $t;
775 }
776 chop( $sl );
777 $t .= $sl.$del.$3;
778 }
779 undef();
780}
781
782# makey - construct Perl y/// from sed y///
783#
784sub makey($$$){
785 my( $fr, $to, $fl ) = @_;
786 my $error = 0;
787
788 # Ensure that any '-' is up front.
789 # Diagnose duplicate contradicting mappings
790 my %tr;
791 for( my $i = 0; $i < length($fr); $i++ ){
792 my $fc = substr($fr,$i,1);
793 my $tc = substr($to,$i,1);
794 if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
3484938b 795 Warn( "ambiguous translation for character `$fc' in `y' command",
86a59229
WL
796 $fl );
797 $error++;
798 }
799 $tr{$fc} = $tc;
800 }
801 $fr = $to = '';
802 if( exists( $tr{'-'} ) ){
803 ( $fr, $to ) = ( '-', $tr{'-'} );
804 delete( $tr{'-'} );
805 } else {
806 $fr = $to = '';
807 }
808 # might just as well sort it...
809 for my $fc ( sort keys( %tr ) ){
810 $fr .= $fc;
811 $to .= $tr{$fc};
812 }
813 # make embedded delimiters and newlines safe
814 $fr =~ s/([{}])/\$1/g;
815 $to =~ s/([{}])/\$1/g;
816 $fr =~ s/\n/\\n/g;
817 $to =~ s/\n/\\n/g;
818 return $error ? undef() : "{ y{$fr}{$to}; }";
819}
820
821######
822# makes - construct Perl s/// from sed s///
823#
824sub makes($$$$$$$){
825 my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
826
827 # make embedded newlines safe
828 $regex =~ s/\n/\\n/g;
829 $subst =~ s/\n/\\n/g;
830
831 my $code;
832 # n-th occurrence
833 #
834 if( length( $nmatch ) ){
835 $code = <<TheEnd;
836{ \$n = $nmatch;
837 while( --\$n && ( \$s = m ${regex}g ) ){}
838 \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
839 \$CondReg ||= \$s;
840TheEnd
841 } else {
842 $code = <<TheEnd;
843{ \$s = s ${regex}${subst}s${global};
844 \$CondReg ||= \$s;
845TheEnd
846 }
847 if( $print ){
848 $code .= ' print $_, "\n" if $s;'."\n";
849 }
850 if( defined( $path ) ){
851 $wFiles{$path} = '';
852 $code .= " _w( '$path' ) if \$s;\n";
d16f50bd 853 $GenKey{'w'} = 1;
86a59229
WL
854 }
855 $code .= "}";
856}
857
858=head1 BASIC REGULAR EXPRESSIONS
859
860A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
861of I<atoms>, for matching parts of a string, and I<bounds>, specifying
862repetitions of a preceding atom.
863
864=head2 Atoms
865
866The possible atoms of a BRE are: B<.>, matching any single character;
867B<^> and B<$>, matching the null string at the beginning or end
868of a string, respectively; a I<bracket expressions>, enclosed
869in B<[> and B<]> (see below); and any single character with no
870other significance (matching that character). A B<\> before one
871of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
872after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
873becomes an atom and establishes the target for a I<backreference>,
874consisting of the substring that actually matches the enclosed atoms.
875Finally, B<\> followed by one of the digits B<0> through B<9> is a
876backreference.
877
878A B<^> that is not first, or a B<$> that is not last does not have
879a special significance and need not be preceded by a backslash to
880become literal. The same is true for a B<]>, that does not terminate
881a bracket expression.
882
883An unescaped backslash cannot be last in a BRE.
884
885=head2 Bounds
886
887The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
888atom; B<\{>I<count>B<\}>, specifying that many repetitions;
889B<\{>I<minimum>B<,\}>, giving a lower limit; and
890B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
891bound.
892
893A bound appearing as the first item in a BRE is taken literally.
894
895=head2 Bracket Expressions
896
897A I<bracket expression> is a list of characters, character ranges
898and character classes enclosed in B<[> and B<]> and matches any
899single character from the represented set of characters.
900
901A character range is written as two characters separated by B<-> and
902represents all characters (according to the character collating sequence)
903that are not less than the first and not greater than the second.
904(Ranges are very collating-sequence-dependent, and portable programs
905should avoid relying on them.)
906
907A character class is one of the class names
908
909 alnum digit punct
910 alpha graph space
911 blank lower upper
912 cntrl print xdigit
913
914enclosed in B<[:> and B<:]> and represents the set of characters
915as defined in ctype(3).
916
917If the first character after B<[> is B<^>, the sense of matching is
918inverted.
919
920To include a literal `C<^>', place it anywhere else but first. To
921include a literal 'C<]>' place it first or immediately after an
922initial B<^>. To include a literal `C<->' make it the first (or
923second after B<^>) or last character, or the second endpoint of
924a range.
925
926The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
927match the null string at the beginning and end of a word respectively.
928(Note that neither is identical to Perl's `\b' atom.)
929
930=head2 Additional Atoms
931
932Since some sed implementations provide additional regular expression
933atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
934the following backslash escapes:
935
936=over 4
937
938=item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
939
940=item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
941
942=item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
943
944=item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
945
946=item B<\y> Match the empty string at a word boundary.
947
948=item B<\B> Match the empty string between any two either word or non-word characters.
949
950=back
951
952To enable this feature, the environment variable PSEDEXTBRE must be set
953to a string containing the requested characters, e.g.:
954C<PSEDEXTBRE='E<lt>E<gt>wW'>.
955
956=cut
957
958#####
959# bre2p - convert BRE to Perl RE
960#
961sub peek(\$$){
962 my( $pref, $ic ) = @_;
963 $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
964}
965
966sub bre2p($$$){
967 my( $del, $pat, $fl ) = @_;
968 my $led = $del;
969 $led =~ tr/{([</})]>/;
970 $led = '' if $led eq $del;
971
972 $pat = substr( $pat, 1, length($pat) - 2 );
973 my $res = '';
974 my $bracklev = 0;
975 my $backref = 0;
976 my $parlev = 0;
977 for( my $ic = 0; $ic < length( $pat ); $ic++ ){
978 my $c = substr( $pat, $ic, 1 );
979 if( $c eq '\\' ){
980 ### backslash escapes
981 my $nc = peek($pat,$ic);
982 if( $nc eq '' ){
983 Warn( "`\\' cannot be last in pattern", $fl );
984 return undef();
985 }
986 $ic++;
987 if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
988 $res .= "\\$del";
989
990 } elsif( $nc =~ /([[.*\\n])/ ){
991 ## check for \-escaped magics and \n:
992 ## \[ \. \* \\ \n stay as they are
993 $res .= '\\'.$nc;
994
995 } elsif( $nc eq '(' ){ ## \( => (
996 $parlev++;
997 $res .= '(';
998
999 } elsif( $nc eq ')' ){ ## \) => )
1000 $parlev--;
1001 $backref++;
1002 if( $parlev < 0 ){
1003 Warn( "unmatched `\\)'", $fl );
1004 return undef();
1005 }
1006 $res .= ')';
1007
1008 } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
1009 my $endpos = index( $pat, '\\}', $ic );
1010 if( $endpos < 0 ){
1011 Warn( "unmatched `\\{'", $fl );
1012 return undef();
1013 }
1014 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
1015 $ic = $endpos + 1;
1016
1017 if( $res =~ /^\^?$/ ){
1018 $res .= "\\{$rep\}";
1019 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
1020 my $min = $1;
1021 my $com = $2 || '';
1022 my $max = $3;
1023 if( length( $max ) ){
1024 if( $max < $min ){
1025 Warn( "maximum less than minimum in `\\{$rep\\}'",
1026 $fl );
1027 return undef();
1028 }
1029 } else {
1030 $max = '';
1031 }
1032 # simplify some
1033 if( $min == 0 && $max eq '1' ){
1034 $res .= '?';
1035 } elsif( $min == 1 && "$com$max" eq ',' ){
1036 $res .= '+';
1037 } elsif( $min == 0 && "$com$max" eq ',' ){
1038 $res .= '*';
1039 } else {
1040 $res .= "{$min$com$max}";
1041 }
1042 } else {
1043 Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1044 return undef();
1045 }
1046
1047 } elsif( $nc =~ /^[1-9]$/ ){
1048 ## \1 .. \9 => \1 .. \9, but check for a following digit
1049 if( $nc > $backref ){
1050 Warn( "invalid backreference ($nc)", $fl );
1051 return undef();
1052 }
1053 $res .= "\\$nc";
1054 if( peek($pat,$ic) =~ /[0-9]/ ){
1055 $res .= '(?:)';
1056 }
1057
1058 } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1059 ## extensions - at most <>wWyB - not in POSIX
1060 if( $nc eq '<' ){ ## \< => \b(?=\w), be precise
1061 $res .= '\\b(?<=\\W)';
1062 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1063 $res .= '\\b(?=\\W)';
1064 } elsif( $nc eq 'y' ){ ## \y => \b
1065 $res .= '\\b';
1066 } else { ## \B, \w, \W remain the same
1067 $res .= "\\$nc";
1068 }
1069 } elsif( $nc eq $led ){
1070 ## \<closing bracketing-delimiter> - keep '\'
1071 $res .= "\\$nc";
1072
1073 } else { ## \ <char> => <char> ("as if `\' were not present")
1074 $res .= $nc;
1075 }
1076
1077 } elsif( $c eq '.' ){ ## . => .
1078 $res .= $c;
1079
1080 } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1081 if( $res =~ /^\^?$/ ){
1082 $res .= '\\*';
1083 } elsif( substr( $res, -1, 1 ) ne '*' ){
1084 $res .= $c;
1085 }
1086
1087 } elsif( $c eq '[' ){
1088 ## parse []: [^...] [^]...] [-...]
1089 my $add = '[';
1090 if( peek($pat,$ic) eq '^' ){
1091 $ic++;
1092 $add .= '^';
1093 }
1094 my $nc = peek($pat,$ic);
1095 if( $nc eq ']' || $nc eq '-' ){
1096 $add .= $nc;
1097 $ic++;
1098 }
1099 # check that [ is not trailing
1100 if( $ic >= length( $pat ) - 1 ){
1101 Warn( "unmatched `['", $fl );
1102 return undef();
1103 }
1104 # look for [:...:] and x-y
1105 my $rstr = substr( $pat, $ic+1 );
1106 if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1107 my $cnt = $1;
1108 $ic += length( $cnt );
1109 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1110 # try some simplifications
1111 my $red = $cnt;
1112 if( $red =~ s/0-9// ){
1113 $cnt = $red.'\d';
1114 if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1115 $cnt = $red.'\w';
1116 }
1117 }
1118 $add .= $cnt;
1119
1120 # POSIX 1003.2 has this (optional) for begin/end word
1121 $add = '\\b(?=\\W)' if $add eq '[[:<:]]';
1122 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1123
1124 }
1125
1126 ## may have a trailing `-' before `]'
1127 if( $ic < length($pat) - 1 &&
1128 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1129 $ic += length( $1 );
1130 $add .= $1;
1131 # another simplification
1132 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1133 $res .= $add;
1134 } else {
1135 Warn( "unmatched `['", $fl );
1136 return undef();
1137 }
3cb6de81 1138
86a59229
WL
1139 } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1140 $res .= "\\$c";
d83e3bda 1141
86a59229
WL
1142 } elsif( $c eq ']' ){ ## unmatched ] is not magic
1143 $res .= ']';
d83e3bda 1144
86a59229
WL
1145 } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1146 $res .= "\\$c";
d83e3bda 1147
86a59229
WL
1148 } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1149 $res .= length( $res ) ? '\\^' : '^';
d83e3bda 1150
86a59229
WL
1151 } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1152 $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
8d063cd8 1153
86a59229
WL
1154 } else {
1155 $res .= $c;
1156 }
8d063cd8 1157 }
0a12ae7d 1158
86a59229
WL
1159 if( $parlev ){
1160 Warn( "unmatched `\\('", $fl );
1161 return undef();
8d063cd8 1162 }
0a12ae7d 1163
86a59229
WL
1164 # final cleanup: eliminate raw HTs
1165 $res =~ s/\t/\\t/g;
1166 return $del . $res . ( $led ? $led : $del );
1167}
0a12ae7d 1168
86a59229
WL
1169
1170#####
1171# sub2p - convert sed substitution to Perl substitution
1172#
1173sub sub2p($$$){
1174 my( $del, $subst, $fl ) = @_;
1175 my $led = $del;
1176 $led =~ tr/{([</})]>/;
1177 $led = '' if $led eq $del;
1178
1179 $subst = substr( $subst, 1, length($subst) - 2 );
1180 my $res = '';
1181
1182 for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1183 my $c = substr( $subst, $ic, 1 );
1184 if( $c eq '\\' ){
1185 ### backslash escapes
1186 my $nc = peek($subst,$ic);
1187 if( $nc eq '' ){
1188 Warn( "`\\' cannot be last in substitution", $fl );
1189 return undef();
1190 }
1191 $ic++;
1192 if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1193 $res .= '\\' . $nc;
1194 } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1195 $res .= '${' . $nc . '}';
1196 } else { ## everything else (includes &): omit \
1197 $res .= $nc;
1198 }
1199 } elsif( $c eq '&' ){ ## & => $&
1200 $res .= '$&';
1201 } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1202 $res .= '\\' . $c;
1203 } else {
1204 $res .= $c;
8d063cd8 1205 }
8d063cd8
LW
1206 }
1207
86a59229
WL
1208 # final cleanup: eliminate raw HTs
1209 $res =~ s/\t/\\t/g;
1210 return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1211}
0a12ae7d 1212
86a59229
WL
1213
1214sub Parse(){
1215 my $error = 0;
1216 my( $pdef, $pfil, $plin );
1217 for( my $icom = 0; $icom < @Commands; $icom++ ){
1218 my $cmd = $Commands[$icom];
1219 print "Parse:$cmd:\n" if $useDEBUG;
1220 $cmd =~ s/^\s+//;
1221 next unless length( $cmd );
1222 my $scom = $icom;
1223 if( exists( $Defined{$icom} ) ){
1224 $pdef = $Defined{$icom};
1225 if( $pdef =~ /^ #(\d+)/ ){
1226 $pfil = 'expression #';
1227 $plin = $1;
1228 } else {
1229 $pfil = "$pdef l.";
1230 $plin = 1;
1231 }
1232 } else {
1233 $plin++;
1234 }
1235 my $fl = "$pfil$plin";
1236
1237 # insert command as comment in gnerated code
1238 #
1239 $Code .= "# $cmd\n" if $doGenerate;
1240
1241 # The Address(es)
1242 #
1243 my( $negated, $naddr, $addr1, $addr2 );
1244 $naddr = 0;
1245 if( $cmd =~ s/^(\d+)\s*// ){
1246 $addr1 = "$1"; $naddr++;
1247 } elsif( $cmd =~ s/^\$\s*// ){
1248 $addr1 = 'eofARGV()'; $naddr++;
1249 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1250 my $del = $1;
1251 my $regex = stripRegex( $del, \$cmd );
1252 if( defined( $regex ) ){
1253 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1254 $naddr++;
1255 } else {
1256 Warn( "malformed regex, 1st address", $fl );
1257 $error++;
1258 next;
1259 }
1260 }
1261 if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1262 if( $cmd =~ s/^(\d+)\s*// ){
1263 $addr2 = "$1"; $naddr++;
1264 } elsif( $cmd =~ s/^\$\s*// ){
1265 $addr2 = 'eofARGV()'; $naddr++;
1266 } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1267 my $del = $1;
1268 my $regex = stripRegex( $del, \$cmd );
1269 if( defined( $regex ) ){
1270 $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1271 $naddr++;
1272 } else {
1273 Warn( "malformed regex, 2nd address", $fl );
1274 $error++;
1275 next;
1276 }
1277 } else {
1278 Warn( "invalid address after `,'", $fl );
1279 $error++;
1280 next;
1281 }
1282 }
1283
1284 # address modifier `!'
1285 #
1286 $negated = $cmd =~ s/^!\s*//;
1287 if( defined( $addr1 ) ){
1288 print "Parse: addr1=$addr1" if $useDEBUG;
1289 if( defined( $addr2 ) ){
1290 print ", addr2=$addr2 " if $useDEBUG;
1291 # both numeric and addr1 > addr2 => eliminate addr2
1292 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1293 $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1294 }
9ef589d8 1295 }
86a59229
WL
1296 print 'negated' if $useDEBUG && $negated;
1297 print " command:$cmd\n" if $useDEBUG;
1298
1299 # The Command
1300 #
1301 if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1302 my $h = substr( $cmd, 0, 1 );
1303 Warn( "unknown command `$h'", $fl );
1304 $error++;
8d063cd8
LW
1305 next;
1306 }
86a59229 1307 my $key = $1;
8d063cd8 1308
86a59229 1309 my $tabref = $ComTab{$key};
d16f50bd 1310 $GenKey{$key} = 1;
86a59229
WL
1311 if( $naddr > $tabref->[0] ){
1312 Warn( "excess address(es)", $fl );
1313 $error++;
8d063cd8
LW
1314 next;
1315 }
1316
86a59229
WL
1317 my $arg = '';
1318 if( $tabref->[1] eq 'str' ){
1319 # take remainder - don't care if it is empty
1320 $arg = $cmd;
1321 $cmd = '';
1322
1323 } elsif( $tabref->[1] eq 'txt' ){
1324 # multi-line text
1325 my $goon = $cmd =~ /(.*)\\$/;
1326 if( length( $1 ) ){
1327 Warn( "extra characters after command ($cmd)", $fl );
1328 $error++;
8d063cd8 1329 }
86a59229
WL
1330 while( $goon ){
1331 $icom++;
1332 if( $icom > $#Commands ){
1333 Warn( "unexpected end of script", $fl );
1334 $error++;
1335 last;
1336 }
1337 $cmd = $Commands[$icom];
1338 $Code .= "# $cmd\n" if $doGenerate;
1339 $goon = $cmd =~ s/\\$//;
1340 $cmd =~ s/\\(.)/$1/g;
1341 $arg .= "\n" if length( $arg );
1342 $arg .= $cmd;
8d063cd8 1343 }
86a59229
WL
1344 $arg .= "\n" if length( $arg );
1345 $cmd = '';
8d063cd8 1346
86a59229
WL
1347 } elsif( $tabref->[1] eq 'sub' ){
1348 # s///
1349 if( ! length( $cmd ) ){
1350 Warn( "`s' command requires argument", $fl );
1351 $error++;
1352 next;
1353 }
1354 if( $cmd =~ s{^([^\\\n])}{} ){
1355 my $del = $1;
1356 my $regex = stripRegex( $del, \$cmd );
1357 if( ! defined( $regex ) ){
1358 Warn( "malformed regular expression", $fl );
1359 $error++;
1360 next;
a687059c 1361 }
86a59229
WL
1362 $regex = bre2p( $del, $regex, $fl );
1363
1364 # a trailing \ indicates embedded NL (in replacement string)
1365 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1366 $icom++;
1367 if( $icom > $#Commands ){
1368 Warn( "unexpected end of script", $fl );
1369 $error++;
1370 last;
8d063cd8 1371 }
86a59229
WL
1372 $cmd .= $Commands[$icom];
1373 $Code .= "# $Commands[$icom]\n" if $doGenerate;
9ef589d8 1374 }
86a59229
WL
1375
1376 my $subst = stripRegex( $del, \$cmd );
1377 if( ! defined( $regex ) ){
1378 Warn( "malformed substitution expression", $fl );
1379 $error++;
0a12ae7d
LW
1380 next;
1381 }
86a59229
WL
1382 $subst = sub2p( $del, $subst, $fl );
1383
1384 # parse s/// modifier: g|p|0-9|w <file>
1385 my( $global, $nmatch, $print, $write ) =
1386 ( '', '', 0, undef );
1387 while( $cmd =~ s/^([gp0-9])// ){
1388 $1 eq 'g' ? ( $global = 'g' ) :
1389 $1 eq 'p' ? ( $print = $1 ) : ( $nmatch .= $1 );
1390 }
1391 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1392 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1393 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1394 Warn( "conflicting flags `$global$nmatch'", $fl );
1395 $error++;
0a12ae7d
LW
1396 next;
1397 }
86a59229
WL
1398
1399 $arg = makes( $regex, $subst,
1400 $write, $global, $print, $nmatch, $fl );
1401 if( ! defined( $arg ) ){
1402 $error++;
8d063cd8
LW
1403 next;
1404 }
86a59229
WL
1405
1406 } else {
1407 Warn( "improper delimiter in s command", $fl );
1408 $error++;
1409 next;
1410 }
1411
1412 } elsif( $tabref->[1] eq 'tra' ){
1413 # y///
1414 # a trailing \ indicates embedded newline
1415 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1416 $icom++;
1417 if( $icom > $#Commands ){
1418 Warn( "unexpected end of script", $fl );
1419 $error++;
1420 last;
1421 }
1422 $cmd .= $Commands[$icom];
1423 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1424 }
1425 if( ! length( $cmd ) ){
1426 Warn( "`y' command requires argument", $fl );
1427 $error++;
1428 next;
1429 }
1430 my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1431 if( $d eq '\\' ){
1432 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1433 $error++;
1434 next;
1435 }
1436 my $fr = stripTrans( $d, \$cmd );
1437 if( ! defined( $fr ) || ! length( $cmd ) ){
1438 Warn( "malformed `y' command argument", $fl );
1439 $error++;
1440 next;
1441 }
1442 my $to = stripTrans( $d, \$cmd );
1443 if( ! defined( $to ) ){
1444 Warn( "malformed `y' command argument", $fl );
1445 $error++;
1446 next;
1447 }
1448 if( length($fr) != length($to) ){
1449 Warn( "string lengths in `y' command differ", $fl );
1450 $error++;
1451 next;
1452 }
1453 if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1454 $error++;
1455 next;
8d063cd8 1456 }
8d063cd8 1457
8d063cd8
LW
1458 }
1459
86a59229
WL
1460 # $cmd must be now empty - exception is {
1461 if( $cmd !~ /^\s*$/ ){
1462 if( $key eq '{' ){
1463 # dirty hack to process command on '{' line
1464 $Commands[$icom--] = $cmd;
1465 } else {
1466 Warn( "extra characters after command ($cmd)", $fl );
1467 $error++;
1468 next;
1469 }
8d063cd8
LW
1470 }
1471
86a59229
WL
1472 # Make Code
1473 #
1474 if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1475 $tabref->[3], $arg, $fl ) ){
1476 $error++;
8d063cd8 1477 }
86a59229 1478 }
8d063cd8 1479
86a59229
WL
1480 while( @BlockStack ){
1481 my $bl = pop( @BlockStack );
1482 Warn( "start of unterminated `{'", $bl );
1483 $error++;
1484 }
8d063cd8 1485
86a59229
WL
1486 for my $lab ( keys( %Label ) ){
1487 if( ! exists( $Label{$lab}{defined} ) ){
1488 for my $used ( @{$Label{$lab}{used}} ){
1489 Warn( "undefined label `$lab'", $used );
1490 $error++;
1491 }
8d063cd8 1492 }
86a59229 1493 }
8d063cd8 1494
86a59229
WL
1495 exit( 1 ) if $error;
1496}
8d063cd8 1497
8d063cd8 1498
86a59229
WL
1499##############
1500#### MAIN ####
1501##############
8d063cd8 1502
86a59229
WL
1503sub usage(){
1504 print STDERR "Usage: sed [-an] command [file...]\n";
1505 print STDERR " [-an] [-e command] [-f script-file] [file...]\n";
1506}
8d063cd8 1507
86a59229
WL
1508###################
1509# Here we go again...
1510#
1511my $expr = 0;
1512while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1513 my $opt = $1;
1514 my $arg = $2;
1515 shift( @ARGV );
1516 if( $opt eq 'e' ){
1517 if( length( $arg ) ){
1518 push( @Commands, split( "\n", $arg ) );
1519 } elsif( @ARGV ){
1520 push( @Commands, shift( @ARGV ) );
1521 } else {
1522 Warn( "option -e requires an argument" );
1523 usage();
1524 exit( 1 );
1525 }
1526 $expr++;
1527 $Defined{$#Commands} = " #$expr";
1528 next;
1529 }
1530 if( $opt eq 'f' ){
1531 my $path;
1532 if( length( $arg ) ){
1533 $path = $arg;
1534 } elsif( @ARGV ){
1535 $path = shift( @ARGV );
1536 } else {
1537 Warn( "option -f requires an argument" );
1538 usage();
1539 exit( 1 );
1540 }
1541 my $fst = $#Commands + 1;
1542 open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1543 my $cmd;
1544 while( defined( $cmd = <SCRIPT> ) ){
1545 chomp( $cmd );
1546 push( @Commands, $cmd );
1547 }
1548 close( SCRIPT );
1549 if( $#Commands >= $fst ){
1550 $Defined{$fst} = "$path";
8d063cd8 1551 }
86a59229
WL
1552 next;
1553 }
1554 if( $opt eq '-' && $arg eq '' ){
1555 last;
1556 }
1557 if( $opt eq 'h' || $opt eq '?' ){
1558 usage();
1559 exit( 0 );
1560 }
1561 if( $opt eq 'n' ){
1562 $doAutoPrint = 0;
1563 } elsif( $opt eq 'a' ){
1564 $doOpenWrite = 0;
1565 } else {
1566 Warn( "illegal option `$opt'" );
1567 usage();
1568 exit( 1 );
1569 }
1570 if( length( $arg ) ){
1571 unshift( @ARGV, "-$arg" );
1572 }
1573}
8d063cd8 1574
86a59229
WL
1575# A singleton command may be the 1st argument when there are no options.
1576#
1577if( @Commands == 0 ){
1578 if( @ARGV == 0 ){
1579 Warn( "no script command given" );
1580 usage();
1581 exit( 1 );
1582 }
1583 push( @Commands, split( "\n", shift( @ARGV ) ) );
1584 $Defined{0} = ' #1';
1585}
8d063cd8 1586
86a59229 1587print STDERR "Files: @ARGV\n" if $useDEBUG;
8d063cd8 1588
86a59229
WL
1589# generate leading code
1590#
d16f50bd 1591$Func = <<'[TheEnd]';
86a59229 1592
d16f50bd
WL
1593# openARGV: open 1st input file
1594#
86a59229
WL
1595sub openARGV(){
1596 unshift( @ARGV, '-' ) unless @ARGV;
1597 my $file = shift( @ARGV );
1598 open( ARG, "<$file" )
1599 || die( "$0: can't open $file for reading ($!)\n" );
1600 $isEOF = 0;
1601}
8d063cd8 1602
d16f50bd
WL
1603# getsARGV: Read another input line into argument (default: $_).
1604# Move on to next input file, and reset EOF flag $isEOF.
86a59229
WL
1605sub getsARGV(;\$){
1606 my $argref = @_ ? shift() : \$_;
1607 while( $isEOF || ! defined( $$argref = <ARG> ) ){
1608 close( ARG );
1609 return 0 unless @ARGV;
1610 my $file = shift( @ARGV );
1611 open( ARG, "<$file" )
1612 || die( "$0: can't open $file for reading ($!)\n" );
1613 $isEOF = 0;
1614 }
1615 1;
1616}
8d063cd8 1617
d16f50bd
WL
1618# eofARGV: end-of-file test
1619#
86a59229
WL
1620sub eofARGV(){
1621 return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1622}
8d063cd8 1623
d16f50bd
WL
1624# makeHandle: Generates another file handle for some file (given by its path)
1625# to be written due to a w command or an s command's w flag.
86a59229
WL
1626sub makeHandle($){
1627 my( $path ) = @_;
1628 my $handle;
1629 if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1630 $handle = $wFiles{$path} = gensym();
1631 if( $doOpenWrite ){
1632 if( ! open( $handle, ">$path" ) ){
1633 die( "$0: can't open $path for writing: ($!)\n" );
1634 }
9ef589d8 1635 }
86a59229
WL
1636 } else {
1637 $handle = $wFiles{$path};
1638 }
1639 return $handle;
1640}
9ef589d8 1641
d16f50bd
WL
1642# printQ: Print queued output which is either a string or a reference
1643# to a pathname.
86a59229
WL
1644sub printQ(){
1645 for my $q ( @Q ){
1646 if( ref( $q ) ){
d16f50bd 1647 # flush open w files so that reading this file gets it all
86a59229
WL
1648 if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1649 open( $wFiles{$$q}, ">>$$q" );
1650 }
d16f50bd 1651 # copy file to stdout: slow, but safe
86a59229 1652 if( open( RF, "<$$q" ) ){
d16f50bd 1653 while( defined( my $line = <RF> ) ){
86a59229
WL
1654 print $line;
1655 }
1656 close( RF );
1657 }
1658 } else {
1659 print $q;
a687059c 1660 }
86a59229
WL
1661 }
1662 undef( @Q );
1663}
1664
d16f50bd
WL
1665[TheEnd]
1666
1667# generate the sed loop
1668#
1669$Code .= <<'[TheEnd]';
1670sub openARGV();
1671sub getsARGV(;\$);
1672sub eofARGV();
1673sub printQ();
1674
1675# Run: the sed loop reading input and applying the script
1676#
86a59229
WL
1677sub Run(){
1678 my( $h, $icnt, $s, $n );
1679 # hack (not unbreakable :-/) to avoid // matching an empty string
1680 my $z = "\000"; $z =~ /$z/;
1681 # Initialize.
1682 openARGV();
1683 $Hold = '';
1684 $CondReg = 0;
1685 $doPrint = $doAutoPrint;
1686CYCLE:
1687 while( getsARGV() ){
1688 chomp();
1689 $CondReg = 0; # cleared on t
1690BOS:;
1691[TheEnd]
1692
1693 # parse - avoid opening files when doing s2p
1694 #
1695 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1696 if $doGenerate;
1697 Parse();
1698 ( $svOpenWrite, $doOpenWrite ) = ( $doOpenWrite, $svOpenWrite )
1699 if $doGenerate;
1700
1701 # append trailing code
1702 #
1703 $Code .= <<'[TheEnd]';
1704EOS: if( $doPrint ){
1705 print $_, "\n";
1706 } else {
1707 $doPrint = $doAutoPrint;
a687059c 1708 }
86a59229 1709 printQ() if @Q;
a687059c 1710 }
86a59229
WL
1711
1712 exit( 0 );
a687059c 1713}
86a59229
WL
1714[TheEnd]
1715
d16f50bd
WL
1716
1717# append optional functions, prepend prototypes
1718#
1719my $Proto = "# prototypes\n";
1720if( $GenKey{'l'} ){
1721 $Proto .= "sub _l();\n";
1722 $Func .= <<'[TheEnd]';
1723# _l: l command processing
1724#
1725sub _l(){
1726 my $h = $_;
1727 my $mcpl = 70;
1728 # transform non printing chars into escape notation
1729 $h =~ s/\\/\\\\/g;
1730 if( $h =~ /[^[:print:]]/ ){
1731 $h =~ s/\a/\\a/g;
1732 $h =~ s/\f/\\f/g;
1733 $h =~ s/\n/\\n/g;
1734 $h =~ s/\t/\\t/g;
1735 $h =~ s/\r/\\r/g;
1736 $h =~ s/\e/\\e/g;
1737 $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1738 }
1739 # split into lines of length $mcpl
1740 while( length( $h ) > $mcpl ){
1741 my $l = substr( $h, 0, $mcpl-1 );
1742 $h = substr( $h, $mcpl );
1743 # remove incomplete \-escape from end of line
1744 if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1745 $h = $1 . $h;
1746 }
1747 print $l, "\\\n";
1748 }
1749 print "$h\$\n";
1750}
1751
1752[TheEnd]
1753}
1754
1755if( $GenKey{'r'} ){
1756 $Proto .= "sub _r(\$);\n";
1757 $Func .= <<'[TheEnd]';
1758# _r: r command processing: Save a reference to the pathname.
1759#
1760sub _r($){
1761 my $path = shift();
1762 push( @Q, \$path );
1763}
1764
1765[TheEnd]
1766}
1767
1768if( $GenKey{'t'} ){
1769 $Proto .= "sub _t();\n";
1770 $Func .= <<'[TheEnd]';
1771# _t: t command - condition register test/reset
1772#
1773sub _t(){
1774 my $res = $CondReg;
1775 $CondReg = 0;
1776 $res;
1777}
1778
1779[TheEnd]
1780}
1781
1782if( $GenKey{'w'} ){
1783 $Proto .= "sub _w(\$);\n";
1784 $Func .= <<'[TheEnd]';
1785# _w: w command and s command's w flag - write to file
1786#
1787sub _w($){
1788 my $path = shift();
1789 my $handle = $wFiles{$path};
1790 if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1791 open( $handle, ">$path" )
1792 || die( "$0: $path: cannot open ($!)\n" );
1793 }
1794 print $handle $_, "\n";
1795}
1796
1797[TheEnd]
1798}
1799
1800$Code = $Proto . $Code;
1801
86a59229
WL
1802# magic "#n" - same as -n option
1803#
1804$doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
a687059c 1805
86a59229
WL
1806# eval code - check for errors
1807#
d16f50bd
WL
1808print "Code:\n$Code$Func" if $useDEBUG;
1809eval $Code . $Func;
86a59229 1810if( $@ ){
d16f50bd 1811 print "Code:\n$Code$Func";
86a59229 1812 die( "$0: internal error - generated incorrect Perl code: $@\n" );
9ef589d8
LW
1813}
1814
86a59229
WL
1815if( $doGenerate ){
1816
1817 # write full Perl program
1818 #
1819
d16f50bd 1820 # bang line, declarations, prototypes
86a59229
WL
1821 print <<TheEnd;
1822#!$perlpath -w
1823eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1824 if 0;
9ae2b9a0 1825\$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
86a59229
WL
1826
1827use strict;
1828use Symbol;
1829use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1830 \$doAutoPrint \$doOpenWrite \$doPrint };
1831\$doAutoPrint = $doAutoPrint;
1832\$doOpenWrite = $doOpenWrite;
1833TheEnd
1834
1835 my $wf = "'" . join( "', '", keys( %wFiles ) ) . "'";
1836 if( $wf ne "''" ){
1837 print <<TheEnd;
1838sub makeHandle(\$);
1839for my \$p ( $wf ){
1840 exit( 1 ) unless makeHandle( \$p );
9ef589d8 1841}
86a59229
WL
1842TheEnd
1843 }
9ef589d8 1844
86a59229 1845 print $Code;
d16f50bd
WL
1846 print "Run();\n";
1847 print $Func;
86a59229 1848 exit( 0 );
1aa91729 1849
86a59229
WL
1850} else {
1851
1852 # execute: make handles (and optionally open) all w files; run!
86a59229
WL
1853 for my $p ( keys( %wFiles ) ){
1854 exit( 1 ) unless makeHandle( $p );
1855 }
d16f50bd 1856 Run();
1aa91729 1857}
86a59229
WL
1858
1859
1860=head1 ENVIRONMENT
1861
1862The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1863See L<"Additional Atoms">.
1864
1865=head1 DIAGNOSTICS
1866
1867=over 4
1868
3484938b 1869=item ambiguous translation for character `%s' in `y' command
86a59229
WL
1870
1871The indicated character appears twice, with different translations.
1872
1873=item `[' cannot be last in pattern
1874
1875A `[' in a BRE indicates the beginning of a I<bracket expression>.
1876
1877=item `\' cannot be last in pattern
1878
1879A `\' in a BRE is used to make the subsequent character literal.
1880
1881=item `\' cannot be last in substitution
1882
1883A `\' in a subsitution string is used to make the subsequent character literal.
1884
1885=item conflicting flags `%s'
1886
1887In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1888multiple n-th occurrence flags are specified. Note that only the digits
1889`1' through `9' are permitted.
1890
1891=item duplicate label %s (first defined at %s)
1892
1893=item excess address(es)
1894
1895The command has more than the permitted number of addresses.
1896
1897=item extra characters after command (%s)
1898
1899=item illegal option `%s'
1900
1901=item improper delimiter in s command
1902
1903The BRE and substitution may not be delimited with `\' or newline.
1904
1905=item invalid address after `,'
1906
1907=item invalid backreference (%s)
1908
1909The specified backreference number exceeds the number of backreferences
1910in the BRE.
1911
1912=item invalid repeat clause `\{%s\}'
1913
1914The repeat clause does not contain a valid integer value, or pair of
1915values.
1916
1917=item malformed regex, 1st address
1918
1919=item malformed regex, 2nd address
1920
1921=item malformed regular expression
1922
1923=item malformed substitution expression
1924
1925=item malformed `y' command argument
1926
1927The first or second string of a B<y> command is syntactically incorrect.
1928
1929=item maximum less than minimum in `\{%s\}'
1930
1931=item no script command given
1932
1933There must be at least one B<-e> or one B<-f> option specifying a
1934script or script file.
1935
1936=item `\' not valid as delimiter in `y' command
1937
1938=item option -e requires an argument
1939
1940=item option -f requires an argument
1941
1942=item `s' command requires argument
1943
1944=item start of unterminated `{'
1945
1946=item string lengths in `y' command differ
1947
863b2ca0 1948The translation table strings in a B<y> command must have equal lengths.
86a59229
WL
1949
1950=item undefined label `%s'
1951
1952=item unexpected `}'
1953
1954A B<}> command without a preceding B<{> command was encountered.
1955
1956=item unexpected end of script
1957
1958The end of the script was reached although a text line after a
1959B<a>, B<c> or B<i> command indicated another line.
1960
1961=item unknown command `%s'
1962
1963=item unterminated `['
1964
1965A BRE contains an unterminated bracket expression.
1966
1967=item unterminated `\('
1968
1969A BRE contains an unterminated backreference.
1970
1971=item `\{' without closing `\}'
1972
1973A BRE contains an unterminated bounds specification.
1974
1975=item `\)' without preceding `\('
1976
1977=item `y' command requires argument
1978
1979=back
1980
1981=head1 EXAMPLE
1982
1983The basic material for the preceding section was generated by running
1984the sed script
1985
1986 #no autoprint
1987 s/^.*Warn( *"\([^"]*\)".*$/\1/
1988 t process
1989 b
1990 :process
1991 s/$!/%s/g
1992 s/$[_[:alnum:]]\{1,\}/%s/g
1993 s/\\\\/\\/g
1994 s/^/=item /
1995 p
1996
1997on the program's own text, and piping the output into C<sort -u>.
1998
1999
2000=head1 SED SCRIPT TRANSLATION
2001
2002If this program is invoked with the name F<s2p> it will act as a
2003sed-to-Perl translator. After option processing (all other
2004arguments are ignored), a Perl program is printed on standard
2005output, which will process the input stream (as read from all
2006arguments) in the way defined by the sed script and the option setting
2007used for the translation.
2008
2009=head1 SEE ALSO
2010
2011perl(1), re_format(7)
2012
2013=head1 BUGS
2014
2015The B<l> command will show escape characters (ESC) as `C<\e>', but
2016a vertical tab (VT) in octal.
2017
2018Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
2019
2020The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
2021is "the last pattern used, at run time". This deviates from the Perl
2022interpretation, which will re-use the "last last successfully executed
2023regular expression". Since keeping track of pattern usage would create
2024terribly cluttered code, and differences would only appear in obscure
2025context (where other B<sed> implementations appear to deviate, too),
2026the Perl semantics was adopted. Note that common usage of this feature,
2027such as in C</abc/s//xyz/>, will work as expected.
2028
2029Collating elements (of bracket expressions in BREs) are not implemented.
2030
2031=head1 STANDARDS
2032
2033This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2034definition of B<sed>, and is compatible with the I<OpenBSD>
2035implementation, except where otherwise noted (see L<"BUGS">).
2036
2037=head1 AUTHOR
2038
2039This Perl implementation of I<sed> was written by Wolfgang Laun,
2040I<Wolfgang.Laun@alcatel.at>.
2041
2042=head1 COPYRIGHT and LICENSE
2043
2044This program is free and open software. You may use, modify,
2045distribute, and sell this program (and any modified variants) in any
2046way you wish, provided you do not restrict others from doing the same.
2047
2048=cut
2049
a687059c 2050!NO!SUBS!
4633a7c4
LW
2051
2052close OUT or die "Can't close $file: $!";
2053chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
d38b618a 2054unlink 'psed';
96021e87 2055print "Linking $file to psed.\n";
d38b618a 2056if (defined $Config{d_link}) {
96021e87 2057 link $file, 'psed';
d38b618a
A
2058} else {
2059 unshift @INC, '../lib';
2060 require File::Copy;
2061 File::Copy::syscopy('s2p', 'psed');
2062}
4633a7c4 2063exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
8a5546a1 2064chdir $origdir;