x2p/s2p.PL

   1 #!/usr/bin/perl
   2
   3 use Config;
   4 use File::Basename qw(&basename &dirname);
   5 use Cwd;
   6 use subs qw(link);
   7
   8 sub link { # This is a cut-down version of installperl:link().
   9     my($from,$to) = @_;
  10     my($success) = 0;
  11
  12     eval {
  13         CORE::link($from, $to)
  14             ? $success++
  15             : ($from =~ m#^/afs/# || $to =~ m#^/afs/#)
  16               ? die "AFS"  # okay inside eval {}
  17               : die "Couldn't link $from to $to: $!\n";
  18     };
  19     if ($@) {
  20         require File::Copy;
  21         File::Copy::copy($from, $to)
  22             ? $success++
  23             : warn "Couldn't copy $from to $to: $!\n";
  24     }
  25     $success;
  26 }
  27
  28 # List explicitly here the variables you want Configure to
  29 # generate.  Metaconfig only looks for shell variables, so you
  30 # have to mention them as if they were shell variables, not
  31 # %Config entries.  Thus you write
  32 #  $startperl
  33 # to ensure Configure will look for $Config{startperl}.
  34
  35 # This forces PL files to create target in same directory as PL file.
  36 # This is so that make depend always knows where to find PL derivatives.
  37 $origdir = cwd;
  38 chdir dirname($0);
  39 $file = basename($0, '.PL');
  40 $file .= '.com' if $^O eq 'VMS';
  41
  42 open OUT,">$file" or die "Can't create $file: $!";
  43
  44 print "Extracting $file (with variable substitutions)\n";
  45
  46 # In this section, perl variables will be expanded during extraction.
  47 # You can use $Config{...} to use Configure variables.
  48
  49 print OUT <<"!GROK!THIS!";
  50 $Config{startperl}
  51     eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
  52         if \$running_under_some_shell;
  53 my \$startperl;
  54 my \$perlpath;
  55 (\$startperl = <<'/../') =~ s/\\s*\\z//;
  56 $Config{startperl}
  57 /../
  58 (\$perlpath = <<'/../') =~ s/\\s*\\z//;
  59 $Config{perlpath}
  60 /../
  61 !GROK!THIS!
  62
  63 # In the following, perl variables are not expanded during extraction.
  64
  65 print OUT <<'!NO!SUBS!';
  66
  67 $0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
  68
  69 # (p)sed - a stream editor
  70 # History:  Aug 12 2000: Original version.
  71 #           Mar 25 2002: Rearrange generated Perl program.
  72 #           Jul 23 2007: Fix bug in regex stripping (M.Thorland)
  73
  74 use strict;
  75 use integer;
  76 use Symbol;
  77
  78 =head1 NAME
  79
  80 psed - a stream editor
  81
  82 =head1 SYNOPSIS
  83
  84    psed [-an] script [file ...]
  85    psed [-an] [-e script] [-f script-file] [file ...]
  86
  87    s2p  [-an] [-e script] [-f script-file]
  88
  89 =head1 DESCRIPTION
  90
  91 A stream editor reads the input stream consisting of the specified files
  92 (or standard input, if none are given), processes is line by line by
  93 applying a script consisting of edit commands, and writes resulting lines
  94 to standard output. The filename 'C<->' may be used to read standard input.
  95
  96 The edit script is composed from arguments of B<-e> options and
  97 script-files, in the given order. A single script argument may be specified
  98 as the first parameter.
  99
 100 If this program is invoked with the name F<s2p>, it will act as a
 101 sed-to-Perl translator. See L<"SED SCRIPT TRANSLATION">.
 102
 103 B<sed> returns an exit code of 0 on success or >0 if an error occurred.
 104
 105 =head1 OPTIONS
 106
 107 =over 4
 108
 109 =item B<-a>
 110
 111 A file specified as argument to the B<w> edit command is by default
 112 opened before input processing starts. Using B<-a>, opening of such
 113 files is delayed until the first line is actually written to the file.
 114
 115 =item B<-e> I<script>
 116
 117 The editing commands defined by I<script> are appended to the script.
 118 Multiple commands must be separated by newlines.
 119
 120 =item B<-f> I<script-file>
 121
 122 Editing commands from the specified I<script-file> are read and appended
 123 to the script.
 124
 125 =item B<-n>
 126
 127 By default, a line is written to standard output after the editing script
 128 has been applied to it. The B<-n> option suppresses automatic printing.
 129
 130 =back
 131
 132 =head1 COMMANDS
 133
 134 B<sed> command syntax is defined as
 135
 136 Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
 137
 138 with whitespace being permitted before or after addresses, and between
 139 the function character and the argument. The I<address>es and the
 140 address inverter (C<!>) are used to restrict the application of a
 141 command to the selected line(s) of input.
 142
 143 Each command must be on a line of its own, except where noted in
 144 the synopses below.
 145
 146 The edit cycle performed on each input line consist of reading the line
 147 (without its trailing newline character) into the I<pattern space>,
 148 applying the applicable commands of the edit script, writing the final
 149 contents of the pattern space and a newline to the standard output.
 150 A I<hold space> is provided for saving the contents of the
 151 pattern space for later use.
 152
 153 =head2 Addresses
 154
 155 A sed address is either a line number or a pattern, which may be combined
 156 arbitrarily to construct ranges. Lines are numbered across all input files.
 157
 158 Any address may be followed by an exclamation mark ('C<!>'), selecting
 159 all lines not matching that address.
 160
 161 =over 4
 162
 163 =item I<number>
 164
 165 The line with the given number is selected.
 166
 167 =item B<$>
 168
 169 A dollar sign (C<$>) is the line number of the last line of the input stream.
 170
 171 =item B</>I<regular expression>B</>
 172
 173 A pattern address is a basic regular expression (see
 174 L<"BASIC REGULAR EXPRESSIONS">), between the delimiting character C</>.
 175 Any other character except C<\> or newline may be used to delimit a
 176 pattern address when the initial delimiter is prefixed with a
 177 backslash ('C<\>').
 178
 179 =back
 180
 181 If no address is given, the command selects every line.
 182
 183 If one address is given, it selects the line (or lines) matching the
 184 address.
 185
 186 Two addresses select a range that begins whenever the first address
 187 matches, and ends (including that line) when the second address matches.
 188 If the first (second) address is a matching pattern, the second
 189 address is not applied to the very same line to determine the end of
 190 the range. Likewise, if the second address is a matching pattern, the
 191 first address is not applied to the very same line to determine the
 192 begin of another range. If both addresses are line numbers,
 193 and the second line number is less than the first line number, then
 194 only the first line is selected.
 195
 196
 197 =head2 Functions
 198
 199 The maximum permitted number of addresses is indicated with each
 200 function synopsis below.
 201
 202 The argument I<text> consists of one or more lines following the command.
 203 Embedded newlines in I<text> must be preceded with a backslash.  Other
 204 backslashes in I<text> are deleted and the following character is taken
 205 literally.
 206
 207 =over 4
 208
 209 =cut
 210
 211 my %ComTab;
 212 my %GenKey;
 213 #--------------------------------------------------------------------------
 214 $ComTab{'a'}=[ 1, 'txt', \&Emit,       '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
 215
 216 =item [1addr]B<a\> I<text>
 217
 218 Write I<text> (which must start on the line following the command)
 219 to standard output immediately before reading the next line
 220 of input, either by executing the B<N> function or by beginning a new cycle.
 221
 222 =cut
 223
 224 #--------------------------------------------------------------------------
 225 $ComTab{'b'}=[ 2, 'str', \&Branch,     '{ goto XXX; }'                   ]; #ok
 226
 227 =item [2addr]B<b> [I<label>]
 228
 229 Branch to the B<:> function with the specified I<label>. If no label
 230 is given, branch to the end of the script.
 231
 232 =cut
 233
 234 #--------------------------------------------------------------------------
 235 $ComTab{'c'}=[ 2, 'txt', \&Change,     <<'-X-'                           ]; #ok
 236 { print <<'TheEnd'; } $doPrint = 0; goto EOS;
 237 -X-
 238 ### continue OK => next CYCLE;
 239
 240 =item [2addr]B<c\> I<text>
 241
 242 The line, or range of lines, selected by the address is deleted.
 243 The I<text> (which must start on the line following the command)
 244 is written to standard output. With an address range, this occurs at
 245 the end of the range.
 246
 247 =cut
 248
 249 #--------------------------------------------------------------------------
 250 $ComTab{'d'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 251 { $doPrint = 0;
 252   goto EOS;
 253 }
 254 -X-
 255 ### continue OK => next CYCLE;
 256
 257 =item [2addr]B<d>
 258
 259 Deletes the pattern space and starts the next cycle.
 260
 261 =cut
 262
 263 #--------------------------------------------------------------------------
 264 $ComTab{'D'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 265 { s/^.*\n?//;
 266   if(length($_)){ goto BOS } else { goto EOS }
 267 }
 268 -X-
 269 ### continue OK => next CYCLE;
 270
 271 =item [2addr]B<D>
 272
 273 Deletes the pattern space through the first embedded newline or to the end.
 274 If the pattern space becomes empty, a new cycle is started, otherwise
 275 execution of the script is restarted.
 276
 277 =cut
 278
 279 #--------------------------------------------------------------------------
 280 $ComTab{'g'}=[ 2, '',    \&Emit,       '{ $_ = $Hold };'                 ]; #ok
 281
 282 =item [2addr]B<g>
 283
 284 Replace the contents of the pattern space with the hold space.
 285
 286 =cut
 287
 288 #--------------------------------------------------------------------------
 289 $ComTab{'G'}=[ 2, '',    \&Emit,       '{ $_ .= "\n"; $_ .= $Hold };'    ]; #ok
 290
 291 =item [2addr]B<G>
 292
 293 Append a newline and the contents of the hold space to the pattern space.
 294
 295 =cut
 296
 297 #--------------------------------------------------------------------------
 298 $ComTab{'h'}=[ 2, '',    \&Emit,       '{ $Hold = $_ }'                  ]; #ok
 299
 300 =item [2addr]B<h>
 301
 302 Replace the contents of the hold space with the pattern space.
 303
 304 =cut
 305
 306 #--------------------------------------------------------------------------
 307 $ComTab{'H'}=[ 2, '',    \&Emit,       '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
 308
 309 =item [2addr]B<H>
 310
 311 Append a newline and the contents of the pattern space to the hold space.
 312
 313 =cut
 314
 315 #--------------------------------------------------------------------------
 316 $ComTab{'i'}=[ 1, 'txt', \&Emit,       '{ print <<'."'TheEnd' }\n"       ]; #ok
 317
 318 =item [1addr]B<i\> I<text>
 319
 320 Write the I<text> (which must start on the line following the command)
 321 to standard output.
 322
 323 =cut
 324
 325 #--------------------------------------------------------------------------
 326 $ComTab{'l'}=[ 2, '',    \&Emit,       '{ _l() }'                        ]; #okUTF8
 327
 328 =item [2addr]B<l>
 329
 330 Print the contents of the pattern space: non-printable characters are
 331 shown in C-style escaped form; long lines are split and have a trailing
 332 ^'C<\>' at the point of the split; the true end of a line is marked with
 333 a 'C<$>'. Escapes are: '\a', '\t', '\n', '\f', '\r', '\e' for
 334 BEL, HT, LF, FF, CR, ESC, respectively, and '\' followed by a three-digit
 335 octal number for all other non-printable characters.
 336
 337 =cut
 338
 339 #--------------------------------------------------------------------------
 340 $ComTab{'n'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 341 { print $_, "\n" if $doPrint;
 342   printQ() if @Q;
 343   $CondReg = 0;
 344   last CYCLE unless getsARGV();
 345   chomp();
 346 }
 347 -X-
 348
 349 =item [2addr]B<n>
 350
 351 If automatic printing is enabled, write the pattern space to the standard
 352 output. Replace the pattern space with the next line of input. If
 353 there is no more input, processing is terminated.
 354
 355 =cut
 356
 357 #--------------------------------------------------------------------------
 358 $ComTab{'N'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 359 { printQ() if @Q;
 360   $CondReg = 0;
 361   last CYCLE unless getsARGV( $h );
 362   chomp( $h );
 363   $_ .= "\n$h";
 364 }
 365 -X-
 366
 367 =item [2addr]B<N>
 368
 369 Append a newline and the next line of input to the pattern space. If
 370 there is no more input, processing is terminated.
 371
 372 =cut
 373
 374 #--------------------------------------------------------------------------
 375 $ComTab{'p'}=[ 2, '',    \&Emit,       '{ print $_, "\n"; }'             ]; #ok
 376
 377 =item [2addr]B<p>
 378
 379 Print the pattern space to the standard output. (Use the B<-n> option
 380 to suppress automatic printing at the end of a cycle if you want to
 381 avoid double printing of lines.)
 382
 383 =cut
 384
 385 #--------------------------------------------------------------------------
 386 $ComTab{'P'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 387 { if( /^(.*)/ ){ print $1, "\n"; } }
 388 -X-
 389
 390 =item [2addr]B<P>
 391
 392 Prints the pattern space through the first embedded newline or to the end.
 393
 394 =cut
 395
 396 #--------------------------------------------------------------------------
 397 $ComTab{'q'}=[ 1, '',    \&Emit,       <<'-X-'                           ]; #ok
 398 { print $_, "\n" if $doPrint;
 399   last CYCLE;
 400 }
 401 -X-
 402
 403 =item [1addr]B<q>
 404
 405 Branch to the end of the script and quit without starting a new cycle.
 406
 407 =cut
 408
 409 #--------------------------------------------------------------------------
 410 $ComTab{'r'}=[ 1, 'str', \&Emit,       "{ _r( '-X-' ) }"                 ]; #ok
 411
 412 =item [1addr]B<r> I<file>
 413
 414 Copy the contents of the I<file> to standard output immediately before
 415 the next attempt to read a line of input. Any error encountered while
 416 reading I<file> is silently ignored.
 417
 418 =cut
 419
 420 #--------------------------------------------------------------------------
 421 $ComTab{'s'}=[ 2, 'sub', \&Emit,       ''                                ]; #ok
 422
 423 =item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
 424
 425 Substitute the I<replacement> string for the first substring in
 426 the pattern space that matches the I<regular expression>.
 427 Any character other than backslash or newline can be used instead of a
 428 slash to delimit the regular expression and the replacement.
 429 To use the delimiter as a literal character within the regular expression
 430 and the replacement, precede the character by a backslash ('C<\>').
 431
 432 Literal newlines may be embedded in the replacement string by
 433 preceding a newline with a backslash.
 434
 435 Within the replacement, an ampersand ('C<&>') is replaced by the string
 436 matching the regular expression. The strings 'C<\1>' through 'C<\9>' are
 437 replaced by the corresponding subpattern (see L<"BASIC REGULAR EXPRESSIONS">).
 438 To get a literal 'C<&>' or 'C<\>' in the replacement text, precede it
 439 by a backslash.
 440
 441 The following I<flags> modify the behaviour of the B<s> command:
 442
 443 =over 8
 444
 445 =item B<g>
 446
 447 The replacement is performed for all matching, non-overlapping substrings
 448 of the pattern space.
 449
 450 =item B<1>..B<9>
 451
 452 Replace only the n-th matching substring of the pattern space.
 453
 454 =item B<p>
 455
 456 If the substitution was made, print the new value of the pattern space.
 457
 458 =item B<w> I<file>
 459
 460 If the substitution was made, write the new value of the pattern space
 461 to the specified file.
 462
 463 =back
 464
 465 =cut
 466
 467 #--------------------------------------------------------------------------
 468 $ComTab{'t'}=[ 2, 'str', \&Branch,     '{ goto XXX if _t() }'            ]; #ok
 469
 470 =item [2addr]B<t> [I<label>]
 471
 472 Branch to the B<:> function with the specified I<label> if any B<s>
 473 substitutions have been made since the most recent reading of an input line
 474 or execution of a B<t> function. If no label is given, branch to the end of
 475 the script.
 476
 477
 478 =cut
 479
 480 #--------------------------------------------------------------------------
 481 $ComTab{'w'}=[ 2, 'str', \&Write,      "{ _w( '-X-' ) }"                 ]; #ok
 482
 483 =item [2addr]B<w> I<file>
 484
 485 The contents of the pattern space are written to the I<file>.
 486
 487 =cut
 488
 489 #--------------------------------------------------------------------------
 490 $ComTab{'x'}=[ 2, '',    \&Emit,       '{ ($Hold, $_) = ($_, $Hold) }'   ]; #ok
 491
 492 =item [2addr]B<x>
 493
 494 Swap the contents of the pattern space and the hold space.
 495
 496 =cut
 497
 498 #--------------------------------------------------------------------------
 499 $ComTab{'y'}=[ 2, 'tra', \&Emit,       ''                                ]; #ok
 500
 501 =item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
 502
 503 In the pattern space, replace all characters occurring in I<string1> by the
 504 character at the corresponding position in I<string2>. It is possible
 505 to use any character (other than a backslash or newline) instead of a
 506 slash to delimit the strings.  Within I<string1> and I<string2>, a
 507 backslash followed by any character other than a newline is that literal
 508 character, and a backslash followed by an 'n' is replaced by a newline
 509 character.
 510
 511 =cut
 512
 513 #--------------------------------------------------------------------------
 514 $ComTab{'='}=[ 1, '',    \&Emit,       '{ print "$.\n" }'                ]; #ok
 515
 516 =item [1addr]B<=>
 517
 518 Prints the current line number on the standard output.
 519
 520 =cut
 521
 522 #--------------------------------------------------------------------------
 523 $ComTab{':'}=[ 0, 'str', \&Label,      ''                                ]; #ok
 524
 525 =item [0addr]B<:> [I<label>]
 526
 527 The command specifies the position of the I<label>. It has no other effect.
 528
 529 =cut
 530
 531 #--------------------------------------------------------------------------
 532 $ComTab{'{'}=[ 2, '',    \&BeginBlock, '{'                               ]; #ok
 533 $ComTab{'}'}=[ 0, '',    \&EndBlock,   ';}'                              ]; #ok
 534 # ';' to avoid warning on empty {}-block
 535
 536 =item [2addr]B<{> [I<command>]
 537
 538 =item [0addr]B<}>
 539
 540 These two commands begin and end a command list. The first command may
 541 be given on the same line as the opening B<{> command. The commands
 542 within the list are jointly selected by the address(es) given on the
 543 B<{> command (but may still have individual addresses).
 544
 545 =cut
 546
 547 #--------------------------------------------------------------------------
 548 $ComTab{'#'}=[ 0, 'str', \&Comment,    ''                                ]; #ok
 549
 550 =item [0addr]B<#> [I<comment>]
 551
 552 The entire line is ignored (treated as a comment). If, however, the first
 553 two characters in the script are 'C<#n>', automatic printing of output is
 554 suppressed, as if the B<-n> option were given on the command line.
 555
 556 =back
 557
 558 =cut
 559
 560 use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
 561
 562 my $useDEBUG    = exists( $ENV{PSEDDEBUG} );
 563 my $useEXTBRE   = $ENV{PSEDEXTBRE} || '';
 564 $useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
 565
 566 my $doAutoPrint = 1;          # automatic printing of pattern space (-n => 0)
 567 my $doOpenWrite = 1;          # open w command output files at start (-a => 0)
 568 my $svOpenWrite = 0;          # save $doOpenWrite
 569
 570 # lower case $0 below as a VMSism.  The VMS build procedure creates the
 571 # s2p file traditionally in upper case on the disk.  When VMS is in a
 572 # case preserved or case sensitive mode, $0 will be returned in the exact
 573 # case which will be on the disk, and that is not predictable at this time.
 574
 575 my $doGenerate  = lc($0) eq 's2p';
 576
 577 # Collected and compiled script
 578 #
 579 my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
 580 $Code = '';
 581
 582 ##################
 583 #  Compile Time
 584 #
 585 # Labels
 586 #
 587 # Error handling
 588 #
 589 sub Warn($;$){
 590     my( $msg, $loc ) = @_;
 591     $loc ||= '';
 592     $loc .= ': ' if length( $loc );
 593     warn( "$0: $loc$msg\n" );
 594 }
 595
 596 $labNum = 0;
 597 sub newLabel(){
 598     return 'L_'.++$labNum;
 599 }
 600
 601 # safeHere: create safe here delimiter and  modify opcode and argument
 602 #
 603 sub safeHere($$){
 604     my( $codref, $argref ) = @_;
 605     my $eod = 'EOD000';
 606     while( $$argref =~ /^$eod$/m ){
 607         $eod++;
 608     }
 609     $$codref =~ s/TheEnd/$eod/e;
 610     $$argref .= "$eod\n";
 611 }
 612
 613 # Emit: create address logic and emit command
 614 #
 615 sub Emit($$$$$$){
 616     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 617     my $cond = '';
 618     if( defined( $addr1 ) ){
 619         if( defined( $addr2 ) ){
 620             $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 621         } else {
 622             $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 623         }
 624         $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
 625     }
 626
 627     if( $opcode eq '' ){
 628         $Code .= "$cond$arg\n";
 629
 630     } elsif( $opcode =~ s/-X-/$arg/e ){
 631         $Code .= "$cond$opcode\n";
 632
 633     } elsif( $opcode =~ /TheEnd/ ){
 634         safeHere( \$opcode, \$arg );
 635         $Code .= "$cond$opcode$arg";
 636
 637     } else {
 638         $Code .= "$cond$opcode\n";
 639     }
 640     0;
 641 }
 642
 643 # Write (w command, w flag): store pathname
 644 #
 645 sub Write($$$$$$){
 646     my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
 647     $wFiles{$path} = '';
 648     Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
 649 }
 650
 651
 652 # Label (: command): label definition
 653 #
 654 sub Label($$$$$$){
 655     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 656     my $rc = 0;
 657     $lab =~ s/\s+//;
 658     if( length( $lab ) ){
 659         my $h;
 660         if( ! exists( $Label{$lab} ) ){
 661             $h = $Label{$lab}{name} = newLabel();
 662         } else {
 663             $h = $Label{$lab}{name};
 664             if( exists( $Label{$lab}{defined} ) ){
 665                 my $dl = $Label{$lab}{defined};
 666                 Warn( "duplicate label $lab (first defined at $dl)", $fl );
 667                 $rc = 1;
 668             }
 669         }
 670         $Label{$lab}{defined} = $fl;
 671         $Code .= "$h:;\n";
 672     }
 673     $rc;
 674 }
 675
 676 # BeginBlock ({ command): push block start
 677 #
 678 sub BeginBlock($$$$$$){
 679     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 680     push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
 681     Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 682 }
 683
 684 # EndBlock (} command): check proper nesting
 685 #
 686 sub EndBlock($$$$$$){
 687     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 688     my $rc;
 689     my $jcom = pop( @BlockStack );
 690     if( defined( $jcom ) ){
 691         $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 692     } else {
 693         Warn( "unexpected '}'", $fl );
 694         $rc = 1;
 695     }
 696     $rc;
 697 }
 698
 699 # Branch (t, b commands): check or create label, substitute default
 700 #
 701 sub Branch($$$$$$){
 702     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 703     $lab =~ s/\s+//; # no spaces at end
 704     my $h;
 705     if( length( $lab ) ){
 706         if( ! exists( $Label{$lab} ) ){
 707             $h = $Label{$lab}{name} = newLabel();
 708         } else {
 709             $h = $Label{$lab}{name};
 710         }
 711         push( @{$Label{$lab}{used}}, $fl );
 712     } else {
 713         $h = 'EOS';
 714     }
 715     $opcode =~ s/XXX/$h/e;
 716     Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
 717 }
 718
 719 # Change (c command): is special due to range end watching
 720 #
 721 sub Change($$$$$$){
 722     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 723     my $kwd = $negated ? 'unless' : 'if';
 724     if( defined( $addr2 ) ){
 725         $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 726         if( ! $negated ){
 727             $addr1  = '$icnt = ('.$addr1.')';
 728             $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
 729         }
 730     } else {
 731         $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 732     }
 733     safeHere( \$opcode, \$arg );
 734     $Code .= "$kwd( $addr1 ){\n  $opcode$arg}\n";
 735     0;
 736 }
 737
 738
 739 # Comment (# command): A no-op. Who would've thought that!
 740 #
 741 sub Comment($$$$$$){
 742     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 743 ### $Code .= "# $arg\n";
 744     0;
 745 }
 746
 747 # stripRegex from the current command. If we're in the first
 748 # part of s///, trailing spaces have to be kept as the initial
 749 # part of the replacement string.
 750 #
 751 sub stripRegex($$;$){
 752     my( $del, $sref, $sub ) = @_;
 753     my $regex = $del;
 754     print "stripRegex:$del:$$sref:\n" if $useDEBUG;
 755     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
 756         my $sl = $2;
 757         $regex .= $1.$sl.$del;
 758         if( length( $sl ) % 2 == 0 ){
 759             if( $sub && (length( $3 ) > 0) ){
 760                 $$sref = $3 . $$sref;
 761             }
 762             return $regex;
 763         }
 764         $regex .= $3;
 765     }
 766     undef();
 767 }
 768
 769 # stripTrans: take a <del> terminated string from y command
 770 #   honoring and cleaning up of \-escaped <del>'s
 771 #
 772 sub stripTrans($$){
 773     my( $del, $sref ) = @_;
 774     my $t = '';
 775     print "stripTrans:$del:$$sref:\n" if $useDEBUG;
 776     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
 777         my $sl = $2;
 778         $t .= $1;
 779         if( length( $sl ) % 2 == 0 ){
 780             $t .= $sl;
 781             $t =~ s/\\\\/\\/g;
 782             return $t;
 783         }
 784         chop( $sl );
 785         $t .= $sl.$del.$3;
 786     }
 787     undef();
 788 }
 789
 790 # makey - construct Perl y/// from sed y///
 791 #
 792 sub makey($$$){
 793     my( $fr, $to, $fl ) = @_;
 794     my $error = 0;
 795
 796     # Ensure that any '-' is up front.
 797     # Diagnose duplicate contradicting mappings
 798     my %tr;
 799     for( my $i = 0; $i < length($fr); $i++ ){
 800         my $fc = substr($fr,$i,1);
 801         my $tc = substr($to,$i,1);
 802         if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
 803             Warn( "ambiguous translation for character '$fc' in 'y' command",
 804                   $fl );
 805             $error++;
 806         }
 807         $tr{$fc} = $tc;
 808     }
 809     $fr = $to = '';
 810     if( exists( $tr{'-'} ) ){
 811         ( $fr, $to ) = ( '-', $tr{'-'} );
 812         delete( $tr{'-'} );
 813     } else {
 814         $fr = $to = '';
 815     }
 816     # might just as well sort it...
 817     for my $fc ( sort keys( %tr ) ){
 818         $fr .= $fc;
 819         $to .= $tr{$fc};
 820     }
 821     # make embedded delimiters and newlines safe
 822     $fr =~ s/([{}])/\$1/g;
 823     $to =~ s/([{}])/\$1/g;
 824     $fr =~ s/\n/\\n/g;
 825     $to =~ s/\n/\\n/g;
 826     return $error ? undef() : "{ y{$fr}{$to}; }";
 827 }
 828
 829 ######
 830 # makes - construct Perl s/// from sed s///
 831 #
 832 sub makes($$$$$$$){
 833     my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
 834
 835     # make embedded newlines safe
 836     $regex =~ s/\n/\\n/g;
 837     $subst =~ s/\n/\\n/g;
 838
 839     my $code;
 840     # n-th occurrence
 841     #
 842     if( length( $nmatch ) ){
 843         $code = <<TheEnd;
 844 { \$n = $nmatch;
 845   while( --\$n && ( \$s = m ${regex}g ) ){}
 846   \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
 847   \$CondReg ||= \$s;
 848 TheEnd
 849     } else {
 850         $code = <<TheEnd;
 851 { \$s = s ${regex}${subst}s${global};
 852   \$CondReg ||= \$s;
 853 TheEnd
 854     }
 855     if( $print ){
 856         $code .= '  print $_, "\n" if $s;'."\n";
 857     }
 858     if( defined( $path ) ){
 859         $wFiles{$path} = '';
 860         $code .= " _w( '$path' ) if \$s;\n";
 861         $GenKey{'w'} = 1;
 862     }
 863     $code .= "}";
 864 }
 865
 866 =head1 BASIC REGULAR EXPRESSIONS
 867
 868 A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
 869 of I<atoms>, for matching parts of a string, and I<bounds>, specifying
 870 repetitions of a preceding atom.
 871
 872 =head2 Atoms
 873
 874 The possible atoms of a BRE are: B<.>, matching any single character;
 875 B<^> and B<$>, matching the null string at the beginning or end
 876 of a string, respectively; a I<bracket expressions>, enclosed
 877 in B<[> and B<]> (see below); and any single character with no
 878 other significance (matching that character). A B<\> before one
 879 of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
 880 after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
 881 becomes an atom and establishes the target for a I<backreference>,
 882 consisting of the substring that actually matches the enclosed atoms.
 883 Finally, B<\> followed by one of the digits B<0> through B<9> is a
 884 backreference.
 885
 886 A B<^> that is not first, or a B<$> that is not last does not have
 887 a special significance and need not be preceded by a backslash to
 888 become literal. The same is true for a B<]>, that does not terminate
 889 a bracket expression.
 890
 891 An unescaped backslash cannot be last in a BRE.
 892
 893 =head2 Bounds
 894
 895 The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
 896 atom; B<\{>I<count>B<\}>, specifying that many repetitions;
 897 B<\{>I<minimum>B<,\}>, giving a lower limit; and
 898 B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
 899 bound.
 900
 901 A bound appearing as the first item in a BRE is taken literally.
 902
 903 =head2 Bracket Expressions
 904
 905 A I<bracket expression> is a list of characters, character ranges
 906 and character classes enclosed in B<[> and B<]> and matches any
 907 single character from the represented set of characters.
 908
 909 A character range is written as two characters separated by B<-> and
 910 represents all characters (according to the character collating sequence)
 911 that are not less than the first and not greater than the second.
 912 (Ranges are very collating-sequence-dependent, and portable programs
 913 should avoid relying on them.)
 914
 915 A character class is one of the class names
 916
 917    alnum     digit     punct
 918    alpha     graph     space
 919    blank     lower     upper
 920    cntrl     print     xdigit
 921
 922 enclosed in B<[:> and B<:]> and represents the set of characters
 923 as defined in ctype(3).
 924
 925 If the first character after B<[> is B<^>, the sense of matching is
 926 inverted.
 927
 928 To include a literal 'C<^>', place it anywhere else but first. To
 929 include a literal 'C<]>' place it first or immediately after an
 930 initial B<^>. To include a literal 'C<->' make it the first (or
 931 second after B<^>) or last character, or the second endpoint of
 932 a range.
 933
 934 The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
 935 match the null string at the beginning and end of a word respectively.
 936 (Note that neither is identical to Perl's '\b' atom.)
 937
 938 =head2 Additional Atoms
 939
 940 Since some sed implementations provide additional regular expression
 941 atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
 942 the following backslash escapes:
 943
 944 =over 4
 945
 946 =item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
 947
 948 =item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
 949
 950 =item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
 951
 952 =item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
 953
 954 =item B<\y> Match the empty string at a word boundary.
 955
 956 =item B<\B> Match the empty string between any two either word or non-word characters.
 957
 958 =back
 959
 960 To enable this feature, the environment variable PSEDEXTBRE must be set
 961 to a string containing the requested characters, e.g.:
 962 C<PSEDEXTBRE='E<lt>E<gt>wW'>.
 963
 964 =cut
 965
 966 #####
 967 # bre2p - convert BRE to Perl RE
 968 #
 969 sub peek(\$$){
 970     my( $pref, $ic ) = @_;
 971     $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
 972 }
 973
 974 sub bre2p($$$){
 975     my( $del, $pat, $fl ) = @_;
 976     my $led = $del;
 977     $led =~ tr/{([</})]>/;
 978     $led = '' if $led eq $del;
 979
 980     $pat = substr( $pat, 1, length($pat) - 2 );
 981     my $res = '';
 982     my $bracklev = 0;
 983     my $backref  = 0;
 984     my $parlev = 0;
 985     for( my $ic = 0; $ic < length( $pat ); $ic++ ){
 986         my $c = substr( $pat, $ic, 1 );
 987         if( $c eq '\\' ){
 988             ### backslash escapes
 989             my $nc = peek($pat,$ic);
 990             if( $nc eq '' ){
 991                 Warn( "'\\' cannot be last in pattern", $fl );
 992                 return undef();
 993             }
 994             $ic++;
 995             if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
 996                 $res .= "\\$del";
 997
 998             } elsif( $nc =~ /([[.*\\n])/ ){
 999                 ## check for \-escaped magics and \n:
1000                 ## \[ \. \* \\ \n stay as they are
1001                 $res .= '\\'.$nc;
1002
1003             } elsif( $nc eq '(' ){ ## \( => (
1004                 $parlev++;
1005                 $res .= '(';
1006
1007             } elsif( $nc eq ')' ){ ## \) => )
1008                 $parlev--;
1009                 $backref++;
1010                 if( $parlev < 0 ){
1011                     Warn( "unmatched '\\)'", $fl );
1012                     return undef();
1013                 }
1014                 $res .= ')';
1015
1016             } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
1017                 my $endpos = index( $pat, '\\}', $ic );
1018                 if( $endpos < 0 ){
1019                     Warn( "unmatched '\\{'", $fl );
1020                     return undef();
1021                 }
1022                 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
1023                 $ic = $endpos + 1;
1024
1025                 if( $res =~ /^\^?$/ ){
1026                     $res .= "\\{$rep\}";
1027                 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
1028                     my $min = $1;
1029                     my $com = $2 || '';
1030                     my $max = $3;
1031                     if( length( $max ) ){
1032                         if( $max < $min ){
1033                             Warn( "maximum less than minimum in '\\{$rep\\}'",
1034                                   $fl );
1035                             return undef();
1036                         }
1037                     } else {
1038                         $max = '';
1039                     }
1040                     # simplify some
1041                     if( $min == 0 && $max eq '1' ){
1042                         $res .= '?';
1043                     } elsif( $min == 1 && "$com$max" eq ',' ){
1044                         $res .= '+';
1045                     } elsif( $min == 0 && "$com$max" eq ',' ){
1046                         $res .= '*';
1047                     } else {
1048                         $res .= "{$min$com$max}";
1049                     }
1050                 } else {
1051                     Warn( "invalid repeat clause '\\{$rep\\}'", $fl );
1052                     return undef();
1053                 }
1054
1055             } elsif( $nc =~ /^[1-9]$/ ){
1056                 ## \1 .. \9 => \1 .. \9, but check for a following digit
1057                 if( $nc > $backref ){
1058                     Warn( "invalid backreference ($nc)", $fl );
1059                     return undef();
1060                 }
1061                 $res .= "\\$nc";
1062                 if( peek($pat,$ic) =~ /[0-9]/ ){
1063                     $res .= '(?:)';
1064                 }
1065
1066             } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1067                 ## extensions - at most <>wWyB - not in POSIX
1068                 if(      $nc eq '<' ){ ## \< => \b(?=\w), be precise
1069                     $res .= '\\b(?<=\\W)';
1070                 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1071                     $res .= '\\b(?=\\W)';
1072                 } elsif( $nc eq 'y' ){ ## \y => \b
1073                     $res .= '\\b';
1074                 } else {               ## \B, \w, \W remain the same
1075                     $res .= "\\$nc";
1076                 }
1077             } elsif( $nc eq $led ){
1078                 ## \<closing bracketing-delimiter> - keep '\'
1079                 $res .= "\\$nc";
1080
1081             } else { ## \ <char> => <char> ("as if '\' were not present")
1082                 $res .= $nc;
1083             }
1084
1085         } elsif( $c eq '.' ){ ## . => .
1086             $res .= $c;
1087
1088         } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1089             if( $res =~ /^\^?$/ ){
1090                 $res .= '\\*';
1091             } elsif( substr( $res, -1, 1 ) ne '*' ){
1092                 $res .= $c;
1093             }
1094
1095         } elsif( $c eq '[' ){
1096             ## parse []: [^...] [^]...] [-...]
1097             my $add = '[';
1098             if( peek($pat,$ic) eq '^' ){
1099                 $ic++;
1100                 $add .= '^';
1101             }
1102             my $nc = peek($pat,$ic);
1103             if( $nc eq ']' || $nc eq '-' ){
1104                 $add .= $nc;
1105                 $ic++;
1106             }
1107             # check that [ is not trailing
1108             if( $ic >= length( $pat ) - 1 ){
1109                 Warn( "unmatched '['", $fl );
1110                 return undef();
1111             }
1112             # look for [:...:] and x-y
1113             my $rstr = substr( $pat, $ic+1 );
1114             if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1115                 my $cnt = $1;
1116                 $ic += length( $cnt );
1117                 $cnt =~ s/([\\\$])/\\$1/g; # '\', '$' are magic in Perl []
1118                 # try some simplifications
1119                 my $red = $cnt;
1120                 if( $red =~ s/0-9// ){
1121                     $cnt = $red.'\d';
1122                     if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1123                         $cnt = $red.'\w';
1124                     }
1125                 }
1126                 $add .= $cnt;
1127
1128                 # POSIX 1003.2 has this (optional) for begin/end word
1129                 $add = '\\b(?=\\W)'  if $add eq '[[:<:]]';
1130                 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1131
1132             }
1133
1134             ## may have a trailing '-' before ']'
1135             if( $ic < length($pat) - 1 &&
1136                 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1137                 $ic += length( $1 );
1138                 $add .= $1;
1139                 # another simplification
1140                 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1141                 $res .= $add;
1142             } else {
1143                 Warn( "unmatched '['", $fl );
1144                 return undef();
1145             }
1146
1147         } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1148             $res .= "\\$c";
1149
1150         } elsif( $c eq ']' ){ ## unmatched ] is not magic
1151             $res .= ']';
1152
1153         } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1154             $res .= "\\$c";
1155
1156         } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1157             $res .= length( $res ) ? '\\^' : '^';
1158
1159         } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1160             $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
1161
1162         } else {
1163             $res .= $c;
1164         }
1165     }
1166
1167     if( $parlev ){
1168        Warn( "unmatched '\\('", $fl );
1169        return undef();
1170     }
1171
1172     # final cleanup: eliminate raw HTs
1173     $res =~ s/\t/\\t/g;
1174     return $del . $res . ( $led ? $led : $del );
1175 }
1176
1177
1178 #####
1179 # sub2p - convert sed substitution to Perl substitution
1180 #
1181 sub sub2p($$$){
1182     my( $del, $subst, $fl ) = @_;
1183     my $led = $del;
1184     $led =~ tr/{([</})]>/;
1185     $led = '' if $led eq $del;
1186
1187     $subst = substr( $subst, 1, length($subst) - 2 );
1188     my $res = '';
1189
1190     for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1191         my $c = substr( $subst, $ic, 1 );
1192         if( $c eq '\\' ){
1193             ### backslash escapes
1194             my $nc = peek($subst,$ic);
1195             if( $nc eq '' ){
1196                 Warn( "'\\' cannot be last in substitution", $fl );
1197                 return undef();
1198             }
1199             $ic++;
1200             if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1201                 $res .= '\\' . $nc;
1202             } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1203                 $res .= '${' . $nc . '}';
1204             } else { ## everything else (includes &): omit \
1205                 $res .= $nc;
1206             }
1207         } elsif( $c eq '&' ){ ## & => $&
1208             $res .= '$&';
1209         } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1210             $res .= '\\' . $c;
1211         } else {
1212             $res .= $c;
1213         }
1214     }
1215
1216     # final cleanup: eliminate raw HTs
1217     $res =~ s/\t/\\t/g;
1218     return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1219 }
1220
1221
1222 sub Parse(){
1223     my $error = 0;
1224     my( $pdef, $pfil, $plin );
1225     for( my $icom = 0; $icom < @Commands; $icom++ ){
1226         my $cmd = $Commands[$icom];
1227         print "Parse:$cmd:\n" if $useDEBUG;
1228         $cmd =~ s/^\s+//;
1229         next unless length( $cmd );
1230         my $scom = $icom;
1231         if( exists( $Defined{$icom} ) ){
1232             $pdef = $Defined{$icom};
1233             if( $pdef =~ /^ #(\d+)/ ){
1234                 $pfil = 'expression #';
1235                 $plin = $1;
1236             } else {
1237                 $pfil = "$pdef l.";
1238                 $plin = 1;
1239             }
1240         } else {
1241             $plin++;
1242         }
1243         my $fl = "$pfil$plin";
1244
1245         # insert command as comment in gnerated code
1246         #
1247         $Code .= "# $cmd\n" if $doGenerate;
1248
1249         # The Address(es)
1250         #
1251         my( $negated, $naddr, $addr1, $addr2 );
1252         $naddr = 0;
1253         if(      $cmd =~ s/^(\d+)\s*// ){
1254             $addr1 = "$1"; $naddr++;
1255         } elsif( $cmd =~ s/^\$\s*// ){
1256             $addr1 = 'eofARGV()'; $naddr++;
1257         } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1258             my $del = $1;
1259             my $regex = stripRegex( $del, \$cmd );
1260             if( defined( $regex ) ){
1261                 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1262                 $naddr++;
1263             } else {
1264                 Warn( "malformed regex, 1st address", $fl );
1265                 $error++;
1266                 next;
1267             }
1268         }
1269         if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1270             if(      $cmd =~ s/^(\d+)\s*// ){
1271                 $addr2 = "$1"; $naddr++;
1272             } elsif( $cmd =~ s/^\$\s*// ){
1273                 $addr2 = 'eofARGV()'; $naddr++;
1274             } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1275                 my $del = $1;
1276                 my $regex = stripRegex( $del, \$cmd );
1277                 if( defined( $regex ) ){
1278                     $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1279                     $naddr++;
1280                 } else {
1281                     Warn( "malformed regex, 2nd address", $fl );
1282                     $error++;
1283                     next;
1284                 }
1285             } else {
1286                 Warn( "invalid address after ','", $fl );
1287                 $error++;
1288                 next;
1289             }
1290         }
1291
1292         # address modifier '!'
1293         #
1294         $negated = $cmd =~ s/^!\s*//;
1295         if( defined( $addr1 ) ){
1296             print "Parse: addr1=$addr1" if $useDEBUG;
1297             if( defined( $addr2 ) ){
1298                 print ", addr2=$addr2 " if $useDEBUG;
1299                 # both numeric and addr1 > addr2 => eliminate addr2
1300                 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1301                                    $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1302             }
1303         }
1304         print 'negated' if $useDEBUG && $negated;
1305         print " command:$cmd\n" if $useDEBUG;
1306
1307         # The Command
1308         #
1309         if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1310             my $h = substr( $cmd, 0, 1 );
1311             Warn( "unknown command '$h'", $fl );
1312             $error++;
1313             next;
1314         }
1315         my $key = $1;
1316
1317         my $tabref = $ComTab{$key};
1318         $GenKey{$key} = 1;
1319         if( $naddr > $tabref->[0] ){
1320             Warn( "excess address(es)", $fl );
1321             $error++;
1322             next;
1323         }
1324
1325         my $arg = '';
1326         if(      $tabref->[1] eq 'str' ){
1327             # take remainder - don't care if it is empty
1328             $arg = $cmd;
1329             $cmd = '';
1330
1331         } elsif( $tabref->[1] eq 'txt' ){
1332             # multi-line text
1333             my $goon = $cmd =~ /(.*)\\$/;
1334             if( length( $1 ) ){
1335                 Warn( "extra characters after command ($cmd)", $fl );
1336                 $error++;
1337             }
1338             while( $goon ){
1339                 $icom++;
1340                 if( $icom > $#Commands ){
1341                     Warn( "unexpected end of script", $fl );
1342                     $error++;
1343                     last;
1344                 }
1345                 $cmd = $Commands[$icom];
1346                 $Code .= "# $cmd\n" if $doGenerate;
1347                 $goon = $cmd =~ s/\\$//;
1348                 $cmd =~ s/\\(.)/$1/g;
1349                 $arg .= "\n" if length( $arg );
1350                 $arg .= $cmd;
1351             }
1352             $arg .= "\n" if length( $arg );
1353             $cmd = '';
1354
1355         } elsif( $tabref->[1] eq 'sub' ){
1356             # s///
1357             if( ! length( $cmd ) ){
1358                 Warn( "'s' command requires argument", $fl );
1359                 $error++;
1360                 next;
1361             }
1362             if( $cmd =~ s{^([^\\\n])}{} ){
1363                 my $del = $1;
1364                 my $regex = stripRegex( $del, \$cmd, "s" );
1365                 if( ! defined( $regex ) ){
1366                     Warn( "malformed regular expression", $fl );
1367                     $error++;
1368                     next;
1369                 }
1370                 $regex = bre2p( $del, $regex, $fl );
1371
1372                 # a trailing \ indicates embedded NL (in replacement string)
1373                 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1374                     $icom++;
1375                     if( $icom > $#Commands ){
1376                         Warn( "unexpected end of script", $fl );
1377                         $error++;
1378                         last;
1379                     }
1380                     $cmd .= $Commands[$icom];
1381                     $Code .= "# $Commands[$icom]\n" if $doGenerate;
1382                 }
1383
1384                 my $subst = stripRegex( $del, \$cmd );
1385                 if( ! defined( $regex ) ){
1386                     Warn( "malformed substitution expression", $fl );
1387                     $error++;
1388                     next;
1389                 }
1390                 $subst = sub2p( $del, $subst, $fl );
1391
1392                 # parse s/// modifier: g|p|0-9|w <file>
1393                 my( $global, $nmatch, $print, $write ) =
1394                   ( '',      '',      0,      undef );
1395                 while( $cmd =~ s/^([gp0-9])// ){
1396                     $1 eq 'g' ? ( $global = 'g' ) :
1397                     $1 eq 'p' ? ( $print  = $1  ) : ( $nmatch .= $1 );
1398                 }
1399                 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1400                 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1401                 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1402                     Warn( "conflicting flags '$global$nmatch'", $fl );
1403                     $error++;
1404                     next;
1405                 }
1406
1407                 $arg = makes( $regex, $subst,
1408                               $write, $global, $print, $nmatch, $fl );
1409                 if( ! defined( $arg ) ){
1410                     $error++;
1411                     next;
1412                 }
1413
1414             } else {
1415                 Warn( "improper delimiter in s command", $fl );
1416                 $error++;
1417                 next;
1418             }
1419
1420         } elsif( $tabref->[1] eq 'tra' ){
1421             # y///
1422             # a trailing \ indicates embedded newline
1423             while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1424                 $icom++;
1425                 if( $icom > $#Commands ){
1426                     Warn( "unexpected end of script", $fl );
1427                     $error++;
1428                     last;
1429                 }
1430                 $cmd .= $Commands[$icom];
1431                 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1432             }
1433             if( ! length( $cmd ) ){
1434                 Warn( "'y' command requires argument", $fl );
1435                 $error++;
1436                 next;
1437             }
1438             my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1439             if( $d eq '\\' ){
1440                 Warn( "'\\' not valid as delimiter in 'y' command", $fl );
1441                 $error++;
1442                 next;
1443             }
1444             my $fr = stripTrans( $d, \$cmd );
1445             if( ! defined( $fr ) || ! length( $cmd ) ){
1446                 Warn( "malformed 'y' command argument", $fl );
1447                 $error++;
1448                 next;
1449             }
1450             my $to = stripTrans( $d, \$cmd );
1451             if( ! defined( $to ) ){
1452                 Warn( "malformed 'y' command argument", $fl );
1453                 $error++;
1454                 next;
1455             }
1456             if( length($fr) != length($to) ){
1457                 Warn( "string lengths in 'y' command differ", $fl );
1458                 $error++;
1459                 next;
1460             }
1461             if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1462                 $error++;
1463                 next;
1464             }
1465
1466         }
1467
1468         # $cmd must be now empty - exception is {
1469         if( $cmd !~ /^\s*$/ ){
1470             if( $key eq '{' ){
1471                 # dirty hack to process command on '{' line
1472                 $Commands[$icom--] = $cmd;
1473             } else {
1474                 Warn( "extra characters after command ($cmd)", $fl );
1475                 $error++;
1476                 next;
1477             }
1478         }
1479
1480         # Make Code
1481         #
1482         if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1483                              $tabref->[3], $arg, $fl ) ){
1484             $error++;
1485         }
1486     }
1487
1488     while( @BlockStack ){
1489         my $bl = pop( @BlockStack );
1490         Warn( "start of unterminated '{'", $bl );
1491         $error++;
1492     }
1493
1494     for my $lab ( keys( %Label ) ){
1495         if( ! exists( $Label{$lab}{defined} ) ){
1496             for my $used ( @{$Label{$lab}{used}} ){
1497                 Warn( "undefined label '$lab'", $used );
1498                 $error++;
1499             }
1500         }
1501     }
1502
1503     exit( 1 ) if $error;
1504 }
1505
1506
1507 ##############
1508 #### MAIN ####
1509 ##############
1510
1511 sub usage(){
1512     print STDERR "Usage: sed [-an] command [file...]\n";
1513     print STDERR "           [-an] [-e command] [-f script-file] [file...]\n";
1514 }
1515
1516 ###################
1517 # Here we go again...
1518 #
1519 my $expr = 0;
1520 while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1521     my $opt = $1;
1522     my $arg = $2;
1523     shift( @ARGV );
1524     if(      $opt eq 'e' ){
1525         if( length( $arg ) ){
1526             push( @Commands, split( "\n", $arg ) );
1527         } elsif( @ARGV ){
1528             push( @Commands, shift( @ARGV ) );
1529         } else {
1530             Warn( "option -e requires an argument" );
1531             usage();
1532             exit( 1 );
1533         }
1534         $expr++;
1535         $Defined{$#Commands} = " #$expr";
1536         next;
1537     }
1538     if( $opt eq 'f' ){
1539         my $path;
1540         if( length( $arg ) ){
1541             $path = $arg;
1542         } elsif( @ARGV ){
1543             $path = shift( @ARGV );
1544         } else {
1545             Warn( "option -f requires an argument" );
1546             usage();
1547             exit( 1 );
1548         }
1549         my $fst = $#Commands + 1;
1550         open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1551         my $cmd;
1552         while( defined( $cmd = <SCRIPT> ) ){
1553             chomp( $cmd );
1554             push( @Commands, $cmd );
1555         }
1556         close( SCRIPT );
1557         if( $#Commands >= $fst ){
1558             $Defined{$fst} = "$path";
1559         }
1560         next;
1561     }
1562     if( $opt eq '-' && $arg eq '' ){
1563         last;
1564     }
1565     if( $opt eq 'h' || $opt eq '?' ){
1566         usage();
1567         exit( 0 );
1568     }
1569     if( $opt eq 'n' ){
1570         $doAutoPrint = 0;
1571     } elsif( $opt eq 'a' ){
1572         $doOpenWrite = 0;
1573     } else {
1574         Warn( "illegal option '$opt'" );
1575         usage();
1576         exit( 1 );
1577     }
1578     if( length( $arg ) ){
1579         unshift( @ARGV, "-$arg" );
1580     }
1581 }
1582
1583 # A singleton command may be the 1st argument when there are no options.
1584 #
1585 if( @Commands == 0 ){
1586     if( @ARGV == 0 ){
1587         Warn( "no script command given" );
1588         usage();
1589         exit( 1 );
1590     }
1591     push( @Commands, split( "\n", shift( @ARGV ) ) );
1592     $Defined{0} = ' #1';
1593 }
1594
1595 print STDERR "Files: @ARGV\n" if $useDEBUG;
1596
1597 # generate leading code
1598 #
1599 $Func = <<'[TheEnd]';
1600
1601 # openARGV: open 1st input file
1602 #
1603 sub openARGV(){
1604     unshift( @ARGV, '-' ) unless @ARGV;
1605     my $file = shift( @ARGV );
1606     open( ARG, "<$file" )
1607     || die( "$0: can't open $file for reading ($!)\n" );
1608     $isEOF = 0;
1609 }
1610
1611 # getsARGV: Read another input line into argument (default: $_).
1612 #           Move on to next input file, and reset EOF flag $isEOF.
1613 sub getsARGV(;\$){
1614     my $argref = @_ ? shift() : \$_;
1615     while( $isEOF || ! defined( $$argref = <ARG> ) ){
1616         close( ARG );
1617         return 0 unless @ARGV;
1618         my $file = shift( @ARGV );
1619         open( ARG, "<$file" )
1620         || die( "$0: can't open $file for reading ($!)\n" );
1621         $isEOF = 0;
1622     }
1623     1;
1624 }
1625
1626 # eofARGV: end-of-file test
1627 #
1628 sub eofARGV(){
1629     return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1630 }
1631
1632 # makeHandle: Generates another file handle for some file (given by its path)
1633 #             to be written due to a w command or an s command's w flag.
1634 sub makeHandle($){
1635     my( $path ) = @_;
1636     my $handle;
1637     if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1638         $handle = $wFiles{$path} = gensym();
1639         if( $doOpenWrite ){
1640             if( ! open( $handle, ">$path" ) ){
1641                 die( "$0: can't open $path for writing: ($!)\n" );
1642             }
1643         }
1644     } else {
1645         $handle = $wFiles{$path};
1646     }
1647     return $handle;
1648 }
1649
1650 # printQ: Print queued output which is either a string or a reference
1651 #         to a pathname.
1652 sub printQ(){
1653     for my $q ( @Q ){
1654         if( ref( $q ) ){
1655             # flush open w files so that reading this file gets it all
1656             if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1657                 open( $wFiles{$$q}, ">>$$q" );
1658             }
1659             # copy file to stdout: slow, but safe
1660             if( open( RF, "<$$q" ) ){
1661                 while( defined( my $line = <RF> ) ){
1662                     print $line;
1663                 }
1664                 close( RF );
1665             }
1666         } else {
1667             print $q;
1668         }
1669     }
1670     undef( @Q );
1671 }
1672
1673 [TheEnd]
1674
1675 # generate the sed loop
1676 #
1677 $Code .= <<'[TheEnd]';
1678 sub openARGV();
1679 sub getsARGV(;\$);
1680 sub eofARGV();
1681 sub printQ();
1682
1683 # Run: the sed loop reading input and applying the script
1684 #
1685 sub Run(){
1686     my( $h, $icnt, $s, $n );
1687     # hack (not unbreakable :-/) to avoid // matching an empty string
1688     my $z = "\000"; $z =~ /$z/;
1689     # Initialize.
1690     openARGV();
1691     $Hold    = '';
1692     $CondReg = 0;
1693     $doPrint = $doAutoPrint;
1694 CYCLE:
1695     while( getsARGV() ){
1696         chomp();
1697         $CondReg = 0;   # cleared on t
1698 BOS:;
1699 [TheEnd]
1700
1701     # parse - avoid opening files when doing s2p
1702     #
1703     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1704       if $doGenerate;
1705     Parse();
1706     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1707       if $doGenerate;
1708
1709     # append trailing code
1710     #
1711     $Code .= <<'[TheEnd]';
1712 EOS:    if( $doPrint ){
1713             print $_, "\n";
1714         } else {
1715             $doPrint = $doAutoPrint;
1716         }
1717         printQ() if @Q;
1718     }
1719
1720     exit( 0 );
1721 }
1722 [TheEnd]
1723
1724
1725 # append optional functions, prepend prototypes
1726 #
1727 my $Proto = "# prototypes\n";
1728 if( $GenKey{'l'} ){
1729     $Proto .= "sub _l();\n";
1730     $Func .= <<'[TheEnd]';
1731 # _l: l command processing
1732 #
1733 sub _l(){
1734     my $h = $_;
1735     my $mcpl = 70;
1736     # transform non printing chars into escape notation
1737     $h =~ s/\\/\\\\/g;
1738     if( $h =~ /[^[:print:]]/ ){
1739         $h =~ s/\a/\\a/g;
1740         $h =~ s/\f/\\f/g;
1741         $h =~ s/\n/\\n/g;
1742         $h =~ s/\t/\\t/g;
1743         $h =~ s/\r/\\r/g;
1744         $h =~ s/\e/\\e/g;
1745         $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1746     }
1747     # split into lines of length $mcpl
1748     while( length( $h ) > $mcpl ){
1749         my $l = substr( $h, 0, $mcpl-1 );
1750         $h = substr( $h, $mcpl );
1751         # remove incomplete \-escape from end of line
1752         if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1753             $h = $1 . $h;
1754         }
1755         print $l, "\\\n";
1756     }
1757     print "$h\$\n";
1758 }
1759
1760 [TheEnd]
1761 }
1762
1763 if( $GenKey{'r'} ){
1764     $Proto .= "sub _r(\$);\n";
1765     $Func .= <<'[TheEnd]';
1766 # _r: r command processing: Save a reference to the pathname.
1767 #
1768 sub _r($){
1769     my $path = shift();
1770     push( @Q, \$path );
1771 }
1772
1773 [TheEnd]
1774 }
1775
1776 if( $GenKey{'t'} ){
1777     $Proto .= "sub _t();\n";
1778     $Func .= <<'[TheEnd]';
1779 # _t: t command - condition register test/reset
1780 #
1781 sub _t(){
1782     my $res = $CondReg;
1783     $CondReg = 0;
1784     $res;
1785 }
1786
1787 [TheEnd]
1788 }
1789
1790 if( $GenKey{'w'} ){
1791     $Proto .= "sub _w(\$);\n";
1792     $Func .= <<'[TheEnd]';
1793 # _w: w command and s command's w flag - write to file
1794 #
1795 sub _w($){
1796     my $path   = shift();
1797     my $handle = $wFiles{$path};
1798     if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1799         open( $handle, ">$path" )
1800         || die( "$0: $path: cannot open ($!)\n" );
1801     }
1802     print $handle $_, "\n";
1803 }
1804
1805 [TheEnd]
1806 }
1807
1808 $Code = $Proto . $Code;
1809
1810 # magic "#n" - same as -n option
1811 #
1812 $doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
1813
1814 # eval code - check for errors
1815 #
1816 print "Code:\n$Code$Func" if $useDEBUG;
1817 eval $Code . $Func;
1818 if( $@ ){
1819     print "Code:\n$Code$Func";
1820     die( "$0: internal error - generated incorrect Perl code: $@\n" );
1821 }
1822
1823 if( $doGenerate ){
1824
1825     # write full Perl program
1826     #
1827
1828     # bang line, declarations, prototypes
1829     print <<TheEnd;
1830 #!$perlpath -w
1831 eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1832   if 0;
1833 \$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
1834
1835 use strict;
1836 use Symbol;
1837 use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1838              \$doAutoPrint \$doOpenWrite \$doPrint };
1839 \$doAutoPrint = $doAutoPrint;
1840 \$doOpenWrite = $doOpenWrite;
1841 TheEnd
1842
1843     my $wf = "'" . join( "', '",  keys( %wFiles ) ) . "'";
1844     if( $wf ne "''" ){
1845         print <<TheEnd;
1846 sub makeHandle(\$);
1847 for my \$p ( $wf ){
1848    exit( 1 ) unless makeHandle( \$p );
1849 }
1850 TheEnd
1851    }
1852
1853    print $Code;
1854    print "Run();\n";
1855    print $Func;
1856    exit( 0 );
1857
1858 } else {
1859
1860     # execute: make handles (and optionally open) all w files; run!
1861     for my $p ( keys( %wFiles ) ){
1862         exit( 1 ) unless makeHandle( $p );
1863     }
1864     Run();
1865 }
1866
1867
1868 =head1 ENVIRONMENT
1869
1870 The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1871 See L<"Additional Atoms">.
1872
1873 =head1 DIAGNOSTICS
1874
1875 =over 4
1876
1877 =item ambiguous translation for character '%s' in 'y' command
1878
1879 The indicated character appears twice, with different translations.
1880
1881 =item '[' cannot be last in pattern
1882
1883 A '[' in a BRE indicates the beginning of a I<bracket expression>.
1884
1885 =item '\' cannot be last in pattern
1886
1887 A '\' in a BRE is used to make the subsequent character literal.
1888
1889 =item '\' cannot be last in substitution
1890
1891 A '\' in a substitution string is used to make the subsequent character literal.
1892
1893 =item conflicting flags '%s'
1894
1895 In an B<s> command, either the 'g' flag and an n-th occurrence flag, or
1896 multiple n-th occurrence flags are specified. Note that only the digits
1897 ^'1' through '9' are permitted.
1898
1899 =item duplicate label %s (first defined at %s)
1900
1901 =item excess address(es)
1902
1903 The command has more than the permitted number of addresses.
1904
1905 =item extra characters after command (%s)
1906
1907 =item illegal option '%s'
1908
1909 =item improper delimiter in s command
1910
1911 The BRE and substitution may not be delimited with '\' or newline.
1912
1913 =item invalid address after ','
1914
1915 =item invalid backreference (%s)
1916
1917 The specified backreference number exceeds the number of backreferences
1918 in the BRE.
1919
1920 =item invalid repeat clause '\{%s\}'
1921
1922 The repeat clause does not contain a valid integer value, or pair of
1923 values.
1924
1925 =item malformed regex, 1st address
1926
1927 =item malformed regex, 2nd address
1928
1929 =item malformed regular expression
1930
1931 =item malformed substitution expression
1932
1933 =item malformed 'y' command argument
1934
1935 The first or second string of a B<y> command  is syntactically incorrect.
1936
1937 =item maximum less than minimum in '\{%s\}'
1938
1939 =item no script command given
1940
1941 There must be at least one B<-e> or one B<-f> option specifying a
1942 script or script file.
1943
1944 =item '\' not valid as delimiter in 'y' command
1945
1946 =item option -e requires an argument
1947
1948 =item option -f requires an argument
1949
1950 =item 's' command requires argument
1951
1952 =item start of unterminated '{'
1953
1954 =item string lengths in 'y' command differ
1955
1956 The translation table strings in a B<y> command must have equal lengths.
1957
1958 =item undefined label '%s'
1959
1960 =item unexpected '}'
1961
1962 A B<}> command without a preceding B<{> command was encountered.
1963
1964 =item unexpected end of script
1965
1966 The end of the script was reached although a text line after a
1967 B<a>, B<c> or B<i> command indicated another line.
1968
1969 =item unknown command '%s'
1970
1971 =item unterminated '['
1972
1973 A BRE contains an unterminated bracket expression.
1974
1975 =item unterminated '\('
1976
1977 A BRE contains an unterminated backreference.
1978
1979 =item '\{' without closing '\}'
1980
1981 A BRE contains an unterminated bounds specification.
1982
1983 =item '\)' without preceding '\('
1984
1985 =item 'y' command requires argument
1986
1987 =back
1988
1989 =head1 EXAMPLE
1990
1991 The basic material for the preceding section was generated by running
1992 the sed script
1993
1994    #no autoprint
1995    s/^.*Warn( *"\([^"]*\)".*$/\1/
1996    t process
1997    b
1998    :process
1999    s/$!/%s/g
2000    s/$[_[:alnum:]]\{1,\}/%s/g
2001    s/\\\\/\\/g
2002    s/^/=item /
2003    p
2004
2005 on the program's own text, and piping the output into C<sort -u>.
2006
2007
2008 =head1 SED SCRIPT TRANSLATION
2009
2010 If this program is invoked with the name F<s2p> it will act as a
2011 sed-to-Perl translator. After option processing (all other
2012 arguments are ignored), a Perl program is printed on standard
2013 output, which will process the input stream (as read from all
2014 arguments) in the way defined by the sed script and the option setting
2015 used for the translation.
2016
2017 =head1 SEE ALSO
2018
2019 perl(1), re_format(7)
2020
2021 =head1 BUGS
2022
2023 The B<l> command will show escape characters (ESC) as 'C<\e>', but
2024 a vertical tab (VT) in octal.
2025
2026 Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
2027
2028 The meaning of an empty regular expression ('C<//>'), as defined by B<sed>,
2029 is "the last pattern used, at run time". This deviates from the Perl
2030 interpretation, which will re-use the "last last successfully executed
2031 regular expression". Since keeping track of pattern usage would create
2032 terribly cluttered code, and differences would only appear in obscure
2033 context (where other B<sed> implementations appear to deviate, too),
2034 the Perl semantics was adopted. Note that common usage of this feature,
2035 such as in C</abc/s//xyz/>, will work as expected.
2036
2037 Collating elements (of bracket expressions in BREs) are not implemented.
2038
2039 =head1 STANDARDS
2040
2041 This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2042 definition of B<sed>, and is compatible with the I<OpenBSD>
2043 implementation, except where otherwise noted (see L<"BUGS">).
2044
2045 =head1 AUTHOR
2046
2047 This Perl implementation of I<sed> was written by Wolfgang Laun,
2048 I<Wolfgang.Laun@alcatel.at>.
2049
2050 =head1 COPYRIGHT and LICENSE
2051
2052 This program is free and open software. You may use, modify,
2053 distribute, and sell this program (and any modified variants) in any
2054 way you wish, provided you do not restrict others from doing the same.
2055
2056 =cut
2057
2058 !NO!SUBS!
2059
2060 close OUT or die "Can't close $file: $!";
2061 chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
2062 unlink 'psed';
2063 print "Linking $file to psed.\n";
2064 if (defined $Config{d_link}) {
2065   link $file, 'psed';
2066 } else {
2067   unshift @INC, '../lib';
2068   require File::Copy;
2069   File::Copy::syscopy('s2p', 'psed');
2070 }
2071 exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
2072 chdir $origdir;