x2p/s2p.PL

   1 #!/usr/bin/perl
   2
   3 use Config;
   4 use File::Basename qw(&basename &dirname);
   5 use Cwd;
   6 use subs qw(link);
   7
   8 sub link { # This is a cut-down version of installperl:link().
   9     my($from,$to) = @_;
  10     my($success) = 0;
  11
  12     eval {
  13         CORE::link($from, $to)
  14             ? $success++
  15             : ($from =~ m#^/afs/# || $to =~ m#^/afs/#)
  16               ? die "AFS"  # okay inside eval {}
  17               : die "Couldn't link $from to $to: $!\n";
  18     };
  19     if ($@) {
  20         require File::Copy;
  21         File::Copy::copy($from, $to)
  22             ? $success++
  23             : warn "Couldn't copy $from to $to: $!\n";
  24     }
  25     $success;
  26 }
  27
  28 # List explicitly here the variables you want Configure to
  29 # generate.  Metaconfig only looks for shell variables, so you
  30 # have to mention them as if they were shell variables, not
  31 # %Config entries.  Thus you write
  32 #  $startperl
  33 # to ensure Configure will look for $Config{startperl}.
  34
  35 # This forces PL files to create target in same directory as PL file.
  36 # This is so that make depend always knows where to find PL derivatives.
  37 $origdir = cwd;
  38 chdir dirname($0);
  39 $file = basename($0, '.PL');
  40 $file .= '.com' if $^O eq 'VMS';
  41
  42 open OUT,">$file" or die "Can't create $file: $!";
  43
  44 print "Extracting $file (with variable substitutions)\n";
  45
  46 # In this section, perl variables will be expanded during extraction.
  47 # You can use $Config{...} to use Configure variables.
  48
  49 print OUT <<"!GROK!THIS!";
  50 $Config{startperl}
  51     eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
  52         if \$running_under_some_shell;
  53 my \$startperl;
  54 my \$perlpath;
  55 (\$startperl = <<'/../') =~ s/\\s*\\z//;
  56 $Config{startperl}
  57 /../
  58 (\$perlpath = <<'/../') =~ s/\\s*\\z//;
  59 $Config{perlpath}
  60 /../
  61 !GROK!THIS!
  62
  63 # In the following, perl variables are not expanded during extraction.
  64
  65 print OUT <<'!NO!SUBS!';
  66
  67 $0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
  68
  69 # (p)sed - a stream editor
  70 # History:  Aug 12 2000: Original version.
  71 #           Mar 25 2002: Rearrange generated Perl program.
  72 #           Jul 23 2007: Fix bug in regex stripping (M.Thorland)
  73
  74 use strict;
  75 use integer;
  76 use Symbol;
  77
  78 =head1 NAME
  79
  80 psed - a stream editor
  81
  82 =head1 SYNOPSIS
  83
  84    psed [-an] script [file ...]
  85    psed [-an] [-e script] [-f script-file] [file ...]
  86
  87    s2p  [-an] [-e script] [-f script-file]
  88
  89 =head1 DESCRIPTION
  90
  91 A stream editor reads the input stream consisting of the specified files
  92 (or standard input, if none are given), processes is line by line by
  93 applying a script consisting of edit commands, and writes resulting lines
  94 to standard output. The filename `C<->' may be used to read standard input.
  95
  96 The edit script is composed from arguments of B<-e> options and
  97 script-files, in the given order. A single script argument may be specified
  98 as the first parameter.
  99
 100 If this program is invoked with the name F<s2p>, it will act as a
 101 sed-to-Perl translator. See L<"sed Script Translation">.
 102
 103 B<sed> returns an exit code of 0 on success or >0 if an error occurred.
 104
 105 =head1 OPTIONS
 106
 107 =over 4
 108
 109 =item B<-a>
 110
 111 A file specified as argument to the B<w> edit command is by default
 112 opened before input processing starts. Using B<-a>, opening of such
 113 files is delayed until the first line is actually written to the file.
 114
 115 =item B<-e> I<script>
 116
 117 The editing commands defined by I<script> are appended to the script.
 118 Multiple commands must be separated by newlines.
 119
 120 =item B<-f> I<script-file>
 121
 122 Editing commands from the specified I<script-file> are read and appended
 123 to the script.
 124
 125 =item B<-n>
 126
 127 By default, a line is written to standard output after the editing script
 128 has been applied to it. The B<-n> option suppresses automatic printing.
 129
 130 =back
 131
 132 =head1 COMMANDS
 133
 134 B<sed> command syntax is defined as
 135
 136 Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
 137
 138 with whitespace being permitted before or after addresses, and between
 139 the function character and the argument. The I<address>es and the
 140 address inverter (C<!>) are used to restrict the application of a
 141 command to the selected line(s) of input.
 142
 143 Each command must be on a line of its own, except where noted in
 144 the synopses below.
 145
 146 The edit cycle performed on each input line consist of reading the line
 147 (without its trailing newline character) into the I<pattern space>,
 148 applying the applicable commands of the edit script, writing the final
 149 contents of the pattern space and a newline to the standard output.
 150 A I<hold space> is provided for saving the contents of the
 151 pattern space for later use.
 152
 153 =head2 Addresses
 154
 155 A sed address is either a line number or a pattern, which may be combined
 156 arbitrarily to construct ranges. Lines are numbered across all input files.
 157
 158 Any address may be followed by an exclamation mark (`C<!>'), selecting
 159 all lines not matching that address.
 160
 161 =over 4
 162
 163 =item I<number>
 164
 165 The line with the given number is selected.
 166
 167 =item B<$>
 168
 169 A dollar sign (C<$>) is the line number of the last line of the input stream.
 170
 171 =item B</>I<regular expression>B</>
 172
 173 A pattern address is a basic regular expression (see
 174 L<"Basic Regular Expressions">), between the delimiting character C</>.
 175 Any other character except C<\> or newline may be used to delimit a
 176 pattern address when the initial delimiter is prefixed with a
 177 backslash (`C<\>').
 178
 179 =back
 180
 181 If no address is given, the command selects every line.
 182
 183 If one address is given, it selects the line (or lines) matching the
 184 address.
 185
 186 Two addresses select a range that begins whenever the first address
 187 matches, and ends (including that line) when the second address matches.
 188 If the first (second) address is a matching pattern, the second
 189 address is not applied to the very same line to determine the end of
 190 the range. Likewise, if the second address is a matching pattern, the
 191 first address is not applied to the very same line to determine the
 192 begin of another range. If both addresses are line numbers,
 193 and the second line number is less than the first line number, then
 194 only the first line is selected.
 195
 196
 197 =head2 Functions
 198
 199 The maximum permitted number of addresses is indicated with each
 200 function synopsis below.
 201
 202 The argument I<text> consists of one or more lines following the command.
 203 Embedded newlines in I<text> must be preceded with a backslash.  Other
 204 backslashes in I<text> are deleted and the following character is taken
 205 literally.
 206
 207 =over 4
 208
 209 =cut
 210
 211 my %ComTab;
 212 my %GenKey;
 213 #--------------------------------------------------------------------------
 214 $ComTab{'a'}=[ 1, 'txt', \&Emit,       '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
 215
 216 =item [1addr]B<a\> I<text>
 217
 218 Write I<text> (which must start on the line following the command)
 219 to standard output immediately before reading the next line
 220 of input, either by executing the B<N> function or by beginning a new cycle.
 221
 222 =cut
 223
 224 #--------------------------------------------------------------------------
 225 $ComTab{'b'}=[ 2, 'str', \&Branch,     '{ goto XXX; }'                   ]; #ok
 226
 227 =item [2addr]B<b> [I<label>]
 228
 229 Branch to the B<:> function with the specified I<label>. If no label
 230 is given, branch to the end of the script.
 231
 232 =cut
 233
 234 #--------------------------------------------------------------------------
 235 $ComTab{'c'}=[ 2, 'txt', \&Change,     <<'-X-'                           ]; #ok
 236 { print <<'TheEnd'; } $doPrint = 0; goto EOS;
 237 -X-
 238 ### continue OK => next CYCLE;
 239
 240 =item [2addr]B<c\> I<text>
 241
 242 The line, or range of lines, selected by the address is deleted.
 243 The I<text> (which must start on the line following the command)
 244 is written to standard output. With an address range, this occurs at
 245 the end of the range.
 246
 247 =cut
 248
 249 #--------------------------------------------------------------------------
 250 $ComTab{'d'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 251 { $doPrint = 0;
 252   goto EOS;
 253 }
 254 -X-
 255 ### continue OK => next CYCLE;
 256
 257 =item [2addr]B<d>
 258
 259 Deletes the pattern space and starts the next cycle.
 260
 261 =cut
 262
 263 #--------------------------------------------------------------------------
 264 $ComTab{'D'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 265 { s/^.*\n?//;
 266   if(length($_)){ goto BOS } else { goto EOS }
 267 }
 268 -X-
 269 ### continue OK => next CYCLE;
 270
 271 =item [2addr]B<D>
 272
 273 Deletes the pattern space through the first embedded newline or to the end.
 274 If the pattern space becomes empty, a new cycle is started, otherwise
 275 execution of the script is restarted.
 276
 277 =cut
 278
 279 #--------------------------------------------------------------------------
 280 $ComTab{'g'}=[ 2, '',    \&Emit,       '{ $_ = $Hold };'                 ]; #ok
 281
 282 =item [2addr]B<g>
 283
 284 Replace the contents of the pattern space with the hold space.
 285
 286 =cut
 287
 288 #--------------------------------------------------------------------------
 289 $ComTab{'G'}=[ 2, '',    \&Emit,       '{ $_ .= "\n"; $_ .= $Hold };'    ]; #ok
 290
 291 =item [2addr]B<G>
 292
 293 Append a newline and the contents of the hold space to the pattern space.
 294
 295 =cut
 296
 297 #--------------------------------------------------------------------------
 298 $ComTab{'h'}=[ 2, '',    \&Emit,       '{ $Hold = $_ }'                  ]; #ok
 299
 300 =item [2addr]B<h>
 301
 302 Replace the contents of the hold space with the pattern space.
 303
 304 =cut
 305
 306 #--------------------------------------------------------------------------
 307 $ComTab{'H'}=[ 2, '',    \&Emit,       '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
 308
 309 =item [2addr]B<H>
 310
 311 Append a newline and the contents of the pattern space to the hold space.
 312
 313 =cut
 314
 315 #--------------------------------------------------------------------------
 316 $ComTab{'i'}=[ 1, 'txt', \&Emit,       '{ print <<'."'TheEnd' }\n"       ]; #ok
 317
 318 =item [1addr]B<i\> I<text>
 319
 320 Write the I<text> (which must start on the line following the command)
 321 to standard output.
 322
 323 =cut
 324
 325 #--------------------------------------------------------------------------
 326 $ComTab{'l'}=[ 2, '',    \&Emit,       '{ _l() }'                        ]; #okUTF8
 327
 328 =item [2addr]B<l>
 329
 330 Print the contents of the pattern space: non-printable characters are
 331 shown in C-style escaped form; long lines are split and have a trailing
 332 `C<\>' at the point of the split; the true end of a line is marked with
 333 a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
 334 BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
 335 octal number for all other non-printable characters.
 336
 337 =cut
 338
 339 #--------------------------------------------------------------------------
 340 $ComTab{'n'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 341 { print $_, "\n" if $doPrint;
 342   printQ() if @Q;
 343   $CondReg = 0;
 344   last CYCLE unless getsARGV();
 345   chomp();
 346 }
 347 -X-
 348
 349 =item [2addr]B<n>
 350
 351 If automatic printing is enabled, write the pattern space to the standard
 352 output. Replace the pattern space with the next line of input. If
 353 there is no more input, processing is terminated.
 354
 355 =cut
 356
 357 #--------------------------------------------------------------------------
 358 $ComTab{'N'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 359 { printQ() if @Q;
 360   $CondReg = 0;
 361   last CYCLE unless getsARGV( $h );
 362   chomp( $h );
 363   $_ .= "\n$h";
 364 }
 365 -X-
 366
 367 =item [2addr]B<N>
 368
 369 Append a newline and the next line of input to the pattern space. If
 370 there is no more input, processing is terminated.
 371
 372 =cut
 373
 374 #--------------------------------------------------------------------------
 375 $ComTab{'p'}=[ 2, '',    \&Emit,       '{ print $_, "\n"; }'             ]; #ok
 376
 377 =item [2addr]B<p>
 378
 379 Print the pattern space to the standard output. (Use the B<-n> option
 380 to suppress automatic printing at the end of a cycle if you want to
 381 avoid double printing of lines.)
 382
 383 =cut
 384
 385 #--------------------------------------------------------------------------
 386 $ComTab{'P'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 387 { if( /^(.*)/ ){ print $1, "\n"; } }
 388 -X-
 389
 390 =item [2addr]B<P>
 391
 392 Prints the pattern space through the first embedded newline or to the end.
 393
 394 =cut
 395
 396 #--------------------------------------------------------------------------
 397 $ComTab{'q'}=[ 1, '',    \&Emit,       <<'-X-'                           ]; #ok
 398 { print $_, "\n" if $doPrint;
 399   last CYCLE;
 400 }
 401 -X-
 402
 403 =item [1addr]B<q>
 404
 405 Branch to the end of the script and quit without starting a new cycle.
 406
 407 =cut
 408
 409 #--------------------------------------------------------------------------
 410 $ComTab{'r'}=[ 1, 'str', \&Emit,       "{ _r( '-X-' ) }"                 ]; #ok
 411
 412 =item [1addr]B<r> I<file>
 413
 414 Copy the contents of the I<file> to standard output immediately before
 415 the next attempt to read a line of input. Any error encountered while
 416 reading I<file> is silently ignored.
 417
 418 =cut
 419
 420 #--------------------------------------------------------------------------
 421 $ComTab{'s'}=[ 2, 'sub', \&Emit,       ''                                ]; #ok
 422
 423 =item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
 424
 425 Substitute the I<replacement> string for the first substring in
 426 the pattern space that matches the I<regular expression>.
 427 Any character other than backslash or newline can be used instead of a
 428 slash to delimit the regular expression and the replacement.
 429 To use the delimiter as a literal character within the regular expression
 430 and the replacement, precede the character by a backslash (`C<\>').
 431
 432 Literal newlines may be embedded in the replacement string by
 433 preceding a newline with a backslash.
 434
 435 Within the replacement, an ampersand (`C<&>') is replaced by the string
 436 matching the regular expression. The strings `C<\1>' through `C<\9>' are
 437 replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
 438 To get a literal `C<&>' or `C<\>' in the replacement text, precede it
 439 by a backslash.
 440
 441 The following I<flags> modify the behaviour of the B<s> command:
 442
 443 =over 8
 444
 445 =item B<g>
 446
 447 The replacement is performed for all matching, non-overlapping substrings
 448 of the pattern space.
 449
 450 =item B<1>..B<9>
 451
 452 Replace only the n-th matching substring of the pattern space.
 453
 454 =item B<p>
 455
 456 If the substitution was made, print the new value of the pattern space.
 457
 458 =item B<w> I<file>
 459
 460 If the substitution was made, write the new value of the pattern space
 461 to the specified file.
 462
 463 =back
 464
 465 =cut
 466
 467 #--------------------------------------------------------------------------
 468 $ComTab{'t'}=[ 2, 'str', \&Branch,     '{ goto XXX if _t() }'            ]; #ok
 469
 470 =item [2addr]B<t> [I<label>]
 471
 472 Branch to the B<:> function with the specified I<label> if any B<s>
 473 substitutions have been made since the most recent reading of an input line
 474 or execution of a B<t> function. If no label is given, branch to the end of
 475 the script.
 476
 477
 478 =cut
 479
 480 #--------------------------------------------------------------------------
 481 $ComTab{'w'}=[ 2, 'str', \&Write,      "{ _w( '-X-' ) }"                 ]; #ok
 482
 483 =item [2addr]B<w> I<file>
 484
 485 The contents of the pattern space are written to the I<file>.
 486
 487 =cut
 488
 489 #--------------------------------------------------------------------------
 490 $ComTab{'x'}=[ 2, '',    \&Emit,       '{ ($Hold, $_) = ($_, $Hold) }'   ]; #ok
 491
 492 =item [2addr]B<x>
 493
 494 Swap the contents of the pattern space and the hold space.
 495
 496 =cut
 497
 498 #--------------------------------------------------------------------------
 499 $ComTab{'y'}=[ 2, 'tra', \&Emit,       ''                                ]; #ok
 500 =item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
 501
 502 In the pattern space, replace all characters occuring in I<string1> by the
 503 character at the corresponding position in I<string2>. It is possible
 504 to use any character (other than a backslash or newline) instead of a
 505 slash to delimit the strings.  Within I<string1> and I<string2>, a
 506 backslash followed by any character other than a newline is that literal
 507 character, and a backslash followed by an `n' is replaced by a newline
 508 character.
 509
 510 =cut
 511
 512 #--------------------------------------------------------------------------
 513 $ComTab{'='}=[ 1, '',    \&Emit,       '{ print "$.\n" }'                ]; #ok
 514
 515 =item [1addr]B<=>
 516
 517 Prints the current line number on the standard output.
 518
 519 =cut
 520
 521 #--------------------------------------------------------------------------
 522 $ComTab{':'}=[ 0, 'str', \&Label,      ''                                ]; #ok
 523
 524 =item [0addr]B<:> [I<label>]
 525
 526 The command specifies the position of the I<label>. It has no other effect.
 527
 528 =cut
 529
 530 #--------------------------------------------------------------------------
 531 $ComTab{'{'}=[ 2, '',    \&BeginBlock, '{'                               ]; #ok
 532 $ComTab{'}'}=[ 0, '',    \&EndBlock,   ';}'                              ]; #ok
 533 # ';' to avoid warning on empty {}-block
 534
 535 =item [2addr]B<{> [I<command>]
 536
 537 =item [0addr]B<}>
 538
 539 These two commands begin and end a command list. The first command may
 540 be given on the same line as the opening B<{> command. The commands
 541 within the list are jointly selected by the address(es) given on the
 542 B<{> command (but may still have individual addresses).
 543
 544 =cut
 545
 546 #--------------------------------------------------------------------------
 547 $ComTab{'#'}=[ 0, 'str', \&Comment,    ''                                ]; #ok
 548
 549 =item [0addr]B<#> [I<comment>]
 550
 551 The entire line is ignored (treated as a comment). If, however, the first
 552 two characters in the script are `C<#n>', automatic printing of output is
 553 suppressed, as if the B<-n> option were given on the command line.
 554
 555 =back
 556
 557 =cut
 558
 559 use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
 560
 561 my $useDEBUG    = exists( $ENV{PSEDDEBUG} );
 562 my $useEXTBRE   = $ENV{PSEDEXTBRE} || '';
 563 $useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
 564
 565 my $doAutoPrint = 1;          # automatic printing of pattern space (-n => 0)
 566 my $doOpenWrite = 1;          # open w command output files at start (-a => 0)
 567 my $svOpenWrite = 0;          # save $doOpenWrite
 568
 569 # lower case $0 below as a VMSism.  The VMS build procedure creates the
 570 # s2p file traditionally in upper case on the disk.  When VMS is in a
 571 # case preserved or case sensitive mode, $0 will be returned in the exact
 572 # case which will be on the disk, and that is not predictable at this time.
 573
 574 my $doGenerate  = lc($0) eq 's2p';
 575
 576 # Collected and compiled script
 577 #
 578 my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
 579 $Code = '';
 580
 581 ##################
 582 #  Compile Time
 583 #
 584 # Labels
 585 #
 586 # Error handling
 587 #
 588 sub Warn($;$){
 589     my( $msg, $loc ) = @_;
 590     $loc ||= '';
 591     $loc .= ': ' if length( $loc );
 592     warn( "$0: $loc$msg\n" );
 593 }
 594
 595 $labNum = 0;
 596 sub newLabel(){
 597     return 'L_'.++$labNum;
 598 }
 599
 600 # safeHere: create safe here delimiter and  modify opcode and argument
 601 #
 602 sub safeHere($$){
 603     my( $codref, $argref ) = @_;
 604     my $eod = 'EOD000';
 605     while( $$argref =~ /^$eod$/m ){
 606         $eod++;
 607     }
 608     $$codref =~ s/TheEnd/$eod/e;
 609     $$argref .= "$eod\n";
 610 }
 611
 612 # Emit: create address logic and emit command
 613 #
 614 sub Emit($$$$$$){
 615     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 616     my $cond = '';
 617     if( defined( $addr1 ) ){
 618         if( defined( $addr2 ) ){
 619             $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 620         } else {
 621             $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 622         }
 623         $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
 624     }
 625
 626     if( $opcode eq '' ){
 627         $Code .= "$cond$arg\n";
 628
 629     } elsif( $opcode =~ s/-X-/$arg/e ){
 630         $Code .= "$cond$opcode\n";
 631
 632     } elsif( $opcode =~ /TheEnd/ ){
 633         safeHere( \$opcode, \$arg );
 634         $Code .= "$cond$opcode$arg";
 635
 636     } else {
 637         $Code .= "$cond$opcode\n";
 638     }
 639     0;
 640 }
 641
 642 # Write (w command, w flag): store pathname
 643 #
 644 sub Write($$$$$$){
 645     my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
 646     $wFiles{$path} = '';
 647     Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
 648 }
 649
 650
 651 # Label (: command): label definition
 652 #
 653 sub Label($$$$$$){
 654     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 655     my $rc = 0;
 656     $lab =~ s/\s+//;
 657     if( length( $lab ) ){
 658         my $h;
 659         if( ! exists( $Label{$lab} ) ){
 660             $h = $Label{$lab}{name} = newLabel();
 661         } else {
 662             $h = $Label{$lab}{name};
 663             if( exists( $Label{$lab}{defined} ) ){
 664                 my $dl = $Label{$lab}{defined};
 665                 Warn( "duplicate label $lab (first defined at $dl)", $fl );
 666                 $rc = 1;
 667             }
 668         }
 669         $Label{$lab}{defined} = $fl;
 670         $Code .= "$h:;\n";
 671     }
 672     $rc;
 673 }
 674
 675 # BeginBlock ({ command): push block start
 676 #
 677 sub BeginBlock($$$$$$){
 678     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 679     push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
 680     Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 681 }
 682
 683 # EndBlock (} command): check proper nesting
 684 #
 685 sub EndBlock($$$$$$){
 686     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 687     my $rc;
 688     my $jcom = pop( @BlockStack );
 689     if( defined( $jcom ) ){
 690         $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 691     } else {
 692         Warn( "unexpected `}'", $fl );
 693         $rc = 1;
 694     }
 695     $rc;
 696 }
 697
 698 # Branch (t, b commands): check or create label, substitute default
 699 #
 700 sub Branch($$$$$$){
 701     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 702     $lab =~ s/\s+//; # no spaces at end
 703     my $h;
 704     if( length( $lab ) ){
 705         if( ! exists( $Label{$lab} ) ){
 706             $h = $Label{$lab}{name} = newLabel();
 707         } else {
 708             $h = $Label{$lab}{name};
 709         }
 710         push( @{$Label{$lab}{used}}, $fl );
 711     } else {
 712         $h = 'EOS';
 713     }
 714     $opcode =~ s/XXX/$h/e;
 715     Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
 716 }
 717
 718 # Change (c command): is special due to range end watching
 719 #
 720 sub Change($$$$$$){
 721     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 722     my $kwd = $negated ? 'unless' : 'if';
 723     if( defined( $addr2 ) ){
 724         $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 725         if( ! $negated ){
 726             $addr1  = '$icnt = ('.$addr1.')';
 727             $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
 728         }
 729     } else {
 730         $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 731     }
 732     safeHere( \$opcode, \$arg );
 733     $Code .= "$kwd( $addr1 ){\n  $opcode$arg}\n";
 734     0;
 735 }
 736
 737
 738 # Comment (# command): A no-op. Who would've thought that!
 739 #
 740 sub Comment($$$$$$){
 741     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 742 ### $Code .= "# $arg\n";
 743     0;
 744 }
 745
 746 # stripRegex from the current command. If we're in the first
 747 # part of s///, trailing spaces have to be kept as the initial
 748 # part of the replacement string.
 749 #
 750 sub stripRegex($$;$){
 751     my( $del, $sref, $sub ) = @_;
 752     my $regex = $del;
 753     print "stripRegex:$del:$$sref:\n" if $useDEBUG;
 754     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
 755         my $sl = $2;
 756         $regex .= $1.$sl.$del;
 757         if( length( $sl ) % 2 == 0 ){
 758             if( $sub && (length( $3 ) > 0) ){
 759                 $$sref = $3 . $$sref;
 760             }
 761             return $regex;
 762         }
 763         $regex .= $3;
 764     }
 765     undef();
 766 }
 767
 768 # stripTrans: take a <del> terminated string from y command
 769 #   honoring and cleaning up of \-escaped <del>'s
 770 #
 771 sub stripTrans($$){
 772     my( $del, $sref ) = @_;
 773     my $t = '';
 774     print "stripTrans:$del:$$sref:\n" if $useDEBUG;
 775     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
 776         my $sl = $2;
 777         $t .= $1;
 778         if( length( $sl ) % 2 == 0 ){
 779             $t .= $sl;
 780             $t =~ s/\\\\/\\/g;
 781             return $t;
 782         }
 783         chop( $sl );
 784         $t .= $sl.$del.$3;
 785     }
 786     undef();
 787 }
 788
 789 # makey - construct Perl y/// from sed y///
 790 #
 791 sub makey($$$){
 792     my( $fr, $to, $fl ) = @_;
 793     my $error = 0;
 794
 795     # Ensure that any '-' is up front.
 796     # Diagnose duplicate contradicting mappings
 797     my %tr;
 798     for( my $i = 0; $i < length($fr); $i++ ){
 799         my $fc = substr($fr,$i,1);
 800         my $tc = substr($to,$i,1);
 801         if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
 802             Warn( "ambiguous translation for character `$fc' in `y' command",
 803                   $fl );
 804             $error++;
 805         }
 806         $tr{$fc} = $tc;
 807     }
 808     $fr = $to = '';
 809     if( exists( $tr{'-'} ) ){
 810         ( $fr, $to ) = ( '-', $tr{'-'} );
 811         delete( $tr{'-'} );
 812     } else {
 813         $fr = $to = '';
 814     }
 815     # might just as well sort it...
 816     for my $fc ( sort keys( %tr ) ){
 817         $fr .= $fc;
 818         $to .= $tr{$fc};
 819     }
 820     # make embedded delimiters and newlines safe
 821     $fr =~ s/([{}])/\$1/g;
 822     $to =~ s/([{}])/\$1/g;
 823     $fr =~ s/\n/\\n/g;
 824     $to =~ s/\n/\\n/g;
 825     return $error ? undef() : "{ y{$fr}{$to}; }";
 826 }
 827
 828 ######
 829 # makes - construct Perl s/// from sed s///
 830 #
 831 sub makes($$$$$$$){
 832     my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
 833
 834     # make embedded newlines safe
 835     $regex =~ s/\n/\\n/g;
 836     $subst =~ s/\n/\\n/g;
 837
 838     my $code;
 839     # n-th occurrence
 840     #
 841     if( length( $nmatch ) ){
 842         $code = <<TheEnd;
 843 { \$n = $nmatch;
 844   while( --\$n && ( \$s = m ${regex}g ) ){}
 845   \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
 846   \$CondReg ||= \$s;
 847 TheEnd
 848     } else {
 849         $code = <<TheEnd;
 850 { \$s = s ${regex}${subst}s${global};
 851   \$CondReg ||= \$s;
 852 TheEnd
 853     }
 854     if( $print ){
 855         $code .= '  print $_, "\n" if $s;'."\n";
 856     }
 857     if( defined( $path ) ){
 858         $wFiles{$path} = '';
 859         $code .= " _w( '$path' ) if \$s;\n";
 860         $GenKey{'w'} = 1;
 861     }
 862     $code .= "}";
 863 }
 864
 865 =head1 BASIC REGULAR EXPRESSIONS
 866
 867 A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
 868 of I<atoms>, for matching parts of a string, and I<bounds>, specifying
 869 repetitions of a preceding atom.
 870
 871 =head2 Atoms
 872
 873 The possible atoms of a BRE are: B<.>, matching any single character;
 874 B<^> and B<$>, matching the null string at the beginning or end
 875 of a string, respectively; a I<bracket expressions>, enclosed
 876 in B<[> and B<]> (see below); and any single character with no
 877 other significance (matching that character). A B<\> before one
 878 of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
 879 after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
 880 becomes an atom and establishes the target for a I<backreference>,
 881 consisting of the substring that actually matches the enclosed atoms.
 882 Finally, B<\> followed by one of the digits B<0> through B<9> is a
 883 backreference.
 884
 885 A B<^> that is not first, or a B<$> that is not last does not have
 886 a special significance and need not be preceded by a backslash to
 887 become literal. The same is true for a B<]>, that does not terminate
 888 a bracket expression.
 889
 890 An unescaped backslash cannot be last in a BRE.
 891
 892 =head2 Bounds
 893
 894 The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
 895 atom; B<\{>I<count>B<\}>, specifying that many repetitions;
 896 B<\{>I<minimum>B<,\}>, giving a lower limit; and
 897 B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
 898 bound.
 899
 900 A bound appearing as the first item in a BRE is taken literally.
 901
 902 =head2 Bracket Expressions
 903
 904 A I<bracket expression> is a list of characters, character ranges
 905 and character classes enclosed in B<[> and B<]> and matches any
 906 single character from the represented set of characters.
 907
 908 A character range is written as two characters separated by B<-> and
 909 represents all characters (according to the character collating sequence)
 910 that are not less than the first and not greater than the second.
 911 (Ranges are very collating-sequence-dependent, and portable programs
 912 should avoid relying on them.)
 913
 914 A character class is one of the class names
 915
 916    alnum     digit     punct
 917    alpha     graph     space
 918    blank     lower     upper
 919    cntrl     print     xdigit
 920
 921 enclosed in B<[:> and B<:]> and represents the set of characters
 922 as defined in ctype(3).
 923
 924 If the first character after B<[> is B<^>, the sense of matching is
 925 inverted.
 926
 927 To include a literal `C<^>', place it anywhere else but first. To
 928 include a literal 'C<]>' place it first or immediately after an
 929 initial B<^>. To include a literal `C<->' make it the first (or
 930 second after B<^>) or last character, or the second endpoint of
 931 a range.
 932
 933 The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
 934 match the null string at the beginning and end of a word respectively.
 935 (Note that neither is identical to Perl's `\b' atom.)
 936
 937 =head2 Additional Atoms
 938
 939 Since some sed implementations provide additional regular expression
 940 atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
 941 the following backslash escapes:
 942
 943 =over 4
 944
 945 =item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
 946
 947 =item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
 948
 949 =item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
 950
 951 =item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
 952
 953 =item B<\y> Match the empty string at a word boundary.
 954
 955 =item B<\B> Match the empty string between any two either word or non-word characters.
 956
 957 =back
 958
 959 To enable this feature, the environment variable PSEDEXTBRE must be set
 960 to a string containing the requested characters, e.g.:
 961 C<PSEDEXTBRE='E<lt>E<gt>wW'>.
 962
 963 =cut
 964
 965 #####
 966 # bre2p - convert BRE to Perl RE
 967 #
 968 sub peek(\$$){
 969     my( $pref, $ic ) = @_;
 970     $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
 971 }
 972
 973 sub bre2p($$$){
 974     my( $del, $pat, $fl ) = @_;
 975     my $led = $del;
 976     $led =~ tr/{([</})]>/;
 977     $led = '' if $led eq $del;
 978
 979     $pat = substr( $pat, 1, length($pat) - 2 );
 980     my $res = '';
 981     my $bracklev = 0;
 982     my $backref  = 0;
 983     my $parlev = 0;
 984     for( my $ic = 0; $ic < length( $pat ); $ic++ ){
 985         my $c = substr( $pat, $ic, 1 );
 986         if( $c eq '\\' ){
 987             ### backslash escapes
 988             my $nc = peek($pat,$ic);
 989             if( $nc eq '' ){
 990                 Warn( "`\\' cannot be last in pattern", $fl );
 991                 return undef();
 992             }
 993             $ic++;
 994             if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
 995                 $res .= "\\$del";
 996
 997             } elsif( $nc =~ /([[.*\\n])/ ){
 998                 ## check for \-escaped magics and \n:
 999                 ## \[ \. \* \\ \n stay as they are
1000                 $res .= '\\'.$nc;
1001
1002             } elsif( $nc eq '(' ){ ## \( => (
1003                 $parlev++;
1004                 $res .= '(';
1005
1006             } elsif( $nc eq ')' ){ ## \) => )
1007                 $parlev--;
1008                 $backref++;
1009                 if( $parlev < 0 ){
1010                     Warn( "unmatched `\\)'", $fl );
1011                     return undef();
1012                 }
1013                 $res .= ')';
1014
1015             } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
1016                 my $endpos = index( $pat, '\\}', $ic );
1017                 if( $endpos < 0 ){
1018                     Warn( "unmatched `\\{'", $fl );
1019                     return undef();
1020                 }
1021                 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
1022                 $ic = $endpos + 1;
1023
1024                 if( $res =~ /^\^?$/ ){
1025                     $res .= "\\{$rep\}";
1026                 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
1027                     my $min = $1;
1028                     my $com = $2 || '';
1029                     my $max = $3;
1030                     if( length( $max ) ){
1031                         if( $max < $min ){
1032                             Warn( "maximum less than minimum in `\\{$rep\\}'",
1033                                   $fl );
1034                             return undef();
1035                         }
1036                     } else {
1037                         $max = '';
1038                     }
1039                     # simplify some
1040                     if( $min == 0 && $max eq '1' ){
1041                         $res .= '?';
1042                     } elsif( $min == 1 && "$com$max" eq ',' ){
1043                         $res .= '+';
1044                     } elsif( $min == 0 && "$com$max" eq ',' ){
1045                         $res .= '*';
1046                     } else {
1047                         $res .= "{$min$com$max}";
1048                     }
1049                 } else {
1050                     Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1051                     return undef();
1052                 }
1053
1054             } elsif( $nc =~ /^[1-9]$/ ){
1055                 ## \1 .. \9 => \1 .. \9, but check for a following digit
1056                 if( $nc > $backref ){
1057                     Warn( "invalid backreference ($nc)", $fl );
1058                     return undef();
1059                 }
1060                 $res .= "\\$nc";
1061                 if( peek($pat,$ic) =~ /[0-9]/ ){
1062                     $res .= '(?:)';
1063                 }
1064
1065             } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1066                 ## extensions - at most <>wWyB - not in POSIX
1067                 if(      $nc eq '<' ){ ## \< => \b(?=\w), be precise
1068                     $res .= '\\b(?<=\\W)';
1069                 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1070                     $res .= '\\b(?=\\W)';
1071                 } elsif( $nc eq 'y' ){ ## \y => \b
1072                     $res .= '\\b';
1073                 } else {               ## \B, \w, \W remain the same
1074                     $res .= "\\$nc";
1075                 }
1076             } elsif( $nc eq $led ){
1077                 ## \<closing bracketing-delimiter> - keep '\'
1078                 $res .= "\\$nc";
1079
1080             } else { ## \ <char> => <char> ("as if `\' were not present")
1081                 $res .= $nc;
1082             }
1083
1084         } elsif( $c eq '.' ){ ## . => .
1085             $res .= $c;
1086
1087         } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1088             if( $res =~ /^\^?$/ ){
1089                 $res .= '\\*';
1090             } elsif( substr( $res, -1, 1 ) ne '*' ){
1091                 $res .= $c;
1092             }
1093
1094         } elsif( $c eq '[' ){
1095             ## parse []: [^...] [^]...] [-...]
1096             my $add = '[';
1097             if( peek($pat,$ic) eq '^' ){
1098                 $ic++;
1099                 $add .= '^';
1100             }
1101             my $nc = peek($pat,$ic);
1102             if( $nc eq ']' || $nc eq '-' ){
1103                 $add .= $nc;
1104                 $ic++;
1105             }
1106             # check that [ is not trailing
1107             if( $ic >= length( $pat ) - 1 ){
1108                 Warn( "unmatched `['", $fl );
1109                 return undef();
1110             }
1111             # look for [:...:] and x-y
1112             my $rstr = substr( $pat, $ic+1 );
1113             if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1114                 my $cnt = $1;
1115                 $ic += length( $cnt );
1116                 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1117                 # try some simplifications
1118                 my $red = $cnt;
1119                 if( $red =~ s/0-9// ){
1120                     $cnt = $red.'\d';
1121                     if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1122                         $cnt = $red.'\w';
1123                     }
1124                 }
1125                 $add .= $cnt;
1126
1127                 # POSIX 1003.2 has this (optional) for begin/end word
1128                 $add = '\\b(?=\\W)'  if $add eq '[[:<:]]';
1129                 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1130
1131             }
1132
1133             ## may have a trailing `-' before `]'
1134             if( $ic < length($pat) - 1 &&
1135                 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1136                 $ic += length( $1 );
1137                 $add .= $1;
1138                 # another simplification
1139                 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1140                 $res .= $add;
1141             } else {
1142                 Warn( "unmatched `['", $fl );
1143                 return undef();
1144             }
1145
1146         } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1147             $res .= "\\$c";
1148
1149         } elsif( $c eq ']' ){ ## unmatched ] is not magic
1150             $res .= ']';
1151
1152         } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1153             $res .= "\\$c";
1154
1155         } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1156             $res .= length( $res ) ? '\\^' : '^';
1157
1158         } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1159             $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
1160
1161         } else {
1162             $res .= $c;
1163         }
1164     }
1165
1166     if( $parlev ){
1167        Warn( "unmatched `\\('", $fl );
1168        return undef();
1169     }
1170
1171     # final cleanup: eliminate raw HTs
1172     $res =~ s/\t/\\t/g;
1173     return $del . $res . ( $led ? $led : $del );
1174 }
1175
1176
1177 #####
1178 # sub2p - convert sed substitution to Perl substitution
1179 #
1180 sub sub2p($$$){
1181     my( $del, $subst, $fl ) = @_;
1182     my $led = $del;
1183     $led =~ tr/{([</})]>/;
1184     $led = '' if $led eq $del;
1185
1186     $subst = substr( $subst, 1, length($subst) - 2 );
1187     my $res = '';
1188
1189     for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1190         my $c = substr( $subst, $ic, 1 );
1191         if( $c eq '\\' ){
1192             ### backslash escapes
1193             my $nc = peek($subst,$ic);
1194             if( $nc eq '' ){
1195                 Warn( "`\\' cannot be last in substitution", $fl );
1196                 return undef();
1197             }
1198             $ic++;
1199             if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1200                 $res .= '\\' . $nc;
1201             } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1202                 $res .= '${' . $nc . '}';
1203             } else { ## everything else (includes &): omit \
1204                 $res .= $nc;
1205             }
1206         } elsif( $c eq '&' ){ ## & => $&
1207             $res .= '$&';
1208         } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1209             $res .= '\\' . $c;
1210         } else {
1211             $res .= $c;
1212         }
1213     }
1214
1215     # final cleanup: eliminate raw HTs
1216     $res =~ s/\t/\\t/g;
1217     return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1218 }
1219
1220
1221 sub Parse(){
1222     my $error = 0;
1223     my( $pdef, $pfil, $plin );
1224     for( my $icom = 0; $icom < @Commands; $icom++ ){
1225         my $cmd = $Commands[$icom];
1226         print "Parse:$cmd:\n" if $useDEBUG;
1227         $cmd =~ s/^\s+//;
1228         next unless length( $cmd );
1229         my $scom = $icom;
1230         if( exists( $Defined{$icom} ) ){
1231             $pdef = $Defined{$icom};
1232             if( $pdef =~ /^ #(\d+)/ ){
1233                 $pfil = 'expression #';
1234                 $plin = $1;
1235             } else {
1236                 $pfil = "$pdef l.";
1237                 $plin = 1;
1238             }
1239         } else {
1240             $plin++;
1241         }
1242         my $fl = "$pfil$plin";
1243
1244         # insert command as comment in gnerated code
1245         #
1246         $Code .= "# $cmd\n" if $doGenerate;
1247
1248         # The Address(es)
1249         #
1250         my( $negated, $naddr, $addr1, $addr2 );
1251         $naddr = 0;
1252         if(      $cmd =~ s/^(\d+)\s*// ){
1253             $addr1 = "$1"; $naddr++;
1254         } elsif( $cmd =~ s/^\$\s*// ){
1255             $addr1 = 'eofARGV()'; $naddr++;
1256         } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1257             my $del = $1;
1258             my $regex = stripRegex( $del, \$cmd );
1259             if( defined( $regex ) ){
1260                 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1261                 $naddr++;
1262             } else {
1263                 Warn( "malformed regex, 1st address", $fl );
1264                 $error++;
1265                 next;
1266             }
1267         }
1268         if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1269             if(      $cmd =~ s/^(\d+)\s*// ){
1270                 $addr2 = "$1"; $naddr++;
1271             } elsif( $cmd =~ s/^\$\s*// ){
1272                 $addr2 = 'eofARGV()'; $naddr++;
1273             } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1274                 my $del = $1;
1275                 my $regex = stripRegex( $del, \$cmd );
1276                 if( defined( $regex ) ){
1277                     $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1278                     $naddr++;
1279                 } else {
1280                     Warn( "malformed regex, 2nd address", $fl );
1281                     $error++;
1282                     next;
1283                 }
1284             } else {
1285                 Warn( "invalid address after `,'", $fl );
1286                 $error++;
1287                 next;
1288             }
1289         }
1290
1291         # address modifier `!'
1292         #
1293         $negated = $cmd =~ s/^!\s*//;
1294         if( defined( $addr1 ) ){
1295             print "Parse: addr1=$addr1" if $useDEBUG;
1296             if( defined( $addr2 ) ){
1297                 print ", addr2=$addr2 " if $useDEBUG;
1298                 # both numeric and addr1 > addr2 => eliminate addr2
1299                 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1300                                    $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1301             }
1302         }
1303         print 'negated' if $useDEBUG && $negated;
1304         print " command:$cmd\n" if $useDEBUG;
1305
1306         # The Command
1307         #
1308         if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1309             my $h = substr( $cmd, 0, 1 );
1310             Warn( "unknown command `$h'", $fl );
1311             $error++;
1312             next;
1313         }
1314         my $key = $1;
1315
1316         my $tabref = $ComTab{$key};
1317         $GenKey{$key} = 1;
1318         if( $naddr > $tabref->[0] ){
1319             Warn( "excess address(es)", $fl );
1320             $error++;
1321             next;
1322         }
1323
1324         my $arg = '';
1325         if(      $tabref->[1] eq 'str' ){
1326             # take remainder - don't care if it is empty
1327             $arg = $cmd;
1328             $cmd = '';
1329
1330         } elsif( $tabref->[1] eq 'txt' ){
1331             # multi-line text
1332             my $goon = $cmd =~ /(.*)\\$/;
1333             if( length( $1 ) ){
1334                 Warn( "extra characters after command ($cmd)", $fl );
1335                 $error++;
1336             }
1337             while( $goon ){
1338                 $icom++;
1339                 if( $icom > $#Commands ){
1340                     Warn( "unexpected end of script", $fl );
1341                     $error++;
1342                     last;
1343                 }
1344                 $cmd = $Commands[$icom];
1345                 $Code .= "# $cmd\n" if $doGenerate;
1346                 $goon = $cmd =~ s/\\$//;
1347                 $cmd =~ s/\\(.)/$1/g;
1348                 $arg .= "\n" if length( $arg );
1349                 $arg .= $cmd;
1350             }
1351             $arg .= "\n" if length( $arg );
1352             $cmd = '';
1353
1354         } elsif( $tabref->[1] eq 'sub' ){
1355             # s///
1356             if( ! length( $cmd ) ){
1357                 Warn( "`s' command requires argument", $fl );
1358                 $error++;
1359                 next;
1360             }
1361             if( $cmd =~ s{^([^\\\n])}{} ){
1362                 my $del = $1;
1363                 my $regex = stripRegex( $del, \$cmd, "s" );
1364                 if( ! defined( $regex ) ){
1365                     Warn( "malformed regular expression", $fl );
1366                     $error++;
1367                     next;
1368                 }
1369                 $regex = bre2p( $del, $regex, $fl );
1370
1371                 # a trailing \ indicates embedded NL (in replacement string)
1372                 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1373                     $icom++;
1374                     if( $icom > $#Commands ){
1375                         Warn( "unexpected end of script", $fl );
1376                         $error++;
1377                         last;
1378                     }
1379                     $cmd .= $Commands[$icom];
1380                     $Code .= "# $Commands[$icom]\n" if $doGenerate;
1381                 }
1382
1383                 my $subst = stripRegex( $del, \$cmd );
1384                 if( ! defined( $regex ) ){
1385                     Warn( "malformed substitution expression", $fl );
1386                     $error++;
1387                     next;
1388                 }
1389                 $subst = sub2p( $del, $subst, $fl );
1390
1391                 # parse s/// modifier: g|p|0-9|w <file>
1392                 my( $global, $nmatch, $print, $write ) =
1393                   ( '',      '',      0,      undef );
1394                 while( $cmd =~ s/^([gp0-9])// ){
1395                     $1 eq 'g' ? ( $global = 'g' ) :
1396                     $1 eq 'p' ? ( $print  = $1  ) : ( $nmatch .= $1 );
1397                 }
1398                 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1399                 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1400                 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1401                     Warn( "conflicting flags `$global$nmatch'", $fl );
1402                     $error++;
1403                     next;
1404                 }
1405
1406                 $arg = makes( $regex, $subst,
1407                               $write, $global, $print, $nmatch, $fl );
1408                 if( ! defined( $arg ) ){
1409                     $error++;
1410                     next;
1411                 }
1412
1413             } else {
1414                 Warn( "improper delimiter in s command", $fl );
1415                 $error++;
1416                 next;
1417             }
1418
1419         } elsif( $tabref->[1] eq 'tra' ){
1420             # y///
1421             # a trailing \ indicates embedded newline
1422             while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1423                 $icom++;
1424                 if( $icom > $#Commands ){
1425                     Warn( "unexpected end of script", $fl );
1426                     $error++;
1427                     last;
1428                 }
1429                 $cmd .= $Commands[$icom];
1430                 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1431             }
1432             if( ! length( $cmd ) ){
1433                 Warn( "`y' command requires argument", $fl );
1434                 $error++;
1435                 next;
1436             }
1437             my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1438             if( $d eq '\\' ){
1439                 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1440                 $error++;
1441                 next;
1442             }
1443             my $fr = stripTrans( $d, \$cmd );
1444             if( ! defined( $fr ) || ! length( $cmd ) ){
1445                 Warn( "malformed `y' command argument", $fl );
1446                 $error++;
1447                 next;
1448             }
1449             my $to = stripTrans( $d, \$cmd );
1450             if( ! defined( $to ) ){
1451                 Warn( "malformed `y' command argument", $fl );
1452                 $error++;
1453                 next;
1454             }
1455             if( length($fr) != length($to) ){
1456                 Warn( "string lengths in `y' command differ", $fl );
1457                 $error++;
1458                 next;
1459             }
1460             if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1461                 $error++;
1462                 next;
1463             }
1464
1465         }
1466
1467         # $cmd must be now empty - exception is {
1468         if( $cmd !~ /^\s*$/ ){
1469             if( $key eq '{' ){
1470                 # dirty hack to process command on '{' line
1471                 $Commands[$icom--] = $cmd;
1472             } else {
1473                 Warn( "extra characters after command ($cmd)", $fl );
1474                 $error++;
1475                 next;
1476             }
1477         }
1478
1479         # Make Code
1480         #
1481         if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1482                              $tabref->[3], $arg, $fl ) ){
1483             $error++;
1484         }
1485     }
1486
1487     while( @BlockStack ){
1488         my $bl = pop( @BlockStack );
1489         Warn( "start of unterminated `{'", $bl );
1490         $error++;
1491     }
1492
1493     for my $lab ( keys( %Label ) ){
1494         if( ! exists( $Label{$lab}{defined} ) ){
1495             for my $used ( @{$Label{$lab}{used}} ){
1496                 Warn( "undefined label `$lab'", $used );
1497                 $error++;
1498             }
1499         }
1500     }
1501
1502     exit( 1 ) if $error;
1503 }
1504
1505
1506 ##############
1507 #### MAIN ####
1508 ##############
1509
1510 sub usage(){
1511     print STDERR "Usage: sed [-an] command [file...]\n";
1512     print STDERR "           [-an] [-e command] [-f script-file] [file...]\n";
1513 }
1514
1515 ###################
1516 # Here we go again...
1517 #
1518 my $expr = 0;
1519 while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1520     my $opt = $1;
1521     my $arg = $2;
1522     shift( @ARGV );
1523     if(      $opt eq 'e' ){
1524         if( length( $arg ) ){
1525             push( @Commands, split( "\n", $arg ) );
1526         } elsif( @ARGV ){
1527             push( @Commands, shift( @ARGV ) );
1528         } else {
1529             Warn( "option -e requires an argument" );
1530             usage();
1531             exit( 1 );
1532         }
1533         $expr++;
1534         $Defined{$#Commands} = " #$expr";
1535         next;
1536     }
1537     if( $opt eq 'f' ){
1538         my $path;
1539         if( length( $arg ) ){
1540             $path = $arg;
1541         } elsif( @ARGV ){
1542             $path = shift( @ARGV );
1543         } else {
1544             Warn( "option -f requires an argument" );
1545             usage();
1546             exit( 1 );
1547         }
1548         my $fst = $#Commands + 1;
1549         open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1550         my $cmd;
1551         while( defined( $cmd = <SCRIPT> ) ){
1552             chomp( $cmd );
1553             push( @Commands, $cmd );
1554         }
1555         close( SCRIPT );
1556         if( $#Commands >= $fst ){
1557             $Defined{$fst} = "$path";
1558         }
1559         next;
1560     }
1561     if( $opt eq '-' && $arg eq '' ){
1562         last;
1563     }
1564     if( $opt eq 'h' || $opt eq '?' ){
1565         usage();
1566         exit( 0 );
1567     }
1568     if( $opt eq 'n' ){
1569         $doAutoPrint = 0;
1570     } elsif( $opt eq 'a' ){
1571         $doOpenWrite = 0;
1572     } else {
1573         Warn( "illegal option `$opt'" );
1574         usage();
1575         exit( 1 );
1576     }
1577     if( length( $arg ) ){
1578         unshift( @ARGV, "-$arg" );
1579     }
1580 }
1581
1582 # A singleton command may be the 1st argument when there are no options.
1583 #
1584 if( @Commands == 0 ){
1585     if( @ARGV == 0 ){
1586         Warn( "no script command given" );
1587         usage();
1588         exit( 1 );
1589     }
1590     push( @Commands, split( "\n", shift( @ARGV ) ) );
1591     $Defined{0} = ' #1';
1592 }
1593
1594 print STDERR "Files: @ARGV\n" if $useDEBUG;
1595
1596 # generate leading code
1597 #
1598 $Func = <<'[TheEnd]';
1599
1600 # openARGV: open 1st input file
1601 #
1602 sub openARGV(){
1603     unshift( @ARGV, '-' ) unless @ARGV;
1604     my $file = shift( @ARGV );
1605     open( ARG, "<$file" )
1606     || die( "$0: can't open $file for reading ($!)\n" );
1607     $isEOF = 0;
1608 }
1609
1610 # getsARGV: Read another input line into argument (default: $_).
1611 #           Move on to next input file, and reset EOF flag $isEOF.
1612 sub getsARGV(;\$){
1613     my $argref = @_ ? shift() : \$_;
1614     while( $isEOF || ! defined( $$argref = <ARG> ) ){
1615         close( ARG );
1616         return 0 unless @ARGV;
1617         my $file = shift( @ARGV );
1618         open( ARG, "<$file" )
1619         || die( "$0: can't open $file for reading ($!)\n" );
1620         $isEOF = 0;
1621     }
1622     1;
1623 }
1624
1625 # eofARGV: end-of-file test
1626 #
1627 sub eofARGV(){
1628     return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1629 }
1630
1631 # makeHandle: Generates another file handle for some file (given by its path)
1632 #             to be written due to a w command or an s command's w flag.
1633 sub makeHandle($){
1634     my( $path ) = @_;
1635     my $handle;
1636     if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1637         $handle = $wFiles{$path} = gensym();
1638         if( $doOpenWrite ){
1639             if( ! open( $handle, ">$path" ) ){
1640                 die( "$0: can't open $path for writing: ($!)\n" );
1641             }
1642         }
1643     } else {
1644         $handle = $wFiles{$path};
1645     }
1646     return $handle;
1647 }
1648
1649 # printQ: Print queued output which is either a string or a reference
1650 #         to a pathname.
1651 sub printQ(){
1652     for my $q ( @Q ){
1653         if( ref( $q ) ){
1654             # flush open w files so that reading this file gets it all
1655             if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1656                 open( $wFiles{$$q}, ">>$$q" );
1657             }
1658             # copy file to stdout: slow, but safe
1659             if( open( RF, "<$$q" ) ){
1660                 while( defined( my $line = <RF> ) ){
1661                     print $line;
1662                 }
1663                 close( RF );
1664             }
1665         } else {
1666             print $q;
1667         }
1668     }
1669     undef( @Q );
1670 }
1671
1672 [TheEnd]
1673
1674 # generate the sed loop
1675 #
1676 $Code .= <<'[TheEnd]';
1677 sub openARGV();
1678 sub getsARGV(;\$);
1679 sub eofARGV();
1680 sub printQ();
1681
1682 # Run: the sed loop reading input and applying the script
1683 #
1684 sub Run(){
1685     my( $h, $icnt, $s, $n );
1686     # hack (not unbreakable :-/) to avoid // matching an empty string
1687     my $z = "\000"; $z =~ /$z/;
1688     # Initialize.
1689     openARGV();
1690     $Hold    = '';
1691     $CondReg = 0;
1692     $doPrint = $doAutoPrint;
1693 CYCLE:
1694     while( getsARGV() ){
1695         chomp();
1696         $CondReg = 0;   # cleared on t
1697 BOS:;
1698 [TheEnd]
1699
1700     # parse - avoid opening files when doing s2p
1701     #
1702     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1703       if $doGenerate;
1704     Parse();
1705     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1706       if $doGenerate;
1707
1708     # append trailing code
1709     #
1710     $Code .= <<'[TheEnd]';
1711 EOS:    if( $doPrint ){
1712             print $_, "\n";
1713         } else {
1714             $doPrint = $doAutoPrint;
1715         }
1716         printQ() if @Q;
1717     }
1718
1719     exit( 0 );
1720 }
1721 [TheEnd]
1722
1723
1724 # append optional functions, prepend prototypes
1725 #
1726 my $Proto = "# prototypes\n";
1727 if( $GenKey{'l'} ){
1728     $Proto .= "sub _l();\n";
1729     $Func .= <<'[TheEnd]';
1730 # _l: l command processing
1731 #
1732 sub _l(){
1733     my $h = $_;
1734     my $mcpl = 70;
1735     # transform non printing chars into escape notation
1736     $h =~ s/\\/\\\\/g;
1737     if( $h =~ /[^[:print:]]/ ){
1738         $h =~ s/\a/\\a/g;
1739         $h =~ s/\f/\\f/g;
1740         $h =~ s/\n/\\n/g;
1741         $h =~ s/\t/\\t/g;
1742         $h =~ s/\r/\\r/g;
1743         $h =~ s/\e/\\e/g;
1744         $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1745     }
1746     # split into lines of length $mcpl
1747     while( length( $h ) > $mcpl ){
1748         my $l = substr( $h, 0, $mcpl-1 );
1749         $h = substr( $h, $mcpl );
1750         # remove incomplete \-escape from end of line
1751         if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1752             $h = $1 . $h;
1753         }
1754         print $l, "\\\n";
1755     }
1756     print "$h\$\n";
1757 }
1758
1759 [TheEnd]
1760 }
1761
1762 if( $GenKey{'r'} ){
1763     $Proto .= "sub _r(\$);\n";
1764     $Func .= <<'[TheEnd]';
1765 # _r: r command processing: Save a reference to the pathname.
1766 #
1767 sub _r($){
1768     my $path = shift();
1769     push( @Q, \$path );
1770 }
1771
1772 [TheEnd]
1773 }
1774
1775 if( $GenKey{'t'} ){
1776     $Proto .= "sub _t();\n";
1777     $Func .= <<'[TheEnd]';
1778 # _t: t command - condition register test/reset
1779 #
1780 sub _t(){
1781     my $res = $CondReg;
1782     $CondReg = 0;
1783     $res;
1784 }
1785
1786 [TheEnd]
1787 }
1788
1789 if( $GenKey{'w'} ){
1790     $Proto .= "sub _w(\$);\n";
1791     $Func .= <<'[TheEnd]';
1792 # _w: w command and s command's w flag - write to file
1793 #
1794 sub _w($){
1795     my $path   = shift();
1796     my $handle = $wFiles{$path};
1797     if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1798         open( $handle, ">$path" )
1799         || die( "$0: $path: cannot open ($!)\n" );
1800     }
1801     print $handle $_, "\n";
1802 }
1803
1804 [TheEnd]
1805 }
1806
1807 $Code = $Proto . $Code;
1808
1809 # magic "#n" - same as -n option
1810 #
1811 $doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
1812
1813 # eval code - check for errors
1814 #
1815 print "Code:\n$Code$Func" if $useDEBUG;
1816 eval $Code . $Func;
1817 if( $@ ){
1818     print "Code:\n$Code$Func";
1819     die( "$0: internal error - generated incorrect Perl code: $@\n" );
1820 }
1821
1822 if( $doGenerate ){
1823
1824     # write full Perl program
1825     #
1826
1827     # bang line, declarations, prototypes
1828     print <<TheEnd;
1829 #!$perlpath -w
1830 eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1831   if 0;
1832 \$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
1833
1834 use strict;
1835 use Symbol;
1836 use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1837              \$doAutoPrint \$doOpenWrite \$doPrint };
1838 \$doAutoPrint = $doAutoPrint;
1839 \$doOpenWrite = $doOpenWrite;
1840 TheEnd
1841
1842     my $wf = "'" . join( "', '",  keys( %wFiles ) ) . "'";
1843     if( $wf ne "''" ){
1844         print <<TheEnd;
1845 sub makeHandle(\$);
1846 for my \$p ( $wf ){
1847    exit( 1 ) unless makeHandle( \$p );
1848 }
1849 TheEnd
1850    }
1851
1852    print $Code;
1853    print "Run();\n";
1854    print $Func;
1855    exit( 0 );
1856
1857 } else {
1858
1859     # execute: make handles (and optionally open) all w files; run!
1860     for my $p ( keys( %wFiles ) ){
1861         exit( 1 ) unless makeHandle( $p );
1862     }
1863     Run();
1864 }
1865
1866
1867 =head1 ENVIRONMENT
1868
1869 The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1870 See L<"Additional Atoms">.
1871
1872 =head1 DIAGNOSTICS
1873
1874 =over 4
1875
1876 =item ambiguous translation for character `%s' in `y' command
1877
1878 The indicated character appears twice, with different translations.
1879
1880 =item `[' cannot be last in pattern
1881
1882 A `[' in a BRE indicates the beginning of a I<bracket expression>.
1883
1884 =item `\' cannot be last in pattern
1885
1886 A `\' in a BRE is used to make the subsequent character literal.
1887
1888 =item `\' cannot be last in substitution
1889
1890 A `\' in a subsitution string is used to make the subsequent character literal.
1891
1892 =item conflicting flags `%s'
1893
1894 In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1895 multiple n-th occurrence flags are specified. Note that only the digits
1896 `1' through `9' are permitted.
1897
1898 =item duplicate label %s (first defined at %s)
1899
1900 =item excess address(es)
1901
1902 The command has more than the permitted number of addresses.
1903
1904 =item extra characters after command (%s)
1905
1906 =item illegal option `%s'
1907
1908 =item improper delimiter in s command
1909
1910 The BRE and substitution may not be delimited with `\' or newline.
1911
1912 =item invalid address after `,'
1913
1914 =item invalid backreference (%s)
1915
1916 The specified backreference number exceeds the number of backreferences
1917 in the BRE.
1918
1919 =item invalid repeat clause `\{%s\}'
1920
1921 The repeat clause does not contain a valid integer value, or pair of
1922 values.
1923
1924 =item malformed regex, 1st address
1925
1926 =item malformed regex, 2nd address
1927
1928 =item malformed regular expression
1929
1930 =item malformed substitution expression
1931
1932 =item malformed `y' command argument
1933
1934 The first or second string of a B<y> command  is syntactically incorrect.
1935
1936 =item maximum less than minimum in `\{%s\}'
1937
1938 =item no script command given
1939
1940 There must be at least one B<-e> or one B<-f> option specifying a
1941 script or script file.
1942
1943 =item `\' not valid as delimiter in `y' command
1944
1945 =item option -e requires an argument
1946
1947 =item option -f requires an argument
1948
1949 =item `s' command requires argument
1950
1951 =item start of unterminated `{'
1952
1953 =item string lengths in `y' command differ
1954
1955 The translation table strings in a B<y> command must have equal lengths.
1956
1957 =item undefined label `%s'
1958
1959 =item unexpected `}'
1960
1961 A B<}> command without a preceding B<{> command was encountered.
1962
1963 =item unexpected end of script
1964
1965 The end of the script was reached although a text line after a
1966 B<a>, B<c> or B<i> command indicated another line.
1967
1968 =item unknown command `%s'
1969
1970 =item unterminated `['
1971
1972 A BRE contains an unterminated bracket expression.
1973
1974 =item unterminated `\('
1975
1976 A BRE contains an unterminated backreference.
1977
1978 =item `\{' without closing `\}'
1979
1980 A BRE contains an unterminated bounds specification.
1981
1982 =item `\)' without preceding `\('
1983
1984 =item `y' command requires argument
1985
1986 =back
1987
1988 =head1 EXAMPLE
1989
1990 The basic material for the preceding section was generated by running
1991 the sed script
1992
1993    #no autoprint
1994    s/^.*Warn( *"\([^"]*\)".*$/\1/
1995    t process
1996    b
1997    :process
1998    s/$!/%s/g
1999    s/$[_[:alnum:]]\{1,\}/%s/g
2000    s/\\\\/\\/g
2001    s/^/=item /
2002    p
2003
2004 on the program's own text, and piping the output into C<sort -u>.
2005
2006
2007 =head1 SED SCRIPT TRANSLATION
2008
2009 If this program is invoked with the name F<s2p> it will act as a
2010 sed-to-Perl translator. After option processing (all other
2011 arguments are ignored), a Perl program is printed on standard
2012 output, which will process the input stream (as read from all
2013 arguments) in the way defined by the sed script and the option setting
2014 used for the translation.
2015
2016 =head1 SEE ALSO
2017
2018 perl(1), re_format(7)
2019
2020 =head1 BUGS
2021
2022 The B<l> command will show escape characters (ESC) as `C<\e>', but
2023 a vertical tab (VT) in octal.
2024
2025 Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
2026
2027 The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
2028 is "the last pattern used, at run time". This deviates from the Perl
2029 interpretation, which will re-use the "last last successfully executed
2030 regular expression". Since keeping track of pattern usage would create
2031 terribly cluttered code, and differences would only appear in obscure
2032 context (where other B<sed> implementations appear to deviate, too),
2033 the Perl semantics was adopted. Note that common usage of this feature,
2034 such as in C</abc/s//xyz/>, will work as expected.
2035
2036 Collating elements (of bracket expressions in BREs) are not implemented.
2037
2038 =head1 STANDARDS
2039
2040 This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2041 definition of B<sed>, and is compatible with the I<OpenBSD>
2042 implementation, except where otherwise noted (see L<"BUGS">).
2043
2044 =head1 AUTHOR
2045
2046 This Perl implementation of I<sed> was written by Wolfgang Laun,
2047 I<Wolfgang.Laun@alcatel.at>.
2048
2049 =head1 COPYRIGHT and LICENSE
2050
2051 This program is free and open software. You may use, modify,
2052 distribute, and sell this program (and any modified variants) in any
2053 way you wish, provided you do not restrict others from doing the same.
2054
2055 =cut
2056
2057 !NO!SUBS!
2058
2059 close OUT or die "Can't close $file: $!";
2060 chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
2061 unlink 'psed';
2062 print "Linking $file to psed.\n";
2063 if (defined $Config{d_link}) {
2064   link $file, 'psed';
2065 } else {
2066   unshift @INC, '../lib';
2067   require File::Copy;
2068   File::Copy::syscopy('s2p', 'psed');
2069 }
2070 exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
2071 chdir $origdir;