x2p/s2p.PL

   1 #!/usr/bin/perl
   2
   3 use Config;
   4 use File::Basename qw(&basename &dirname);
   5 use Cwd;
   6 use subs qw(link);
   7
   8 sub link { # This is a cut-down version of installperl:link().
   9     my($from,$to) = @_;
  10     my($success) = 0;
  11
  12     eval {
  13         CORE::link($from, $to)
  14             ? $success++
  15             : ($from =~ m#^/afs/# || $to =~ m#^/afs/#)
  16               ? die "AFS"  # okay inside eval {}
  17               : die "Couldn't link $from to $to: $!\n";
  18     };
  19     if ($@) {
  20         require File::Copy;
  21         File::Copy::copy($from, $to)
  22             ? $success++
  23             : warn "Couldn't copy $from to $to: $!\n";
  24     }
  25     $success;
  26 }
  27
  28 # List explicitly here the variables you want Configure to
  29 # generate.  Metaconfig only looks for shell variables, so you
  30 # have to mention them as if they were shell variables, not
  31 # %Config entries.  Thus you write
  32 #  $startperl
  33 # to ensure Configure will look for $Config{startperl}.
  34
  35 # This forces PL files to create target in same directory as PL file.
  36 # This is so that make depend always knows where to find PL derivatives.
  37 $origdir = cwd;
  38 chdir dirname($0);
  39 $file = basename($0, '.PL');
  40 $file .= '.com' if $^O eq 'VMS';
  41
  42 open OUT,">$file" or die "Can't create $file: $!";
  43
  44 print "Extracting $file (with variable substitutions)\n";
  45
  46 # In this section, perl variables will be expanded during extraction.
  47 # You can use $Config{...} to use Configure variables.
  48
  49 print OUT <<"!GROK!THIS!";
  50 $Config{startperl}
  51     eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
  52         if \$running_under_some_shell;
  53 my \$startperl;
  54 my \$perlpath;
  55 (\$startperl = <<'/../') =~ s/\\s*\\z//;
  56 $Config{startperl}
  57 /../
  58 (\$perlpath = <<'/../') =~ s/\\s*\\z//;
  59 $Config{perlpath}
  60 /../
  61 !GROK!THIS!
  62
  63 # In the following, perl variables are not expanded during extraction.
  64
  65 print OUT <<'!NO!SUBS!';
  66
  67 $0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
  68
  69 # (p)sed - a stream editor
  70 # History:  Aug 12 2000: Original version.
  71 #           Mar 25 2002: Rearrange generated Perl program.
  72
  73 use strict;
  74 use integer;
  75 use Symbol;
  76
  77 =head1 NAME
  78
  79 psed - a stream editor
  80
  81 =head1 SYNOPSIS
  82
  83    psed [-an] script [file ...]
  84    psed [-an] [-e script] [-f script-file] [file ...]
  85
  86    s2p  [-an] [-e script] [-f script-file]
  87
  88 =head1 DESCRIPTION
  89
  90 A stream editor reads the input stream consisting of the specified files
  91 (or standard input, if none are given), processes is line by line by
  92 applying a script consisting of edit commands, and writes resulting lines
  93 to standard output. The filename `C<->' may be used to read standard input.
  94
  95 The edit script is composed from arguments of B<-e> options and
  96 script-files, in the given order. A single script argument may be specified
  97 as the first parameter.
  98
  99 If this program is invoked with the name F<s2p>, it will act as a
 100 sed-to-Perl translator. See L<"sed Script Translation">.
 101
 102 B<sed> returns an exit code of 0 on success or >0 if an error occurred.
 103
 104 =head1 OPTIONS
 105
 106 =over 4
 107
 108 =item B<-a>
 109
 110 A file specified as argument to the B<w> edit command is by default
 111 opened before input processing starts. Using B<-a>, opening of such
 112 files is delayed until the first line is actually written to the file.
 113
 114 =item B<-e> I<script>
 115
 116 The editing commands defined by I<script> are appended to the script.
 117 Multiple commands must be separated by newlines.
 118
 119 =item B<-f> I<script-file>
 120
 121 Editing commands from the specified I<script-file> are read and appended
 122 to the script.
 123
 124 =item B<-n>
 125
 126 By default, a line is written to standard output after the editing script
 127 has been applied to it. The B<-n> option suppresses automatic printing.
 128
 129 =back
 130
 131 =head1 COMMANDS
 132
 133 B<sed> command syntax is defined as
 134
 135 Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
 136
 137 with whitespace being permitted before or after addresses, and between
 138 the function character and the argument. The I<address>es and the
 139 address inverter (C<!>) are used to restrict the application of a
 140 command to the selected line(s) of input.
 141
 142 Each command must be on a line of its own, except where noted in
 143 the synopses below.
 144
 145 The edit cycle performed on each input line consist of reading the line
 146 (without its trailing newline character) into the I<pattern space>,
 147 applying the applicable commands of the edit script, writing the final
 148 contents of the pattern space and a newline to the standard output.
 149 A I<hold space> is provided for saving the contents of the
 150 pattern space for later use.
 151
 152 =head2 Addresses
 153
 154 A sed address is either a line number or a pattern, which may be combined
 155 arbitrarily to construct ranges. Lines are numbered across all input files.
 156
 157 Any address may be followed by an exclamation mark (`C<!>'), selecting
 158 all lines not matching that address.
 159
 160 =over 4
 161
 162 =item I<number>
 163
 164 The line with the given number is selected.
 165
 166 =item B<$>
 167
 168 A dollar sign (C<$>) is the line number of the last line of the input stream.
 169
 170 =item B</>I<regular expression>B</>
 171
 172 A pattern address is a basic regular expression (see
 173 L<"Basic Regular Expressions">), between the delimiting character C</>.
 174 Any other character except C<\> or newline may be used to delimit a
 175 pattern address when the initial delimiter is prefixed with a
 176 backslash (`C<\>').
 177
 178 =back
 179
 180 If no address is given, the command selects every line.
 181
 182 If one address is given, it selects the line (or lines) matching the
 183 address.
 184
 185 Two addresses select a range that begins whenever the first address
 186 matches, and ends (including that line) when the second address matches.
 187 If the first (second) address is a matching pattern, the second
 188 address is not applied to the very same line to determine the end of
 189 the range. Likewise, if the second address is a matching pattern, the
 190 first address is not applied to the very same line to determine the
 191 begin of another range. If both addresses are line numbers,
 192 and the second line number is less than the first line number, then
 193 only the first line is selected.
 194
 195
 196 =head2 Functions
 197
 198 The maximum permitted number of addresses is indicated with each
 199 function synopsis below.
 200
 201 The argument I<text> consists of one or more lines following the command.
 202 Embedded newlines in I<text> must be preceded with a backslash.  Other
 203 backslashes in I<text> are deleted and the following character is taken
 204 literally.
 205
 206 =over 4
 207
 208 =cut
 209
 210 my %ComTab;
 211 my %GenKey;
 212 #--------------------------------------------------------------------------
 213 $ComTab{'a'}=[ 1, 'txt', \&Emit,       '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
 214
 215 =item [1addr]B<a\> I<text>
 216
 217 Write I<text> (which must start on the line following the command)
 218 to standard output immediately before reading the next line
 219 of input, either by executing the B<N> function or by beginning a new cycle.
 220
 221 =cut
 222
 223 #--------------------------------------------------------------------------
 224 $ComTab{'b'}=[ 2, 'str', \&Branch,     '{ goto XXX; }'                   ]; #ok
 225
 226 =item [2addr]B<b> [I<label>]
 227
 228 Branch to the B<:> function with the specified I<label>. If no label
 229 is given, branch to the end of the script.
 230
 231 =cut
 232
 233 #--------------------------------------------------------------------------
 234 $ComTab{'c'}=[ 2, 'txt', \&Change,     <<'-X-'                           ]; #ok
 235 { print <<'TheEnd'; } $doPrint = 0; goto EOS;
 236 -X-
 237 ### continue OK => next CYCLE;
 238
 239 =item [2addr]B<c\> I<text>
 240
 241 The line, or range of lines, selected by the address is deleted.
 242 The I<text> (which must start on the line following the command)
 243 is written to standard output. With an address range, this occurs at
 244 the end of the range.
 245
 246 =cut
 247
 248 #--------------------------------------------------------------------------
 249 $ComTab{'d'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 250 { $doPrint = 0;
 251   goto EOS;
 252 }
 253 -X-
 254 ### continue OK => next CYCLE;
 255
 256 =item [2addr]B<d>
 257
 258 Deletes the pattern space and starts the next cycle.
 259
 260 =cut
 261
 262 #--------------------------------------------------------------------------
 263 $ComTab{'D'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 264 { s/^.*\n?//;
 265   if(length($_)){ goto BOS } else { goto EOS }
 266 }
 267 -X-
 268 ### continue OK => next CYCLE;
 269
 270 =item [2addr]B<D>
 271
 272 Deletes the pattern space through the first embedded newline or to the end.
 273 If the pattern space becomes empty, a new cycle is started, otherwise
 274 execution of the script is restarted.
 275
 276 =cut
 277
 278 #--------------------------------------------------------------------------
 279 $ComTab{'g'}=[ 2, '',    \&Emit,       '{ $_ = $Hold };'                 ]; #ok
 280
 281 =item [2addr]B<g>
 282
 283 Replace the contents of the pattern space with the hold space.
 284
 285 =cut
 286
 287 #--------------------------------------------------------------------------
 288 $ComTab{'G'}=[ 2, '',    \&Emit,       '{ $_ .= "\n"; $_ .= $Hold };'    ]; #ok
 289
 290 =item [2addr]B<G>
 291
 292 Append a newline and the contents of the hold space to the pattern space.
 293
 294 =cut
 295
 296 #--------------------------------------------------------------------------
 297 $ComTab{'h'}=[ 2, '',    \&Emit,       '{ $Hold = $_ }'                  ]; #ok
 298
 299 =item [2addr]B<h>
 300
 301 Replace the contents of the hold space with the pattern space.
 302
 303 =cut
 304
 305 #--------------------------------------------------------------------------
 306 $ComTab{'H'}=[ 2, '',    \&Emit,       '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
 307
 308 =item [2addr]B<H>
 309
 310 Append a newline and the contents of the pattern space to the hold space.
 311
 312 =cut
 313
 314 #--------------------------------------------------------------------------
 315 $ComTab{'i'}=[ 1, 'txt', \&Emit,       '{ print <<'."'TheEnd' }\n"       ]; #ok
 316
 317 =item [1addr]B<i\> I<text>
 318
 319 Write the I<text> (which must start on the line following the command)
 320 to standard output.
 321
 322 =cut
 323
 324 #--------------------------------------------------------------------------
 325 $ComTab{'l'}=[ 2, '',    \&Emit,       '{ _l() }'                        ]; #okUTF8
 326
 327 =item [2addr]B<l>
 328
 329 Print the contents of the pattern space: non-printable characters are
 330 shown in C-style escaped form; long lines are split and have a trailing
 331 `C<\>' at the point of the split; the true end of a line is marked with
 332 a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
 333 BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
 334 octal number for all other non-printable characters.
 335
 336 =cut
 337
 338 #--------------------------------------------------------------------------
 339 $ComTab{'n'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 340 { print $_, "\n" if $doPrint;
 341   printQ() if @Q;
 342   $CondReg = 0;
 343   last CYCLE unless getsARGV();
 344   chomp();
 345 }
 346 -X-
 347
 348 =item [2addr]B<n>
 349
 350 If automatic printing is enabled, write the pattern space to the standard
 351 output. Replace the pattern space with the next line of input. If
 352 there is no more input, processing is terminated.
 353
 354 =cut
 355
 356 #--------------------------------------------------------------------------
 357 $ComTab{'N'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 358 { printQ() if @Q;
 359   $CondReg = 0;
 360   last CYCLE unless getsARGV( $h );
 361   chomp( $h );
 362   $_ .= "\n$h";
 363 }
 364 -X-
 365
 366 =item [2addr]B<N>
 367
 368 Append a newline and the next line of input to the pattern space. If
 369 there is no more input, processing is terminated.
 370
 371 =cut
 372
 373 #--------------------------------------------------------------------------
 374 $ComTab{'p'}=[ 2, '',    \&Emit,       '{ print $_, "\n"; }'             ]; #ok
 375
 376 =item [2addr]B<p>
 377
 378 Print the pattern space to the standard output. (Use the B<-n> option
 379 to suppress automatic printing at the end of a cycle if you want to
 380 avoid double printing of lines.)
 381
 382 =cut
 383
 384 #--------------------------------------------------------------------------
 385 $ComTab{'P'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 386 { if( /^(.*)/ ){ print $1, "\n"; } }
 387 -X-
 388
 389 =item [2addr]B<P>
 390
 391 Prints the pattern space through the first embedded newline or to the end.
 392
 393 =cut
 394
 395 #--------------------------------------------------------------------------
 396 $ComTab{'q'}=[ 1, '',    \&Emit,       <<'-X-'                           ]; #ok
 397 { print $_, "\n" if $doPrint;
 398   last CYCLE;
 399 }
 400 -X-
 401
 402 =item [1addr]B<q>
 403
 404 Branch to the end of the script and quit without starting a new cycle.
 405
 406 =cut
 407
 408 #--------------------------------------------------------------------------
 409 $ComTab{'r'}=[ 1, 'str', \&Emit,       "{ _r( '-X-' ) }"                 ]; #ok
 410
 411 =item [1addr]B<r> I<file>
 412
 413 Copy the contents of the I<file> to standard output immediately before
 414 the next attempt to read a line of input. Any error encountered while
 415 reading I<file> is silently ignored.
 416
 417 =cut
 418
 419 #--------------------------------------------------------------------------
 420 $ComTab{'s'}=[ 2, 'sub', \&Emit,       ''                                ]; #ok
 421
 422 =item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
 423
 424 Substitute the I<replacement> string for the first substring in
 425 the pattern space that matches the I<regular expression>.
 426 Any character other than backslash or newline can be used instead of a
 427 slash to delimit the regular expression and the replacement.
 428 To use the delimiter as a literal character within the regular expression
 429 and the replacement, precede the character by a backslash (`C<\>').
 430
 431 Literal newlines may be embedded in the replacement string by
 432 preceding a newline with a backslash.
 433
 434 Within the replacement, an ampersand (`C<&>') is replaced by the string
 435 matching the regular expression. The strings `C<\1>' through `C<\9>' are
 436 replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
 437 To get a literal `C<&>' or `C<\>' in the replacement text, precede it
 438 by a backslash.
 439
 440 The following I<flags> modify the behaviour of the B<s> command:
 441
 442 =over 8
 443
 444 =item B<g>
 445
 446 The replacement is performed for all matching, non-overlapping substrings
 447 of the pattern space.
 448
 449 =item B<1>..B<9>
 450
 451 Replace only the n-th matching substring of the pattern space.
 452
 453 =item B<p>
 454
 455 If the substitution was made, print the new value of the pattern space.
 456
 457 =item B<w> I<file>
 458
 459 If the substitution was made, write the new value of the pattern space
 460 to the specified file.
 461
 462 =back
 463
 464 =cut
 465
 466 #--------------------------------------------------------------------------
 467 $ComTab{'t'}=[ 2, 'str', \&Branch,     '{ goto XXX if _t() }'            ]; #ok
 468
 469 =item [2addr]B<t> [I<label>]
 470
 471 Branch to the B<:> function with the specified I<label> if any B<s>
 472 substitutions have been made since the most recent reading of an input line
 473 or execution of a B<t> function. If no label is given, branch to the end of
 474 the script.
 475
 476
 477 =cut
 478
 479 #--------------------------------------------------------------------------
 480 $ComTab{'w'}=[ 2, 'str', \&Write,      "{ _w( '-X-' ) }"                 ]; #ok
 481
 482 =item [2addr]B<w> I<file>
 483
 484 The contents of the pattern space are written to the I<file>.
 485
 486 =cut
 487
 488 #--------------------------------------------------------------------------
 489 $ComTab{'x'}=[ 2, '',    \&Emit,       '{ ($Hold, $_) = ($_, $Hold) }'   ]; #ok
 490
 491 =item [2addr]B<x>
 492
 493 Swap the contents of the pattern space and the hold space.
 494
 495 =cut
 496
 497 #--------------------------------------------------------------------------
 498 $ComTab{'y'}=[ 2, 'tra', \&Emit,       ''                                ]; #ok
 499 =item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
 500
 501 In the pattern space, replace all characters occuring in I<string1> by the
 502 character at the corresponding position in I<string2>. It is possible
 503 to use any character (other than a backslash or newline) instead of a
 504 slash to delimit the strings.  Within I<string1> and I<string2>, a
 505 backslash followed by any character other than a newline is that literal
 506 character, and a backslash followed by an `n' is replaced by a newline
 507 character.
 508
 509 =cut
 510
 511 #--------------------------------------------------------------------------
 512 $ComTab{'='}=[ 1, '',    \&Emit,       '{ print "$.\n" }'                ]; #ok
 513
 514 =item [1addr]B<=>
 515
 516 Prints the current line number on the standard output.
 517
 518 =cut
 519
 520 #--------------------------------------------------------------------------
 521 $ComTab{':'}=[ 0, 'str', \&Label,      ''                                ]; #ok
 522
 523 =item [0addr]B<:> [I<label>]
 524
 525 The command specifies the position of the I<label>. It has no other effect.
 526
 527 =cut
 528
 529 #--------------------------------------------------------------------------
 530 $ComTab{'{'}=[ 2, '',    \&BeginBlock, '{'                               ]; #ok
 531 $ComTab{'}'}=[ 0, '',    \&EndBlock,   ';}'                              ]; #ok
 532 # ';' to avoid warning on empty {}-block
 533
 534 =item [2addr]B<{> [I<command>]
 535
 536 =item [0addr]B<}>
 537
 538 These two commands begin and end a command list. The first command may
 539 be given on the same line as the opening B<{> command. The commands
 540 within the list are jointly selected by the address(es) given on the
 541 B<{> command (but may still have individual addresses).
 542
 543 =cut
 544
 545 #--------------------------------------------------------------------------
 546 $ComTab{'#'}=[ 0, 'str', \&Comment,    ''                                ]; #ok
 547
 548 =item [0addr]B<#> [I<comment>]
 549
 550 The entire line is ignored (treated as a comment). If, however, the first
 551 two characters in the script are `C<#n>', automatic printing of output is
 552 suppressed, as if the B<-n> option were given on the command line.
 553
 554 =back
 555
 556 =cut
 557
 558 use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
 559
 560 my $useDEBUG    = exists( $ENV{PSEDDEBUG} );
 561 my $useEXTBRE   = $ENV{PSEDEXTBRE} || '';
 562 $useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
 563
 564 my $doAutoPrint = 1;          # automatic printing of pattern space (-n => 0)
 565 my $doOpenWrite = 1;          # open w command output files at start (-a => 0)
 566 my $svOpenWrite = 0;          # save $doOpenWrite
 567 my $doGenerate  = $0 eq 's2p';
 568
 569 # Collected and compiled script
 570 #
 571 my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
 572 $Code = '';
 573
 574 ##################
 575 #  Compile Time
 576 #
 577 # Labels
 578 #
 579 # Error handling
 580 #
 581 sub Warn($;$){
 582     my( $msg, $loc ) = @_;
 583     $loc ||= '';
 584     $loc .= ': ' if length( $loc );
 585     warn( "$0: $loc$msg\n" );
 586 }
 587
 588 $labNum = 0;
 589 sub newLabel(){
 590     return 'L_'.++$labNum;
 591 }
 592
 593 # safeHere: create safe here delimiter and  modify opcode and argument
 594 #
 595 sub safeHere($$){
 596     my( $codref, $argref ) = @_;
 597     my $eod = 'EOD000';
 598     while( $$argref =~ /^$eod$/m ){
 599         $eod++;
 600     }
 601     $$codref =~ s/TheEnd/$eod/e;
 602     $$argref .= "$eod\n";
 603 }
 604
 605 # Emit: create address logic and emit command
 606 #
 607 sub Emit($$$$$$){
 608     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 609     my $cond = '';
 610     if( defined( $addr1 ) ){
 611         if( defined( $addr2 ) ){
 612             $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 613         } else {
 614             $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 615         }
 616         $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
 617     }
 618
 619     if( $opcode eq '' ){
 620         $Code .= "$cond$arg\n";
 621
 622     } elsif( $opcode =~ s/-X-/$arg/e ){
 623         $Code .= "$cond$opcode\n";
 624
 625     } elsif( $opcode =~ /TheEnd/ ){
 626         safeHere( \$opcode, \$arg );
 627         $Code .= "$cond$opcode$arg";
 628
 629     } else {
 630         $Code .= "$cond$opcode\n";
 631     }
 632     0;
 633 }
 634
 635 # Write (w command, w flag): store pathname
 636 #
 637 sub Write($$$$$$){
 638     my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
 639     $wFiles{$path} = '';
 640     Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
 641 }
 642
 643
 644 # Label (: command): label definition
 645 #
 646 sub Label($$$$$$){
 647     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 648     my $rc = 0;
 649     $lab =~ s/\s+//;
 650     if( length( $lab ) ){
 651         my $h;
 652         if( ! exists( $Label{$lab} ) ){
 653             $h = $Label{$lab}{name} = newLabel();
 654         } else {
 655             $h = $Label{$lab}{name};
 656             if( exists( $Label{$lab}{defined} ) ){
 657                 my $dl = $Label{$lab}{defined};
 658                 Warn( "duplicate label $lab (first defined at $dl)", $fl );
 659                 $rc = 1;
 660             }
 661         }
 662         $Label{$lab}{defined} = $fl;
 663         $Code .= "$h:;\n";
 664     }
 665     $rc;
 666 }
 667
 668 # BeginBlock ({ command): push block start
 669 #
 670 sub BeginBlock($$$$$$){
 671     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 672     push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
 673     Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 674 }
 675
 676 # EndBlock (} command): check proper nesting
 677 #
 678 sub EndBlock($$$$$$){
 679     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 680     my $rc;
 681     my $jcom = pop( @BlockStack );
 682     if( defined( $jcom ) ){
 683         $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 684     } else {
 685         Warn( "unexpected `}'", $fl );
 686         $rc = 1;
 687     }
 688     $rc;
 689 }
 690
 691 # Branch (t, b commands): check or create label, substitute default
 692 #
 693 sub Branch($$$$$$){
 694     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 695     $lab =~ s/\s+//; # no spaces at end
 696     my $h;
 697     if( length( $lab ) ){
 698         if( ! exists( $Label{$lab} ) ){
 699             $h = $Label{$lab}{name} = newLabel();
 700         } else {
 701             $h = $Label{$lab}{name};
 702         }
 703         push( @{$Label{$lab}{used}}, $fl );
 704     } else {
 705         $h = 'EOS';
 706     }
 707     $opcode =~ s/XXX/$h/e;
 708     Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
 709 }
 710
 711 # Change (c command): is special due to range end watching
 712 #
 713 sub Change($$$$$$){
 714     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 715     my $kwd = $negated ? 'unless' : 'if';
 716     if( defined( $addr2 ) ){
 717         $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 718         if( ! $negated ){
 719             $addr1  = '$icnt = ('.$addr1.')';
 720             $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
 721         }
 722     } else {
 723         $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 724     }
 725     safeHere( \$opcode, \$arg );
 726     $Code .= "$kwd( $addr1 ){\n  $opcode$arg}\n";
 727     0;
 728 }
 729
 730
 731 # Comment (# command): A no-op. Who would've thought that!
 732 #
 733 sub Comment($$$$$$){
 734     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 735 ### $Code .= "# $arg\n";
 736     0;
 737 }
 738
 739
 740 sub stripRegex($$){
 741     my( $del, $sref ) = @_;
 742     my $regex = $del;
 743     print "stripRegex:$del:$$sref:\n" if $useDEBUG;
 744     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
 745         my $sl = $2;
 746         $regex .= $1.$sl.$del;
 747         if( length( $sl ) % 2 == 0 ){
 748             return $regex;
 749         }
 750         $regex .= $3;
 751     }
 752     undef();
 753 }
 754
 755 # stripTrans: take a <del> terminated string from y command
 756 #   honoring and cleaning up of \-escaped <del>'s
 757 #
 758 sub stripTrans($$){
 759     my( $del, $sref ) = @_;
 760     my $t = '';
 761     print "stripTrans:$del:$$sref:\n" if $useDEBUG;
 762     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
 763         my $sl = $2;
 764         $t .= $1;
 765         if( length( $sl ) % 2 == 0 ){
 766             $t .= $sl;
 767             $t =~ s/\\\\/\\/g;
 768             return $t;
 769         }
 770         chop( $sl );
 771         $t .= $sl.$del.$3;
 772     }
 773     undef();
 774 }
 775
 776 # makey - construct Perl y/// from sed y///
 777 #
 778 sub makey($$$){
 779     my( $fr, $to, $fl ) = @_;
 780     my $error = 0;
 781
 782     # Ensure that any '-' is up front.
 783     # Diagnose duplicate contradicting mappings
 784     my %tr;
 785     for( my $i = 0; $i < length($fr); $i++ ){
 786         my $fc = substr($fr,$i,1);
 787         my $tc = substr($to,$i,1);
 788         if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
 789             Warn( "ambiguous translation for character `$fc' in `y' command",
 790                   $fl );
 791             $error++;
 792         }
 793         $tr{$fc} = $tc;
 794     }
 795     $fr = $to = '';
 796     if( exists( $tr{'-'} ) ){
 797         ( $fr, $to ) = ( '-', $tr{'-'} );
 798         delete( $tr{'-'} );
 799     } else {
 800         $fr = $to = '';
 801     }
 802     # might just as well sort it...
 803     for my $fc ( sort keys( %tr ) ){
 804         $fr .= $fc;
 805         $to .= $tr{$fc};
 806     }
 807     # make embedded delimiters and newlines safe
 808     $fr =~ s/([{}])/\$1/g;
 809     $to =~ s/([{}])/\$1/g;
 810     $fr =~ s/\n/\\n/g;
 811     $to =~ s/\n/\\n/g;
 812     return $error ? undef() : "{ y{$fr}{$to}; }";
 813 }
 814
 815 ######
 816 # makes - construct Perl s/// from sed s///
 817 #
 818 sub makes($$$$$$$){
 819     my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
 820
 821     # make embedded newlines safe
 822     $regex =~ s/\n/\\n/g;
 823     $subst =~ s/\n/\\n/g;
 824
 825     my $code;
 826     # n-th occurrence
 827     #
 828     if( length( $nmatch ) ){
 829         $code = <<TheEnd;
 830 { \$n = $nmatch;
 831   while( --\$n && ( \$s = m ${regex}g ) ){}
 832   \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
 833   \$CondReg ||= \$s;
 834 TheEnd
 835     } else {
 836         $code = <<TheEnd;
 837 { \$s = s ${regex}${subst}s${global};
 838   \$CondReg ||= \$s;
 839 TheEnd
 840     }
 841     if( $print ){
 842         $code .= '  print $_, "\n" if $s;'."\n";
 843     }
 844     if( defined( $path ) ){
 845         $wFiles{$path} = '';
 846         $code .= " _w( '$path' ) if \$s;\n";
 847         $GenKey{'w'} = 1;
 848     }
 849     $code .= "}";
 850 }
 851
 852 =head1 BASIC REGULAR EXPRESSIONS
 853
 854 A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
 855 of I<atoms>, for matching parts of a string, and I<bounds>, specifying
 856 repetitions of a preceding atom.
 857
 858 =head2 Atoms
 859
 860 The possible atoms of a BRE are: B<.>, matching any single character;
 861 B<^> and B<$>, matching the null string at the beginning or end
 862 of a string, respectively; a I<bracket expressions>, enclosed
 863 in B<[> and B<]> (see below); and any single character with no
 864 other significance (matching that character). A B<\> before one
 865 of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
 866 after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
 867 becomes an atom and establishes the target for a I<backreference>,
 868 consisting of the substring that actually matches the enclosed atoms.
 869 Finally, B<\> followed by one of the digits B<0> through B<9> is a
 870 backreference.
 871
 872 A B<^> that is not first, or a B<$> that is not last does not have
 873 a special significance and need not be preceded by a backslash to
 874 become literal. The same is true for a B<]>, that does not terminate
 875 a bracket expression.
 876
 877 An unescaped backslash cannot be last in a BRE.
 878
 879 =head2 Bounds
 880
 881 The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
 882 atom; B<\{>I<count>B<\}>, specifying that many repetitions;
 883 B<\{>I<minimum>B<,\}>, giving a lower limit; and
 884 B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
 885 bound.
 886
 887 A bound appearing as the first item in a BRE is taken literally.
 888
 889 =head2 Bracket Expressions
 890
 891 A I<bracket expression> is a list of characters, character ranges
 892 and character classes enclosed in B<[> and B<]> and matches any
 893 single character from the represented set of characters.
 894
 895 A character range is written as two characters separated by B<-> and
 896 represents all characters (according to the character collating sequence)
 897 that are not less than the first and not greater than the second.
 898 (Ranges are very collating-sequence-dependent, and portable programs
 899 should avoid relying on them.)
 900
 901 A character class is one of the class names
 902
 903    alnum     digit     punct
 904    alpha     graph     space
 905    blank     lower     upper
 906    cntrl     print     xdigit
 907
 908 enclosed in B<[:> and B<:]> and represents the set of characters
 909 as defined in ctype(3).
 910
 911 If the first character after B<[> is B<^>, the sense of matching is
 912 inverted.
 913
 914 To include a literal `C<^>', place it anywhere else but first. To
 915 include a literal 'C<]>' place it first or immediately after an
 916 initial B<^>. To include a literal `C<->' make it the first (or
 917 second after B<^>) or last character, or the second endpoint of
 918 a range.
 919
 920 The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
 921 match the null string at the beginning and end of a word respectively.
 922 (Note that neither is identical to Perl's `\b' atom.)
 923
 924 =head2 Additional Atoms
 925
 926 Since some sed implementations provide additional regular expression
 927 atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
 928 the following backslash escapes:
 929
 930 =over 4
 931
 932 =item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
 933
 934 =item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
 935
 936 =item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
 937
 938 =item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
 939
 940 =item B<\y> Match the empty string at a word boundary.
 941
 942 =item B<\B> Match the empty string between any two either word or non-word characters.
 943
 944 =back
 945
 946 To enable this feature, the environment variable PSEDEXTBRE must be set
 947 to a string containing the requested characters, e.g.:
 948 C<PSEDEXTBRE='E<lt>E<gt>wW'>.
 949
 950 =cut
 951
 952 #####
 953 # bre2p - convert BRE to Perl RE
 954 #
 955 sub peek(\$$){
 956     my( $pref, $ic ) = @_;
 957     $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
 958 }
 959
 960 sub bre2p($$$){
 961     my( $del, $pat, $fl ) = @_;
 962     my $led = $del;
 963     $led =~ tr/{([</})]>/;
 964     $led = '' if $led eq $del;
 965
 966     $pat = substr( $pat, 1, length($pat) - 2 );
 967     my $res = '';
 968     my $bracklev = 0;
 969     my $backref  = 0;
 970     my $parlev = 0;
 971     for( my $ic = 0; $ic < length( $pat ); $ic++ ){
 972         my $c = substr( $pat, $ic, 1 );
 973         if( $c eq '\\' ){
 974             ### backslash escapes
 975             my $nc = peek($pat,$ic);
 976             if( $nc eq '' ){
 977                 Warn( "`\\' cannot be last in pattern", $fl );
 978                 return undef();
 979             }
 980             $ic++;
 981             if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
 982                 $res .= "\\$del";
 983
 984             } elsif( $nc =~ /([[.*\\n])/ ){
 985                 ## check for \-escaped magics and \n:
 986                 ## \[ \. \* \\ \n stay as they are
 987                 $res .= '\\'.$nc;
 988
 989             } elsif( $nc eq '(' ){ ## \( => (
 990                 $parlev++;
 991                 $res .= '(';
 992
 993             } elsif( $nc eq ')' ){ ## \) => )
 994                 $parlev--;
 995                 $backref++;
 996                 if( $parlev < 0 ){
 997                     Warn( "unmatched `\\)'", $fl );
 998                     return undef();
 999                 }
1000                 $res .= ')';
1001
1002             } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
1003                 my $endpos = index( $pat, '\\}', $ic );
1004                 if( $endpos < 0 ){
1005                     Warn( "unmatched `\\{'", $fl );
1006                     return undef();
1007                 }
1008                 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
1009                 $ic = $endpos + 1;
1010
1011                 if( $res =~ /^\^?$/ ){
1012                     $res .= "\\{$rep\}";
1013                 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
1014                     my $min = $1;
1015                     my $com = $2 || '';
1016                     my $max = $3;
1017                     if( length( $max ) ){
1018                         if( $max < $min ){
1019                             Warn( "maximum less than minimum in `\\{$rep\\}'",
1020                                   $fl );
1021                             return undef();
1022                         }
1023                     } else {
1024                         $max = '';
1025                     }
1026                     # simplify some
1027                     if( $min == 0 && $max eq '1' ){
1028                         $res .= '?';
1029                     } elsif( $min == 1 && "$com$max" eq ',' ){
1030                         $res .= '+';
1031                     } elsif( $min == 0 && "$com$max" eq ',' ){
1032                         $res .= '*';
1033                     } else {
1034                         $res .= "{$min$com$max}";
1035                     }
1036                 } else {
1037                     Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1038                     return undef();
1039                 }
1040
1041             } elsif( $nc =~ /^[1-9]$/ ){
1042                 ## \1 .. \9 => \1 .. \9, but check for a following digit
1043                 if( $nc > $backref ){
1044                     Warn( "invalid backreference ($nc)", $fl );
1045                     return undef();
1046                 }
1047                 $res .= "\\$nc";
1048                 if( peek($pat,$ic) =~ /[0-9]/ ){
1049                     $res .= '(?:)';
1050                 }
1051
1052             } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1053                 ## extensions - at most <>wWyB - not in POSIX
1054                 if(      $nc eq '<' ){ ## \< => \b(?=\w), be precise
1055                     $res .= '\\b(?<=\\W)';
1056                 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1057                     $res .= '\\b(?=\\W)';
1058                 } elsif( $nc eq 'y' ){ ## \y => \b
1059                     $res .= '\\b';
1060                 } else {               ## \B, \w, \W remain the same
1061                     $res .= "\\$nc";
1062                 }
1063             } elsif( $nc eq $led ){
1064                 ## \<closing bracketing-delimiter> - keep '\'
1065                 $res .= "\\$nc";
1066
1067             } else { ## \ <char> => <char> ("as if `\' were not present")
1068                 $res .= $nc;
1069             }
1070
1071         } elsif( $c eq '.' ){ ## . => .
1072             $res .= $c;
1073
1074         } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1075             if( $res =~ /^\^?$/ ){
1076                 $res .= '\\*';
1077             } elsif( substr( $res, -1, 1 ) ne '*' ){
1078                 $res .= $c;
1079             }
1080
1081         } elsif( $c eq '[' ){
1082             ## parse []: [^...] [^]...] [-...]
1083             my $add = '[';
1084             if( peek($pat,$ic) eq '^' ){
1085                 $ic++;
1086                 $add .= '^';
1087             }
1088             my $nc = peek($pat,$ic);
1089             if( $nc eq ']' || $nc eq '-' ){
1090                 $add .= $nc;
1091                 $ic++;
1092             }
1093             # check that [ is not trailing
1094             if( $ic >= length( $pat ) - 1 ){
1095                 Warn( "unmatched `['", $fl );
1096                 return undef();
1097             }
1098             # look for [:...:] and x-y
1099             my $rstr = substr( $pat, $ic+1 );
1100             if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1101                 my $cnt = $1;
1102                 $ic += length( $cnt );
1103                 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1104                 # try some simplifications
1105                 my $red = $cnt;
1106                 if( $red =~ s/0-9// ){
1107                     $cnt = $red.'\d';
1108                     if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1109                         $cnt = $red.'\w';
1110                     }
1111                 }
1112                 $add .= $cnt;
1113
1114                 # POSIX 1003.2 has this (optional) for begin/end word
1115                 $add = '\\b(?=\\W)'  if $add eq '[[:<:]]';
1116                 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1117
1118             }
1119
1120             ## may have a trailing `-' before `]'
1121             if( $ic < length($pat) - 1 &&
1122                 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1123                 $ic += length( $1 );
1124                 $add .= $1;
1125                 # another simplification
1126                 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1127                 $res .= $add;
1128             } else {
1129                 Warn( "unmatched `['", $fl );
1130                 return undef();
1131             }
1132
1133         } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1134             $res .= "\\$c";
1135
1136         } elsif( $c eq ']' ){ ## unmatched ] is not magic
1137             $res .= ']';
1138
1139         } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1140             $res .= "\\$c";
1141
1142         } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1143             $res .= length( $res ) ? '\\^' : '^';
1144
1145         } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1146             $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
1147
1148         } else {
1149             $res .= $c;
1150         }
1151     }
1152
1153     if( $parlev ){
1154        Warn( "unmatched `\\('", $fl );
1155        return undef();
1156     }
1157
1158     # final cleanup: eliminate raw HTs
1159     $res =~ s/\t/\\t/g;
1160     return $del . $res . ( $led ? $led : $del );
1161 }
1162
1163
1164 #####
1165 # sub2p - convert sed substitution to Perl substitution
1166 #
1167 sub sub2p($$$){
1168     my( $del, $subst, $fl ) = @_;
1169     my $led = $del;
1170     $led =~ tr/{([</})]>/;
1171     $led = '' if $led eq $del;
1172
1173     $subst = substr( $subst, 1, length($subst) - 2 );
1174     my $res = '';
1175
1176     for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1177         my $c = substr( $subst, $ic, 1 );
1178         if( $c eq '\\' ){
1179             ### backslash escapes
1180             my $nc = peek($subst,$ic);
1181             if( $nc eq '' ){
1182                 Warn( "`\\' cannot be last in substitution", $fl );
1183                 return undef();
1184             }
1185             $ic++;
1186             if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1187                 $res .= '\\' . $nc;
1188             } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1189                 $res .= '${' . $nc . '}';
1190             } else { ## everything else (includes &): omit \
1191                 $res .= $nc;
1192             }
1193         } elsif( $c eq '&' ){ ## & => $&
1194             $res .= '$&';
1195         } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1196             $res .= '\\' . $c;
1197         } else {
1198             $res .= $c;
1199         }
1200     }
1201
1202     # final cleanup: eliminate raw HTs
1203     $res =~ s/\t/\\t/g;
1204     return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1205 }
1206
1207
1208 sub Parse(){
1209     my $error = 0;
1210     my( $pdef, $pfil, $plin );
1211     for( my $icom = 0; $icom < @Commands; $icom++ ){
1212         my $cmd = $Commands[$icom];
1213         print "Parse:$cmd:\n" if $useDEBUG;
1214         $cmd =~ s/^\s+//;
1215         next unless length( $cmd );
1216         my $scom = $icom;
1217         if( exists( $Defined{$icom} ) ){
1218             $pdef = $Defined{$icom};
1219             if( $pdef =~ /^ #(\d+)/ ){
1220                 $pfil = 'expression #';
1221                 $plin = $1;
1222             } else {
1223                 $pfil = "$pdef l.";
1224                 $plin = 1;
1225             }
1226         } else {
1227             $plin++;
1228         }
1229         my $fl = "$pfil$plin";
1230
1231         # insert command as comment in gnerated code
1232         #
1233         $Code .= "# $cmd\n" if $doGenerate;
1234
1235         # The Address(es)
1236         #
1237         my( $negated, $naddr, $addr1, $addr2 );
1238         $naddr = 0;
1239         if(      $cmd =~ s/^(\d+)\s*// ){
1240             $addr1 = "$1"; $naddr++;
1241         } elsif( $cmd =~ s/^\$\s*// ){
1242             $addr1 = 'eofARGV()'; $naddr++;
1243         } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1244             my $del = $1;
1245             my $regex = stripRegex( $del, \$cmd );
1246             if( defined( $regex ) ){
1247                 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1248                 $naddr++;
1249             } else {
1250                 Warn( "malformed regex, 1st address", $fl );
1251                 $error++;
1252                 next;
1253             }
1254         }
1255         if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1256             if(      $cmd =~ s/^(\d+)\s*// ){
1257                 $addr2 = "$1"; $naddr++;
1258             } elsif( $cmd =~ s/^\$\s*// ){
1259                 $addr2 = 'eofARGV()'; $naddr++;
1260             } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1261                 my $del = $1;
1262                 my $regex = stripRegex( $del, \$cmd );
1263                 if( defined( $regex ) ){
1264                     $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1265                     $naddr++;
1266                 } else {
1267                     Warn( "malformed regex, 2nd address", $fl );
1268                     $error++;
1269                     next;
1270                 }
1271             } else {
1272                 Warn( "invalid address after `,'", $fl );
1273                 $error++;
1274                 next;
1275             }
1276         }
1277
1278         # address modifier `!'
1279         #
1280         $negated = $cmd =~ s/^!\s*//;
1281         if( defined( $addr1 ) ){
1282             print "Parse: addr1=$addr1" if $useDEBUG;
1283             if( defined( $addr2 ) ){
1284                 print ", addr2=$addr2 " if $useDEBUG;
1285                 # both numeric and addr1 > addr2 => eliminate addr2
1286                 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1287                                    $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1288             }
1289         }
1290         print 'negated' if $useDEBUG && $negated;
1291         print " command:$cmd\n" if $useDEBUG;
1292
1293         # The Command
1294         #
1295         if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1296             my $h = substr( $cmd, 0, 1 );
1297             Warn( "unknown command `$h'", $fl );
1298             $error++;
1299             next;
1300         }
1301         my $key = $1;
1302
1303         my $tabref = $ComTab{$key};
1304         $GenKey{$key} = 1;
1305         if( $naddr > $tabref->[0] ){
1306             Warn( "excess address(es)", $fl );
1307             $error++;
1308             next;
1309         }
1310
1311         my $arg = '';
1312         if(      $tabref->[1] eq 'str' ){
1313             # take remainder - don't care if it is empty
1314             $arg = $cmd;
1315             $cmd = '';
1316
1317         } elsif( $tabref->[1] eq 'txt' ){
1318             # multi-line text
1319             my $goon = $cmd =~ /(.*)\\$/;
1320             if( length( $1 ) ){
1321                 Warn( "extra characters after command ($cmd)", $fl );
1322                 $error++;
1323             }
1324             while( $goon ){
1325                 $icom++;
1326                 if( $icom > $#Commands ){
1327                     Warn( "unexpected end of script", $fl );
1328                     $error++;
1329                     last;
1330                 }
1331                 $cmd = $Commands[$icom];
1332                 $Code .= "# $cmd\n" if $doGenerate;
1333                 $goon = $cmd =~ s/\\$//;
1334                 $cmd =~ s/\\(.)/$1/g;
1335                 $arg .= "\n" if length( $arg );
1336                 $arg .= $cmd;
1337             }
1338             $arg .= "\n" if length( $arg );
1339             $cmd = '';
1340
1341         } elsif( $tabref->[1] eq 'sub' ){
1342             # s///
1343             if( ! length( $cmd ) ){
1344                 Warn( "`s' command requires argument", $fl );
1345                 $error++;
1346                 next;
1347             }
1348             if( $cmd =~ s{^([^\\\n])}{} ){
1349                 my $del = $1;
1350                 my $regex = stripRegex( $del, \$cmd );
1351                 if( ! defined( $regex ) ){
1352                     Warn( "malformed regular expression", $fl );
1353                     $error++;
1354                     next;
1355                 }
1356                 $regex = bre2p( $del, $regex, $fl );
1357
1358                 # a trailing \ indicates embedded NL (in replacement string)
1359                 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1360                     $icom++;
1361                     if( $icom > $#Commands ){
1362                         Warn( "unexpected end of script", $fl );
1363                         $error++;
1364                         last;
1365                     }
1366                     $cmd .= $Commands[$icom];
1367                     $Code .= "# $Commands[$icom]\n" if $doGenerate;
1368                 }
1369
1370                 my $subst = stripRegex( $del, \$cmd );
1371                 if( ! defined( $regex ) ){
1372                     Warn( "malformed substitution expression", $fl );
1373                     $error++;
1374                     next;
1375                 }
1376                 $subst = sub2p( $del, $subst, $fl );
1377
1378                 # parse s/// modifier: g|p|0-9|w <file>
1379                 my( $global, $nmatch, $print, $write ) =
1380                   ( '',      '',      0,      undef );
1381                 while( $cmd =~ s/^([gp0-9])// ){
1382                     $1 eq 'g' ? ( $global = 'g' ) :
1383                     $1 eq 'p' ? ( $print  = $1  ) : ( $nmatch .= $1 );
1384                 }
1385                 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1386                 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1387                 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1388                     Warn( "conflicting flags `$global$nmatch'", $fl );
1389                     $error++;
1390                     next;
1391                 }
1392
1393                 $arg = makes( $regex, $subst,
1394                               $write, $global, $print, $nmatch, $fl );
1395                 if( ! defined( $arg ) ){
1396                     $error++;
1397                     next;
1398                 }
1399
1400             } else {
1401                 Warn( "improper delimiter in s command", $fl );
1402                 $error++;
1403                 next;
1404             }
1405
1406         } elsif( $tabref->[1] eq 'tra' ){
1407             # y///
1408             # a trailing \ indicates embedded newline
1409             while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1410                 $icom++;
1411                 if( $icom > $#Commands ){
1412                     Warn( "unexpected end of script", $fl );
1413                     $error++;
1414                     last;
1415                 }
1416                 $cmd .= $Commands[$icom];
1417                 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1418             }
1419             if( ! length( $cmd ) ){
1420                 Warn( "`y' command requires argument", $fl );
1421                 $error++;
1422                 next;
1423             }
1424             my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1425             if( $d eq '\\' ){
1426                 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1427                 $error++;
1428                 next;
1429             }
1430             my $fr = stripTrans( $d, \$cmd );
1431             if( ! defined( $fr ) || ! length( $cmd ) ){
1432                 Warn( "malformed `y' command argument", $fl );
1433                 $error++;
1434                 next;
1435             }
1436             my $to = stripTrans( $d, \$cmd );
1437             if( ! defined( $to ) ){
1438                 Warn( "malformed `y' command argument", $fl );
1439                 $error++;
1440                 next;
1441             }
1442             if( length($fr) != length($to) ){
1443                 Warn( "string lengths in `y' command differ", $fl );
1444                 $error++;
1445                 next;
1446             }
1447             if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1448                 $error++;
1449                 next;
1450             }
1451
1452         }
1453
1454         # $cmd must be now empty - exception is {
1455         if( $cmd !~ /^\s*$/ ){
1456             if( $key eq '{' ){
1457                 # dirty hack to process command on '{' line
1458                 $Commands[$icom--] = $cmd;
1459             } else {
1460                 Warn( "extra characters after command ($cmd)", $fl );
1461                 $error++;
1462                 next;
1463             }
1464         }
1465
1466         # Make Code
1467         #
1468         if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1469                              $tabref->[3], $arg, $fl ) ){
1470             $error++;
1471         }
1472     }
1473
1474     while( @BlockStack ){
1475         my $bl = pop( @BlockStack );
1476         Warn( "start of unterminated `{'", $bl );
1477         $error++;
1478     }
1479
1480     for my $lab ( keys( %Label ) ){
1481         if( ! exists( $Label{$lab}{defined} ) ){
1482             for my $used ( @{$Label{$lab}{used}} ){
1483                 Warn( "undefined label `$lab'", $used );
1484                 $error++;
1485             }
1486         }
1487     }
1488
1489     exit( 1 ) if $error;
1490 }
1491
1492
1493 ##############
1494 #### MAIN ####
1495 ##############
1496
1497 sub usage(){
1498     print STDERR "Usage: sed [-an] command [file...]\n";
1499     print STDERR "           [-an] [-e command] [-f script-file] [file...]\n";
1500 }
1501
1502 ###################
1503 # Here we go again...
1504 #
1505 my $expr = 0;
1506 while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1507     my $opt = $1;
1508     my $arg = $2;
1509     shift( @ARGV );
1510     if(      $opt eq 'e' ){
1511         if( length( $arg ) ){
1512             push( @Commands, split( "\n", $arg ) );
1513         } elsif( @ARGV ){
1514             push( @Commands, shift( @ARGV ) );
1515         } else {
1516             Warn( "option -e requires an argument" );
1517             usage();
1518             exit( 1 );
1519         }
1520         $expr++;
1521         $Defined{$#Commands} = " #$expr";
1522         next;
1523     }
1524     if( $opt eq 'f' ){
1525         my $path;
1526         if( length( $arg ) ){
1527             $path = $arg;
1528         } elsif( @ARGV ){
1529             $path = shift( @ARGV );
1530         } else {
1531             Warn( "option -f requires an argument" );
1532             usage();
1533             exit( 1 );
1534         }
1535         my $fst = $#Commands + 1;
1536         open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1537         my $cmd;
1538         while( defined( $cmd = <SCRIPT> ) ){
1539             chomp( $cmd );
1540             push( @Commands, $cmd );
1541         }
1542         close( SCRIPT );
1543         if( $#Commands >= $fst ){
1544             $Defined{$fst} = "$path";
1545         }
1546         next;
1547     }
1548     if( $opt eq '-' && $arg eq '' ){
1549         last;
1550     }
1551     if( $opt eq 'h' || $opt eq '?' ){
1552         usage();
1553         exit( 0 );
1554     }
1555     if( $opt eq 'n' ){
1556         $doAutoPrint = 0;
1557     } elsif( $opt eq 'a' ){
1558         $doOpenWrite = 0;
1559     } else {
1560         Warn( "illegal option `$opt'" );
1561         usage();
1562         exit( 1 );
1563     }
1564     if( length( $arg ) ){
1565         unshift( @ARGV, "-$arg" );
1566     }
1567 }
1568
1569 # A singleton command may be the 1st argument when there are no options.
1570 #
1571 if( @Commands == 0 ){
1572     if( @ARGV == 0 ){
1573         Warn( "no script command given" );
1574         usage();
1575         exit( 1 );
1576     }
1577     push( @Commands, split( "\n", shift( @ARGV ) ) );
1578     $Defined{0} = ' #1';
1579 }
1580
1581 print STDERR "Files: @ARGV\n" if $useDEBUG;
1582
1583 # generate leading code
1584 #
1585 $Func = <<'[TheEnd]';
1586
1587 # openARGV: open 1st input file
1588 #
1589 sub openARGV(){
1590     unshift( @ARGV, '-' ) unless @ARGV;
1591     my $file = shift( @ARGV );
1592     open( ARG, "<$file" )
1593     || die( "$0: can't open $file for reading ($!)\n" );
1594     $isEOF = 0;
1595 }
1596
1597 # getsARGV: Read another input line into argument (default: $_).
1598 #           Move on to next input file, and reset EOF flag $isEOF.
1599 sub getsARGV(;\$){
1600     my $argref = @_ ? shift() : \$_;
1601     while( $isEOF || ! defined( $$argref = <ARG> ) ){
1602         close( ARG );
1603         return 0 unless @ARGV;
1604         my $file = shift( @ARGV );
1605         open( ARG, "<$file" )
1606         || die( "$0: can't open $file for reading ($!)\n" );
1607         $isEOF = 0;
1608     }
1609     1;
1610 }
1611
1612 # eofARGV: end-of-file test
1613 #
1614 sub eofARGV(){
1615     return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1616 }
1617
1618 # makeHandle: Generates another file handle for some file (given by its path)
1619 #             to be written due to a w command or an s command's w flag.
1620 sub makeHandle($){
1621     my( $path ) = @_;
1622     my $handle;
1623     if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1624         $handle = $wFiles{$path} = gensym();
1625         if( $doOpenWrite ){
1626             if( ! open( $handle, ">$path" ) ){
1627                 die( "$0: can't open $path for writing: ($!)\n" );
1628             }
1629         }
1630     } else {
1631         $handle = $wFiles{$path};
1632     }
1633     return $handle;
1634 }
1635
1636 # printQ: Print queued output which is either a string or a reference
1637 #         to a pathname.
1638 sub printQ(){
1639     for my $q ( @Q ){
1640         if( ref( $q ) ){
1641             # flush open w files so that reading this file gets it all
1642             if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1643                 open( $wFiles{$$q}, ">>$$q" );
1644             }
1645             # copy file to stdout: slow, but safe
1646             if( open( RF, "<$$q" ) ){
1647                 while( defined( my $line = <RF> ) ){
1648                     print $line;
1649                 }
1650                 close( RF );
1651             }
1652         } else {
1653             print $q;
1654         }
1655     }
1656     undef( @Q );
1657 }
1658
1659 [TheEnd]
1660
1661 # generate the sed loop
1662 #
1663 $Code .= <<'[TheEnd]';
1664 sub openARGV();
1665 sub getsARGV(;\$);
1666 sub eofARGV();
1667 sub printQ();
1668
1669 # Run: the sed loop reading input and applying the script
1670 #
1671 sub Run(){
1672     my( $h, $icnt, $s, $n );
1673     # hack (not unbreakable :-/) to avoid // matching an empty string
1674     my $z = "\000"; $z =~ /$z/;
1675     # Initialize.
1676     openARGV();
1677     $Hold    = '';
1678     $CondReg = 0;
1679     $doPrint = $doAutoPrint;
1680 CYCLE:
1681     while( getsARGV() ){
1682         chomp();
1683         $CondReg = 0;   # cleared on t
1684 BOS:;
1685 [TheEnd]
1686
1687     # parse - avoid opening files when doing s2p
1688     #
1689     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1690       if $doGenerate;
1691     Parse();
1692     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1693       if $doGenerate;
1694
1695     # append trailing code
1696     #
1697     $Code .= <<'[TheEnd]';
1698 EOS:    if( $doPrint ){
1699             print $_, "\n";
1700         } else {
1701             $doPrint = $doAutoPrint;
1702         }
1703         printQ() if @Q;
1704     }
1705
1706     exit( 0 );
1707 }
1708 [TheEnd]
1709
1710
1711 # append optional functions, prepend prototypes
1712 #
1713 my $Proto = "# prototypes\n";
1714 if( $GenKey{'l'} ){
1715     $Proto .= "sub _l();\n";
1716     $Func .= <<'[TheEnd]';
1717 # _l: l command processing
1718 #
1719 sub _l(){
1720     my $h = $_;
1721     my $mcpl = 70;
1722     # transform non printing chars into escape notation
1723     $h =~ s/\\/\\\\/g;
1724     if( $h =~ /[^[:print:]]/ ){
1725         $h =~ s/\a/\\a/g;
1726         $h =~ s/\f/\\f/g;
1727         $h =~ s/\n/\\n/g;
1728         $h =~ s/\t/\\t/g;
1729         $h =~ s/\r/\\r/g;
1730         $h =~ s/\e/\\e/g;
1731         $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1732     }
1733     # split into lines of length $mcpl
1734     while( length( $h ) > $mcpl ){
1735         my $l = substr( $h, 0, $mcpl-1 );
1736         $h = substr( $h, $mcpl );
1737         # remove incomplete \-escape from end of line
1738         if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1739             $h = $1 . $h;
1740         }
1741         print $l, "\\\n";
1742     }
1743     print "$h\$\n";
1744 }
1745
1746 [TheEnd]
1747 }
1748
1749 if( $GenKey{'r'} ){
1750     $Proto .= "sub _r(\$);\n";
1751     $Func .= <<'[TheEnd]';
1752 # _r: r command processing: Save a reference to the pathname.
1753 #
1754 sub _r($){
1755     my $path = shift();
1756     push( @Q, \$path );
1757 }
1758
1759 [TheEnd]
1760 }
1761
1762 if( $GenKey{'t'} ){
1763     $Proto .= "sub _t();\n";
1764     $Func .= <<'[TheEnd]';
1765 # _t: t command - condition register test/reset
1766 #
1767 sub _t(){
1768     my $res = $CondReg;
1769     $CondReg = 0;
1770     $res;
1771 }
1772
1773 [TheEnd]
1774 }
1775
1776 if( $GenKey{'w'} ){
1777     $Proto .= "sub _w(\$);\n";
1778     $Func .= <<'[TheEnd]';
1779 # _w: w command and s command's w flag - write to file
1780 #
1781 sub _w($){
1782     my $path   = shift();
1783     my $handle = $wFiles{$path};
1784     if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1785         open( $handle, ">$path" )
1786         || die( "$0: $path: cannot open ($!)\n" );
1787     }
1788     print $handle $_, "\n";
1789 }
1790
1791 [TheEnd]
1792 }
1793
1794 $Code = $Proto . $Code;
1795
1796 # magic "#n" - same as -n option
1797 #
1798 $doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
1799
1800 # eval code - check for errors
1801 #
1802 print "Code:\n$Code$Func" if $useDEBUG;
1803 eval $Code . $Func;
1804 if( $@ ){
1805     print "Code:\n$Code$Func";
1806     die( "$0: internal error - generated incorrect Perl code: $@\n" );
1807 }
1808
1809 if( $doGenerate ){
1810
1811     # write full Perl program
1812     #
1813
1814     # bang line, declarations, prototypes
1815     print <<TheEnd;
1816 #!$perlpath -w
1817 eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1818   if 0;
1819 \$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
1820
1821 use strict;
1822 use Symbol;
1823 use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1824              \$doAutoPrint \$doOpenWrite \$doPrint };
1825 \$doAutoPrint = $doAutoPrint;
1826 \$doOpenWrite = $doOpenWrite;
1827 TheEnd
1828
1829     my $wf = "'" . join( "', '",  keys( %wFiles ) ) . "'";
1830     if( $wf ne "''" ){
1831         print <<TheEnd;
1832 sub makeHandle(\$);
1833 for my \$p ( $wf ){
1834    exit( 1 ) unless makeHandle( \$p );
1835 }
1836 TheEnd
1837    }
1838
1839    print $Code;
1840    print "Run();\n";
1841    print $Func;
1842    exit( 0 );
1843
1844 } else {
1845
1846     # execute: make handles (and optionally open) all w files; run!
1847     for my $p ( keys( %wFiles ) ){
1848         exit( 1 ) unless makeHandle( $p );
1849     }
1850     Run();
1851 }
1852
1853
1854 =head1 ENVIRONMENT
1855
1856 The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1857 See L<"Additional Atoms">.
1858
1859 =head1 DIAGNOSTICS
1860
1861 =over 4
1862
1863 =item ambiguous translation for character `%s' in `y' command
1864
1865 The indicated character appears twice, with different translations.
1866
1867 =item `[' cannot be last in pattern
1868
1869 A `[' in a BRE indicates the beginning of a I<bracket expression>.
1870
1871 =item `\' cannot be last in pattern
1872
1873 A `\' in a BRE is used to make the subsequent character literal.
1874
1875 =item `\' cannot be last in substitution
1876
1877 A `\' in a subsitution string is used to make the subsequent character literal.
1878
1879 =item conflicting flags `%s'
1880
1881 In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1882 multiple n-th occurrence flags are specified. Note that only the digits
1883 `1' through `9' are permitted.
1884
1885 =item duplicate label %s (first defined at %s)
1886
1887 =item excess address(es)
1888
1889 The command has more than the permitted number of addresses.
1890
1891 =item extra characters after command (%s)
1892
1893 =item illegal option `%s'
1894
1895 =item improper delimiter in s command
1896
1897 The BRE and substitution may not be delimited with `\' or newline.
1898
1899 =item invalid address after `,'
1900
1901 =item invalid backreference (%s)
1902
1903 The specified backreference number exceeds the number of backreferences
1904 in the BRE.
1905
1906 =item invalid repeat clause `\{%s\}'
1907
1908 The repeat clause does not contain a valid integer value, or pair of
1909 values.
1910
1911 =item malformed regex, 1st address
1912
1913 =item malformed regex, 2nd address
1914
1915 =item malformed regular expression
1916
1917 =item malformed substitution expression
1918
1919 =item malformed `y' command argument
1920
1921 The first or second string of a B<y> command  is syntactically incorrect.
1922
1923 =item maximum less than minimum in `\{%s\}'
1924
1925 =item no script command given
1926
1927 There must be at least one B<-e> or one B<-f> option specifying a
1928 script or script file.
1929
1930 =item `\' not valid as delimiter in `y' command
1931
1932 =item option -e requires an argument
1933
1934 =item option -f requires an argument
1935
1936 =item `s' command requires argument
1937
1938 =item start of unterminated `{'
1939
1940 =item string lengths in `y' command differ
1941
1942 The translation table strings in a B<y> commanf must have equal lengths.
1943
1944 =item undefined label `%s'
1945
1946 =item unexpected `}'
1947
1948 A B<}> command without a preceding B<{> command was encountered.
1949
1950 =item unexpected end of script
1951
1952 The end of the script was reached although a text line after a
1953 B<a>, B<c> or B<i> command indicated another line.
1954
1955 =item unknown command `%s'
1956
1957 =item unterminated `['
1958
1959 A BRE contains an unterminated bracket expression.
1960
1961 =item unterminated `\('
1962
1963 A BRE contains an unterminated backreference.
1964
1965 =item `\{' without closing `\}'
1966
1967 A BRE contains an unterminated bounds specification.
1968
1969 =item `\)' without preceding `\('
1970
1971 =item `y' command requires argument
1972
1973 =back
1974
1975 =head1 EXAMPLE
1976
1977 The basic material for the preceding section was generated by running
1978 the sed script
1979
1980    #no autoprint
1981    s/^.*Warn( *"\([^"]*\)".*$/\1/
1982    t process
1983    b
1984    :process
1985    s/$!/%s/g
1986    s/$[_[:alnum:]]\{1,\}/%s/g
1987    s/\\\\/\\/g
1988    s/^/=item /
1989    p
1990
1991 on the program's own text, and piping the output into C<sort -u>.
1992
1993
1994 =head1 SED SCRIPT TRANSLATION
1995
1996 If this program is invoked with the name F<s2p> it will act as a
1997 sed-to-Perl translator. After option processing (all other
1998 arguments are ignored), a Perl program is printed on standard
1999 output, which will process the input stream (as read from all
2000 arguments) in the way defined by the sed script and the option setting
2001 used for the translation.
2002
2003 =head1 SEE ALSO
2004
2005 perl(1), re_format(7)
2006
2007 =head1 BUGS
2008
2009 The B<l> command will show escape characters (ESC) as `C<\e>', but
2010 a vertical tab (VT) in octal.
2011
2012 Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
2013
2014 The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
2015 is "the last pattern used, at run time". This deviates from the Perl
2016 interpretation, which will re-use the "last last successfully executed
2017 regular expression". Since keeping track of pattern usage would create
2018 terribly cluttered code, and differences would only appear in obscure
2019 context (where other B<sed> implementations appear to deviate, too),
2020 the Perl semantics was adopted. Note that common usage of this feature,
2021 such as in C</abc/s//xyz/>, will work as expected.
2022
2023 Collating elements (of bracket expressions in BREs) are not implemented.
2024
2025 =head1 STANDARDS
2026
2027 This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2028 definition of B<sed>, and is compatible with the I<OpenBSD>
2029 implementation, except where otherwise noted (see L<"BUGS">).
2030
2031 =head1 AUTHOR
2032
2033 This Perl implementation of I<sed> was written by Wolfgang Laun,
2034 I<Wolfgang.Laun@alcatel.at>.
2035
2036 =head1 COPYRIGHT and LICENSE
2037
2038 This program is free and open software. You may use, modify,
2039 distribute, and sell this program (and any modified variants) in any
2040 way you wish, provided you do not restrict others from doing the same.
2041
2042 =cut
2043
2044 !NO!SUBS!
2045
2046 close OUT or die "Can't close $file: $!";
2047 chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
2048 unlink 'psed';
2049 print "Linking s2p to psed.\n";
2050 if (defined $Config{d_link}) {
2051   link 's2p', 'psed';
2052 } else {
2053   unshift @INC, '../lib';
2054   require File::Copy;
2055   File::Copy::syscopy('s2p', 'psed');
2056 }
2057 exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
2058 chdir $origdir;