x2p/s2p.PL

   1 #!/usr/bin/perl
   2
   3 use Config;
   4 use File::Basename qw(&basename &dirname);
   5 use Cwd;
   6 use subs qw(link);
   7
   8 sub link { # This is a cut-down version of installperl:link().
   9     my($from,$to) = @_;
  10     my($success) = 0;
  11
  12     eval {
  13         CORE::link($from, $to)
  14             ? $success++
  15             : ($from =~ m#^/afs/# || $to =~ m#^/afs/#)
  16               ? die "AFS"  # okay inside eval {}
  17               : die "Couldn't link $from to $to: $!\n";
  18     };
  19     if ($@) {
  20         require File::Copy;
  21         File::Copy::copy($from, $to)
  22             ? $success++
  23             : warn "Couldn't copy $from to $to: $!\n";
  24     }
  25     $success;
  26 }
  27
  28 # List explicitly here the variables you want Configure to
  29 # generate.  Metaconfig only looks for shell variables, so you
  30 # have to mention them as if they were shell variables, not
  31 # %Config entries.  Thus you write
  32 #  $startperl
  33 # to ensure Configure will look for $Config{startperl}.
  34
  35 # This forces PL files to create target in same directory as PL file.
  36 # This is so that make depend always knows where to find PL derivatives.
  37 $origdir = cwd;
  38 chdir dirname($0);
  39 $file = basename($0, '.PL');
  40 $file .= '.com' if $^O eq 'VMS';
  41
  42 open OUT,">$file" or die "Can't create $file: $!";
  43
  44 print "Extracting $file (with variable substitutions)\n";
  45
  46 # In this section, perl variables will be expanded during extraction.
  47 # You can use $Config{...} to use Configure variables.
  48
  49 print OUT <<"!GROK!THIS!";
  50 $Config{startperl}
  51     eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
  52         if \$running_under_some_shell;
  53 my \$startperl;
  54 my \$perlpath;
  55 (\$startperl = <<'/../') =~ s/\\s*\\z//;
  56 $Config{startperl}
  57 /../
  58 (\$perlpath = <<'/../') =~ s/\\s*\\z//;
  59 $Config{perlpath}
  60 /../
  61 !GROK!THIS!
  62
  63 # In the following, perl variables are not expanded during extraction.
  64
  65 print OUT <<'!NO!SUBS!';
  66
  67 $0 =~ s/^.*?(\w+)[\.\w]*$/$1/;
  68
  69 # (p)sed - a stream editor
  70 # History:  Aug 12 2000: Original version.
  71 #           Mar 25 2002: Rearrange generated Perl program.
  72
  73 use strict;
  74 use integer;
  75 use Symbol;
  76
  77 =head1 NAME
  78
  79 psed - a stream editor
  80
  81 =head1 SYNOPSIS
  82
  83    psed [-an] script [file ...]
  84    psed [-an] [-e script] [-f script-file] [file ...]
  85
  86    s2p  [-an] [-e script] [-f script-file]
  87
  88 =head1 DESCRIPTION
  89
  90 A stream editor reads the input stream consisting of the specified files
  91 (or standard input, if none are given), processes is line by line by
  92 applying a script consisting of edit commands, and writes resulting lines
  93 to standard output. The filename `C<->' may be used to read standard input.
  94
  95 The edit script is composed from arguments of B<-e> options and
  96 script-files, in the given order. A single script argument may be specified
  97 as the first parameter.
  98
  99 If this program is invoked with the name F<s2p>, it will act as a
 100 sed-to-Perl translator. See L<"sed Script Translation">.
 101
 102 B<sed> returns an exit code of 0 on success or >0 if an error occurred.
 103
 104 =head1 OPTIONS
 105
 106 =over 4
 107
 108 =item B<-a>
 109
 110 A file specified as argument to the B<w> edit command is by default
 111 opened before input processing starts. Using B<-a>, opening of such
 112 files is delayed until the first line is actually written to the file.
 113
 114 =item B<-e> I<script>
 115
 116 The editing commands defined by I<script> are appended to the script.
 117 Multiple commands must be separated by newlines.
 118
 119 =item B<-f> I<script-file>
 120
 121 Editing commands from the specified I<script-file> are read and appended
 122 to the script.
 123
 124 =item B<-n>
 125
 126 By default, a line is written to standard output after the editing script
 127 has been applied to it. The B<-n> option suppresses automatic printing.
 128
 129 =back
 130
 131 =head1 COMMANDS
 132
 133 B<sed> command syntax is defined as
 134
 135 Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
 136
 137 with whitespace being permitted before or after addresses, and between
 138 the function character and the argument. The I<address>es and the
 139 address inverter (C<!>) are used to restrict the application of a
 140 command to the selected line(s) of input.
 141
 142 Each command must be on a line of its own, except where noted in
 143 the synopses below.
 144
 145 The edit cycle performed on each input line consist of reading the line
 146 (without its trailing newline character) into the I<pattern space>,
 147 applying the applicable commands of the edit script, writing the final
 148 contents of the pattern space and a newline to the standard output.
 149 A I<hold space> is provided for saving the contents of the
 150 pattern space for later use.
 151
 152 =head2 Addresses
 153
 154 A sed address is either a line number or a pattern, which may be combined
 155 arbitrarily to construct ranges. Lines are numbered across all input files.
 156
 157 Any address may be followed by an exclamation mark (`C<!>'), selecting
 158 all lines not matching that address.
 159
 160 =over 4
 161
 162 =item I<number>
 163
 164 The line with the given number is selected.
 165
 166 =item B<$>
 167
 168 A dollar sign (C<$>) is the line number of the last line of the input stream.
 169
 170 =item B</>I<regular expression>B</>
 171
 172 A pattern address is a basic regular expression (see
 173 L<"Basic Regular Expressions">), between the delimiting character C</>.
 174 Any other character except C<\> or newline may be used to delimit a
 175 pattern address when the initial delimiter is prefixed with a
 176 backslash (`C<\>').
 177
 178 =back
 179
 180 If no address is given, the command selects every line.
 181
 182 If one address is given, it selects the line (or lines) matching the
 183 address.
 184
 185 Two addresses select a range that begins whenever the first address
 186 matches, and ends (including that line) when the second address matches.
 187 If the first (second) address is a matching pattern, the second
 188 address is not applied to the very same line to determine the end of
 189 the range. Likewise, if the second address is a matching pattern, the
 190 first address is not applied to the very same line to determine the
 191 begin of another range. If both addresses are line numbers,
 192 and the second line number is less than the first line number, then
 193 only the first line is selected.
 194
 195
 196 =head2 Functions
 197
 198 The maximum permitted number of addresses is indicated with each
 199 function synopsis below.
 200
 201 The argument I<text> consists of one or more lines following the command.
 202 Embedded newlines in I<text> must be preceded with a backslash.  Other
 203 backslashes in I<text> are deleted and the following character is taken
 204 literally.
 205
 206 =over 4
 207
 208 =cut
 209
 210 my %ComTab;
 211 my %GenKey;
 212 #--------------------------------------------------------------------------
 213 $ComTab{'a'}=[ 1, 'txt', \&Emit,       '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
 214
 215 =item [1addr]B<a\> I<text>
 216
 217 Write I<text> (which must start on the line following the command)
 218 to standard output immediately before reading the next line
 219 of input, either by executing the B<N> function or by beginning a new cycle.
 220
 221 =cut
 222
 223 #--------------------------------------------------------------------------
 224 $ComTab{'b'}=[ 2, 'str', \&Branch,     '{ goto XXX; }'                   ]; #ok
 225
 226 =item [2addr]B<b> [I<label>]
 227
 228 Branch to the B<:> function with the specified I<label>. If no label
 229 is given, branch to the end of the script.
 230
 231 =cut
 232
 233 #--------------------------------------------------------------------------
 234 $ComTab{'c'}=[ 2, 'txt', \&Change,     <<'-X-'                           ]; #ok
 235 { print <<'TheEnd'; } $doPrint = 0; goto EOS;
 236 -X-
 237 ### continue OK => next CYCLE;
 238
 239 =item [2addr]B<c\> I<text>
 240
 241 The line, or range of lines, selected by the address is deleted.
 242 The I<text> (which must start on the line following the command)
 243 is written to standard output. With an address range, this occurs at
 244 the end of the range.
 245
 246 =cut
 247
 248 #--------------------------------------------------------------------------
 249 $ComTab{'d'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 250 { $doPrint = 0;
 251   goto EOS;
 252 }
 253 -X-
 254 ### continue OK => next CYCLE;
 255
 256 =item [2addr]B<d>
 257
 258 Deletes the pattern space and starts the next cycle.
 259
 260 =cut
 261
 262 #--------------------------------------------------------------------------
 263 $ComTab{'D'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 264 { s/^.*\n?//;
 265   if(length($_)){ goto BOS } else { goto EOS }
 266 }
 267 -X-
 268 ### continue OK => next CYCLE;
 269
 270 =item [2addr]B<D>
 271
 272 Deletes the pattern space through the first embedded newline or to the end.
 273 If the pattern space becomes empty, a new cycle is started, otherwise
 274 execution of the script is restarted.
 275
 276 =cut
 277
 278 #--------------------------------------------------------------------------
 279 $ComTab{'g'}=[ 2, '',    \&Emit,       '{ $_ = $Hold };'                 ]; #ok
 280
 281 =item [2addr]B<g>
 282
 283 Replace the contents of the pattern space with the hold space.
 284
 285 =cut
 286
 287 #--------------------------------------------------------------------------
 288 $ComTab{'G'}=[ 2, '',    \&Emit,       '{ $_ .= "\n"; $_ .= $Hold };'    ]; #ok
 289
 290 =item [2addr]B<G>
 291
 292 Append a newline and the contents of the hold space to the pattern space.
 293
 294 =cut
 295
 296 #--------------------------------------------------------------------------
 297 $ComTab{'h'}=[ 2, '',    \&Emit,       '{ $Hold = $_ }'                  ]; #ok
 298
 299 =item [2addr]B<h>
 300
 301 Replace the contents of the hold space with the pattern space.
 302
 303 =cut
 304
 305 #--------------------------------------------------------------------------
 306 $ComTab{'H'}=[ 2, '',    \&Emit,       '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
 307
 308 =item [2addr]B<H>
 309
 310 Append a newline and the contents of the pattern space to the hold space.
 311
 312 =cut
 313
 314 #--------------------------------------------------------------------------
 315 $ComTab{'i'}=[ 1, 'txt', \&Emit,       '{ print <<'."'TheEnd' }\n"       ]; #ok
 316
 317 =item [1addr]B<i\> I<text>
 318
 319 Write the I<text> (which must start on the line following the command)
 320 to standard output.
 321
 322 =cut
 323
 324 #--------------------------------------------------------------------------
 325 $ComTab{'l'}=[ 2, '',    \&Emit,       '{ _l() }'                        ]; #okUTF8
 326
 327 =item [2addr]B<l>
 328
 329 Print the contents of the pattern space: non-printable characters are
 330 shown in C-style escaped form; long lines are split and have a trailing
 331 `C<\>' at the point of the split; the true end of a line is marked with
 332 a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
 333 BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
 334 octal number for all other non-printable characters.
 335
 336 =cut
 337
 338 #--------------------------------------------------------------------------
 339 $ComTab{'n'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 340 { print $_, "\n" if $doPrint;
 341   printQ() if @Q;
 342   $CondReg = 0;
 343   last CYCLE unless getsARGV();
 344   chomp();
 345 }
 346 -X-
 347
 348 =item [2addr]B<n>
 349
 350 If automatic printing is enabled, write the pattern space to the standard
 351 output. Replace the pattern space with the next line of input. If
 352 there is no more input, processing is terminated.
 353
 354 =cut
 355
 356 #--------------------------------------------------------------------------
 357 $ComTab{'N'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 358 { printQ() if @Q;
 359   $CondReg = 0;
 360   last CYCLE unless getsARGV( $h );
 361   chomp( $h );
 362   $_ .= "\n$h";
 363 }
 364 -X-
 365
 366 =item [2addr]B<N>
 367
 368 Append a newline and the next line of input to the pattern space. If
 369 there is no more input, processing is terminated.
 370
 371 =cut
 372
 373 #--------------------------------------------------------------------------
 374 $ComTab{'p'}=[ 2, '',    \&Emit,       '{ print $_, "\n"; }'             ]; #ok
 375
 376 =item [2addr]B<p>
 377
 378 Print the pattern space to the standard output. (Use the B<-n> option
 379 to suppress automatic printing at the end of a cycle if you want to
 380 avoid double printing of lines.)
 381
 382 =cut
 383
 384 #--------------------------------------------------------------------------
 385 $ComTab{'P'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 386 { if( /^(.*)/ ){ print $1, "\n"; } }
 387 -X-
 388
 389 =item [2addr]B<P>
 390
 391 Prints the pattern space through the first embedded newline or to the end.
 392
 393 =cut
 394
 395 #--------------------------------------------------------------------------
 396 $ComTab{'q'}=[ 1, '',    \&Emit,       <<'-X-'                           ]; #ok
 397 { print $_, "\n" if $doPrint;
 398   last CYCLE;
 399 }
 400 -X-
 401
 402 =item [1addr]B<q>
 403
 404 Branch to the end of the script and quit without starting a new cycle.
 405
 406 =cut
 407
 408 #--------------------------------------------------------------------------
 409 $ComTab{'r'}=[ 1, 'str', \&Emit,       "{ _r( '-X-' ) }"                 ]; #ok
 410
 411 =item [1addr]B<r> I<file>
 412
 413 Copy the contents of the I<file> to standard output immediately before
 414 the next attempt to read a line of input. Any error encountered while
 415 reading I<file> is silently ignored.
 416
 417 =cut
 418
 419 #--------------------------------------------------------------------------
 420 $ComTab{'s'}=[ 2, 'sub', \&Emit,       ''                                ]; #ok
 421
 422 =item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
 423
 424 Substitute the I<replacement> string for the first substring in
 425 the pattern space that matches the I<regular expression>.
 426 Any character other than backslash or newline can be used instead of a
 427 slash to delimit the regular expression and the replacement.
 428 To use the delimiter as a literal character within the regular expression
 429 and the replacement, precede the character by a backslash (`C<\>').
 430
 431 Literal newlines may be embedded in the replacement string by
 432 preceding a newline with a backslash.
 433
 434 Within the replacement, an ampersand (`C<&>') is replaced by the string
 435 matching the regular expression. The strings `C<\1>' through `C<\9>' are
 436 replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
 437 To get a literal `C<&>' or `C<\>' in the replacement text, precede it
 438 by a backslash.
 439
 440 The following I<flags> modify the behaviour of the B<s> command:
 441
 442 =over 8
 443
 444 =item B<g>
 445
 446 The replacement is performed for all matching, non-overlapping substrings
 447 of the pattern space.
 448
 449 =item B<1>..B<9>
 450
 451 Replace only the n-th matching substring of the pattern space.
 452
 453 =item B<p>
 454
 455 If the substitution was made, print the new value of the pattern space.
 456
 457 =item B<w> I<file>
 458
 459 If the substitution was made, write the new value of the pattern space
 460 to the specified file.
 461
 462 =back
 463
 464 =cut
 465
 466 #--------------------------------------------------------------------------
 467 $ComTab{'t'}=[ 2, 'str', \&Branch,     '{ goto XXX if _t() }'            ]; #ok
 468
 469 =item [2addr]B<t> [I<label>]
 470
 471 Branch to the B<:> function with the specified I<label> if any B<s>
 472 substitutions have been made since the most recent reading of an input line
 473 or execution of a B<t> function. If no label is given, branch to the end of
 474 the script.
 475
 476
 477 =cut
 478
 479 #--------------------------------------------------------------------------
 480 $ComTab{'w'}=[ 2, 'str', \&Write,      "{ _w( '-X-' ) }"                 ]; #ok
 481
 482 =item [2addr]B<w> I<file>
 483
 484 The contents of the pattern space are written to the I<file>.
 485
 486 =cut
 487
 488 #--------------------------------------------------------------------------
 489 $ComTab{'x'}=[ 2, '',    \&Emit,       '{ ($Hold, $_) = ($_, $Hold) }'   ]; #ok
 490
 491 =item [2addr]B<x>
 492
 493 Swap the contents of the pattern space and the hold space.
 494
 495 =cut
 496
 497 #--------------------------------------------------------------------------
 498 $ComTab{'y'}=[ 2, 'tra', \&Emit,       ''                                ]; #ok
 499 =item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
 500
 501 In the pattern space, replace all characters occuring in I<string1> by the
 502 character at the corresponding position in I<string2>. It is possible
 503 to use any character (other than a backslash or newline) instead of a
 504 slash to delimit the strings.  Within I<string1> and I<string2>, a
 505 backslash followed by any character other than a newline is that literal
 506 character, and a backslash followed by an `n' is replaced by a newline
 507 character.
 508
 509 =cut
 510
 511 #--------------------------------------------------------------------------
 512 $ComTab{'='}=[ 1, '',    \&Emit,       '{ print "$.\n" }'                ]; #ok
 513
 514 =item [1addr]B<=>
 515
 516 Prints the current line number on the standard output.
 517
 518 =cut
 519
 520 #--------------------------------------------------------------------------
 521 $ComTab{':'}=[ 0, 'str', \&Label,      ''                                ]; #ok
 522
 523 =item [0addr]B<:> [I<label>]
 524
 525 The command specifies the position of the I<label>. It has no other effect.
 526
 527 =cut
 528
 529 #--------------------------------------------------------------------------
 530 $ComTab{'{'}=[ 2, '',    \&BeginBlock, '{'                               ]; #ok
 531 $ComTab{'}'}=[ 0, '',    \&EndBlock,   ';}'                              ]; #ok
 532 # ';' to avoid warning on empty {}-block
 533
 534 =item [2addr]B<{> [I<command>]
 535
 536 =item [0addr]B<}>
 537
 538 These two commands begin and end a command list. The first command may
 539 be given on the same line as the opening B<{> command. The commands
 540 within the list are jointly selected by the address(es) given on the
 541 B<{> command (but may still have individual addresses).
 542
 543 =cut
 544
 545 #--------------------------------------------------------------------------
 546 $ComTab{'#'}=[ 0, 'str', \&Comment,    ''                                ]; #ok
 547
 548 =item [0addr]B<#> [I<comment>]
 549
 550 The entire line is ignored (treated as a comment). If, however, the first
 551 two characters in the script are `C<#n>', automatic printing of output is
 552 suppressed, as if the B<-n> option were given on the command line.
 553
 554 =back
 555
 556 =cut
 557
 558 use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
 559
 560 my $useDEBUG    = exists( $ENV{PSEDDEBUG} );
 561 my $useEXTBRE   = $ENV{PSEDEXTBRE} || '';
 562 $useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
 563
 564 my $doAutoPrint = 1;          # automatic printing of pattern space (-n => 0)
 565 my $doOpenWrite = 1;          # open w command output files at start (-a => 0)
 566 my $svOpenWrite = 0;          # save $doOpenWrite
 567
 568 # lower case $0 below as a VMSism.  The VMS build procedure creates the
 569 # s2p file traditionally in upper case on the disk.  When VMS is in a
 570 # case preserved or case sensitive mode, $0 will be returned in the exact
 571 # case which will be on the disk, and that is not predictable at this time.
 572
 573 my $doGenerate  = lc($0) eq 's2p';
 574
 575 # Collected and compiled script
 576 #
 577 my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code, $Func );
 578 $Code = '';
 579
 580 ##################
 581 #  Compile Time
 582 #
 583 # Labels
 584 #
 585 # Error handling
 586 #
 587 sub Warn($;$){
 588     my( $msg, $loc ) = @_;
 589     $loc ||= '';
 590     $loc .= ': ' if length( $loc );
 591     warn( "$0: $loc$msg\n" );
 592 }
 593
 594 $labNum = 0;
 595 sub newLabel(){
 596     return 'L_'.++$labNum;
 597 }
 598
 599 # safeHere: create safe here delimiter and  modify opcode and argument
 600 #
 601 sub safeHere($$){
 602     my( $codref, $argref ) = @_;
 603     my $eod = 'EOD000';
 604     while( $$argref =~ /^$eod$/m ){
 605         $eod++;
 606     }
 607     $$codref =~ s/TheEnd/$eod/e;
 608     $$argref .= "$eod\n";
 609 }
 610
 611 # Emit: create address logic and emit command
 612 #
 613 sub Emit($$$$$$){
 614     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 615     my $cond = '';
 616     if( defined( $addr1 ) ){
 617         if( defined( $addr2 ) ){
 618             $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 619         } else {
 620             $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 621         }
 622         $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
 623     }
 624
 625     if( $opcode eq '' ){
 626         $Code .= "$cond$arg\n";
 627
 628     } elsif( $opcode =~ s/-X-/$arg/e ){
 629         $Code .= "$cond$opcode\n";
 630
 631     } elsif( $opcode =~ /TheEnd/ ){
 632         safeHere( \$opcode, \$arg );
 633         $Code .= "$cond$opcode$arg";
 634
 635     } else {
 636         $Code .= "$cond$opcode\n";
 637     }
 638     0;
 639 }
 640
 641 # Write (w command, w flag): store pathname
 642 #
 643 sub Write($$$$$$){
 644     my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
 645     $wFiles{$path} = '';
 646     Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
 647 }
 648
 649
 650 # Label (: command): label definition
 651 #
 652 sub Label($$$$$$){
 653     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 654     my $rc = 0;
 655     $lab =~ s/\s+//;
 656     if( length( $lab ) ){
 657         my $h;
 658         if( ! exists( $Label{$lab} ) ){
 659             $h = $Label{$lab}{name} = newLabel();
 660         } else {
 661             $h = $Label{$lab}{name};
 662             if( exists( $Label{$lab}{defined} ) ){
 663                 my $dl = $Label{$lab}{defined};
 664                 Warn( "duplicate label $lab (first defined at $dl)", $fl );
 665                 $rc = 1;
 666             }
 667         }
 668         $Label{$lab}{defined} = $fl;
 669         $Code .= "$h:;\n";
 670     }
 671     $rc;
 672 }
 673
 674 # BeginBlock ({ command): push block start
 675 #
 676 sub BeginBlock($$$$$$){
 677     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 678     push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
 679     Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 680 }
 681
 682 # EndBlock (} command): check proper nesting
 683 #
 684 sub EndBlock($$$$$$){
 685     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 686     my $rc;
 687     my $jcom = pop( @BlockStack );
 688     if( defined( $jcom ) ){
 689         $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 690     } else {
 691         Warn( "unexpected `}'", $fl );
 692         $rc = 1;
 693     }
 694     $rc;
 695 }
 696
 697 # Branch (t, b commands): check or create label, substitute default
 698 #
 699 sub Branch($$$$$$){
 700     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 701     $lab =~ s/\s+//; # no spaces at end
 702     my $h;
 703     if( length( $lab ) ){
 704         if( ! exists( $Label{$lab} ) ){
 705             $h = $Label{$lab}{name} = newLabel();
 706         } else {
 707             $h = $Label{$lab}{name};
 708         }
 709         push( @{$Label{$lab}{used}}, $fl );
 710     } else {
 711         $h = 'EOS';
 712     }
 713     $opcode =~ s/XXX/$h/e;
 714     Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
 715 }
 716
 717 # Change (c command): is special due to range end watching
 718 #
 719 sub Change($$$$$$){
 720     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 721     my $kwd = $negated ? 'unless' : 'if';
 722     if( defined( $addr2 ) ){
 723         $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 724         if( ! $negated ){
 725             $addr1  = '$icnt = ('.$addr1.')';
 726             $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
 727         }
 728     } else {
 729         $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 730     }
 731     safeHere( \$opcode, \$arg );
 732     $Code .= "$kwd( $addr1 ){\n  $opcode$arg}\n";
 733     0;
 734 }
 735
 736
 737 # Comment (# command): A no-op. Who would've thought that!
 738 #
 739 sub Comment($$$$$$){
 740     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 741 ### $Code .= "# $arg\n";
 742     0;
 743 }
 744
 745
 746 sub stripRegex($$){
 747     my( $del, $sref ) = @_;
 748     my $regex = $del;
 749     print "stripRegex:$del:$$sref:\n" if $useDEBUG;
 750     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
 751         my $sl = $2;
 752         $regex .= $1.$sl.$del;
 753         if( length( $sl ) % 2 == 0 ){
 754             return $regex;
 755         }
 756         $regex .= $3;
 757     }
 758     undef();
 759 }
 760
 761 # stripTrans: take a <del> terminated string from y command
 762 #   honoring and cleaning up of \-escaped <del>'s
 763 #
 764 sub stripTrans($$){
 765     my( $del, $sref ) = @_;
 766     my $t = '';
 767     print "stripTrans:$del:$$sref:\n" if $useDEBUG;
 768     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
 769         my $sl = $2;
 770         $t .= $1;
 771         if( length( $sl ) % 2 == 0 ){
 772             $t .= $sl;
 773             $t =~ s/\\\\/\\/g;
 774             return $t;
 775         }
 776         chop( $sl );
 777         $t .= $sl.$del.$3;
 778     }
 779     undef();
 780 }
 781
 782 # makey - construct Perl y/// from sed y///
 783 #
 784 sub makey($$$){
 785     my( $fr, $to, $fl ) = @_;
 786     my $error = 0;
 787
 788     # Ensure that any '-' is up front.
 789     # Diagnose duplicate contradicting mappings
 790     my %tr;
 791     for( my $i = 0; $i < length($fr); $i++ ){
 792         my $fc = substr($fr,$i,1);
 793         my $tc = substr($to,$i,1);
 794         if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
 795             Warn( "ambiguous translation for character `$fc' in `y' command",
 796                   $fl );
 797             $error++;
 798         }
 799         $tr{$fc} = $tc;
 800     }
 801     $fr = $to = '';
 802     if( exists( $tr{'-'} ) ){
 803         ( $fr, $to ) = ( '-', $tr{'-'} );
 804         delete( $tr{'-'} );
 805     } else {
 806         $fr = $to = '';
 807     }
 808     # might just as well sort it...
 809     for my $fc ( sort keys( %tr ) ){
 810         $fr .= $fc;
 811         $to .= $tr{$fc};
 812     }
 813     # make embedded delimiters and newlines safe
 814     $fr =~ s/([{}])/\$1/g;
 815     $to =~ s/([{}])/\$1/g;
 816     $fr =~ s/\n/\\n/g;
 817     $to =~ s/\n/\\n/g;
 818     return $error ? undef() : "{ y{$fr}{$to}; }";
 819 }
 820
 821 ######
 822 # makes - construct Perl s/// from sed s///
 823 #
 824 sub makes($$$$$$$){
 825     my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
 826
 827     # make embedded newlines safe
 828     $regex =~ s/\n/\\n/g;
 829     $subst =~ s/\n/\\n/g;
 830
 831     my $code;
 832     # n-th occurrence
 833     #
 834     if( length( $nmatch ) ){
 835         $code = <<TheEnd;
 836 { \$n = $nmatch;
 837   while( --\$n && ( \$s = m ${regex}g ) ){}
 838   \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
 839   \$CondReg ||= \$s;
 840 TheEnd
 841     } else {
 842         $code = <<TheEnd;
 843 { \$s = s ${regex}${subst}s${global};
 844   \$CondReg ||= \$s;
 845 TheEnd
 846     }
 847     if( $print ){
 848         $code .= '  print $_, "\n" if $s;'."\n";
 849     }
 850     if( defined( $path ) ){
 851         $wFiles{$path} = '';
 852         $code .= " _w( '$path' ) if \$s;\n";
 853         $GenKey{'w'} = 1;
 854     }
 855     $code .= "}";
 856 }
 857
 858 =head1 BASIC REGULAR EXPRESSIONS
 859
 860 A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
 861 of I<atoms>, for matching parts of a string, and I<bounds>, specifying
 862 repetitions of a preceding atom.
 863
 864 =head2 Atoms
 865
 866 The possible atoms of a BRE are: B<.>, matching any single character;
 867 B<^> and B<$>, matching the null string at the beginning or end
 868 of a string, respectively; a I<bracket expressions>, enclosed
 869 in B<[> and B<]> (see below); and any single character with no
 870 other significance (matching that character). A B<\> before one
 871 of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
 872 after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
 873 becomes an atom and establishes the target for a I<backreference>,
 874 consisting of the substring that actually matches the enclosed atoms.
 875 Finally, B<\> followed by one of the digits B<0> through B<9> is a
 876 backreference.
 877
 878 A B<^> that is not first, or a B<$> that is not last does not have
 879 a special significance and need not be preceded by a backslash to
 880 become literal. The same is true for a B<]>, that does not terminate
 881 a bracket expression.
 882
 883 An unescaped backslash cannot be last in a BRE.
 884
 885 =head2 Bounds
 886
 887 The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
 888 atom; B<\{>I<count>B<\}>, specifying that many repetitions;
 889 B<\{>I<minimum>B<,\}>, giving a lower limit; and
 890 B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
 891 bound.
 892
 893 A bound appearing as the first item in a BRE is taken literally.
 894
 895 =head2 Bracket Expressions
 896
 897 A I<bracket expression> is a list of characters, character ranges
 898 and character classes enclosed in B<[> and B<]> and matches any
 899 single character from the represented set of characters.
 900
 901 A character range is written as two characters separated by B<-> and
 902 represents all characters (according to the character collating sequence)
 903 that are not less than the first and not greater than the second.
 904 (Ranges are very collating-sequence-dependent, and portable programs
 905 should avoid relying on them.)
 906
 907 A character class is one of the class names
 908
 909    alnum     digit     punct
 910    alpha     graph     space
 911    blank     lower     upper
 912    cntrl     print     xdigit
 913
 914 enclosed in B<[:> and B<:]> and represents the set of characters
 915 as defined in ctype(3).
 916
 917 If the first character after B<[> is B<^>, the sense of matching is
 918 inverted.
 919
 920 To include a literal `C<^>', place it anywhere else but first. To
 921 include a literal 'C<]>' place it first or immediately after an
 922 initial B<^>. To include a literal `C<->' make it the first (or
 923 second after B<^>) or last character, or the second endpoint of
 924 a range.
 925
 926 The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
 927 match the null string at the beginning and end of a word respectively.
 928 (Note that neither is identical to Perl's `\b' atom.)
 929
 930 =head2 Additional Atoms
 931
 932 Since some sed implementations provide additional regular expression
 933 atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
 934 the following backslash escapes:
 935
 936 =over 4
 937
 938 =item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
 939
 940 =item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
 941
 942 =item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
 943
 944 =item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
 945
 946 =item B<\y> Match the empty string at a word boundary.
 947
 948 =item B<\B> Match the empty string between any two either word or non-word characters.
 949
 950 =back
 951
 952 To enable this feature, the environment variable PSEDEXTBRE must be set
 953 to a string containing the requested characters, e.g.:
 954 C<PSEDEXTBRE='E<lt>E<gt>wW'>.
 955
 956 =cut
 957
 958 #####
 959 # bre2p - convert BRE to Perl RE
 960 #
 961 sub peek(\$$){
 962     my( $pref, $ic ) = @_;
 963     $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
 964 }
 965
 966 sub bre2p($$$){
 967     my( $del, $pat, $fl ) = @_;
 968     my $led = $del;
 969     $led =~ tr/{([</})]>/;
 970     $led = '' if $led eq $del;
 971
 972     $pat = substr( $pat, 1, length($pat) - 2 );
 973     my $res = '';
 974     my $bracklev = 0;
 975     my $backref  = 0;
 976     my $parlev = 0;
 977     for( my $ic = 0; $ic < length( $pat ); $ic++ ){
 978         my $c = substr( $pat, $ic, 1 );
 979         if( $c eq '\\' ){
 980             ### backslash escapes
 981             my $nc = peek($pat,$ic);
 982             if( $nc eq '' ){
 983                 Warn( "`\\' cannot be last in pattern", $fl );
 984                 return undef();
 985             }
 986             $ic++;
 987             if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
 988                 $res .= "\\$del";
 989
 990             } elsif( $nc =~ /([[.*\\n])/ ){
 991                 ## check for \-escaped magics and \n:
 992                 ## \[ \. \* \\ \n stay as they are
 993                 $res .= '\\'.$nc;
 994
 995             } elsif( $nc eq '(' ){ ## \( => (
 996                 $parlev++;
 997                 $res .= '(';
 998
 999             } elsif( $nc eq ')' ){ ## \) => )
1000                 $parlev--;
1001                 $backref++;
1002                 if( $parlev < 0 ){
1003                     Warn( "unmatched `\\)'", $fl );
1004                     return undef();
1005                 }
1006                 $res .= ')';
1007
1008             } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
1009                 my $endpos = index( $pat, '\\}', $ic );
1010                 if( $endpos < 0 ){
1011                     Warn( "unmatched `\\{'", $fl );
1012                     return undef();
1013                 }
1014                 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
1015                 $ic = $endpos + 1;
1016
1017                 if( $res =~ /^\^?$/ ){
1018                     $res .= "\\{$rep\}";
1019                 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
1020                     my $min = $1;
1021                     my $com = $2 || '';
1022                     my $max = $3;
1023                     if( length( $max ) ){
1024                         if( $max < $min ){
1025                             Warn( "maximum less than minimum in `\\{$rep\\}'",
1026                                   $fl );
1027                             return undef();
1028                         }
1029                     } else {
1030                         $max = '';
1031                     }
1032                     # simplify some
1033                     if( $min == 0 && $max eq '1' ){
1034                         $res .= '?';
1035                     } elsif( $min == 1 && "$com$max" eq ',' ){
1036                         $res .= '+';
1037                     } elsif( $min == 0 && "$com$max" eq ',' ){
1038                         $res .= '*';
1039                     } else {
1040                         $res .= "{$min$com$max}";
1041                     }
1042                 } else {
1043                     Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1044                     return undef();
1045                 }
1046
1047             } elsif( $nc =~ /^[1-9]$/ ){
1048                 ## \1 .. \9 => \1 .. \9, but check for a following digit
1049                 if( $nc > $backref ){
1050                     Warn( "invalid backreference ($nc)", $fl );
1051                     return undef();
1052                 }
1053                 $res .= "\\$nc";
1054                 if( peek($pat,$ic) =~ /[0-9]/ ){
1055                     $res .= '(?:)';
1056                 }
1057
1058             } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1059                 ## extensions - at most <>wWyB - not in POSIX
1060                 if(      $nc eq '<' ){ ## \< => \b(?=\w), be precise
1061                     $res .= '\\b(?<=\\W)';
1062                 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1063                     $res .= '\\b(?=\\W)';
1064                 } elsif( $nc eq 'y' ){ ## \y => \b
1065                     $res .= '\\b';
1066                 } else {               ## \B, \w, \W remain the same
1067                     $res .= "\\$nc";
1068                 }
1069             } elsif( $nc eq $led ){
1070                 ## \<closing bracketing-delimiter> - keep '\'
1071                 $res .= "\\$nc";
1072
1073             } else { ## \ <char> => <char> ("as if `\' were not present")
1074                 $res .= $nc;
1075             }
1076
1077         } elsif( $c eq '.' ){ ## . => .
1078             $res .= $c;
1079
1080         } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1081             if( $res =~ /^\^?$/ ){
1082                 $res .= '\\*';
1083             } elsif( substr( $res, -1, 1 ) ne '*' ){
1084                 $res .= $c;
1085             }
1086
1087         } elsif( $c eq '[' ){
1088             ## parse []: [^...] [^]...] [-...]
1089             my $add = '[';
1090             if( peek($pat,$ic) eq '^' ){
1091                 $ic++;
1092                 $add .= '^';
1093             }
1094             my $nc = peek($pat,$ic);
1095             if( $nc eq ']' || $nc eq '-' ){
1096                 $add .= $nc;
1097                 $ic++;
1098             }
1099             # check that [ is not trailing
1100             if( $ic >= length( $pat ) - 1 ){
1101                 Warn( "unmatched `['", $fl );
1102                 return undef();
1103             }
1104             # look for [:...:] and x-y
1105             my $rstr = substr( $pat, $ic+1 );
1106             if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1107                 my $cnt = $1;
1108                 $ic += length( $cnt );
1109                 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1110                 # try some simplifications
1111                 my $red = $cnt;
1112                 if( $red =~ s/0-9// ){
1113                     $cnt = $red.'\d';
1114                     if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1115                         $cnt = $red.'\w';
1116                     }
1117                 }
1118                 $add .= $cnt;
1119
1120                 # POSIX 1003.2 has this (optional) for begin/end word
1121                 $add = '\\b(?=\\W)'  if $add eq '[[:<:]]';
1122                 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1123
1124             }
1125
1126             ## may have a trailing `-' before `]'
1127             if( $ic < length($pat) - 1 &&
1128                 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1129                 $ic += length( $1 );
1130                 $add .= $1;
1131                 # another simplification
1132                 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1133                 $res .= $add;
1134             } else {
1135                 Warn( "unmatched `['", $fl );
1136                 return undef();
1137             }
1138
1139         } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1140             $res .= "\\$c";
1141
1142         } elsif( $c eq ']' ){ ## unmatched ] is not magic
1143             $res .= ']';
1144
1145         } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1146             $res .= "\\$c";
1147
1148         } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1149             $res .= length( $res ) ? '\\^' : '^';
1150
1151         } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1152             $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
1153
1154         } else {
1155             $res .= $c;
1156         }
1157     }
1158
1159     if( $parlev ){
1160        Warn( "unmatched `\\('", $fl );
1161        return undef();
1162     }
1163
1164     # final cleanup: eliminate raw HTs
1165     $res =~ s/\t/\\t/g;
1166     return $del . $res . ( $led ? $led : $del );
1167 }
1168
1169
1170 #####
1171 # sub2p - convert sed substitution to Perl substitution
1172 #
1173 sub sub2p($$$){
1174     my( $del, $subst, $fl ) = @_;
1175     my $led = $del;
1176     $led =~ tr/{([</})]>/;
1177     $led = '' if $led eq $del;
1178
1179     $subst = substr( $subst, 1, length($subst) - 2 );
1180     my $res = '';
1181
1182     for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1183         my $c = substr( $subst, $ic, 1 );
1184         if( $c eq '\\' ){
1185             ### backslash escapes
1186             my $nc = peek($subst,$ic);
1187             if( $nc eq '' ){
1188                 Warn( "`\\' cannot be last in substitution", $fl );
1189                 return undef();
1190             }
1191             $ic++;
1192             if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1193                 $res .= '\\' . $nc;
1194             } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1195                 $res .= '${' . $nc . '}';
1196             } else { ## everything else (includes &): omit \
1197                 $res .= $nc;
1198             }
1199         } elsif( $c eq '&' ){ ## & => $&
1200             $res .= '$&';
1201         } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1202             $res .= '\\' . $c;
1203         } else {
1204             $res .= $c;
1205         }
1206     }
1207
1208     # final cleanup: eliminate raw HTs
1209     $res =~ s/\t/\\t/g;
1210     return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1211 }
1212
1213
1214 sub Parse(){
1215     my $error = 0;
1216     my( $pdef, $pfil, $plin );
1217     for( my $icom = 0; $icom < @Commands; $icom++ ){
1218         my $cmd = $Commands[$icom];
1219         print "Parse:$cmd:\n" if $useDEBUG;
1220         $cmd =~ s/^\s+//;
1221         next unless length( $cmd );
1222         my $scom = $icom;
1223         if( exists( $Defined{$icom} ) ){
1224             $pdef = $Defined{$icom};
1225             if( $pdef =~ /^ #(\d+)/ ){
1226                 $pfil = 'expression #';
1227                 $plin = $1;
1228             } else {
1229                 $pfil = "$pdef l.";
1230                 $plin = 1;
1231             }
1232         } else {
1233             $plin++;
1234         }
1235         my $fl = "$pfil$plin";
1236
1237         # insert command as comment in gnerated code
1238         #
1239         $Code .= "# $cmd\n" if $doGenerate;
1240
1241         # The Address(es)
1242         #
1243         my( $negated, $naddr, $addr1, $addr2 );
1244         $naddr = 0;
1245         if(      $cmd =~ s/^(\d+)\s*// ){
1246             $addr1 = "$1"; $naddr++;
1247         } elsif( $cmd =~ s/^\$\s*// ){
1248             $addr1 = 'eofARGV()'; $naddr++;
1249         } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1250             my $del = $1;
1251             my $regex = stripRegex( $del, \$cmd );
1252             if( defined( $regex ) ){
1253                 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1254                 $naddr++;
1255             } else {
1256                 Warn( "malformed regex, 1st address", $fl );
1257                 $error++;
1258                 next;
1259             }
1260         }
1261         if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1262             if(      $cmd =~ s/^(\d+)\s*// ){
1263                 $addr2 = "$1"; $naddr++;
1264             } elsif( $cmd =~ s/^\$\s*// ){
1265                 $addr2 = 'eofARGV()'; $naddr++;
1266             } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1267                 my $del = $1;
1268                 my $regex = stripRegex( $del, \$cmd );
1269                 if( defined( $regex ) ){
1270                     $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1271                     $naddr++;
1272                 } else {
1273                     Warn( "malformed regex, 2nd address", $fl );
1274                     $error++;
1275                     next;
1276                 }
1277             } else {
1278                 Warn( "invalid address after `,'", $fl );
1279                 $error++;
1280                 next;
1281             }
1282         }
1283
1284         # address modifier `!'
1285         #
1286         $negated = $cmd =~ s/^!\s*//;
1287         if( defined( $addr1 ) ){
1288             print "Parse: addr1=$addr1" if $useDEBUG;
1289             if( defined( $addr2 ) ){
1290                 print ", addr2=$addr2 " if $useDEBUG;
1291                 # both numeric and addr1 > addr2 => eliminate addr2
1292                 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1293                                    $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1294             }
1295         }
1296         print 'negated' if $useDEBUG && $negated;
1297         print " command:$cmd\n" if $useDEBUG;
1298
1299         # The Command
1300         #
1301         if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1302             my $h = substr( $cmd, 0, 1 );
1303             Warn( "unknown command `$h'", $fl );
1304             $error++;
1305             next;
1306         }
1307         my $key = $1;
1308
1309         my $tabref = $ComTab{$key};
1310         $GenKey{$key} = 1;
1311         if( $naddr > $tabref->[0] ){
1312             Warn( "excess address(es)", $fl );
1313             $error++;
1314             next;
1315         }
1316
1317         my $arg = '';
1318         if(      $tabref->[1] eq 'str' ){
1319             # take remainder - don't care if it is empty
1320             $arg = $cmd;
1321             $cmd = '';
1322
1323         } elsif( $tabref->[1] eq 'txt' ){
1324             # multi-line text
1325             my $goon = $cmd =~ /(.*)\\$/;
1326             if( length( $1 ) ){
1327                 Warn( "extra characters after command ($cmd)", $fl );
1328                 $error++;
1329             }
1330             while( $goon ){
1331                 $icom++;
1332                 if( $icom > $#Commands ){
1333                     Warn( "unexpected end of script", $fl );
1334                     $error++;
1335                     last;
1336                 }
1337                 $cmd = $Commands[$icom];
1338                 $Code .= "# $cmd\n" if $doGenerate;
1339                 $goon = $cmd =~ s/\\$//;
1340                 $cmd =~ s/\\(.)/$1/g;
1341                 $arg .= "\n" if length( $arg );
1342                 $arg .= $cmd;
1343             }
1344             $arg .= "\n" if length( $arg );
1345             $cmd = '';
1346
1347         } elsif( $tabref->[1] eq 'sub' ){
1348             # s///
1349             if( ! length( $cmd ) ){
1350                 Warn( "`s' command requires argument", $fl );
1351                 $error++;
1352                 next;
1353             }
1354             if( $cmd =~ s{^([^\\\n])}{} ){
1355                 my $del = $1;
1356                 my $regex = stripRegex( $del, \$cmd );
1357                 if( ! defined( $regex ) ){
1358                     Warn( "malformed regular expression", $fl );
1359                     $error++;
1360                     next;
1361                 }
1362                 $regex = bre2p( $del, $regex, $fl );
1363
1364                 # a trailing \ indicates embedded NL (in replacement string)
1365                 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1366                     $icom++;
1367                     if( $icom > $#Commands ){
1368                         Warn( "unexpected end of script", $fl );
1369                         $error++;
1370                         last;
1371                     }
1372                     $cmd .= $Commands[$icom];
1373                     $Code .= "# $Commands[$icom]\n" if $doGenerate;
1374                 }
1375
1376                 my $subst = stripRegex( $del, \$cmd );
1377                 if( ! defined( $regex ) ){
1378                     Warn( "malformed substitution expression", $fl );
1379                     $error++;
1380                     next;
1381                 }
1382                 $subst = sub2p( $del, $subst, $fl );
1383
1384                 # parse s/// modifier: g|p|0-9|w <file>
1385                 my( $global, $nmatch, $print, $write ) =
1386                   ( '',      '',      0,      undef );
1387                 while( $cmd =~ s/^([gp0-9])// ){
1388                     $1 eq 'g' ? ( $global = 'g' ) :
1389                     $1 eq 'p' ? ( $print  = $1  ) : ( $nmatch .= $1 );
1390                 }
1391                 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1392                 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1393                 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1394                     Warn( "conflicting flags `$global$nmatch'", $fl );
1395                     $error++;
1396                     next;
1397                 }
1398
1399                 $arg = makes( $regex, $subst,
1400                               $write, $global, $print, $nmatch, $fl );
1401                 if( ! defined( $arg ) ){
1402                     $error++;
1403                     next;
1404                 }
1405
1406             } else {
1407                 Warn( "improper delimiter in s command", $fl );
1408                 $error++;
1409                 next;
1410             }
1411
1412         } elsif( $tabref->[1] eq 'tra' ){
1413             # y///
1414             # a trailing \ indicates embedded newline
1415             while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1416                 $icom++;
1417                 if( $icom > $#Commands ){
1418                     Warn( "unexpected end of script", $fl );
1419                     $error++;
1420                     last;
1421                 }
1422                 $cmd .= $Commands[$icom];
1423                 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1424             }
1425             if( ! length( $cmd ) ){
1426                 Warn( "`y' command requires argument", $fl );
1427                 $error++;
1428                 next;
1429             }
1430             my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1431             if( $d eq '\\' ){
1432                 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1433                 $error++;
1434                 next;
1435             }
1436             my $fr = stripTrans( $d, \$cmd );
1437             if( ! defined( $fr ) || ! length( $cmd ) ){
1438                 Warn( "malformed `y' command argument", $fl );
1439                 $error++;
1440                 next;
1441             }
1442             my $to = stripTrans( $d, \$cmd );
1443             if( ! defined( $to ) ){
1444                 Warn( "malformed `y' command argument", $fl );
1445                 $error++;
1446                 next;
1447             }
1448             if( length($fr) != length($to) ){
1449                 Warn( "string lengths in `y' command differ", $fl );
1450                 $error++;
1451                 next;
1452             }
1453             if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1454                 $error++;
1455                 next;
1456             }
1457
1458         }
1459
1460         # $cmd must be now empty - exception is {
1461         if( $cmd !~ /^\s*$/ ){
1462             if( $key eq '{' ){
1463                 # dirty hack to process command on '{' line
1464                 $Commands[$icom--] = $cmd;
1465             } else {
1466                 Warn( "extra characters after command ($cmd)", $fl );
1467                 $error++;
1468                 next;
1469             }
1470         }
1471
1472         # Make Code
1473         #
1474         if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1475                              $tabref->[3], $arg, $fl ) ){
1476             $error++;
1477         }
1478     }
1479
1480     while( @BlockStack ){
1481         my $bl = pop( @BlockStack );
1482         Warn( "start of unterminated `{'", $bl );
1483         $error++;
1484     }
1485
1486     for my $lab ( keys( %Label ) ){
1487         if( ! exists( $Label{$lab}{defined} ) ){
1488             for my $used ( @{$Label{$lab}{used}} ){
1489                 Warn( "undefined label `$lab'", $used );
1490                 $error++;
1491             }
1492         }
1493     }
1494
1495     exit( 1 ) if $error;
1496 }
1497
1498
1499 ##############
1500 #### MAIN ####
1501 ##############
1502
1503 sub usage(){
1504     print STDERR "Usage: sed [-an] command [file...]\n";
1505     print STDERR "           [-an] [-e command] [-f script-file] [file...]\n";
1506 }
1507
1508 ###################
1509 # Here we go again...
1510 #
1511 my $expr = 0;
1512 while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1513     my $opt = $1;
1514     my $arg = $2;
1515     shift( @ARGV );
1516     if(      $opt eq 'e' ){
1517         if( length( $arg ) ){
1518             push( @Commands, split( "\n", $arg ) );
1519         } elsif( @ARGV ){
1520             push( @Commands, shift( @ARGV ) );
1521         } else {
1522             Warn( "option -e requires an argument" );
1523             usage();
1524             exit( 1 );
1525         }
1526         $expr++;
1527         $Defined{$#Commands} = " #$expr";
1528         next;
1529     }
1530     if( $opt eq 'f' ){
1531         my $path;
1532         if( length( $arg ) ){
1533             $path = $arg;
1534         } elsif( @ARGV ){
1535             $path = shift( @ARGV );
1536         } else {
1537             Warn( "option -f requires an argument" );
1538             usage();
1539             exit( 1 );
1540         }
1541         my $fst = $#Commands + 1;
1542         open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1543         my $cmd;
1544         while( defined( $cmd = <SCRIPT> ) ){
1545             chomp( $cmd );
1546             push( @Commands, $cmd );
1547         }
1548         close( SCRIPT );
1549         if( $#Commands >= $fst ){
1550             $Defined{$fst} = "$path";
1551         }
1552         next;
1553     }
1554     if( $opt eq '-' && $arg eq '' ){
1555         last;
1556     }
1557     if( $opt eq 'h' || $opt eq '?' ){
1558         usage();
1559         exit( 0 );
1560     }
1561     if( $opt eq 'n' ){
1562         $doAutoPrint = 0;
1563     } elsif( $opt eq 'a' ){
1564         $doOpenWrite = 0;
1565     } else {
1566         Warn( "illegal option `$opt'" );
1567         usage();
1568         exit( 1 );
1569     }
1570     if( length( $arg ) ){
1571         unshift( @ARGV, "-$arg" );
1572     }
1573 }
1574
1575 # A singleton command may be the 1st argument when there are no options.
1576 #
1577 if( @Commands == 0 ){
1578     if( @ARGV == 0 ){
1579         Warn( "no script command given" );
1580         usage();
1581         exit( 1 );
1582     }
1583     push( @Commands, split( "\n", shift( @ARGV ) ) );
1584     $Defined{0} = ' #1';
1585 }
1586
1587 print STDERR "Files: @ARGV\n" if $useDEBUG;
1588
1589 # generate leading code
1590 #
1591 $Func = <<'[TheEnd]';
1592
1593 # openARGV: open 1st input file
1594 #
1595 sub openARGV(){
1596     unshift( @ARGV, '-' ) unless @ARGV;
1597     my $file = shift( @ARGV );
1598     open( ARG, "<$file" )
1599     || die( "$0: can't open $file for reading ($!)\n" );
1600     $isEOF = 0;
1601 }
1602
1603 # getsARGV: Read another input line into argument (default: $_).
1604 #           Move on to next input file, and reset EOF flag $isEOF.
1605 sub getsARGV(;\$){
1606     my $argref = @_ ? shift() : \$_;
1607     while( $isEOF || ! defined( $$argref = <ARG> ) ){
1608         close( ARG );
1609         return 0 unless @ARGV;
1610         my $file = shift( @ARGV );
1611         open( ARG, "<$file" )
1612         || die( "$0: can't open $file for reading ($!)\n" );
1613         $isEOF = 0;
1614     }
1615     1;
1616 }
1617
1618 # eofARGV: end-of-file test
1619 #
1620 sub eofARGV(){
1621     return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1622 }
1623
1624 # makeHandle: Generates another file handle for some file (given by its path)
1625 #             to be written due to a w command or an s command's w flag.
1626 sub makeHandle($){
1627     my( $path ) = @_;
1628     my $handle;
1629     if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1630         $handle = $wFiles{$path} = gensym();
1631         if( $doOpenWrite ){
1632             if( ! open( $handle, ">$path" ) ){
1633                 die( "$0: can't open $path for writing: ($!)\n" );
1634             }
1635         }
1636     } else {
1637         $handle = $wFiles{$path};
1638     }
1639     return $handle;
1640 }
1641
1642 # printQ: Print queued output which is either a string or a reference
1643 #         to a pathname.
1644 sub printQ(){
1645     for my $q ( @Q ){
1646         if( ref( $q ) ){
1647             # flush open w files so that reading this file gets it all
1648             if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1649                 open( $wFiles{$$q}, ">>$$q" );
1650             }
1651             # copy file to stdout: slow, but safe
1652             if( open( RF, "<$$q" ) ){
1653                 while( defined( my $line = <RF> ) ){
1654                     print $line;
1655                 }
1656                 close( RF );
1657             }
1658         } else {
1659             print $q;
1660         }
1661     }
1662     undef( @Q );
1663 }
1664
1665 [TheEnd]
1666
1667 # generate the sed loop
1668 #
1669 $Code .= <<'[TheEnd]';
1670 sub openARGV();
1671 sub getsARGV(;\$);
1672 sub eofARGV();
1673 sub printQ();
1674
1675 # Run: the sed loop reading input and applying the script
1676 #
1677 sub Run(){
1678     my( $h, $icnt, $s, $n );
1679     # hack (not unbreakable :-/) to avoid // matching an empty string
1680     my $z = "\000"; $z =~ /$z/;
1681     # Initialize.
1682     openARGV();
1683     $Hold    = '';
1684     $CondReg = 0;
1685     $doPrint = $doAutoPrint;
1686 CYCLE:
1687     while( getsARGV() ){
1688         chomp();
1689         $CondReg = 0;   # cleared on t
1690 BOS:;
1691 [TheEnd]
1692
1693     # parse - avoid opening files when doing s2p
1694     #
1695     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1696       if $doGenerate;
1697     Parse();
1698     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1699       if $doGenerate;
1700
1701     # append trailing code
1702     #
1703     $Code .= <<'[TheEnd]';
1704 EOS:    if( $doPrint ){
1705             print $_, "\n";
1706         } else {
1707             $doPrint = $doAutoPrint;
1708         }
1709         printQ() if @Q;
1710     }
1711
1712     exit( 0 );
1713 }
1714 [TheEnd]
1715
1716
1717 # append optional functions, prepend prototypes
1718 #
1719 my $Proto = "# prototypes\n";
1720 if( $GenKey{'l'} ){
1721     $Proto .= "sub _l();\n";
1722     $Func .= <<'[TheEnd]';
1723 # _l: l command processing
1724 #
1725 sub _l(){
1726     my $h = $_;
1727     my $mcpl = 70;
1728     # transform non printing chars into escape notation
1729     $h =~ s/\\/\\\\/g;
1730     if( $h =~ /[^[:print:]]/ ){
1731         $h =~ s/\a/\\a/g;
1732         $h =~ s/\f/\\f/g;
1733         $h =~ s/\n/\\n/g;
1734         $h =~ s/\t/\\t/g;
1735         $h =~ s/\r/\\r/g;
1736         $h =~ s/\e/\\e/g;
1737         $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1738     }
1739     # split into lines of length $mcpl
1740     while( length( $h ) > $mcpl ){
1741         my $l = substr( $h, 0, $mcpl-1 );
1742         $h = substr( $h, $mcpl );
1743         # remove incomplete \-escape from end of line
1744         if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1745             $h = $1 . $h;
1746         }
1747         print $l, "\\\n";
1748     }
1749     print "$h\$\n";
1750 }
1751
1752 [TheEnd]
1753 }
1754
1755 if( $GenKey{'r'} ){
1756     $Proto .= "sub _r(\$);\n";
1757     $Func .= <<'[TheEnd]';
1758 # _r: r command processing: Save a reference to the pathname.
1759 #
1760 sub _r($){
1761     my $path = shift();
1762     push( @Q, \$path );
1763 }
1764
1765 [TheEnd]
1766 }
1767
1768 if( $GenKey{'t'} ){
1769     $Proto .= "sub _t();\n";
1770     $Func .= <<'[TheEnd]';
1771 # _t: t command - condition register test/reset
1772 #
1773 sub _t(){
1774     my $res = $CondReg;
1775     $CondReg = 0;
1776     $res;
1777 }
1778
1779 [TheEnd]
1780 }
1781
1782 if( $GenKey{'w'} ){
1783     $Proto .= "sub _w(\$);\n";
1784     $Func .= <<'[TheEnd]';
1785 # _w: w command and s command's w flag - write to file
1786 #
1787 sub _w($){
1788     my $path   = shift();
1789     my $handle = $wFiles{$path};
1790     if( ! $doOpenWrite && ! defined( fileno( $handle ) ) ){
1791         open( $handle, ">$path" )
1792         || die( "$0: $path: cannot open ($!)\n" );
1793     }
1794     print $handle $_, "\n";
1795 }
1796
1797 [TheEnd]
1798 }
1799
1800 $Code = $Proto . $Code;
1801
1802 # magic "#n" - same as -n option
1803 #
1804 $doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
1805
1806 # eval code - check for errors
1807 #
1808 print "Code:\n$Code$Func" if $useDEBUG;
1809 eval $Code . $Func;
1810 if( $@ ){
1811     print "Code:\n$Code$Func";
1812     die( "$0: internal error - generated incorrect Perl code: $@\n" );
1813 }
1814
1815 if( $doGenerate ){
1816
1817     # write full Perl program
1818     #
1819
1820     # bang line, declarations, prototypes
1821     print <<TheEnd;
1822 #!$perlpath -w
1823 eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1824   if 0;
1825 \$0 =~ s/^.*?(\\w+)\[\\.\\w+\]*\$/\$1/;
1826
1827 use strict;
1828 use Symbol;
1829 use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1830              \$doAutoPrint \$doOpenWrite \$doPrint };
1831 \$doAutoPrint = $doAutoPrint;
1832 \$doOpenWrite = $doOpenWrite;
1833 TheEnd
1834
1835     my $wf = "'" . join( "', '",  keys( %wFiles ) ) . "'";
1836     if( $wf ne "''" ){
1837         print <<TheEnd;
1838 sub makeHandle(\$);
1839 for my \$p ( $wf ){
1840    exit( 1 ) unless makeHandle( \$p );
1841 }
1842 TheEnd
1843    }
1844
1845    print $Code;
1846    print "Run();\n";
1847    print $Func;
1848    exit( 0 );
1849
1850 } else {
1851
1852     # execute: make handles (and optionally open) all w files; run!
1853     for my $p ( keys( %wFiles ) ){
1854         exit( 1 ) unless makeHandle( $p );
1855     }
1856     Run();
1857 }
1858
1859
1860 =head1 ENVIRONMENT
1861
1862 The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1863 See L<"Additional Atoms">.
1864
1865 =head1 DIAGNOSTICS
1866
1867 =over 4
1868
1869 =item ambiguous translation for character `%s' in `y' command
1870
1871 The indicated character appears twice, with different translations.
1872
1873 =item `[' cannot be last in pattern
1874
1875 A `[' in a BRE indicates the beginning of a I<bracket expression>.
1876
1877 =item `\' cannot be last in pattern
1878
1879 A `\' in a BRE is used to make the subsequent character literal.
1880
1881 =item `\' cannot be last in substitution
1882
1883 A `\' in a subsitution string is used to make the subsequent character literal.
1884
1885 =item conflicting flags `%s'
1886
1887 In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1888 multiple n-th occurrence flags are specified. Note that only the digits
1889 `1' through `9' are permitted.
1890
1891 =item duplicate label %s (first defined at %s)
1892
1893 =item excess address(es)
1894
1895 The command has more than the permitted number of addresses.
1896
1897 =item extra characters after command (%s)
1898
1899 =item illegal option `%s'
1900
1901 =item improper delimiter in s command
1902
1903 The BRE and substitution may not be delimited with `\' or newline.
1904
1905 =item invalid address after `,'
1906
1907 =item invalid backreference (%s)
1908
1909 The specified backreference number exceeds the number of backreferences
1910 in the BRE.
1911
1912 =item invalid repeat clause `\{%s\}'
1913
1914 The repeat clause does not contain a valid integer value, or pair of
1915 values.
1916
1917 =item malformed regex, 1st address
1918
1919 =item malformed regex, 2nd address
1920
1921 =item malformed regular expression
1922
1923 =item malformed substitution expression
1924
1925 =item malformed `y' command argument
1926
1927 The first or second string of a B<y> command  is syntactically incorrect.
1928
1929 =item maximum less than minimum in `\{%s\}'
1930
1931 =item no script command given
1932
1933 There must be at least one B<-e> or one B<-f> option specifying a
1934 script or script file.
1935
1936 =item `\' not valid as delimiter in `y' command
1937
1938 =item option -e requires an argument
1939
1940 =item option -f requires an argument
1941
1942 =item `s' command requires argument
1943
1944 =item start of unterminated `{'
1945
1946 =item string lengths in `y' command differ
1947
1948 The translation table strings in a B<y> commanf must have equal lengths.
1949
1950 =item undefined label `%s'
1951
1952 =item unexpected `}'
1953
1954 A B<}> command without a preceding B<{> command was encountered.
1955
1956 =item unexpected end of script
1957
1958 The end of the script was reached although a text line after a
1959 B<a>, B<c> or B<i> command indicated another line.
1960
1961 =item unknown command `%s'
1962
1963 =item unterminated `['
1964
1965 A BRE contains an unterminated bracket expression.
1966
1967 =item unterminated `\('
1968
1969 A BRE contains an unterminated backreference.
1970
1971 =item `\{' without closing `\}'
1972
1973 A BRE contains an unterminated bounds specification.
1974
1975 =item `\)' without preceding `\('
1976
1977 =item `y' command requires argument
1978
1979 =back
1980
1981 =head1 EXAMPLE
1982
1983 The basic material for the preceding section was generated by running
1984 the sed script
1985
1986    #no autoprint
1987    s/^.*Warn( *"\([^"]*\)".*$/\1/
1988    t process
1989    b
1990    :process
1991    s/$!/%s/g
1992    s/$[_[:alnum:]]\{1,\}/%s/g
1993    s/\\\\/\\/g
1994    s/^/=item /
1995    p
1996
1997 on the program's own text, and piping the output into C<sort -u>.
1998
1999
2000 =head1 SED SCRIPT TRANSLATION
2001
2002 If this program is invoked with the name F<s2p> it will act as a
2003 sed-to-Perl translator. After option processing (all other
2004 arguments are ignored), a Perl program is printed on standard
2005 output, which will process the input stream (as read from all
2006 arguments) in the way defined by the sed script and the option setting
2007 used for the translation.
2008
2009 =head1 SEE ALSO
2010
2011 perl(1), re_format(7)
2012
2013 =head1 BUGS
2014
2015 The B<l> command will show escape characters (ESC) as `C<\e>', but
2016 a vertical tab (VT) in octal.
2017
2018 Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
2019
2020 The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
2021 is "the last pattern used, at run time". This deviates from the Perl
2022 interpretation, which will re-use the "last last successfully executed
2023 regular expression". Since keeping track of pattern usage would create
2024 terribly cluttered code, and differences would only appear in obscure
2025 context (where other B<sed> implementations appear to deviate, too),
2026 the Perl semantics was adopted. Note that common usage of this feature,
2027 such as in C</abc/s//xyz/>, will work as expected.
2028
2029 Collating elements (of bracket expressions in BREs) are not implemented.
2030
2031 =head1 STANDARDS
2032
2033 This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
2034 definition of B<sed>, and is compatible with the I<OpenBSD>
2035 implementation, except where otherwise noted (see L<"BUGS">).
2036
2037 =head1 AUTHOR
2038
2039 This Perl implementation of I<sed> was written by Wolfgang Laun,
2040 I<Wolfgang.Laun@alcatel.at>.
2041
2042 =head1 COPYRIGHT and LICENSE
2043
2044 This program is free and open software. You may use, modify,
2045 distribute, and sell this program (and any modified variants) in any
2046 way you wish, provided you do not restrict others from doing the same.
2047
2048 =cut
2049
2050 !NO!SUBS!
2051
2052 close OUT or die "Can't close $file: $!";
2053 chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
2054 unlink 'psed';
2055 print "Linking $file to psed.\n";
2056 if (defined $Config{d_link}) {
2057   link $file, 'psed';
2058 } else {
2059   unshift @INC, '../lib';
2060   require File::Copy;
2061   File::Copy::syscopy('s2p', 'psed');
2062 }
2063 exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
2064 chdir $origdir;