x2p/s2p.PL

   1 #!/usr/bin/perl
   2
   3 use Config;
   4 use File::Basename qw(&basename &dirname);
   5 use Cwd;
   6
   7 # List explicitly here the variables you want Configure to
   8 # generate.  Metaconfig only looks for shell variables, so you
   9 # have to mention them as if they were shell variables, not
  10 # %Config entries.  Thus you write
  11 #  $startperl
  12 # to ensure Configure will look for $Config{startperl}.
  13
  14 # This forces PL files to create target in same directory as PL file.
  15 # This is so that make depend always knows where to find PL derivatives.
  16 $origdir = cwd;
  17 chdir dirname($0);
  18 $file = basename($0, '.PL');
  19 $file .= '.com' if $^O eq 'VMS';
  20
  21 open OUT,">$file" or die "Can't create $file: $!";
  22
  23 print "Extracting $file (with variable substitutions)\n";
  24
  25 # In this section, perl variables will be expanded during extraction.
  26 # You can use $Config{...} to use Configure variables.
  27
  28 print OUT <<"!GROK!THIS!";
  29 $Config{startperl}
  30     eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}'
  31         if \$running_under_some_shell;
  32 my \$startperl;
  33 my \$perlpath;
  34 (\$startperl = <<'/../') =~ s/\\s*\\z//;
  35 $Config{startperl}
  36 /../
  37 (\$perlpath = <<'/../') =~ s/\\s*\\z//;
  38 $Config{perlpath}
  39 /../
  40 !GROK!THIS!
  41
  42 # In the following, perl variables are not expanded during extraction.
  43
  44 print OUT <<'!NO!SUBS!';
  45
  46 $0 =~ s/^.*?(\w+)$/$1/;
  47
  48 # (p)sed - a stream editor
  49 # History:  Aug 12 2000: Original version.
  50
  51 use strict;
  52 use integer;
  53 use Symbol;
  54
  55 =head1 NAME
  56
  57 psed - a stream editor
  58
  59 =head1 SYNOPSIS
  60
  61    psed [-an] script [file ...]
  62    psed [-an] [-e script] [-f script-file] [file ...]
  63
  64    s2p  [-an] [-e script] [-f script-file]
  65
  66 =head1 DESCRIPTION
  67
  68 A stream editor reads the input stream consisting of the specified files
  69 (or standard input, if none are given), processes is line by line by
  70 applying a script consisting of edit commands, and writes resulting lines
  71 to standard output. The filename `C<->' may be used to read standard input.
  72
  73 The edit script is composed from arguments of B<-e> options and
  74 script-files, in the given order. A single script argument may be specified
  75 as the first parameter.
  76
  77 If this program is invoked with the name F<s2p>, it will act as a
  78 sed-to-Perl translator. See L<"sed Script Translation">.
  79
  80 B<sed> returns an exit code of 0 on success or >0 if an error occurred.
  81
  82 =head1 OPTIONS
  83
  84 =over 4
  85
  86 =item B<-a>
  87
  88 A file specified as argument to the B<w> edit command is by default
  89 opened before input processing starts. Using B<-a>, opening of such
  90 files is delayed until the first line is actually written to the file.
  91
  92 =item B<-e> I<script>
  93
  94 The editing commands defined by I<script> are appended to the script.
  95 Multiple commands must be separated by newlines.
  96
  97 =item B<-f> I<script-file>
  98
  99 Editing commands from the specified I<script-file> are read and appended
 100 to the script.
 101
 102 =item B<-n>
 103
 104 By default, a line is written to standard output after the editing script
 105 has been applied to it. The B<-n> option suppresses automatic printing.
 106
 107 =back
 108
 109 =head1 COMMANDS
 110
 111 B<sed> command syntax is defined as
 112
 113 Z<> Z<> Z<> Z<>[I<address>[B<,>I<address>]][B<!>]I<function>[I<argument>]
 114
 115 with whitespace being permitted before or after addresses, and between
 116 the function character and the argument. The I<address>es and the
 117 address inverter (C<!>) are used to restrict the application of a
 118 command to the selected line(s) of input.
 119
 120 Each command must be on a line of its own, except where noted in
 121 the synopses below.
 122
 123 The edit cycle performed on each input line consist of reading the line
 124 (without its trailing newline character) into the I<pattern space>,
 125 applying the applicable commands of the edit script, writing the final
 126 contents of the pattern space and a newline to the standard output.
 127 A I<hold space> is provided for saving the contents of the
 128 pattern space for later use.
 129
 130 =head2 Addresses
 131
 132 A sed address is either a line number or a pattern, which may be combined
 133 arbitrarily to construct ranges. Lines are numbered across all input files.
 134
 135 Any address may be followed by an exclamation mark (`C<!>'), selecting
 136 all lines not matching that address.
 137
 138 =over 4
 139
 140 =item I<number>
 141
 142 The line with the given number is selected.
 143
 144 =item B<$>
 145
 146 A dollar sign (C<$>) is the line number of the last line of the input stream.
 147
 148 =item B</>I<regular expression>B</>
 149
 150 A pattern address is a basic regular expression (see
 151 L<"Basic Regular Expressions">), between the delimiting character C</>.
 152 Any other character except C<\> or newline may be used to delimit a
 153 pattern address when the initial delimiter is prefixed with a
 154 backslash (`C<\>').
 155
 156 =back
 157
 158 If no address is given, the command selects every line.
 159
 160 If one address is given, it selects the line (or lines) matching the
 161 address.
 162
 163 Two addresses select a range that begins whenever the first address
 164 matches, and ends (including that line) when the second address matches.
 165 If the first (second) address is a matching pattern, the second
 166 address is not applied to the very same line to determine the end of
 167 the range. Likewise, if the second address is a matching pattern, the
 168 first address is not applied to the very same line to determine the
 169 begin of another range. If both addresses are line numbers,
 170 and the second line number is less than the first line number, then
 171 only the first line is selected.
 172
 173
 174 =head2 Functions
 175
 176 The maximum permitted number of addresses is indicated with each
 177 function synopsis below.
 178
 179 The argument I<text> consists of one or more lines following the command.
 180 Embedded newlines in I<text> must be preceded with a backslash.  Other
 181 backslashes in I<text> are deleted and the following character is taken
 182 literally.
 183
 184 =over 4
 185
 186 =cut
 187
 188 my %ComTab;
 189 #--------------------------------------------------------------------------
 190 $ComTab{'a'}=[ 1, 'txt', \&Emit,       '{ push( @Q, <<'."'TheEnd' ) }\n" ]; #ok
 191
 192 =item [1addr]B<a\> I<text>
 193
 194 Write I<text> (which must start on the line following the command)
 195 to standard output immediately before reading the next line
 196 of input, either by executing the B<N> function or by beginning a new cycle.
 197
 198 =cut
 199
 200 #--------------------------------------------------------------------------
 201 $ComTab{'b'}=[ 2, 'str', \&Branch,     '{ goto XXX; }'                   ]; #ok
 202
 203 =item [2addr]B<b> [I<label>]
 204
 205 Branch to the B<:> function with the specified I<label>. If no label
 206 is given, branch to the end of the script.
 207
 208 =cut
 209
 210 #--------------------------------------------------------------------------
 211 $ComTab{'c'}=[ 2, 'txt', \&Change,     <<'-X-'                           ]; #ok
 212 { print <<'TheEnd'; } $doPrint = 0; goto EOS;
 213 -X-
 214 ### continue OK => next CYCLE;
 215
 216 =item [2addr]B<c\> I<text>
 217
 218 The line, or range of lines, selected by the address is deleted.
 219 The I<text> (which must start on the line following the command)
 220 is written to standard output. With an address range, this occurs at
 221 the end of the range.
 222
 223 =cut
 224
 225 #--------------------------------------------------------------------------
 226 $ComTab{'d'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 227 { $doPrint = 0;
 228   goto EOS;
 229 }
 230 -X-
 231 ### continue OK => next CYCLE;
 232
 233 =item [2addr]B<d>
 234
 235 Deletes the pattern space and starts the next cycle.
 236
 237 =cut
 238
 239 #--------------------------------------------------------------------------
 240 $ComTab{'D'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 241 { s/^.*\n?//;
 242   if(length($_)){ goto BOS } else { goto EOS }
 243 }
 244 -X-
 245 ### continue OK => next CYCLE;
 246
 247 =item [2addr]B<D>
 248
 249 Deletes the pattern space through the first embedded newline or to the end.
 250 If the pattern space becomes empty, a new cycle is started, otherwise
 251 execution of the script is restarted.
 252
 253 =cut
 254
 255 #--------------------------------------------------------------------------
 256 $ComTab{'g'}=[ 2, '',    \&Emit,       '{ $_ = $Hold };'                 ]; #ok
 257
 258 =item [2addr]B<g>
 259
 260 Replace the contents of the pattern space with the hold space.
 261
 262 =cut
 263
 264 #--------------------------------------------------------------------------
 265 $ComTab{'G'}=[ 2, '',    \&Emit,       '{ $_ .= "\n"; $_ .= $Hold };'    ]; #ok
 266
 267 =item [2addr]B<G>
 268
 269 Append a newline and the contents of the hold space to the pattern space.
 270
 271 =cut
 272
 273 #--------------------------------------------------------------------------
 274 $ComTab{'h'}=[ 2, '',    \&Emit,       '{ $Hold = $_ }'                  ]; #ok
 275
 276 =item [2addr]B<h>
 277
 278 Replace the contents of the hold space with the pattern space.
 279
 280 =cut
 281
 282 #--------------------------------------------------------------------------
 283 $ComTab{'H'}=[ 2, '',    \&Emit,       '{ $Hold .= "\n"; $Hold .= $_; }' ]; #ok
 284
 285 =item [2addr]B<H>
 286
 287 Append a newline and the contents of the pattern space to the hold space.
 288
 289 =cut
 290
 291 #--------------------------------------------------------------------------
 292 $ComTab{'i'}=[ 1, 'txt', \&Emit,       '{ print <<'."'TheEnd' }\n"       ]; #ok
 293
 294 =item [1addr]B<i\> I<text>
 295
 296 Write the I<text> (which must start on the line following the command)
 297 to standard output.
 298
 299 =cut
 300
 301 #--------------------------------------------------------------------------
 302 $ComTab{'l'}=[ 2, '',    \&Emit,       '{ _l() }'                        ]; #okUTF8
 303
 304 =item [2addr]B<l>
 305
 306 Print the contents of the pattern space: non-printable characters are
 307 shown in C-style escaped form; long lines are split and have a trailing
 308 `C<\>' at the point of the split; the true end of a line is marked with
 309 a `C<$>'. Escapes are: `\a', `\t', `\n', `\f', `\r', `\e' for
 310 BEL, HT, LF, FF, CR, ESC, respectively, and `\' followed by a three-digit
 311 octal number for all other non-printable characters.
 312
 313 =cut
 314
 315 #--------------------------------------------------------------------------
 316 $ComTab{'n'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 317 { print $_, "\n" if $doPrint;
 318   printQ if @Q;
 319   $CondReg = 0;
 320   last CYCLE unless getsARGV();
 321   chomp();
 322 }
 323 -X-
 324
 325 =item [2addr]B<n>
 326
 327 If automatic printing is enabled, write the pattern space to the standard
 328 output. Replace the pattern space with the next line of input. If
 329 there is no more input, processing is terminated.
 330
 331 =cut
 332
 333 #--------------------------------------------------------------------------
 334 $ComTab{'N'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 335 { printQ if @Q;
 336   $CondReg = 0;
 337   last CYCLE unless getsARGV( $h );
 338   chomp( $h );
 339   $_ .= "\n$h";
 340 }
 341 -X-
 342
 343 =item [2addr]B<N>
 344
 345 Append a newline and the next line of input to the pattern space. If
 346 there is no more input, processing is terminated.
 347
 348 =cut
 349
 350 #--------------------------------------------------------------------------
 351 $ComTab{'p'}=[ 2, '',    \&Emit,       '{ print $_, "\n"; }'             ]; #ok
 352
 353 =item [2addr]B<p>
 354
 355 Print the pattern space to the standard output. (Use the B<-n> option
 356 to suppress automatic printing at the end of a cycle if you want to
 357 avoid double printing of lines.)
 358
 359 =cut
 360
 361 #--------------------------------------------------------------------------
 362 $ComTab{'P'}=[ 2, '',    \&Emit,       <<'-X-'                           ]; #ok
 363 { if( /^(.*)/ ){ print $1, "\n"; } }
 364 -X-
 365
 366 =item [2addr]B<P>
 367
 368 Prints the pattern space through the first embedded newline or to the end.
 369
 370 =cut
 371
 372 #--------------------------------------------------------------------------
 373 $ComTab{'q'}=[ 1, '',    \&Emit,       <<'-X-'                           ]; #ok
 374 { print $_, "\n" if $doPrint;
 375   last CYCLE;
 376 }
 377 -X-
 378
 379 =item [1addr]B<q>
 380
 381 Branch to the end of the script and quit without starting a new cycle.
 382
 383 =cut
 384
 385 #--------------------------------------------------------------------------
 386 $ComTab{'r'}=[ 1, 'str', \&Emit,       "{ _r( '-X-' ) }"                 ]; #ok
 387 ### FIXME: lazy reading - big files???
 388
 389 =item [1addr]B<r> I<file>
 390
 391 Copy the contents of the I<file> to standard output immediately before
 392 the next attempt to read a line of input. Any error encountered while
 393 reading I<file> is silently ignored.
 394
 395 =cut
 396
 397 #--------------------------------------------------------------------------
 398 $ComTab{'s'}=[ 2, 'sub', \&Emit,       ''                                ]; #ok
 399
 400 =item [2addr]B<s/>I<regular expression>B</>I<replacement>B</>I<flags>
 401
 402 Substitute the I<replacement> string for the first substring in
 403 the pattern space that matches the I<regular expression>.
 404 Any character other than backslash or newline can be used instead of a
 405 slash to delimit the regular expression and the replacement.
 406 To use the delimiter as a literal character within the regular expression
 407 and the replacement, precede the character by a backslash (`C<\>').
 408
 409 Literal newlines may be embedded in the replacement string by
 410 preceding a newline with a backslash.
 411
 412 Within the replacement, an ampersand (`C<&>') is replaced by the string
 413 matching the regular expression. The strings `C<\1>' through `C<\9>' are
 414 replaced by the corresponding subpattern (see L<"Basic Regular Expressions">).
 415 To get a literal `C<&>' or `C<\>' in the replacement text, precede it
 416 by a backslash.
 417
 418 The following I<flags> modify the behaviour of the B<s> command:
 419
 420 =over 8
 421
 422 =item B<g>
 423
 424 The replacement is performed for all matching, non-overlapping substrings
 425 of the pattern space.
 426
 427 =item B<1>..B<9>
 428
 429 Replace only the n-th matching substring of the pattern space.
 430
 431 =item B<p>
 432
 433 If the substitution was made, print the new value of the pattern space.
 434
 435 =item B<w> I<file>
 436
 437 If the substitution was made, write the new value of the pattern space
 438 to the specified file.
 439
 440 =back
 441
 442 =cut
 443
 444 #--------------------------------------------------------------------------
 445 $ComTab{'t'}=[ 2, 'str', \&Branch,     '{ goto XXX if _t() }'            ]; #ok
 446
 447 =item [2addr]B<t> [I<label>]
 448
 449 Branch to the B<:> function with the specified I<label> if any B<s>
 450 substitutions have been made since the most recent reading of an input line
 451 or execution of a B<t> function. If no label is given, branch to the end of
 452 the script.
 453
 454
 455 =cut
 456
 457 #--------------------------------------------------------------------------
 458 $ComTab{'w'}=[ 2, 'str', \&Write,      "{ _w( '-X-' ) }"                 ]; #ok
 459
 460 =item [2addr]B<w> I<file>
 461
 462 The contents of the pattern space are written to the I<file>.
 463
 464 =cut
 465
 466 #--------------------------------------------------------------------------
 467 $ComTab{'x'}=[ 2, '',    \&Emit,       '{ ($Hold, $_) = ($_, $Hold) }'   ]; #ok
 468
 469 =item [2addr]B<x>
 470
 471 Swap the contents of the pattern space and the hold space.
 472
 473 =cut
 474
 475 #--------------------------------------------------------------------------
 476 $ComTab{'y'}=[ 2, 'tra', \&Emit,       ''                                ]; #ok
 477 =item [2addr]B<y>B</>I<string1>B</>I<string2>B</>
 478
 479 In the pattern space, replace all characters occuring in I<string1> by the
 480 character at the corresponding position in I<string2>. It is possible
 481 to use any character (other than a backslash or newline) instead of a
 482 slash to delimit the strings.  Within I<string1> and I<string2>, a
 483 backslash followed by any character other than a newline is that literal
 484 character, and a backslash followed by an `n' is replaced by a newline
 485 character.
 486
 487 =cut
 488
 489 #--------------------------------------------------------------------------
 490 $ComTab{'='}=[ 1, '',    \&Emit,       '{ print "$.\n" }'                ]; #ok
 491
 492 =item [1addr]B<=>
 493
 494 Prints the current line number on the standard output.
 495
 496 =cut
 497
 498 #--------------------------------------------------------------------------
 499 $ComTab{':'}=[ 0, 'str', \&Label,      ''                                ]; #ok
 500
 501 =item [0addr]B<:> [I<label>]
 502
 503 The command specifies the position of the I<label>. It has no other effect.
 504
 505 =cut
 506
 507 #--------------------------------------------------------------------------
 508 $ComTab{'{'}=[ 2, '',    \&BeginBlock, '{'                               ]; #ok
 509 $ComTab{'}'}=[ 0, '',    \&EndBlock,   ';}'                              ]; #ok
 510 # ';' to avoid warning on empty {}-block
 511
 512 =item [2addr]B<{> [I<command>]
 513
 514 =item [0addr]B<}>
 515
 516 These two commands begin and end a command list. The first command may
 517 be given on the same line as the opening B<{> command. The commands
 518 within the list are jointly selected by the address(es) given on the
 519 B<{> command (but may still have individual addresses).
 520
 521 =cut
 522
 523 #--------------------------------------------------------------------------
 524 $ComTab{'#'}=[ 0, 'str', \&Comment,    ''                                ]; #ok
 525
 526 =item [0addr]B<#> [I<comment>]
 527
 528 The entire line is ignored (treated as a comment). If, however, the first
 529 two characters in the script are `C<#n>', automatic printing of output is
 530 suppressed, as if the B<-n> option were given on the command line.
 531
 532 =back
 533
 534 =cut
 535
 536 use vars qw{ $isEOF $Hold %wFiles @Q $CondReg $doPrint };
 537
 538 my $useDEBUG    = exists( $ENV{PSEDDEBUG} );
 539 my $useEXTBRE   = $ENV{PSEDEXTBRE} || '';
 540 $useEXTBRE =~ s/[^<>wWyB]//g; # gawk RE's handle these
 541
 542 my $doAutoPrint = 1;          # automatic printing of pattern space (-n => 0)
 543 my $doOpenWrite = 1;          # open w command output files at start (-a => 0)
 544 my $svOpenWrite = 0;          # save $doOpenWrite
 545 my $doGenerate  = $0 eq 's2p';
 546
 547 # Collected and compiled script
 548 #
 549 my( @Commands, %Defined, @BlockStack, %Label, $labNum, $Code );
 550
 551 ##################
 552 #  Compile Time
 553 #
 554 # Labels
 555 #
 556 # Error handling
 557 #
 558 sub Warn($;$){
 559     my( $msg, $loc ) = @_;
 560     $loc ||= '';
 561     $loc .= ': ' if length( $loc );
 562     warn( "$0: $loc$msg\n" );
 563 }
 564
 565 $labNum = 0;
 566 sub newLabel(){
 567     return 'L_'.++$labNum;
 568 }
 569
 570 # safeHere: create safe here delimiter and  modify opcode and argument
 571 #
 572 sub safeHere($$){
 573     my( $codref, $argref ) = @_;
 574     my $eod = 'EOD000';
 575     while( $$argref =~ /^$eod$/m ){
 576         $eod++;
 577     }
 578     $$codref =~ s/TheEnd/$eod/e;
 579     $$argref .= "$eod\n";
 580 }
 581
 582 # Emit: create address logic and emit command
 583 #
 584 sub Emit($$$$$$){
 585     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 586     my $cond = '';
 587     if( defined( $addr1 ) ){
 588         if( defined( $addr2 ) ){
 589             $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 590         } else {
 591             $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 592         }
 593         $cond = $negated ? "unless( $addr1 )\n" : "if( $addr1 )\n";
 594     }
 595
 596     if( $opcode eq '' ){
 597         $Code .= "$cond$arg\n";
 598
 599     } elsif( $opcode =~ s/-X-/$arg/e ){
 600         $Code .= "$cond$opcode\n";
 601
 602     } elsif( $opcode =~ /TheEnd/ ){
 603         safeHere( \$opcode, \$arg );
 604         $Code .= "$cond$opcode$arg";
 605
 606     } else {
 607         $Code .= "$cond$opcode\n";
 608     }
 609     0;
 610 }
 611
 612 # Write (w command, w flag): store pathname
 613 #
 614 sub Write($$$$$$){
 615     my( $addr1, $addr2, $negated, $opcode, $path, $fl ) = @_;
 616     $wFiles{$path} = '';
 617     Emit( $addr1, $addr2, $negated, $opcode, $path, $fl );
 618 }
 619
 620
 621 # Label (: command): label definition
 622 #
 623 sub Label($$$$$$){
 624     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 625     my $rc = 0;
 626     $lab =~ s/\s+//;
 627     if( length( $lab ) ){
 628         my $h;
 629         if( ! exists( $Label{$lab} ) ){
 630             $h = $Label{$lab}{name} = newLabel();
 631         } else {
 632             $h = $Label{$lab}{name};
 633             if( exists( $Label{$lab}{defined} ) ){
 634                 my $dl = $Label{$lab}{defined};
 635                 Warn( "duplicate label $lab (first defined at $dl)", $fl );
 636                 $rc = 1;
 637             }
 638         }
 639         $Label{$lab}{defined} = $fl;
 640         $Code .= "$h:;\n";
 641     }
 642     $rc;
 643 }
 644
 645 # BeginBlock ({ command): push block start
 646 #
 647 sub BeginBlock($$$$$$){
 648     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 649     push( @BlockStack, [ $fl, $addr1, $addr2, $negated ] );
 650     Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 651 }
 652
 653 # EndBlock (} command): check proper nesting
 654 #
 655 sub EndBlock($$$$$$){
 656     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 657     my $rc;
 658     my $jcom = pop( @BlockStack );
 659     if( defined( $jcom ) ){
 660         $rc = Emit( $addr1, $addr2, $negated, $opcode, $arg, $fl );
 661     } else {
 662         Warn( "unexpected `}'", $fl );
 663         $rc = 1;
 664     }
 665     $rc;
 666 }
 667
 668 # Branch (t, b commands): check or create label, substitute default
 669 #
 670 sub Branch($$$$$$){
 671     my( $addr1, $addr2, $negated, $opcode, $lab, $fl ) = @_;
 672     $lab =~ s/\s+//; # no spaces at end
 673     my $h;
 674     if( length( $lab ) ){
 675         if( ! exists( $Label{$lab} ) ){
 676             $h = $Label{$lab}{name} = newLabel();
 677         } else {
 678             $h = $Label{$lab}{name};
 679         }
 680         push( @{$Label{$lab}{used}}, $fl );
 681     } else {
 682         $h = 'EOS';
 683     }
 684     $opcode =~ s/XXX/$h/e;
 685     Emit( $addr1, $addr2, $negated, $opcode, '', $fl );
 686 }
 687
 688 # Change (c command): is special due to range end watching
 689 #
 690 sub Change($$$$$$){
 691     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 692     my $kwd = $negated ? 'unless' : 'if';
 693     if( defined( $addr2 ) ){
 694         $addr1 .= $addr2 =~ /^\d+$/ ? "..$addr2" : "...$addr2";
 695         if( ! $negated ){
 696             $addr1  = '$icnt = ('.$addr1.')';
 697             $opcode = 'if( $icnt =~ /E0$/ )' . $opcode;
 698         }
 699     } else {
 700         $addr1 .= ' == $.' if $addr1 =~ /^\d+$/;
 701     }
 702     safeHere( \$opcode, \$arg );
 703     $Code .= "$kwd( $addr1 ){\n  $opcode$arg}\n";
 704     0;
 705 }
 706
 707
 708 # Comment (# command): A no-op. Who would've thought that!
 709 #
 710 sub Comment($$$$$$){
 711     my( $addr1, $addr2, $negated, $opcode, $arg, $fl ) = @_;
 712 ### $Code .= "# $arg\n";
 713     0;
 714 }
 715
 716
 717 sub stripRegex($$){
 718     my( $del, $sref ) = @_;
 719     my $regex = $del;
 720     print "stripRegex:$del:$$sref:\n" if $useDEBUG;
 721     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E(\s*)}{}s ){
 722         my $sl = $2;
 723         $regex .= $1.$sl.$del;
 724         if( length( $sl ) % 2 == 0 ){
 725             return $regex;
 726         }
 727         $regex .= $3;
 728     }
 729     undef();
 730 }
 731
 732 # stripTrans: take a <del> terminated string from y command
 733 #   honoring and cleaning up of \-escaped <del>'s
 734 #
 735 sub stripTrans($$){
 736     my( $del, $sref ) = @_;
 737     my $t = '';
 738     print "stripTrans:$del:$$sref:\n" if $useDEBUG;
 739     while( $$sref =~ s{^(.*?)(\\*)\Q$del\E}{}s ){
 740         my $sl = $2;
 741         $t .= $1;
 742         if( length( $sl ) % 2 == 0 ){
 743             $t .= $sl;
 744             $t =~ s/\\\\/\\/g;
 745             return $t;
 746         }
 747         chop( $sl );
 748         $t .= $sl.$del.$3;
 749     }
 750     undef();
 751 }
 752
 753 # makey - construct Perl y/// from sed y///
 754 #
 755 sub makey($$$){
 756     my( $fr, $to, $fl ) = @_;
 757     my $error = 0;
 758
 759     # Ensure that any '-' is up front.
 760     # Diagnose duplicate contradicting mappings
 761     my %tr;
 762     for( my $i = 0; $i < length($fr); $i++ ){
 763         my $fc = substr($fr,$i,1);
 764         my $tc = substr($to,$i,1);
 765         if( exists( $tr{$fc} ) && $tr{$fc} ne $tc ){
 766             Warn( "ambiguos translation for character `$fc' in `y' command",
 767                   $fl );
 768             $error++;
 769         }
 770         $tr{$fc} = $tc;
 771     }
 772     $fr = $to = '';
 773     if( exists( $tr{'-'} ) ){
 774         ( $fr, $to ) = ( '-', $tr{'-'} );
 775         delete( $tr{'-'} );
 776     } else {
 777         $fr = $to = '';
 778     }
 779     # might just as well sort it...
 780     for my $fc ( sort keys( %tr ) ){
 781         $fr .= $fc;
 782         $to .= $tr{$fc};
 783     }
 784     # make embedded delimiters and newlines safe
 785     $fr =~ s/([{}])/\$1/g;
 786     $to =~ s/([{}])/\$1/g;
 787     $fr =~ s/\n/\\n/g;
 788     $to =~ s/\n/\\n/g;
 789     return $error ? undef() : "{ y{$fr}{$to}; }";
 790 }
 791
 792 ######
 793 # makes - construct Perl s/// from sed s///
 794 #
 795 sub makes($$$$$$$){
 796     my( $regex, $subst, $path, $global, $print, $nmatch, $fl ) = @_;
 797
 798     # make embedded newlines safe
 799     $regex =~ s/\n/\\n/g;
 800     $subst =~ s/\n/\\n/g;
 801
 802     my $code;
 803     # n-th occurrence
 804     #
 805     if( length( $nmatch ) ){
 806         $code = <<TheEnd;
 807 { \$n = $nmatch;
 808   while( --\$n && ( \$s = m ${regex}g ) ){}
 809   \$s = ( substr( \$_, pos() ) =~ s ${regex}${subst}s ) if \$s;
 810   \$CondReg ||= \$s;
 811 TheEnd
 812     } else {
 813         $code = <<TheEnd;
 814 { \$s = s ${regex}${subst}s${global};
 815   \$CondReg ||= \$s;
 816 TheEnd
 817     }
 818     if( $print ){
 819         $code .= '  print $_, "\n" if $s;'."\n";
 820     }
 821     if( defined( $path ) ){
 822         $wFiles{$path} = '';
 823         $code .= " _w( '$path' ) if \$s;\n";
 824     }
 825     $code .= "}";
 826 }
 827
 828 =head1 BASIC REGULAR EXPRESSIONS
 829
 830 A I<Basic Regular Expression> (BRE), as defined in POSIX 1003.2, consists
 831 of I<atoms>, for matching parts of a string, and I<bounds>, specifying
 832 repetitions of a preceding atom.
 833
 834 =head2 Atoms
 835
 836 The possible atoms of a BRE are: B<.>, matching any single character;
 837 B<^> and B<$>, matching the null string at the beginning or end
 838 of a string, respectively; a I<bracket expressions>, enclosed
 839 in B<[> and B<]> (see below); and any single character with no
 840 other significance (matching that character). A B<\> before one
 841 of: B<.>, B<^>, B<$>, B<[>, B<*>, B<\>, matching the character
 842 after the backslash. A sequence of atoms enclosed in B<\(> and B<\)>
 843 becomes an atom and establishes the target for a I<backreference>,
 844 consisting of the substring that actually matches the enclosed atoms.
 845 Finally, B<\> followed by one of the digits B<0> through B<9> is a
 846 backreference.
 847
 848 A B<^> that is not first, or a B<$> that is not last does not have
 849 a special significance and need not be preceded by a backslash to
 850 become literal. The same is true for a B<]>, that does not terminate
 851 a bracket expression.
 852
 853 An unescaped backslash cannot be last in a BRE.
 854
 855 =head2 Bounds
 856
 857 The BRE bounds are: B<*>, specifying 0 or more matches of the preceding
 858 atom; B<\{>I<count>B<\}>, specifying that many repetitions;
 859 B<\{>I<minimum>B<,\}>, giving a lower limit; and
 860 B<\{>I<minimum>B<,>I<maximum>B<\}> finally defines a lower and upper
 861 bound.
 862
 863 A bound appearing as the first item in a BRE is taken literally.
 864
 865 =head2 Bracket Expressions
 866
 867 A I<bracket expression> is a list of characters, character ranges
 868 and character classes enclosed in B<[> and B<]> and matches any
 869 single character from the represented set of characters.
 870
 871 A character range is written as two characters separated by B<-> and
 872 represents all characters (according to the character collating sequence)
 873 that are not less than the first and not greater than the second.
 874 (Ranges are very collating-sequence-dependent, and portable programs
 875 should avoid relying on them.)
 876
 877 A character class is one of the class names
 878
 879    alnum     digit     punct
 880    alpha     graph     space
 881    blank     lower     upper
 882    cntrl     print     xdigit
 883
 884 enclosed in B<[:> and B<:]> and represents the set of characters
 885 as defined in ctype(3).
 886
 887 If the first character after B<[> is B<^>, the sense of matching is
 888 inverted.
 889
 890 To include a literal `C<^>', place it anywhere else but first. To
 891 include a literal 'C<]>' place it first or immediately after an
 892 initial B<^>. To include a literal `C<->' make it the first (or
 893 second after B<^>) or last character, or the second endpoint of
 894 a range.
 895
 896 The special bracket expression constructs C<[[:E<lt>:]]> and C<[[:E<gt>:]]>
 897 match the null string at the beginning and end of a word respectively.
 898 (Note that neither is identical to Perl's `\b' atom.)
 899
 900 =head2 Additional Atoms
 901
 902 Since some sed implementations provide additional regular expression
 903 atoms (not defined in POSIX 1003.2), B<psed> is capable of translating
 904 the following backslash escapes:
 905
 906 =over 4
 907
 908 =item B<\E<lt>> This is the same as C<[[:E<gt>:]]>.
 909
 910 =item B<\E<gt>> This is the same as C<[[:E<lt>:]]>.
 911
 912 =item B<\w> This is an abbreviation for C<[[:alnum:]_]>.
 913
 914 =item B<\W> This is an abbreviation for C<[^[:alnum:]_]>.
 915
 916 =item B<\y> Match the empty string at a word boundary.
 917
 918 =item B<\B> Match the empty string between any two either word or non-word characters.
 919
 920 =back
 921
 922 To enable this feature, the environment variable PSEDEXTBRE must be set
 923 to a string containing the requested characters, e.g.:
 924 C<PSEDEXTBRE='E<lt>E<gt>wW'>.
 925
 926 =cut
 927
 928 #####
 929 # bre2p - convert BRE to Perl RE
 930 #
 931 sub peek(\$$){
 932     my( $pref, $ic ) = @_;
 933     $ic < length($$pref)-1 ? substr( $$pref, $ic+1, 1 ) : '';
 934 }
 935
 936 sub bre2p($$$){
 937     my( $del, $pat, $fl ) = @_;
 938     my $led = $del;
 939     $led =~ tr/{([</})]>/;
 940     $led = '' if $led eq $del;
 941
 942     $pat = substr( $pat, 1, length($pat) - 2 );
 943     my $res = '';
 944     my $bracklev = 0;
 945     my $backref  = 0;
 946     my $parlev = 0;
 947     for( my $ic = 0; $ic < length( $pat ); $ic++ ){
 948         my $c = substr( $pat, $ic, 1 );
 949         if( $c eq '\\' ){
 950             ### backslash escapes
 951             my $nc = peek($pat,$ic);
 952             if( $nc eq '' ){
 953                 Warn( "`\\' cannot be last in pattern", $fl );
 954                 return undef();
 955             }
 956             $ic++;
 957             if( $nc eq $del ){ ## \<pattern del> => \<pattern del>
 958                 $res .= "\\$del";
 959
 960             } elsif( $nc =~ /([[.*\\n])/ ){
 961                 ## check for \-escaped magics and \n:
 962                 ## \[ \. \* \\ \n stay as they are
 963                 $res .= '\\'.$nc;
 964
 965             } elsif( $nc eq '(' ){ ## \( => (
 966                 $parlev++;
 967                 $res .= '(';
 968
 969             } elsif( $nc eq ')' ){ ## \) => )
 970                 $parlev--;
 971                 $backref++;
 972                 if( $parlev < 0 ){
 973                     Warn( "unmatched `\\)'", $fl );
 974                     return undef();
 975                 }
 976                 $res .= ')';
 977
 978             } elsif( $nc eq '{' ){ ## repetition factor \{<i>[,[<j>]]\}
 979                 my $endpos = index( $pat, '\\}', $ic );
 980                 if( $endpos < 0 ){
 981                     Warn( "unmatched `\\{'", $fl );
 982                     return undef();
 983                 }
 984                 my $rep = substr( $pat, $ic+1, $endpos-($ic+1) );
 985                 $ic = $endpos + 1;
 986
 987                 if( $res =~ /^\^?$/ ){
 988                     $res .= "\\{$rep\}";
 989                 } elsif( $rep =~ /^(\d+)(,?)(\d*)?$/ ){
 990                     my $min = $1;
 991                     my $com = $2 || '';
 992                     my $max = $3;
 993                     if( length( $max ) ){
 994                         if( $max < $min ){
 995                             Warn( "maximum less than minimum in `\\{$rep\\}'",
 996                                   $fl );
 997                             return undef();
 998                         }
 999                     } else {
1000                         $max = '';
1001                     }
1002                     # simplify some
1003                     if( $min == 0 && $max eq '1' ){
1004                         $res .= '?';
1005                     } elsif( $min == 1 && "$com$max" eq ',' ){
1006                         $res .= '+';
1007                     } elsif( $min == 0 && "$com$max" eq ',' ){
1008                         $res .= '*';
1009                     } else {
1010                         $res .= "{$min$com$max}";
1011                     }
1012                 } else {
1013                     Warn( "invalid repeat clause `\\{$rep\\}'", $fl );
1014                     return undef();
1015                 }
1016
1017             } elsif( $nc =~ /^[1-9]$/ ){
1018                 ## \1 .. \9 => \1 .. \9, but check for a following digit
1019                 if( $nc > $backref ){
1020                     Warn( "invalid backreference ($nc)", $fl );
1021                     return undef();
1022                 }
1023                 $res .= "\\$nc";
1024                 if( peek($pat,$ic) =~ /[0-9]/ ){
1025                     $res .= '(?:)';
1026                 }
1027
1028             } elsif( $useEXTBRE && ( $nc =~ /[$useEXTBRE]/ ) ){
1029                 ## extensions - at most <>wWyB - not in POSIX
1030                 if(      $nc eq '<' ){ ## \< => \b(?=\w), be precise
1031                     $res .= '\\b(?<=\\W)';
1032                 } elsif( $nc eq '>' ){ ## \> => \b(?=\W), be precise
1033                     $res .= '\\b(?=\\W)';
1034                 } elsif( $nc eq 'y' ){ ## \y => \b
1035                     $res .= '\\b';
1036                 } else {               ## \B, \w, \W remain the same
1037                     $res .= "\\$nc";
1038                 }
1039             } elsif( $nc eq $led ){
1040                 ## \<closing bracketing-delimiter> - keep '\'
1041                 $res .= "\\$nc";
1042
1043             } else { ## \ <char> => <char> ("as if `\' were not present")
1044                 $res .= $nc;
1045             }
1046
1047         } elsif( $c eq '.' ){ ## . => .
1048             $res .= $c;
1049
1050         } elsif( $c eq '*' ){ ## * => * but \* if there's nothing preceding it
1051             if( $res =~ /^\^?$/ ){
1052                 $res .= '\\*';
1053             } elsif( substr( $res, -1, 1 ) ne '*' ){
1054                 $res .= $c;
1055             }
1056
1057         } elsif( $c eq '[' ){
1058             ## parse []: [^...] [^]...] [-...]
1059             my $add = '[';
1060             if( peek($pat,$ic) eq '^' ){
1061                 $ic++;
1062                 $add .= '^';
1063             }
1064             my $nc = peek($pat,$ic);
1065             if( $nc eq ']' || $nc eq '-' ){
1066                 $add .= $nc;
1067                 $ic++;
1068             }
1069             # check that [ is not trailing
1070             if( $ic >= length( $pat ) - 1 ){
1071                 Warn( "unmatched `['", $fl );
1072                 return undef();
1073             }
1074             # look for [:...:] and x-y
1075             my $rstr = substr( $pat, $ic+1 );
1076             if( $rstr =~ /^((?:\[:\(\w+|[><]\):\]|[^]-](?:-[^]])?)*)/ ){
1077                 my $cnt = $1;
1078                 $ic += length( $cnt );
1079                 $cnt =~ s/([\\\$])/\\$1/g; # `\', `$' are magic in Perl []
1080                 # try some simplifications
1081                 my $red = $cnt;
1082                 if( $red =~ s/0-9// ){
1083                     $cnt = $red.'\d';
1084                     if( $red =~ s/A-Z// && $red =~ s/a-z// && $red =~ s/_// ){
1085                         $cnt = $red.'\w';
1086                     }
1087                 }
1088                 $add .= $cnt;
1089
1090                 # POSIX 1003.2 has this (optional) for begin/end word
1091                 $add = '\\b(?=\\W)'  if $add eq '[[:<:]]';
1092                 $add = '\\b(?<=\\W)' if $add eq '[[:>:]]';
1093
1094             }
1095
1096             ## may have a trailing `-' before `]'
1097             if( $ic < length($pat) - 1 &&
1098                 substr( $pat, $ic+1 ) =~ /^(-?])/ ){
1099                 $ic += length( $1 );
1100                 $add .= $1;
1101                 # another simplification
1102                 $add =~ s/^\[(\^?)(\\[dw])]$/ $1 eq '^' ? uc($2) : $2 /e;
1103                 $res .= $add;
1104             } else {
1105                 Warn( "unmatched `['", $fl );
1106                 return undef();
1107             }
1108
1109         } elsif( $c eq $led ){ ## unescaped <closing bracketing-delimiter>
1110             $res .= "\\$c";
1111
1112         } elsif( $c eq ']' ){ ## unmatched ] is not magic
1113             $res .= ']';
1114
1115         } elsif( $c =~ /[|+?{}()]/ ){ ## not magic in BRE, but in Perl: \-quote
1116             $res .= "\\$c";
1117
1118         } elsif( $c eq '^' ){ ## not magic unless 1st, but in Perl: \-quote
1119             $res .= length( $res ) ? '\\^' : '^';
1120
1121         } elsif( $c eq '$' ){ ## not magic unless last, but in Perl: \-quote
1122             $res .= $ic == length( $pat ) - 1 ? '$' : '\\$';
1123
1124         } else {
1125             $res .= $c;
1126         }
1127     }
1128
1129     if( $parlev ){
1130        Warn( "unmatched `\\('", $fl );
1131        return undef();
1132     }
1133
1134     # final cleanup: eliminate raw HTs
1135     $res =~ s/\t/\\t/g;
1136     return $del . $res . ( $led ? $led : $del );
1137 }
1138
1139
1140 #####
1141 # sub2p - convert sed substitution to Perl substitution
1142 #
1143 sub sub2p($$$){
1144     my( $del, $subst, $fl ) = @_;
1145     my $led = $del;
1146     $led =~ tr/{([</})]>/;
1147     $led = '' if $led eq $del;
1148
1149     $subst = substr( $subst, 1, length($subst) - 2 );
1150     my $res = '';
1151
1152     for( my $ic = 0; $ic < length( $subst ); $ic++ ){
1153         my $c = substr( $subst, $ic, 1 );
1154         if( $c eq '\\' ){
1155             ### backslash escapes
1156             my $nc = peek($subst,$ic);
1157             if( $nc eq '' ){
1158                 Warn( "`\\' cannot be last in substitution", $fl );
1159                 return undef();
1160             }
1161             $ic++;
1162             if( $nc =~ /[\\$del$led]/ ){ ## \ and delimiter
1163                 $res .= '\\' . $nc;
1164             } elsif( $nc =~ /[1-9]/ ){ ## \1 - \9 => ${1} - ${9}
1165                 $res .= '${' . $nc . '}';
1166             } else { ## everything else (includes &): omit \
1167                 $res .= $nc;
1168             }
1169         } elsif( $c eq '&' ){ ## & => $&
1170             $res .= '$&';
1171         } elsif( $c =~ /[\$\@$led]/ ){ ## magic in Perl's substitution string
1172             $res .= '\\' . $c;
1173         } else {
1174             $res .= $c;
1175         }
1176     }
1177
1178     # final cleanup: eliminate raw HTs
1179     $res =~ s/\t/\\t/g;
1180     return ( $led ? $del : $led ) . $res . ( $led ? $led : $del );
1181 }
1182
1183
1184 sub Parse(){
1185     my $error = 0;
1186     my( $pdef, $pfil, $plin );
1187     for( my $icom = 0; $icom < @Commands; $icom++ ){
1188         my $cmd = $Commands[$icom];
1189         print "Parse:$cmd:\n" if $useDEBUG;
1190         $cmd =~ s/^\s+//;
1191         next unless length( $cmd );
1192         my $scom = $icom;
1193         if( exists( $Defined{$icom} ) ){
1194             $pdef = $Defined{$icom};
1195             if( $pdef =~ /^ #(\d+)/ ){
1196                 $pfil = 'expression #';
1197                 $plin = $1;
1198             } else {
1199                 $pfil = "$pdef l.";
1200                 $plin = 1;
1201             }
1202         } else {
1203             $plin++;
1204         }
1205         my $fl = "$pfil$plin";
1206
1207         # insert command as comment in gnerated code
1208         #
1209         $Code .= "# $cmd\n" if $doGenerate;
1210
1211         # The Address(es)
1212         #
1213         my( $negated, $naddr, $addr1, $addr2 );
1214         $naddr = 0;
1215         if(      $cmd =~ s/^(\d+)\s*// ){
1216             $addr1 = "$1"; $naddr++;
1217         } elsif( $cmd =~ s/^\$\s*// ){
1218             $addr1 = 'eofARGV()'; $naddr++;
1219         } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1220             my $del = $1;
1221             my $regex = stripRegex( $del, \$cmd );
1222             if( defined( $regex ) ){
1223                 $addr1 = 'm '.bre2p( $del, $regex, $fl ).'s';
1224                 $naddr++;
1225             } else {
1226                 Warn( "malformed regex, 1st address", $fl );
1227                 $error++;
1228                 next;
1229             }
1230         }
1231         if( defined( $addr1 ) && $cmd =~ s/,\s*// ){
1232             if(      $cmd =~ s/^(\d+)\s*// ){
1233                 $addr2 = "$1"; $naddr++;
1234             } elsif( $cmd =~ s/^\$\s*// ){
1235                 $addr2 = 'eofARGV()'; $naddr++;
1236             } elsif( $cmd =~ s{^(/)}{} || $cmd =~ s{^\\(.)}{} ){
1237                 my $del = $1;
1238                 my $regex = stripRegex( $del, \$cmd );
1239                 if( defined( $regex ) ){
1240                     $addr2 = 'm '. bre2p( $del, $regex, $fl ).'s';
1241                     $naddr++;
1242                 } else {
1243                     Warn( "malformed regex, 2nd address", $fl );
1244                     $error++;
1245                     next;
1246                 }
1247             } else {
1248                 Warn( "invalid address after `,'", $fl );
1249                 $error++;
1250                 next;
1251             }
1252         }
1253
1254         # address modifier `!'
1255         #
1256         $negated = $cmd =~ s/^!\s*//;
1257         if( defined( $addr1 ) ){
1258             print "Parse: addr1=$addr1" if $useDEBUG;
1259             if( defined( $addr2 ) ){
1260                 print ", addr2=$addr2 " if $useDEBUG;
1261                 # both numeric and addr1 > addr2 => eliminate addr2
1262                 undef( $addr2 ) if $addr1 =~ /^\d+$/ &&
1263                                    $addr2 =~ /^\d+$/ && $addr1 > $addr2;
1264             }
1265         }
1266         print 'negated' if $useDEBUG && $negated;
1267         print " command:$cmd\n" if $useDEBUG;
1268
1269         # The Command
1270         #
1271         if( $cmd !~ s/^([:#={}abcdDgGhHilnNpPqrstwxy])\s*// ){
1272             my $h = substr( $cmd, 0, 1 );
1273             Warn( "unknown command `$h'", $fl );
1274             $error++;
1275             next;
1276         }
1277         my $key = $1;
1278
1279         my $tabref = $ComTab{$key};
1280         if( $naddr > $tabref->[0] ){
1281             Warn( "excess address(es)", $fl );
1282             $error++;
1283             next;
1284         }
1285
1286         my $arg = '';
1287         if(      $tabref->[1] eq 'str' ){
1288             # take remainder - don't care if it is empty
1289             $arg = $cmd;
1290             $cmd = '';
1291
1292         } elsif( $tabref->[1] eq 'txt' ){
1293             # multi-line text
1294             my $goon = $cmd =~ /(.*)\\$/;
1295             if( length( $1 ) ){
1296                 Warn( "extra characters after command ($cmd)", $fl );
1297                 $error++;
1298             }
1299             while( $goon ){
1300                 $icom++;
1301                 if( $icom > $#Commands ){
1302                     Warn( "unexpected end of script", $fl );
1303                     $error++;
1304                     last;
1305                 }
1306                 $cmd = $Commands[$icom];
1307                 $Code .= "# $cmd\n" if $doGenerate;
1308                 $goon = $cmd =~ s/\\$//;
1309                 $cmd =~ s/\\(.)/$1/g;
1310                 $arg .= "\n" if length( $arg );
1311                 $arg .= $cmd;
1312             }
1313             $arg .= "\n" if length( $arg );
1314             $cmd = '';
1315
1316         } elsif( $tabref->[1] eq 'sub' ){
1317             # s///
1318             if( ! length( $cmd ) ){
1319                 Warn( "`s' command requires argument", $fl );
1320                 $error++;
1321                 next;
1322             }
1323             if( $cmd =~ s{^([^\\\n])}{} ){
1324                 my $del = $1;
1325                 my $regex = stripRegex( $del, \$cmd );
1326                 if( ! defined( $regex ) ){
1327                     Warn( "malformed regular expression", $fl );
1328                     $error++;
1329                     next;
1330                 }
1331                 $regex = bre2p( $del, $regex, $fl );
1332
1333                 # a trailing \ indicates embedded NL (in replacement string)
1334                 while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1335                     $icom++;
1336                     if( $icom > $#Commands ){
1337                         Warn( "unexpected end of script", $fl );
1338                         $error++;
1339                         last;
1340                     }
1341                     $cmd .= $Commands[$icom];
1342                     $Code .= "# $Commands[$icom]\n" if $doGenerate;
1343                 }
1344
1345                 my $subst = stripRegex( $del, \$cmd );
1346                 if( ! defined( $regex ) ){
1347                     Warn( "malformed substitution expression", $fl );
1348                     $error++;
1349                     next;
1350                 }
1351                 $subst = sub2p( $del, $subst, $fl );
1352
1353                 # parse s/// modifier: g|p|0-9|w <file>
1354                 my( $global, $nmatch, $print, $write ) =
1355                   ( '',      '',      0,      undef );
1356                 while( $cmd =~ s/^([gp0-9])// ){
1357                     $1 eq 'g' ? ( $global = 'g' ) :
1358                     $1 eq 'p' ? ( $print  = $1  ) : ( $nmatch .= $1 );
1359                 }
1360                 $write = $1 if $cmd =~ s/w\s*(.*)$//;
1361                 ### $nmatch =~ s/^(\d)\1*$/$1/; ### may be dangerous?
1362                 if( $global && length( $nmatch ) || length( $nmatch ) > 1 ){
1363                     Warn( "conflicting flags `$global$nmatch'", $fl );
1364                     $error++;
1365                     next;
1366                 }
1367
1368                 $arg = makes( $regex, $subst,
1369                               $write, $global, $print, $nmatch, $fl );
1370                 if( ! defined( $arg ) ){
1371                     $error++;
1372                     next;
1373                 }
1374
1375             } else {
1376                 Warn( "improper delimiter in s command", $fl );
1377                 $error++;
1378                 next;
1379             }
1380
1381         } elsif( $tabref->[1] eq 'tra' ){
1382             # y///
1383             # a trailing \ indicates embedded newline
1384             while( $cmd =~ s/(?<!\\)\\$/\n/ ){
1385                 $icom++;
1386                 if( $icom > $#Commands ){
1387                     Warn( "unexpected end of script", $fl );
1388                     $error++;
1389                     last;
1390                 }
1391                 $cmd .= $Commands[$icom];
1392                 $Code .= "# $Commands[$icom]\n" if $doGenerate;
1393             }
1394             if( ! length( $cmd ) ){
1395                 Warn( "`y' command requires argument", $fl );
1396                 $error++;
1397                 next;
1398             }
1399             my $d = substr( $cmd, 0, 1 ); $cmd = substr( $cmd, 1 );
1400             if( $d eq '\\' ){
1401                 Warn( "`\\' not valid as delimiter in `y' command", $fl );
1402                 $error++;
1403                 next;
1404             }
1405             my $fr = stripTrans( $d, \$cmd );
1406             if( ! defined( $fr ) || ! length( $cmd ) ){
1407                 Warn( "malformed `y' command argument", $fl );
1408                 $error++;
1409                 next;
1410             }
1411             my $to = stripTrans( $d, \$cmd );
1412             if( ! defined( $to ) ){
1413                 Warn( "malformed `y' command argument", $fl );
1414                 $error++;
1415                 next;
1416             }
1417             if( length($fr) != length($to) ){
1418                 Warn( "string lengths in `y' command differ", $fl );
1419                 $error++;
1420                 next;
1421             }
1422             if( ! defined( $arg = makey( $fr, $to, $fl ) ) ){
1423                 $error++;
1424                 next;
1425             }
1426
1427         }
1428
1429         # $cmd must be now empty - exception is {
1430         if( $cmd !~ /^\s*$/ ){
1431             if( $key eq '{' ){
1432                 # dirty hack to process command on '{' line
1433                 $Commands[$icom--] = $cmd;
1434             } else {
1435                 Warn( "extra characters after command ($cmd)", $fl );
1436                 $error++;
1437                 next;
1438             }
1439         }
1440
1441         # Make Code
1442         #
1443         if( &{$tabref->[2]}( $addr1, $addr2, $negated,
1444                              $tabref->[3], $arg, $fl ) ){
1445             $error++;
1446         }
1447     }
1448
1449     while( @BlockStack ){
1450         my $bl = pop( @BlockStack );
1451         Warn( "start of unterminated `{'", $bl );
1452         $error++;
1453     }
1454
1455     for my $lab ( keys( %Label ) ){
1456         if( ! exists( $Label{$lab}{defined} ) ){
1457             for my $used ( @{$Label{$lab}{used}} ){
1458                 Warn( "undefined label `$lab'", $used );
1459                 $error++;
1460             }
1461         }
1462     }
1463
1464     exit( 1 ) if $error;
1465 }
1466
1467
1468 ##############
1469 #### MAIN ####
1470 ##############
1471
1472 sub usage(){
1473     print STDERR "Usage: sed [-an] command [file...]\n";
1474     print STDERR "           [-an] [-e command] [-f script-file] [file...]\n";
1475 }
1476
1477 ###################
1478 # Here we go again...
1479 #
1480 my $expr = 0;
1481 while( @ARGV && $ARGV[0] =~ /^-(.)(.*)$/ ){
1482     my $opt = $1;
1483     my $arg = $2;
1484     shift( @ARGV );
1485     if(      $opt eq 'e' ){
1486         if( length( $arg ) ){
1487             push( @Commands, split( "\n", $arg ) );
1488         } elsif( @ARGV ){
1489             push( @Commands, shift( @ARGV ) );
1490         } else {
1491             Warn( "option -e requires an argument" );
1492             usage();
1493             exit( 1 );
1494         }
1495         $expr++;
1496         $Defined{$#Commands} = " #$expr";
1497         next;
1498     }
1499     if( $opt eq 'f' ){
1500         my $path;
1501         if( length( $arg ) ){
1502             $path = $arg;
1503         } elsif( @ARGV ){
1504             $path = shift( @ARGV );
1505         } else {
1506             Warn( "option -f requires an argument" );
1507             usage();
1508             exit( 1 );
1509         }
1510         my $fst = $#Commands + 1;
1511         open( SCRIPT, "<$path" ) || die( "$0: $path: could not open ($!)\n" );
1512         my $cmd;
1513         while( defined( $cmd = <SCRIPT> ) ){
1514             chomp( $cmd );
1515             push( @Commands, $cmd );
1516         }
1517         close( SCRIPT );
1518         if( $#Commands >= $fst ){
1519             $Defined{$fst} = "$path";
1520         }
1521         next;
1522     }
1523     if( $opt eq '-' && $arg eq '' ){
1524         last;
1525     }
1526     if( $opt eq 'h' || $opt eq '?' ){
1527         usage();
1528         exit( 0 );
1529     }
1530     if( $opt eq 'n' ){
1531         $doAutoPrint = 0;
1532     } elsif( $opt eq 'a' ){
1533         $doOpenWrite = 0;
1534     } else {
1535         Warn( "illegal option `$opt'" );
1536         usage();
1537         exit( 1 );
1538     }
1539     if( length( $arg ) ){
1540         unshift( @ARGV, "-$arg" );
1541     }
1542 }
1543
1544 # A singleton command may be the 1st argument when there are no options.
1545 #
1546 if( @Commands == 0 ){
1547     if( @ARGV == 0 ){
1548         Warn( "no script command given" );
1549         usage();
1550         exit( 1 );
1551     }
1552     push( @Commands, split( "\n", shift( @ARGV ) ) );
1553     $Defined{0} = ' #1';
1554 }
1555
1556 print STDERR "Files: @ARGV\n" if $useDEBUG;
1557
1558 # generate leading code
1559 #
1560     $Code = <<'[TheEnd]';
1561
1562 sub openARGV(){
1563     unshift( @ARGV, '-' ) unless @ARGV;
1564     my $file = shift( @ARGV );
1565     open( ARG, "<$file" )
1566     || die( "$0: can't open $file for reading ($!)\n" );
1567     $isEOF = 0;
1568 }
1569
1570 sub getsARGV(;\$){
1571     my $argref = @_ ? shift() : \$_;
1572     while( $isEOF || ! defined( $$argref = <ARG> ) ){
1573         close( ARG );
1574         return 0 unless @ARGV;
1575         my $file = shift( @ARGV );
1576         open( ARG, "<$file" )
1577         || die( "$0: can't open $file for reading ($!)\n" );
1578         $isEOF = 0;
1579     }
1580     1;
1581 }
1582
1583 sub eofARGV(){
1584     return @ARGV == 0 && ( $isEOF = eof( ARG ) );
1585 }
1586
1587 sub makeHandle($){
1588     my( $path ) = @_;
1589     my $handle;
1590     if( ! exists( $wFiles{$path} ) || $wFiles{$path} eq '' ){
1591         $handle = $wFiles{$path} = gensym();
1592         if( $doOpenWrite ){
1593             if( ! open( $handle, ">$path" ) ){
1594                 die( "$0: can't open $path for writing: ($!)\n" );
1595             }
1596         }
1597     } else {
1598         $handle = $wFiles{$path};
1599     }
1600     return $handle;
1601 }
1602
1603 sub _r($){
1604     my $path = shift();
1605     push( @Q, \$path );
1606 }
1607
1608 sub _l(){
1609     my $h = $_;
1610     my $mcpl = 70;
1611     $h =~ s/\\/\\\\/g;
1612     if( $h =~ /[^[:print:]]/ ){
1613         $h =~ s/\a/\\a/g;
1614         $h =~ s/\f/\\f/g;
1615         $h =~ s/\n/\\n/g;
1616         $h =~ s/\t/\\t/g;
1617         $h =~ s/\r/\\r/g;
1618         $h =~ s/\e/\\e/g;
1619         $h =~ s/([^[:print:]])/sprintf("\\%03o", ord($1))/ge;
1620     }
1621     while( length( $h ) > $mcpl ){
1622         my $l = substr( $h, 0, $mcpl-1 );
1623         $h = substr( $h, $mcpl );
1624         # remove incomplete \-escape from end of line
1625         if( $l =~ s/(?<!\\)(\\[0-7]{0,2})$// ){
1626             $h = $1 . $h;
1627         }
1628         print $l, "\\\n";
1629     }
1630     print "$h\$\n";
1631 }
1632
1633 sub _w($){
1634     my $path   = shift();
1635     my $handle = $wFiles{$path};
1636     if( ! $doOpenWrite &&
1637         ! defined( fileno( $handle ) ) ){
1638         open( $handle, ">$path" )
1639         || die( "$0: $path: cannot open ($!)\n" );
1640     }
1641     print $handle $_, "\n";
1642 }
1643
1644 # condition register test/reset
1645 #
1646 sub _t(){
1647     my $res = $CondReg;
1648     $CondReg = 0;
1649     $res;
1650 }
1651
1652 # printQ
1653 #
1654 sub printQ(){
1655     for my $q ( @Q ){
1656         if( ref( $q ) ){
1657             if( exists( $wFiles{$$q} ) && $wFiles{$$q} ne '' ){
1658                 open( $wFiles{$$q}, ">>$$q" );
1659             }
1660             if( open( RF, "<$$q" ) ){
1661                 my $line;
1662                 while( defined( $line = <RF> ) ){
1663                     print $line;
1664                 }
1665                 close( RF );
1666             }
1667         } else {
1668             print $q;
1669         }
1670     }
1671     undef( @Q );
1672 }
1673
1674 sub Run(){
1675     my( $h, $icnt, $s, $n );
1676     # hack (not unbreakable :-/) to avoid // matching an empty string
1677     my $z = "\000"; $z =~ /$z/;
1678     # Initialize.
1679     openARGV();
1680     $Hold    = '';
1681     $CondReg = 0;
1682     $doPrint = $doAutoPrint;
1683 CYCLE:
1684     while( getsARGV() ){
1685         chomp();
1686         $CondReg = 0;   # cleared on t
1687 BOS:;
1688 [TheEnd]
1689
1690     # parse - avoid opening files when doing s2p
1691     #
1692     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1693       if $doGenerate;
1694     Parse();
1695     ( $svOpenWrite, $doOpenWrite ) = (  $doOpenWrite, $svOpenWrite )
1696       if $doGenerate;
1697
1698     # append trailing code
1699     #
1700     $Code .= <<'[TheEnd]';
1701 EOS:    if( $doPrint ){
1702             print $_, "\n";
1703         } else {
1704             $doPrint = $doAutoPrint;
1705         }
1706         printQ() if @Q;
1707     }
1708
1709     exit( 0 );
1710 }
1711 [TheEnd]
1712
1713 # magic "#n" - same as -n option
1714 #
1715 $doAutoPrint = 0 if substr( $Commands[0], 0, 2 ) eq '#n';
1716
1717 # eval code - check for errors
1718 #
1719 print "Code:\n$Code" if $useDEBUG;
1720 eval $Code;
1721 if( $@ ){
1722     print "Code:\n$Code";
1723     die( "$0: internal error - generated incorrect Perl code: $@\n" );
1724 }
1725
1726 if( $doGenerate ){
1727
1728     # write full Perl program
1729     #
1730
1731     # bang line, declarations
1732     print <<TheEnd;
1733 #!$perlpath -w
1734 eval 'exec $perlpath -S \$0 \${1+"\$@"}'
1735   if 0;
1736 \$0 =~ s/^.*?(\\w+)\$/\$1/;
1737
1738 use strict;
1739 use Symbol;
1740 use vars qw{ \$isEOF \$Hold \%wFiles \@Q \$CondReg
1741              \$doAutoPrint \$doOpenWrite \$doPrint };
1742 \$doAutoPrint = $doAutoPrint;
1743 \$doOpenWrite = $doOpenWrite;
1744 TheEnd
1745
1746     my $wf = "'" . join( "', '",  keys( %wFiles ) ) . "'";
1747     if( $wf ne "''" ){
1748         print <<TheEnd;
1749 sub makeHandle(\$);
1750 for my \$p ( $wf ){
1751    exit( 1 ) unless makeHandle( \$p );
1752 }
1753 TheEnd
1754    }
1755
1756    print $Code;
1757    print "&Run()\n";
1758    exit( 0 );
1759
1760 } else {
1761
1762     # execute: make handles (and optionally open) all w files; run!
1763
1764     for my $p ( keys( %wFiles ) ){
1765         exit( 1 ) unless makeHandle( $p );
1766     }
1767     &Run();
1768 }
1769
1770
1771 =head1 ENVIRONMENT
1772
1773 The environment variable C<PSEDEXTBRE> may be set to extend BREs.
1774 See L<"Additional Atoms">.
1775
1776 =head1 DIAGNOSTICS
1777
1778 =over 4
1779
1780 =item ambiguos translation for character `%s' in `y' command
1781
1782 The indicated character appears twice, with different translations.
1783
1784 =item `[' cannot be last in pattern
1785
1786 A `[' in a BRE indicates the beginning of a I<bracket expression>.
1787
1788 =item `\' cannot be last in pattern
1789
1790 A `\' in a BRE is used to make the subsequent character literal.
1791
1792 =item `\' cannot be last in substitution
1793
1794 A `\' in a subsitution string is used to make the subsequent character literal.
1795
1796 =item conflicting flags `%s'
1797
1798 In an B<s> command, either the `g' flag and an n-th occurrence flag, or
1799 multiple n-th occurrence flags are specified. Note that only the digits
1800 `1' through `9' are permitted.
1801
1802 =item duplicate label %s (first defined at %s)
1803
1804 =item excess address(es)
1805
1806 The command has more than the permitted number of addresses.
1807
1808 =item extra characters after command (%s)
1809
1810 =item illegal option `%s'
1811
1812 =item improper delimiter in s command
1813
1814 The BRE and substitution may not be delimited with `\' or newline.
1815
1816 =item invalid address after `,'
1817
1818 =item invalid backreference (%s)
1819
1820 The specified backreference number exceeds the number of backreferences
1821 in the BRE.
1822
1823 =item invalid repeat clause `\{%s\}'
1824
1825 The repeat clause does not contain a valid integer value, or pair of
1826 values.
1827
1828 =item malformed regex, 1st address
1829
1830 =item malformed regex, 2nd address
1831
1832 =item malformed regular expression
1833
1834 =item malformed substitution expression
1835
1836 =item malformed `y' command argument
1837
1838 The first or second string of a B<y> command  is syntactically incorrect.
1839
1840 =item maximum less than minimum in `\{%s\}'
1841
1842 =item no script command given
1843
1844 There must be at least one B<-e> or one B<-f> option specifying a
1845 script or script file.
1846
1847 =item `\' not valid as delimiter in `y' command
1848
1849 =item option -e requires an argument
1850
1851 =item option -f requires an argument
1852
1853 =item `s' command requires argument
1854
1855 =item start of unterminated `{'
1856
1857 =item string lengths in `y' command differ
1858
1859 The translation table strings in a B<y> commanf must have equal lengths.
1860
1861 =item undefined label `%s'
1862
1863 =item unexpected `}'
1864
1865 A B<}> command without a preceding B<{> command was encountered.
1866
1867 =item unexpected end of script
1868
1869 The end of the script was reached although a text line after a
1870 B<a>, B<c> or B<i> command indicated another line.
1871
1872 =item unknown command `%s'
1873
1874 =item unterminated `['
1875
1876 A BRE contains an unterminated bracket expression.
1877
1878 =item unterminated `\('
1879
1880 A BRE contains an unterminated backreference.
1881
1882 =item `\{' without closing `\}'
1883
1884 A BRE contains an unterminated bounds specification.
1885
1886 =item `\)' without preceding `\('
1887
1888 =item `y' command requires argument
1889
1890 =back
1891
1892 =head1 EXAMPLE
1893
1894 The basic material for the preceding section was generated by running
1895 the sed script
1896
1897    #no autoprint
1898    s/^.*Warn( *"\([^"]*\)".*$/\1/
1899    t process
1900    b
1901    :process
1902    s/$!/%s/g
1903    s/$[_[:alnum:]]\{1,\}/%s/g
1904    s/\\\\/\\/g
1905    s/^/=item /
1906    p
1907
1908 on the program's own text, and piping the output into C<sort -u>.
1909
1910
1911 =head1 SED SCRIPT TRANSLATION
1912
1913 If this program is invoked with the name F<s2p> it will act as a
1914 sed-to-Perl translator. After option processing (all other
1915 arguments are ignored), a Perl program is printed on standard
1916 output, which will process the input stream (as read from all
1917 arguments) in the way defined by the sed script and the option setting
1918 used for the translation.
1919
1920 =head1 SEE ALSO
1921
1922 perl(1), re_format(7)
1923
1924 =head1 BUGS
1925
1926 The B<l> command will show escape characters (ESC) as `C<\e>', but
1927 a vertical tab (VT) in octal.
1928
1929 Trailing spaces are truncated from labels in B<:>, B<t> and B<b> commands.
1930
1931 The meaning of an empty regular expression (`C<//>'), as defined by B<sed>,
1932 is "the last pattern used, at run time". This deviates from the Perl
1933 interpretation, which will re-use the "last last successfully executed
1934 regular expression". Since keeping track of pattern usage would create
1935 terribly cluttered code, and differences would only appear in obscure
1936 context (where other B<sed> implementations appear to deviate, too),
1937 the Perl semantics was adopted. Note that common usage of this feature,
1938 such as in C</abc/s//xyz/>, will work as expected.
1939
1940 Collating elements (of bracket expressions in BREs) are not implemented.
1941
1942 =head1 STANDARDS
1943
1944 This B<sed> implementation conforms to the IEEE Std1003.2-1992 ("POSIX.2")
1945 definition of B<sed>, and is compatible with the I<OpenBSD>
1946 implementation, except where otherwise noted (see L<"BUGS">).
1947
1948 =head1 AUTHOR
1949
1950 This Perl implementation of I<sed> was written by Wolfgang Laun,
1951 I<Wolfgang.Laun@alcatel.at>.
1952
1953 =head1 COPYRIGHT and LICENSE
1954
1955 This program is free and open software. You may use, modify,
1956 distribute, and sell this program (and any modified variants) in any
1957 way you wish, provided you do not restrict others from doing the same.
1958
1959 =cut
1960
1961 !NO!SUBS!
1962
1963 close OUT or die "Can't close $file: $!";
1964 chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
1965 unlink 'psed';
1966 print "Linking s2p to psed.\n";
1967 if (defined $Config{d_link}) {
1968   link 's2p', 'psed';
1969 } else {
1970   unshift @INC, '../lib';
1971   require File::Copy;
1972   File::Copy::syscopy('s2p', 'psed');
1973 }
1974 exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
1975 chdir $origdir;