regen/regcomp.pl

   1 #!/usr/bin/perl -w
   2 #
   3 #
   4 # Regenerate (overwriting only if changed):
   5 #
   6 #    pod/perldebguts.pod
   7 #    regnodes.h
   8 #
   9 # from information stored in
  10 #
  11 #    regcomp.sym
  12 #    op_reg_common.h
  13 #    regexp.h
  14 #
  15 # pod/perldebguts.pod is not completely regenerated.  Only the table of
  16 # regexp nodes is replaced; other parts remain unchanged.
  17 #
  18 # Accepts the standard regen_lib -q and -v args.
  19 #
  20 # This script is normally invoked from regen.pl.
  21 #
  22 # F<regcomp.sym> defines the opcodes and states used in the regex
  23 # engine, it also includes documentation on the opcodes. This script
  24 # parses those definitions out and turns them into typedefs, defines,
  25 # and data structures, and maybe even code which the regex engine can
  26 # use to operate.
  27 #
  28 # F<regexp.h> and op_reg_common.h contain defines C<RXf_xxx> and
  29 # C<PREGf_xxx> that are used in flags in our code. These defines are
  30 # parsed out and data structures are created to allow the debug mode of
  31 # the regex engine to show things such as which flags were set during
  32 # compilation. In some cases we transform the C code in the header files
  33 # into perl code which we execute to C<eval()> the contents. For instance
  34 # in a situation like this:
  35 #
  36 #   #define RXf_X 0x1   /* the X mode */
  37 #   #define RXf_Y 0x2   /* the Y mode */
  38 #   #define RXf_Z (X|Y) /* the Z mode */
  39 #
  40 # this script might end up eval()ing something like C<0x1> and then
  41 # C<0x2> and then C<(0x1|0x2)> the results of which it then might use in
  42 # constructing a data structure, or pod in perldebguts, or a comment in
  43 # C<regnodes.h>. It also would separate out the "X", "Y", and "Z" and
  44 # use them, and would also use the data in the line comment if present.
  45 #
  46 # If you compile a regex under perl -Mre=Debug,ALL you can see much
  47 # of the content that this file generates and parses out of its input
  48 # files.
  49
  50 BEGIN {
  51     # Get function prototypes
  52     require './regen/regen_lib.pl';
  53 }
  54
  55 use strict;
  56
  57 # NOTE I don't think anyone actually knows what all of these properties mean,
  58 # and I suspect some of them are outright unused. This is a first attempt to
  59 # clean up the generation so maybe one day we can move to something more self
  60 # documenting. (One might argue that an array of hashes of properties would
  61 # be easier to use.)
  62 #
  63 # Why we use the term regnode and nodes, and not say, opcodes, I am not sure.
  64
  65 # General thoughts:
  66 # 1. We use a single continuum to represent both opcodes and states,
  67 #    and in regexec.c we switch on the combined set.
  68 # 2. Opcodes have more information associated to them, states are simpler,
  69 #    basically just an identifier/number that can be used to switch within
  70 #    the state machine.
  71 # 3. Some opcode are order dependent.
  72 # 4. Output files often use "tricks" to reduce diff effects. Some of what
  73 #    we do below is more clumsy looking than it could be because of this.
  74
  75 # Op/state properties:
  76 #
  77 # Property      In      Descr
  78 # ----------------------------------------------------------------------------
  79 # name          Both    Name of op/state
  80 # id            Both    integer value for this opcode/state
  81 # optype        Both    Either 'op' or 'state'
  82 # line_num      Both    line_num number of the input file for this item.
  83 # type          Op      Type of node (aka regnode_kind)
  84 # code          Op      Meta about the node, used to detect variable length nodes
  85 # suffix        Op      which regnode struct this uses, so if this is '1', it
  86 #                       uses 'struct regnode_1'
  87 # flags         Op      S for simple; V for varies
  88 # longj         Op      Boolean as to if this node is a longjump
  89 # comment       Both    Comment about node, if any.  Placed in perlredebguts
  90 #                       as its description
  91 # pod_comment   Both    Special comments for pod output (preceding lines in def)
  92 #                       Such lines begin with '#*'
  93
  94 # Global State
  95 my @all;    # all opcodes/state
  96 my %all;    # hash of all opcode/state names
  97
  98 my @ops;    # array of just opcodes
  99 my @states; # array of just states
 100
 101 my $longest_name_length= 0; # track lengths of names for nicer reports
 102 my (%type_alias);           # map the type (??)
 103
 104 # register a newly constructed node into our state tables.
 105 # ensures that we have no name collisions (on name anyway),
 106 # and issues the "id" for the node.
 107 sub register_node {
 108     my ($node)= @_;
 109
 110     if ( $all{ $node->{name} } ) {
 111         die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} "
 112             . "previously defined on line $all{ $node->{name} }{line_num}\n";
 113     } elsif (!$node->{optype}) {
 114         die "must have an optype in node ", Dumper($node);
 115     } elsif ($node->{optype} eq "op") {
 116         push @ops, $node;
 117     } elsif ($node->{optype} eq "state") {
 118         push @states, $node;
 119     } else {
 120         die "Uknown optype '$node->{optype}' in ", Dumper($node);
 121     }
 122     $node->{id}= 0 + @all;
 123     push @all, $node;
 124     $all{ $node->{name} }= $node;
 125
 126     if ($node->{longj} && $node->{longj} != 1) {
 127         die "longj field must be in [01] if present in ", Dumper($node);
 128     }
 129
 130 }
 131
 132 # Parse and add an opcode definition to the global state.
 133 # What an opcode definition looks like is given in regcomp.sym.
 134 #
 135 # Not every opcode definition has all of the components. We should maybe make
 136 # this nicer/easier to read in the future. Also note that the above is tab
 137 # sensitive.
 138
 139 # Special comments for an entry precede it, and begin with '#*' and are placed
 140 # in the generated pod file just before the entry.
 141
 142 sub parse_opcode_def {
 143     my ( $text, $line_num, $pod_comment )= @_;
 144     my $node= {
 145         line_num    => $line_num,
 146         pod_comment => $pod_comment,
 147         optype      => "op",
 148     };
 149
 150     # first split the line into three, the initial NAME, a middle part
 151     # that we call "desc" which contains various (not well documented) things,
 152     # and a comment section.
 153     @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/
 154         or die "Failed to match $_";
 155
 156     # the content of the "desc" field from the first step is extracted here:
 157     @{$node}{qw(type code suffix flags longj)}= split /[,\s]\s*/, $node->{desc};
 158
 159     defined $node->{$_} or $node->{$_} = ""
 160         for qw(type code suffix flags longj);
 161
 162     register_node($node); # has to be before the type_alias code below
 163
 164     if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) {
 165
 166         #warn "Regop type '$node->{type}' from regcomp.sym line $line_num"
 167         #     ." is not an existing regop, and will be aliased to $node->{name}\n"
 168         #    if -t STDERR;
 169         $type_alias{ $node->{type} }= $node->{name};
 170     }
 171
 172     $longest_name_length= length $node->{name}
 173         if length $node->{name} > $longest_name_length;
 174 }
 175
 176 # parse out a state definition and add the resulting data
 177 # into the global state. may create multiple new states from
 178 # a single definition (this is part of the point).
 179 # Format for states:
 180 # REGOP \t typelist [ \t typelist]
 181 # typelist= namelist
 182 #         = namelist:FAIL
 183 #         = name:count
 184 # Eg:
 185 # WHILEM          A_pre,A_min,A_max,B_min,B_max:FAIL
 186 # BRANCH          next:FAIL
 187 # CURLYM          A,B:FAIL
 188 #
 189 # The CURLYM definition would create the states:
 190 # CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail
 191 sub parse_state_def {
 192     my ( $text, $line_num, $pod_comment )= @_;
 193     my ( $type, @lists )= split /\s+/, $text;
 194     die "No list? $type" if !@lists;
 195     foreach my $list (@lists) {
 196         my ( $names, $special )= split /:/, $list, 2;
 197         $special ||= "";
 198         foreach my $name ( split /,/, $names ) {
 199             my $real=
 200                 $name eq 'resume'
 201                 ? "resume_$type"
 202                 : "${type}_$name";
 203             my @suffix;
 204             if ( !$special ) {
 205                 @suffix= ("");
 206             }
 207             elsif ( $special =~ /\d/ ) {
 208                 @suffix= ( 1 .. $special );
 209             }
 210             elsif ( $special eq 'FAIL' ) {
 211                 @suffix= ( "", "_fail" );
 212             }
 213             else {
 214                 die "unknown :type ':$special'";
 215             }
 216             foreach my $suffix (@suffix) {
 217                 my $node= {
 218                     name        => "$real$suffix",
 219                     optype      => "state",
 220                     type        => $type || "",
 221                     comment     => "state for $type",
 222                     line_num    => $line_num,
 223                 };
 224                 register_node($node);
 225             }
 226         }
 227     }
 228 }
 229
 230 sub process_flags {
 231     my ( $flag, $varname, $comment )= @_;
 232     $comment= '' unless defined $comment;
 233
 234     my @selected;
 235     my $bitmap= '';
 236     for my $node (@ops) {
 237         my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0;
 238
 239         # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic
 240         # ops in the C code.
 241         my $current= do {
 242             no warnings;
 243             ord substr $bitmap, ( $node->{id} >> 3 );
 244         };
 245         substr( $bitmap, ( $node->{id} >> 3 ), 1 )=
 246             chr( $current | ( $set << ( $node->{id} & 7 ) ) );
 247
 248         push @selected, $node->{name} if $set;
 249     }
 250     my $out_string= join ', ', @selected, 0;
 251     $out_string =~ s/(.{1,70},) /$1\n    /g;
 252
 253     my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap;
 254
 255     return $comment . <<"EOP";
 256 #define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7)))
 257
 258 #ifndef DOINIT
 259 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__;
 260 #else
 261 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = {
 262     $out_string
 263 };
 264 #endif /* DOINIT */
 265
 266 #ifndef DOINIT
 267 EXTCONST U8 PL_${varname}_bitmask[];
 268 #else
 269 EXTCONST U8 PL_${varname}_bitmask[] = {
 270     $out_mask
 271 };
 272 #endif /* DOINIT */
 273 EOP
 274 }
 275
 276 sub print_process_EXACTish {
 277     my ($out)= @_;
 278
 279     # Creates some bitmaps for EXACTish nodes.
 280
 281     my @folded;
 282     my @req8;
 283
 284     my $base;
 285     for my $node (@ops) {
 286         next unless $node->{type} eq 'EXACT';
 287         my $name = $node->{name};
 288         $base = $node->{id} if $name eq 'EXACT';
 289
 290         my $index = $node->{id} - $base;
 291
 292         # This depends entirely on naming conventions in regcomp.sym
 293         $folded[$index] = $name =~ /^EXACTF/ || 0;
 294         $req8[$index] = $name =~ /8/ || 0;
 295     }
 296
 297     die "Can't cope with > 32 EXACTish nodes" if @folded > 32;
 298
 299     my $exactf = sprintf "%X", oct("0b" . join "", reverse @folded);
 300     my $req8 =   sprintf "%X", oct("0b" . join "", reverse @req8);
 301     print $out <<EOP,
 302
 303 /* Is 'op', known to be of type EXACT, folding? */
 304 #define isEXACTFish(op) (__ASSERT_(REGNODE_TYPE(op) == EXACT) (PL_EXACTFish_bitmask & (1U << (op - EXACT))))
 305
 306 /* Do only UTF-8 target strings match 'op', known to be of type EXACT? */
 307 #define isEXACT_REQ8(op) (__ASSERT_(REGNODE_TYPE(op) == EXACT) (PL_EXACT_REQ8_bitmask & (1U << (op - EXACT))))
 308
 309 #ifndef DOINIT
 310 EXTCONST U32 PL_EXACTFish_bitmask;
 311 EXTCONST U32 PL_EXACT_REQ8_bitmask;
 312 #else
 313 EXTCONST U32 PL_EXACTFish_bitmask = 0x$exactf;
 314 EXTCONST U32 PL_EXACT_REQ8_bitmask = 0x$req8;
 315 #endif /* DOINIT */
 316 EOP
 317 }
 318
 319 sub read_definition {
 320     my ( $file )= @_;
 321     my ( $seen_sep, $pod_comment )= "";
 322     open my $in_fh, "<", $file
 323         or die "Failed to open '$file' for reading: $!";
 324     while (<$in_fh>) {
 325
 326         # Special pod comments
 327         if (/^#\* ?/) { $pod_comment .= "# $'"; }
 328
 329         # Truly blank lines possibly surrounding pod comments
 330         elsif (/^\s*$/) { $pod_comment .= "\n" }
 331
 332         next if /\A\s*#/ || /\A\s*\z/;
 333
 334         s/\s*\z//;
 335         if (/^-+\s*$/) {
 336             $seen_sep= 1;
 337             next;
 338         }
 339
 340         if ($seen_sep) {
 341             parse_state_def( $_, $., $pod_comment );
 342         }
 343         else {
 344             parse_opcode_def( $_, $., $pod_comment );
 345         }
 346         $pod_comment= "";
 347     }
 348     close $in_fh;
 349     die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all,
 350         " in file!"
 351         if @all > 256;
 352 }
 353
 354 # use fixed width to keep the diffs between regcomp.pl recompiles
 355 # as small as possible.
 356 my ( $base_name_width, $rwidth, $twidth )= ( 22, 12, 9 );
 357
 358 sub print_state_defs {
 359     my ($out)= @_;
 360     printf $out <<EOP,
 361 /* Regops and State definitions */
 362
 363 #define %*s\t%d
 364 #define %*s\t%d
 365
 366 EOP
 367         -$base_name_width,
 368         REGNODE_MAX => $#ops,
 369         -$base_name_width, REGMATCH_STATE_MAX => $#all;
 370
 371     my %rev_type_alias= reverse %type_alias;
 372     my $base_format = "#define %*s\t%d\t/* %#04x %s */\n";
 373     my @withs;
 374     my $in_states = 0;
 375
 376     my $max_name_width = 0;
 377     for my $ref (\@ops, \@states) {
 378         for my $node ($ref->@*) {
 379             my $len = length $node->{name};
 380             $max_name_width = $len if $max_name_width < $len;
 381         }
 382     }
 383
 384     die "Do a white-space only commit to increase \$base_name_width to"
 385      .  " $max_name_width; then re-run"  if $base_name_width < $max_name_width;
 386
 387     print $out <<EOT;
 388 /* -- For regexec.c to switch on target being utf8 (t8) or not (tb, b='byte'); */
 389 #define with_t_UTF8ness(op, t_utf8) (((op) << 1) + (cBOOL(t_utf8)))
 390 /* -- same, but also with pattern (p8, pb) -- */
 391 #define with_tp_UTF8ness(op, t_utf8, p_utf8)                        \\
 392 \t\t(((op) << 2) + (cBOOL(t_utf8) << 1) + cBOOL(p_utf8))
 393
 394 /* The #defines below give both the basic regnode and the expanded version for
 395    switching on utf8ness */
 396 EOT
 397
 398     for my $node (@ops) {
 399         print_state_def_line($out, $node->{name}, $node->{id}, $node->{comment});
 400         if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) {
 401             print_state_def_line($out, $alias, $node->{id}, $node->{comment});
 402         }
 403     }
 404
 405     print $out "\t/* ------------ States ------------- */\n";
 406     for my $node (@states) {
 407         print_state_def_line($out, $node->{name}, $node->{id}, $node->{comment});
 408     }
 409 }
 410
 411 sub print_state_def_line
 412 {
 413     my ($fh, $name, $id, $comment) = @_;
 414
 415     # The sub-names are like '_tb' or '_tb_p8' = max 6 chars wide
 416     my $name_col_width = $base_name_width + 6;
 417     my $base_id_width = 3;  # Max is '255' or 3 cols
 418     my $mid_id_width  = 3;  # Max is '511' or 3 cols
 419     my $full_id_width = 3;  # Max is '1023' but not close to using the 4th
 420
 421     my $line = "#define " . $name;
 422     $line .= " " x ($name_col_width - length($name));
 423
 424     $line .= sprintf "%*s", $base_id_width, $id;
 425     $line .= " " x $mid_id_width;
 426     $line .= " " x ($full_id_width + 2);
 427
 428     $line .= "/* ";
 429     my $hanging = length $line;     # Indent any subsequent line to this pos
 430     $line .= sprintf "0x%02x", $id;
 431
 432     my $columns = 78;
 433
 434     # From the documentation: 'In fact, every resulting line will have length
 435     # of no more than "$columns - 1"'
 436     $line = wrap($columns + 1, "", " " x $hanging, "$line $comment");
 437     chomp $line;            # wrap always adds a trailing \n
 438     $line =~ s/ \s+ $ //x;  # trim, just in case.
 439
 440     # The comment may have wrapped.  Find the final \n and measure the length
 441     # to the end.  If it is short enough, just append the ' */' to the line.
 442     # If it is too close to the end of the space available, add an extra line
 443     # that consists solely of blanks and the ' */'
 444     my $len = length($line); my $rindex = rindex($line, "\n");
 445     if (length($line) - rindex($line, "\n") - 1 <= $columns - 3) {
 446         $line .= " */\n";
 447     }
 448     else {
 449         $line .= "\n" . " " x ($hanging - 3) . "*/\n";
 450     }
 451
 452     print $fh $line;
 453
 454     # And add the 2 subsidiary #defines used when switching on
 455     # with_t_UTF8nes()
 456     my $with_id_t = $id * 2;
 457     for my $with (qw(tb  t8)) {
 458         my $with_name = "${name}_$with";
 459         print  $fh "#define ", $with_name;
 460         print  $fh " " x ($name_col_width - length($with_name) + $base_id_width);
 461         printf $fh "%*s", $mid_id_width, $with_id_t;
 462         print  $fh " " x $full_id_width;
 463         printf $fh "  /*";
 464         print  $fh " " x (4 + 2);  # 4 is width of 0xHH that the base entry uses
 465         printf $fh "0x%03x */\n", $with_id_t;
 466
 467         $with_id_t++;
 468     }
 469
 470     # Finally add the 4 subsidiary #defines used when switching on
 471     # with_tp_UTF8nes()
 472     my $with_id_tp = $id * 4;
 473     for my $with (qw(tb_pb  tb_p8  t8_pb  t8_p8)) {
 474         my $with_name = "${name}_$with";
 475         print  $fh "#define ", $with_name;
 476         print  $fh " " x ($name_col_width - length($with_name) + $base_id_width + $mid_id_width);
 477         printf $fh "%*s", $full_id_width, $with_id_tp;
 478         printf $fh "  /*";
 479         print  $fh " " x (4 + 2);  # 4 is width of 0xHH that the base entry uses
 480         printf $fh "0x%03x */\n", $with_id_tp;
 481
 482         $with_id_tp++;
 483     }
 484
 485     print $fh "\n"; # Blank line separates groups for clarity
 486 }
 487
 488 sub print_typedefs {
 489     my ($out)= @_;
 490     print $out <<EOP;
 491
 492 /* typedefs for regex nodes - one typedef per node type */
 493
 494 EOP
 495     my $len= 0;
 496     foreach my $node (@ops) {
 497         if ($node->{suffix} and $len < length($node->{suffix})) {
 498             $len= length $node->{suffix};
 499         }
 500     }
 501     $len += length "struct regnode_";
 502     $len = (int($len/5)+2)*5;
 503     my $prefix= "tregnode";
 504
 505     foreach my $node (sort { $a->{name} cmp $b->{name} } @ops) {
 506         my $struct_name= "struct regnode";
 507         if (my $suffix= $node->{suffix}) {
 508             $struct_name .= "_$suffix";
 509         }
 510         $node->{typedef}= $prefix . "_" . $node->{name};
 511         printf $out "typedef %*s %s;\n", -$len, $struct_name, $node->{typedef};
 512     }
 513     print $out <<EOP;
 514
 515 /* end typedefs */
 516
 517 EOP
 518
 519 }
 520
 521
 522
 523
 524 sub print_regnode_info {
 525     my ($out)= @_;
 526     print $out <<EOP;
 527
 528 /* PL_regnode_info[] - Opcode/state names in string form, for debugging */
 529
 530 #ifndef DOINIT
 531 EXTCONST struct regnode_meta PL_regnode_info[];
 532 #else
 533 EXTCONST struct regnode_meta PL_regnode_info[] = {
 534 EOP
 535     my @fields= qw(type arg_len arg_len_varies off_by_arg);
 536     foreach my $node_idx (0..$#all) {
 537         my $node= $all[$node_idx];
 538         {
 539             my $size= 0;
 540             $size= "EXTRA_SIZE($node->{typedef})" if $node->{suffix};
 541             $node->{arg_len}= $size;
 542
 543         }
 544         {
 545             my $varies= 0;
 546             $varies= 1 if $node->{code} and $node->{code}=~"str";
 547             $node->{arg_len_varies}= $varies;
 548         }
 549         $node->{off_by_arg}= $node->{longj} || 0;
 550         print $out "    {\n";
 551         print $out "        /* #$node_idx $node->{optype} $node->{name} */\n";
 552         foreach my $f_idx (0..$#fields) {
 553             my $field= $fields[$f_idx];
 554             printf $out  "        .%s = %s", $field, $node->{$field} // 0;
 555             printf $out $f_idx == $#fields ? "\n" : ",\n";
 556         }
 557         print $out "    }";
 558         print $out $node_idx==$#all ? "\n" : ",\n";
 559     }
 560
 561     print $out <<EOP;
 562 };
 563 #endif /* DOINIT */
 564
 565 EOP
 566 }
 567
 568
 569 sub print_regnode_name {
 570     my ($out)= @_;
 571     print $out <<EOP;
 572
 573 /* PL_regnode_name[] - Opcode/state names in string form, for debugging */
 574
 575 #ifndef DOINIT
 576 EXTCONST char * PL_regnode_name[];
 577 #else
 578 EXTCONST char * const PL_regnode_name[] = {
 579 EOP
 580
 581     my $ofs= 0;
 582     my $sym= "";
 583     foreach my $node (@all) {
 584         printf $out "\t%*s\t/* $sym%#04x */\n",
 585             -3 - $base_name_width, qq("$node->{name}",), $node->{id} - $ofs;
 586         if ( $node->{id} == $#ops and @ops != @all ) {
 587             print $out "\t/* ------------ States ------------- */\n";
 588             $ofs= $#ops;
 589             $sym= 'REGNODE_MAX +';
 590         }
 591     }
 592
 593     print $out <<EOP;
 594 };
 595 #endif /* DOINIT */
 596
 597 EOP
 598 }
 599
 600 sub print_reg_extflags_name {
 601     my ($out)= @_;
 602     print $out <<EOP;
 603 /* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */
 604
 605 #ifndef DOINIT
 606 EXTCONST char * PL_reg_extflags_name[];
 607 #else
 608 EXTCONST char * const PL_reg_extflags_name[] = {
 609 EOP
 610
 611     my %rxfv;
 612     my %definitions;    # Remember what the symbol definitions are
 613     my $val= 0;
 614     my %reverse;
 615     my $REG_EXTFLAGS_NAME_SIZE= 0;
 616     foreach my $file ( "op_reg_common.h", "regexp.h" ) {
 617         open my $in_fh, "<", $file or die "Can't read '$file': $!";
 618         while (<$in_fh>) {
 619
 620             # optional leading '_'.  Return symbol in $1, and strip it from
 621             # comment of line.  Currently doesn't handle comments running onto
 622             # next line
 623             if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
 624                 chomp;
 625                 my $define= $1;
 626                 my $orig= $_;
 627                 s{ /\* .*? \*/ }{ }x;    # Replace comments by a blank
 628
 629                 # Replace any prior defined symbols by their values
 630                 foreach my $key ( keys %definitions ) {
 631                     s/\b$key\b/$definitions{$key}/g;
 632                 }
 633
 634                 # Remove the U suffix from unsigned int literals
 635                 s/\b([0-9]+)U\b/$1/g;
 636
 637                 my $newval= eval $_;     # Get numeric definition
 638
 639                 $definitions{$define}= $newval;
 640
 641                 next unless $_ =~ /<</;    # Bit defines use left shift
 642                 if ( $val & $newval ) {
 643                     my @names= ( $define, $reverse{$newval} );
 644                     s/PMf_// for @names;
 645                     if ( $names[0] ne $names[1] ) {
 646                         die sprintf
 647                             "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)",
 648                             $newval, $orig, $_;
 649                     }
 650                     next;
 651                 }
 652                 $val |= $newval;
 653                 $rxfv{$define}= $newval;
 654                 $reverse{$newval}= $define;
 655             }
 656         }
 657     }
 658     my %vrxf= reverse %rxfv;
 659     printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N',
 660         $val;
 661     my %multibits;
 662     for ( 0 .. 31 ) {
 663         my $power_of_2= 2**$_;
 664         my $n= $vrxf{$power_of_2};
 665         my $extra= "";
 666         if ( !$n ) {
 667
 668             # Here, there was no name that matched exactly the bit.  It could be
 669             # either that it is unused, or the name matches multiple bits.
 670             if ( !( $val & $power_of_2 ) ) {
 671                 $n= "UNUSED_BIT_$_";
 672             }
 673             else {
 674
 675                 # Here, must be because it matches multiple bits.  Look through
 676                 # all possibilities until find one that matches this one.  Use
 677                 # that name, and all the bits it matches
 678                 foreach my $name ( keys %rxfv ) {
 679                     if ( $rxfv{$name} & $power_of_2 ) {
 680                         $n= $name . ( $multibits{$name}++ );
 681                         $extra= sprintf qq{ : "%s" - 0x%08x}, $name,
 682                             $rxfv{$name}
 683                             if $power_of_2 != $rxfv{$name};
 684                         last;
 685                     }
 686                 }
 687             }
 688         }
 689         s/\bRXf_(PMf_)?// for $n, $extra;
 690         printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra;
 691         $REG_EXTFLAGS_NAME_SIZE++;
 692     }
 693
 694     print $out <<EOP;
 695 };
 696 #endif /* DOINIT */
 697
 698 #ifdef DEBUGGING
 699 #  define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE
 700 #endif
 701 EOP
 702
 703 }
 704
 705 sub print_reg_intflags_name {
 706     my ($out)= @_;
 707     print $out <<EOP;
 708
 709 /* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */
 710
 711 #ifndef DOINIT
 712 EXTCONST char * PL_reg_intflags_name[];
 713 #else
 714 EXTCONST char * const PL_reg_intflags_name[] = {
 715 EOP
 716
 717     my %rxfv;
 718     my %definitions;    # Remember what the symbol definitions are
 719     my $val= 0;
 720     my %reverse;
 721     my $REG_INTFLAGS_NAME_SIZE= 0;
 722     foreach my $file ("regcomp.h") {
 723         open my $fh, "<", $file or die "Can't read $file: $!";
 724         while (<$fh>) {
 725
 726             # optional leading '_'.  Return symbol in $1, and strip it from
 727             # comment of line
 728             if (
 729                 m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi
 730                 )
 731             {
 732                 chomp;
 733                 my $define= $1;
 734                 my $abbr= $2;
 735                 my $hex= $3;
 736                 my $comment= $4;
 737                 my $val= hex($hex);
 738                 $comment= $comment ? " - $comment" : "";
 739
 740                 printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",),
 741                     $val, $define, $comment;
 742                 $REG_INTFLAGS_NAME_SIZE++;
 743             }
 744         }
 745     }
 746
 747     print $out <<EOP;
 748 };
 749 #endif /* DOINIT */
 750
 751 EOP
 752     print $out <<EOQ;
 753 #ifdef DEBUGGING
 754 #  define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE
 755 #endif
 756
 757 EOQ
 758 }
 759
 760 sub print_process_flags {
 761     my ($out)= @_;
 762
 763     print $out process_flags( 'V', 'varies', <<'EOC');
 764 /* The following have no fixed length. U8 so we can do strchr() on it. */
 765 EOC
 766
 767     print $out process_flags( 'S', 'simple', <<'EOC');
 768
 769 /* The following always have a length of 1. U8 we can do strchr() on it. */
 770 /* (Note that length 1 means "one character" under UTF8, not "one octet".) */
 771 EOC
 772
 773 }
 774
 775 sub do_perldebguts {
 776     my $guts= open_new( 'pod/perldebguts.pod', '>' );
 777
 778     my $node;
 779     my $code;
 780     my $name_fmt= '<' x  ( $longest_name_length - 1 );
 781     my $descr_fmt= '<' x ( 58 - $longest_name_length );
 782     eval <<EOD or die $@;
 783 format GuTS =
 784  ^*~~
 785  \$node->{pod_comment}
 786  ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~
 787  \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : ''
 788 .
 789 1;
 790 EOD
 791
 792     my $old_fh= select($guts);
 793     $~= "GuTS";
 794
 795     open my $oldguts, '<', 'pod/perldebguts.pod'
 796         or die "$0 cannot open pod/perldebguts.pod for reading: $!";
 797     while (<$oldguts>) {
 798         print;
 799         last if /=for regcomp.pl begin/;
 800     }
 801
 802     print <<'END_OF_DESCR';
 803
 804  # TYPE arg-description [regnode-struct-suffix] [longjump-len] DESCRIPTION
 805 END_OF_DESCR
 806     for my $n (@ops) {
 807         $node= $n;
 808         $code= "$node->{code} " . ( $node->{suffix} || "" );
 809         $code .= " $node->{longj}" if $node->{longj};
 810         if ( $node->{pod_comment} ||= "" ) {
 811
 812             # Trim multiple blanks
 813             $node->{pod_comment} =~ s/^\n\n+/\n/;
 814             $node->{pod_comment} =~ s/\n\n+$/\n\n/;
 815         }
 816         write;
 817     }
 818     print "\n";
 819
 820     while (<$oldguts>) {
 821         last if /=for regcomp.pl end/;
 822     }
 823     do { print } while <$oldguts>; #win32 can't unlink an open FH
 824     close $oldguts or die "Error closing pod/perldebguts.pod: $!";
 825     select $old_fh;
 826     close_and_rename($guts);
 827 }
 828
 829 my $confine_to_core = 'defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)';
 830 read_definition("regcomp.sym");
 831 if ($ENV{DUMP}) {
 832     require Data::Dumper;
 833     print Data::Dumper::Dumper(\@all);
 834     exit(1);
 835 }
 836 my $out= open_new( 'regnodes.h', '>',
 837     {
 838         by      => 'regen/regcomp.pl',
 839         from    => [ 'regcomp.sym', 'op_reg_common.h', 'regexp.h' ],
 840     },
 841 );
 842 print $out "#if $confine_to_core\n\n";
 843 print_typedefs($out);
 844 print_state_defs($out);
 845
 846 print_regnode_name($out);
 847 print_regnode_info($out);
 848
 849
 850 print_reg_extflags_name($out);
 851 print_reg_intflags_name($out);
 852 print_process_flags($out);
 853 print_process_EXACTish($out);
 854 print $out "\n#endif /* $confine_to_core */\n";
 855 read_only_bottom_close_and_rename($out);
 856
 857 do_perldebguts();