4 # Regenerate (overwriting only if changed):
9 # from information stored in
14 # pod/perldebguts.pod is not completely regenerated. Only the table of
15 # regexp nodes is replaced; other parts remain unchanged.
17 # Accepts the standard regen_lib -q and -v args.
19 # This script is normally invoked from regen.pl.
22 # Get function prototypes
23 require './regen/regen_lib.pl';
27 # NOTE I don't think anyone actually knows what all of these properties mean,
28 # and I suspect some of them are outright unused. This is a first attempt to
29 # clean up the generation so maybe one day we can move to something more self
30 # documenting. (One might argue that an array of hashes of properties would
33 # Why we use the term regnode and nodes, and not say, opcodes, I am not sure.
36 # 1. We use a single continuum to represent both opcodes and states,
37 # and in regexec.c we switch on the combined set.
38 # 2. Opcodes have more information associated to them, states are simpler,
39 # basically just an identifier/number that can be used to switch within
41 # 3. Some opcode are order dependent.
42 # 4. Output files often use "tricks" to reduce diff effects. Some of what
43 # we do below is more clumsy looking than it could be because of this.
45 # Op/state properties:
48 # ----------------------------------------------------------------------------
49 # name Both Name of op/state
50 # id Both integer value for this opcode/state
51 # optype Both Either 'op' or 'state'
52 # line_num Both line_num number of the input file for this item.
53 # type Op Type of node (aka regkind)
54 # code Op Apparently not used
55 # suffix Op which regnode struct this uses, so if this is '1', it
56 # uses 'struct regnode_1'
57 # flags Op S for simple; V for varies
58 # longj Op Boolean as to if this node is a longjump
59 # comment Both Comment about node, if any. Placed in perlredebguts
61 # pod_comment Both Special comments for pod output (preceding lines in def)
62 # Such lines begin with '#*'
65 my @all; # all opcodes/state
66 my %all; # hash of all opcode/state names
68 my @ops; # array of just opcodes
69 my @states; # array of just states
71 my $longest_name_length= 0; # track lengths of names for nicer reports
72 my (%type_alias); # map the type (??)
74 # register a newly constructed node into our state tables.
75 # ensures that we have no name collisions (on name anyway),
76 # and issues the "id" for the node.
80 if ( $all{ $node->{name} } ) {
81 die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} "
82 . "previously defined on line $all{ $node->{name} }{line_num}\n";
83 } elsif (!$node->{optype}) {
84 die "must have an optype in node ", Dumper($node);
85 } elsif ($node->{optype} eq "op") {
87 } elsif ($node->{optype} eq "state") {
90 die "Uknown optype '$node->{optype}' in ", Dumper($node);
92 $node->{id}= 0 + @all;
94 $all{ $node->{name} }= $node;
96 if ($node->{longj} && $node->{longj} != 1) {
97 die "longj field must be in [01] if present in ", Dumper($node);
102 # Parse and add an opcode definition to the global state.
103 # What an opcode definition looks like is given in regcomp.sym.
105 # Not every opcode definition has all of the components. We should maybe make
106 # this nicer/easier to read in the future. Also note that the above is tab
109 # Special comments for an entry precede it, and begin with '#*' and are placed
110 # in the generated pod file just before the entry.
112 sub parse_opcode_def {
113 my ( $text, $line_num, $pod_comment )= @_;
115 line_num => $line_num,
116 pod_comment => $pod_comment,
120 # first split the line into three, the initial NAME, a middle part
121 # that we call "desc" which contains various (not well documented) things,
122 # and a comment section.
123 @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/
124 or die "Failed to match $_";
126 # the content of the "desc" field from the first step is extracted here:
127 @{$node}{qw(type code suffix flags longj)}= split /[,\s]\s*/, $node->{desc};
129 defined $node->{$_} or $node->{$_} = ""
130 for qw(type code suffix flags longj);
132 register_node($node); # has to be before the type_alias code below
134 if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) {
136 #warn "Regop type '$node->{type}' from regcomp.sym line $line_num"
137 # ." is not an existing regop, and will be aliased to $node->{name}\n"
139 $type_alias{ $node->{type} }= $node->{name};
142 $longest_name_length= length $node->{name}
143 if length $node->{name} > $longest_name_length;
146 # parse out a state definition and add the resulting data
147 # into the global state. may create multiple new states from
148 # a single definition (this is part of the point).
150 # REGOP \t typelist [ \t typelist]
155 # WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL
159 # The CURLYM definition would create the states:
160 # CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail
161 sub parse_state_def {
162 my ( $text, $line_num, $pod_comment )= @_;
163 my ( $type, @lists )= split /\s+/, $text;
164 die "No list? $type" if !@lists;
165 foreach my $list (@lists) {
166 my ( $names, $special )= split /:/, $list, 2;
168 foreach my $name ( split /,/, $names ) {
177 elsif ( $special =~ /\d/ ) {
178 @suffix= ( 1 .. $special );
180 elsif ( $special eq 'FAIL' ) {
181 @suffix= ( "", "_fail" );
184 die "unknown :type ':$special'";
186 foreach my $suffix (@suffix) {
188 name => "$real$suffix",
191 comment => "state for $type",
192 line_num => $line_num,
194 register_node($node);
201 my ( $flag, $varname, $comment )= @_;
202 $comment= '' unless defined $comment;
206 for my $node (@ops) {
207 my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0;
209 # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic
213 ord substr $bitmap, ( $node->{id} >> 3 );
215 substr( $bitmap, ( $node->{id} >> 3 ), 1 )=
216 chr( $current | ( $set << ( $node->{id} & 7 ) ) );
218 push @selected, $node->{name} if $set;
220 my $out_string= join ', ', @selected, 0;
221 $out_string =~ s/(.{1,70},) /$1\n /g;
223 my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap;
225 return $comment . <<"EOP";
226 #define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7)))
229 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__;
231 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = {
237 EXTCONST U8 PL_${varname}_bitmask[];
239 EXTCONST U8 PL_${varname}_bitmask[] = {
246 sub read_definition {
248 my ( $seen_sep, $pod_comment )= "";
249 open my $in_fh, "<", $file
250 or die "Failed to open '$file' for reading: $!";
253 # Special pod comments
254 if (/^#\* ?/) { $pod_comment .= "# $'"; }
256 # Truly blank lines possibly surrounding pod comments
257 elsif (/^\s*$/) { $pod_comment .= "\n" }
259 next if /\A\s*#/ || /\A\s*\z/;
268 parse_state_def( $_, $., $pod_comment );
271 parse_opcode_def( $_, $., $pod_comment );
276 die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all,
281 # use fixed width to keep the diffs between regcomp.pl recompiles
282 # as small as possible.
283 my ( $width, $rwidth, $twidth )= ( 22, 12, 9 );
285 sub print_state_defs {
288 /* Regops and State definitions */
295 REGNODE_MAX => $#ops,
296 -$width, REGMATCH_STATE_MAX => $#all;
298 my %rev_type_alias= reverse %type_alias;
299 for my $node (@ops) {
300 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
301 -$width, $node->{name}, $node->{id}, $node->{id}, $node->{comment};
302 if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) {
303 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
304 -$width, $alias, $node->{id}, $node->{id}, "type alias";
308 print $out "\t/* ------------ States ------------- */\n";
309 for my $node (@states) {
310 printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n",
311 -$width, $node->{name}, $node->{id} - $#ops, $node->{comment};
319 /* PL_regkind[] What type of regop or state is this. */
322 EXTCONST U8 PL_regkind[];
324 EXTCONST U8 PL_regkind[] = {
327 foreach my $node (@all) {
328 print Dumper($node) if !defined $node->{type} or !defined( $node->{name} );
329 printf $out "\t%*s\t/* %*s */\n",
330 -1 - $twidth, "$node->{type},", -$width, $node->{name};
331 print $out "\t/* ------------ States ------------- */\n"
332 if $node->{id} == $#ops and $node->{id} != $#all;
341 sub wrap_ifdef_print {
355 sub print_regarglen {
359 /* regarglen[] - How large is the argument part of the node (in regnodes) */
361 static const U8 regarglen[] = {
364 foreach my $node (@ops) {
366 $size= "EXTRA_SIZE(struct regnode_$node->{suffix})" if $node->{suffix};
368 printf $out "\t%*s\t/* %*s */\n", -37, "$size,", -$rwidth, $node->{name};
376 sub print_reg_off_by_arg {
380 /* reg_off_by_arg[] - Which argument holds the offset to the next node */
382 static const char reg_off_by_arg[] = {
385 foreach my $node (@ops) {
386 my $size= $node->{longj} || 0;
388 printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name};
401 /* reg_name[] - Opcode/state names in string form, for debugging */
404 EXTCONST char * PL_reg_name[];
406 EXTCONST char * const PL_reg_name[] = {
411 foreach my $node (@all) {
412 my $size= $node->{longj} || 0;
414 printf $out "\t%*s\t/* $sym%#04x */\n",
415 -3 - $width, qq("$node->{name}",), $node->{id} - $ofs;
416 if ( $node->{id} == $#ops and @ops != @all ) {
417 print $out "\t/* ------------ States ------------- */\n";
419 $sym= 'REGNODE_MAX +';
430 sub print_reg_extflags_name {
433 /* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */
436 EXTCONST char * PL_reg_extflags_name[];
438 EXTCONST char * const PL_reg_extflags_name[] = {
442 my %definitions; # Remember what the symbol definitions are
445 my $REG_EXTFLAGS_NAME_SIZE= 0;
446 foreach my $file ( "op_reg_common.h", "regexp.h" ) {
447 open my $in_fh, "<", $file or die "Can't read '$file': $!";
450 # optional leading '_'. Return symbol in $1, and strip it from
451 # comment of line. Currently doesn't handle comments running onto
453 if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
457 s{ /\* .*? \*/ }{ }x; # Replace comments by a blank
459 # Replace any prior defined symbols by their values
460 foreach my $key ( keys %definitions ) {
461 s/\b$key\b/$definitions{$key}/g;
464 # Remove the U suffix from unsigned int literals
465 s/\b([0-9]+)U\b/$1/g;
467 my $newval= eval $_; # Get numeric definition
469 $definitions{$define}= $newval;
471 next unless $_ =~ /<</; # Bit defines use left shift
472 if ( $val & $newval ) {
473 my @names= ( $define, $reverse{$newval} );
475 if ( $names[0] ne $names[1] ) {
477 "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)",
483 $rxfv{$define}= $newval;
484 $reverse{$newval}= $define;
488 my %vrxf= reverse %rxfv;
489 printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N',
493 my $power_of_2= 2**$_;
494 my $n= $vrxf{$power_of_2};
498 # Here, there was no name that matched exactly the bit. It could be
499 # either that it is unused, or the name matches multiple bits.
500 if ( !( $val & $power_of_2 ) ) {
505 # Here, must be because it matches multiple bits. Look through
506 # all possibilities until find one that matches this one. Use
507 # that name, and all the bits it matches
508 foreach my $name ( keys %rxfv ) {
509 if ( $rxfv{$name} & $power_of_2 ) {
510 $n= $name . ( $multibits{$name}++ );
511 $extra= sprintf qq{ : "%s" - 0x%08x}, $name,
513 if $power_of_2 != $rxfv{$name};
519 s/\bRXf_(PMf_)?// for $n, $extra;
520 printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra;
521 $REG_EXTFLAGS_NAME_SIZE++;
529 # define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE
535 sub print_reg_intflags_name {
539 /* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */
542 EXTCONST char * PL_reg_intflags_name[];
544 EXTCONST char * const PL_reg_intflags_name[] = {
548 my %definitions; # Remember what the symbol definitions are
551 my $REG_INTFLAGS_NAME_SIZE= 0;
552 foreach my $file ("regcomp.h") {
553 open my $fh, "<", $file or die "Can't read $file: $!";
556 # optional leading '_'. Return symbol in $1, and strip it from
559 m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi
568 $comment= $comment ? " - $comment" : "";
570 printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",),
571 $val, $define, $comment;
572 $REG_INTFLAGS_NAME_SIZE++;
584 # define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE
590 sub print_process_flags {
593 print $out process_flags( 'V', 'varies', <<'EOC');
594 /* The following have no fixed length. U8 so we can do strchr() on it. */
597 print $out process_flags( 'S', 'simple', <<'EOC');
599 /* The following always have a length of 1. U8 we can do strchr() on it. */
600 /* (Note that length 1 means "one character" under UTF8, not "one octet".) */
606 my $guts= open_new( 'pod/perldebguts.pod', '>' );
610 my $name_fmt= '<' x ( $longest_name_length - 1 );
611 my $descr_fmt= '<' x ( 58 - $longest_name_length );
612 eval <<EOD or die $@;
615 \$node->{pod_comment}
616 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~
617 \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : ''
622 my $old_fh= select($guts);
625 open my $oldguts, '<', 'pod/perldebguts.pod'
626 or die "$0 cannot open pod/perldebguts.pod for reading: $!";
629 last if /=for regcomp.pl begin/;
632 print <<'END_OF_DESCR';
634 # TYPE arg-description [regnode-struct-suffix] [longjump-len] DESCRIPTION
638 $code= "$node->{code} " . ( $node->{suffix} || "" );
639 $code .= " $node->{longj}" if $node->{longj};
640 if ( $node->{pod_comment} ||= "" ) {
642 # Trim multiple blanks
643 $node->{pod_comment} =~ s/^\n\n+/\n/;
644 $node->{pod_comment} =~ s/\n\n+$/\n\n/;
651 last if /=for regcomp.pl end/;
653 do { print } while <$oldguts>; #win32 can't unlink an open FH
654 close $oldguts or die "Error closing pod/perldebguts.pod: $!";
656 close_and_rename($guts);
659 read_definition("regcomp.sym");
660 my $out= open_new( 'regnodes.h', '>',
661 { by => 'regen/regcomp.pl', from => 'regcomp.sym' } );
662 print_state_defs($out);
668 \&print_reg_off_by_arg
670 print_reg_name($out);
671 print_reg_extflags_name($out);
672 print_reg_intflags_name($out);
673 print_process_flags($out);
674 read_only_bottom_close_and_rename($out);