4 # Regenerate (overwriting only if changed):
9 # from information stored in
14 # pod/perldebguts.pod is not completely regenerated. Only the table of
15 # regexp nodes is replaced; other parts remain unchanged.
17 # Accepts the standard regen_lib -q and -v args.
19 # This script is normally invoked from regen.pl.
22 # Get function prototypes
23 require './regen/regen_lib.pl';
27 # NOTE I don't think anyone actually knows what all of these properties mean,
28 # and I suspect some of them are outright unused. This is a first attempt to
29 # clean up the generation so maybe one day we can move to something more self
30 # documenting. (One might argue that an array of hashes of properties would
33 # Why we use the term regnode and nodes, and not say, opcodes, I am not sure.
36 # 1. We use a single continuum to represent both opcodes and states,
37 # and in regexec.c we switch on the combined set.
38 # 2. Opcodes have more information associated to them, states are simpler,
39 # basically just an identifier/number that can be used to switch within
41 # 3. Some opcode are order dependent.
42 # 4. Output files often use "tricks" to reduce diff effects. Some of what
43 # we do below is more clumsy looking than it could be because of this.
45 # Op/state properties:
48 # ----------------------------------------------------------------------------
49 # name Both Name of op/state
50 # id Both integer value for this opcode/state
51 # optype Both Either 'op' or 'state'
52 # line_num Both line_num number of the input file for this item.
53 # type Op Type of node (aka regkind)
54 # code Op what code is associated with this node (???)
55 # args Op what type of args the node has (which regnode struct)
57 # longj Op Boolean as to if this node is a longjump
58 # comment Both Comment about node, if any
59 # pod_comment Both Special comments for pod output (preceding lines in def)
62 my @all; # all opcodes/state
63 my %all; # hash of all opcode/state names
65 my @ops; # array of just opcodes
66 my @states; # array of just states
68 my $longest_name_length= 0; # track lengths of names for nicer reports
69 my (%type_alias); # map the type (??)
71 # register a newly constructed node into our state tables.
72 # ensures that we have no name collisions (on name anyway),
73 # and issues the "id" for the node.
77 if ( $all{ $node->{name} } ) {
78 die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} "
79 . "previously defined on line $all{ $node->{name} }{line_num}\n";
80 } elsif (!$node->{optype}) {
81 die "must have an optype in node ", Dumper($node);
82 } elsif ($node->{optype} eq "op") {
84 } elsif ($node->{optype} eq "state") {
87 die "Uknown optype '$node->{optype}' in ", Dumper($node);
89 $node->{id}= 0 + @all;
91 $all{ $node->{name} }= $node;
93 if ($node->{longj} && $node->{longj} != 1) {
94 die "longj field must be in [01] if present in ", Dumper($node);
99 # Parse and add an opcode definition to the global state.
100 # An opcode definition looks like this:
105 # Name Type code | | | ; comment
106 # --------------------------------------------------------------------------
107 # IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches.
108 # UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches.
109 # SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE.
110 # IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceded by switcher.
111 # GROUPP GROUPP, num 1 ; Whether the group matched.
113 # Not every opcode definition has all of these. We should maybe make this
114 # nicer/easier to read in the future. Also note that the above is tab
117 sub parse_opcode_def {
118 my ( $text, $line_num, $pod_comment )= @_;
120 line_num => $line_num,
121 pod_comment => $pod_comment,
125 # first split the line into three, the initial NAME, a middle part
126 # that we call "desc" which contains various (not well documented) things,
127 # and a comment section.
128 @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/
129 or die "Failed to match $_";
131 # the content of the "desc" field from the first step is extracted here:
132 @{$node}{qw(type code args flags longj)}= split /[,\s]\s*/, $node->{desc};
134 defined $node->{$_} or $node->{$_} = ""
135 for qw(type code args flags longj);
137 register_node($node); # has to be before the type_alias code below
139 if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) {
141 #warn "Regop type '$node->{type}' from regcomp.sym line $line_num"
142 # ." is not an existing regop, and will be aliased to $node->{name}\n"
144 $type_alias{ $node->{type} }= $node->{name};
147 $longest_name_length= length $node->{name}
148 if length $node->{name} > $longest_name_length;
151 # parse out a state definition and add the resulting data
152 # into the global state. may create multiple new states from
153 # a single definition (this is part of the point).
155 # REGOP \t typelist [ \t typelist]
160 # WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL
164 # The CURLYM definition would create the states:
165 # CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail
166 sub parse_state_def {
167 my ( $text, $line_num, $pod_comment )= @_;
168 my ( $type, @lists )= split /\s+/, $text;
169 die "No list? $type" if !@lists;
170 foreach my $list (@lists) {
171 my ( $names, $special )= split /:/, $list, 2;
173 foreach my $name ( split /,/, $names ) {
182 elsif ( $special =~ /\d/ ) {
183 @suffix= ( 1 .. $special );
185 elsif ( $special eq 'FAIL' ) {
186 @suffix= ( "", "_fail" );
189 die "unknown :type ':$special'";
191 foreach my $suffix (@suffix) {
193 name => "$real$suffix",
196 comment => "state for $type",
197 line_num => $line_num,
199 register_node($node);
206 my ( $flag, $varname, $comment )= @_;
207 $comment= '' unless defined $comment;
211 for my $node (@ops) {
212 my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0;
214 # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic
218 ord substr $bitmap, ( $node->{id} >> 3 );
220 substr( $bitmap, ( $node->{id} >> 3 ), 1 )=
221 chr( $current | ( $set << ( $node->{id} & 7 ) ) );
223 push @selected, $node->{name} if $set;
225 my $out_string= join ', ', @selected, 0;
226 $out_string =~ s/(.{1,70},) /$1\n /g;
228 my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap;
230 return $comment . <<"EOP";
231 #define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7)))
234 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__;
236 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = {
242 EXTCONST U8 PL_${varname}_bitmask[];
244 EXTCONST U8 PL_${varname}_bitmask[] = {
251 sub read_definition {
253 my ( $seen_sep, $pod_comment )= "";
254 open my $in_fh, "<", $file
255 or die "Failed to open '$file' for reading: $!";
258 # Special pod comments
259 if (/^#\* ?/) { $pod_comment .= "# $'"; }
261 # Truly blank lines possibly surrounding pod comments
262 elsif (/^\s*$/) { $pod_comment .= "\n" }
264 next if /\A\s*#/ || /\A\s*\z/;
273 parse_state_def( $_, $., $pod_comment );
276 parse_opcode_def( $_, $., $pod_comment );
281 die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all,
286 # use fixed width to keep the diffs between regcomp.pl recompiles
287 # as small as possible.
288 my ( $width, $rwidth, $twidth )= ( 22, 12, 9 );
290 sub print_state_defs {
293 /* Regops and State definitions */
300 REGNODE_MAX => $#ops,
301 -$width, REGMATCH_STATE_MAX => $#all;
303 my %rev_type_alias= reverse %type_alias;
304 for my $node (@ops) {
305 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
306 -$width, $node->{name}, $node->{id}, $node->{id}, $node->{comment};
307 if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) {
308 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
309 -$width, $alias, $node->{id}, $node->{id}, "type alias";
313 print $out "\t/* ------------ States ------------- */\n";
314 for my $node (@states) {
315 printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n",
316 -$width, $node->{name}, $node->{id} - $#ops, $node->{comment};
324 /* PL_regkind[] What type of regop or state is this. */
327 EXTCONST U8 PL_regkind[];
329 EXTCONST U8 PL_regkind[] = {
332 foreach my $node (@all) {
333 print Dumper($node) if !defined $node->{type} or !defined( $node->{name} );
334 printf $out "\t%*s\t/* %*s */\n",
335 -1 - $twidth, "$node->{type},", -$width, $node->{name};
336 print $out "\t/* ------------ States ------------- */\n"
337 if $node->{id} == $#ops and $node->{id} != $#all;
346 sub wrap_ifdef_print {
360 sub print_regarglen {
364 /* regarglen[] - How large is the argument part of the node (in regnodes) */
366 static const U8 regarglen[] = {
369 foreach my $node (@ops) {
371 $size= "EXTRA_SIZE(struct regnode_$node->{args})" if $node->{args};
373 printf $out "\t%*s\t/* %*s */\n", -37, "$size,", -$rwidth, $node->{name};
381 sub print_reg_off_by_arg {
385 /* reg_off_by_arg[] - Which argument holds the offset to the next node */
387 static const char reg_off_by_arg[] = {
390 foreach my $node (@ops) {
391 my $size= $node->{longj} || 0;
393 printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name};
406 /* reg_name[] - Opcode/state names in string form, for debugging */
409 EXTCONST char * PL_reg_name[];
411 EXTCONST char * const PL_reg_name[] = {
416 foreach my $node (@all) {
417 my $size= $node->{longj} || 0;
419 printf $out "\t%*s\t/* $sym%#04x */\n",
420 -3 - $width, qq("$node->{name}",), $node->{id} - $ofs;
421 if ( $node->{id} == $#ops and @ops != @all ) {
422 print $out "\t/* ------------ States ------------- */\n";
424 $sym= 'REGNODE_MAX +';
435 sub print_reg_extflags_name {
438 /* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */
441 EXTCONST char * PL_reg_extflags_name[];
443 EXTCONST char * const PL_reg_extflags_name[] = {
447 my %definitions; # Remember what the symbol definitions are
450 my $REG_EXTFLAGS_NAME_SIZE= 0;
451 foreach my $file ( "op_reg_common.h", "regexp.h" ) {
452 open my $in_fh, "<", $file or die "Can't read '$file': $!";
455 # optional leading '_'. Return symbol in $1, and strip it from
457 if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
461 s{ /\* .*? \*/ }{ }x; # Replace comments by a blank
463 # Replace any prior defined symbols by their values
464 foreach my $key ( keys %definitions ) {
465 s/\b$key\b/$definitions{$key}/g;
468 # Remove the U suffix from unsigned int literals
469 s/\b([0-9]+)U\b/$1/g;
471 my $newval= eval $_; # Get numeric definition
473 $definitions{$define}= $newval;
475 next unless $_ =~ /<</; # Bit defines use left shift
476 if ( $val & $newval ) {
477 my @names= ( $define, $reverse{$newval} );
479 if ( $names[0] ne $names[1] ) {
481 "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)",
487 $rxfv{$define}= $newval;
488 $reverse{$newval}= $define;
492 my %vrxf= reverse %rxfv;
493 printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N',
497 my $power_of_2= 2**$_;
498 my $n= $vrxf{$power_of_2};
502 # Here, there was no name that matched exactly the bit. It could be
503 # either that it is unused, or the name matches multiple bits.
504 if ( !( $val & $power_of_2 ) ) {
509 # Here, must be because it matches multiple bits. Look through
510 # all possibilities until find one that matches this one. Use
511 # that name, and all the bits it matches
512 foreach my $name ( keys %rxfv ) {
513 if ( $rxfv{$name} & $power_of_2 ) {
514 $n= $name . ( $multibits{$name}++ );
515 $extra= sprintf qq{ : "%s" - 0x%08x}, $name,
517 if $power_of_2 != $rxfv{$name};
523 s/\bRXf_(PMf_)?// for $n, $extra;
524 printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra;
525 $REG_EXTFLAGS_NAME_SIZE++;
533 # define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE
539 sub print_reg_intflags_name {
543 /* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */
546 EXTCONST char * PL_reg_intflags_name[];
548 EXTCONST char * const PL_reg_intflags_name[] = {
552 my %definitions; # Remember what the symbol definitions are
555 my $REG_INTFLAGS_NAME_SIZE= 0;
556 foreach my $file ("regcomp.h") {
557 open my $fh, "<", $file or die "Can't read $file: $!";
560 # optional leading '_'. Return symbol in $1, and strip it from
563 m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi
572 $comment= $comment ? " - $comment" : "";
574 printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",),
575 $val, $define, $comment;
576 $REG_INTFLAGS_NAME_SIZE++;
588 # define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE
594 sub print_process_flags {
597 print $out process_flags( 'V', 'varies', <<'EOC');
598 /* The following have no fixed length. U8 so we can do strchr() on it. */
601 print $out process_flags( 'S', 'simple', <<'EOC');
603 /* The following always have a length of 1. U8 we can do strchr() on it. */
604 /* (Note that length 1 means "one character" under UTF8, not "one octet".) */
610 my $guts= open_new( 'pod/perldebguts.pod', '>' );
614 my $name_fmt= '<' x ( $longest_name_length - 1 );
615 my $descr_fmt= '<' x ( 58 - $longest_name_length );
616 eval <<EOD or die $@;
619 \$node->{pod_comment}
620 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~
621 \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : ''
626 my $old_fh= select($guts);
629 open my $oldguts, '<', 'pod/perldebguts.pod'
630 or die "$0 cannot open pod/perldebguts.pod for reading: $!";
633 last if /=for regcomp.pl begin/;
636 print <<'END_OF_DESCR';
638 # TYPE arg-description [num-args] [longjump-len] DESCRIPTION
642 $code= "$node->{code} " . ( $node->{args} || "" );
643 $code .= " $node->{longj}" if $node->{longj};
644 if ( $node->{pod_comment} ||= "" ) {
646 # Trim multiple blanks
647 $node->{pod_comment} =~ s/^\n\n+/\n/;
648 $node->{pod_comment} =~ s/\n\n+$/\n\n/;
655 last if /=for regcomp.pl end/;
657 do { print } while <$oldguts>; #win32 can't unlink an open FH
658 close $oldguts or die "Error closing pod/perldebguts.pod: $!";
660 close_and_rename($guts);
663 read_definition("regcomp.sym");
664 my $out= open_new( 'regnodes.h', '>',
665 { by => 'regen/regcomp.pl', from => 'regcomp.sym' } );
666 print_state_defs($out);
672 \&print_reg_off_by_arg
674 print_reg_name($out);
675 print_reg_extflags_name($out);
676 print_reg_intflags_name($out);
677 print_process_flags($out);
678 read_only_bottom_close_and_rename($out);