4 # Regenerate (overwriting only if changed):
9 # from information stored in
14 # pod/perldebguts.pod is not completely regenerated. Only the table of
15 # regexp nodes is replaced; other parts remain unchanged.
17 # Accepts the standard regen_lib -q and -v args.
19 # This script is normally invoked from regen.pl.
22 # Get function prototypes
23 require 'regen/regen_lib.pl';
27 # NOTE I don't think anyone actually knows what all of these properties mean,
28 # and I suspect some of them are outright unused. This is a first attempt to
29 # clean up the generation so maybe one day we can move to something more self
30 # documenting. (One might argue that an array of hashes of properties would
33 # Why we use the term regnode and nodes, and not say, opcodes, I am not sure.
36 # 1. We use a single continuum to represent both opcodes and states,
37 # and in regexec.c we switch on the combined set.
38 # 2. Opcodes have more information associated to them, states are simpler,
39 # basically just an identifier/number that can be used to switch within
41 # 3. Some opcode are order dependent.
42 # 4. Output files often use "tricks" to reduce diff effects. Some of what
43 # we do below is more clumsy looking than it could be because of this.
45 # Op/state properties:
48 # ----------------------------------------------------------------------------
49 # name Both Name of op/state
50 # id Both integer value for this opcode/state
51 # optype Both Either 'op' or 'state'
52 # line_num Both line_num number of the input file for this item.
53 # type Op Type of node (aka regkind)
54 # code Op what code is associated with this node (???)
55 # args Op what type of args the node has (which regnode struct)
57 # longj Op Whether this node is a longjump
58 # comment Both Comment about node, if any
59 # pod_comment Both Special comments for pod output (preceding lines in def)
62 my @all; # all opcodes/state
63 my %all; # hash of all opcode/state names
65 my @ops; # array of just opcodes
66 my @states; # array of just states
68 my $longest_name_length= 0; # track lengths of names for nicer reports
69 my (%type_alias); # map the type (??)
71 # register a newly constructed node into our state tables.
72 # ensures that we have no name collisions (on name anyway),
73 # and issues the "id" for the node.
77 if ( $all{ $node->{name} } ) {
78 die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} "
79 . "previously defined on line $all{ $node->{name} }{line_num}\n";
80 } elsif (!$node->{optype}) {
81 die "must have an optype in node ", Dumper($node);
82 } elsif ($node->{optype} eq "op") {
84 } elsif ($node->{optype} eq "state") {
87 die "Uknown optype '$node->{optype}' in ", Dumper($node);
89 $node->{id}= 0 + @all;
91 $all{ $node->{name} }= $node;
94 # Parse and add an opcode definition to the global state.
95 # An opcode definition looks like this:
100 # Name Type code | | | ; comment
101 # --------------------------------------------------------------------------
102 # IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches.
103 # UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches.
104 # SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE.
105 # IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceded by switcher.
106 # GROUPP GROUPP, num 1 ; Whether the group matched.
108 # Not every opcode definition has all of these. We should maybe make this
109 # nicer/easier to read in the future. Also note that the above is tab
112 sub parse_opcode_def {
113 my ( $text, $line_num, $pod_comment )= @_;
115 line_num => $line_num,
116 pod_comment => $pod_comment,
120 # first split the line into three, the initial NAME, a middle part
121 # that we call "desc" which contains various (not well documented) things,
122 # and a comment section.
123 @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/
124 or die "Failed to match $_";
126 # the content of the "desc" field from the first step is extracted here:
127 @{$node}{qw(type code args flags longj)}= split /[,\s]\s*/, $node->{desc};
129 $node->{$_} //= "" for qw(type code args flags longj);
131 register_node($node); # has to be before the type_alias code below
133 if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) {
135 #warn "Regop type '$node->{type}' from regcomp.sym line $line_num"
136 # ." is not an existing regop, and will be aliased to $node->{name}\n"
138 $type_alias{ $node->{type} }= $node->{name};
141 $longest_name_length= length $node->{name}
142 if length $node->{name} > $longest_name_length;
145 # parse out a state definition and add the resulting data
146 # into the global state. may create multiple new states from
147 # a single definition (this is part of the point).
149 # REGOP \t typelist [ \t typelist]
154 # WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL
158 # The CURLYM definition would create the states:
159 # CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail
160 sub parse_state_def {
161 my ( $text, $line_num, $pod_comment )= @_;
162 my ( $type, @lists )= split /\s+/, $text;
163 die "No list? $type" if !@lists;
164 foreach my $list (@lists) {
165 my ( $names, $special )= split /:/, $list, 2;
167 foreach my $name ( split /,/, $names ) {
176 elsif ( $special =~ /\d/ ) {
177 @suffix= ( 1 .. $special );
179 elsif ( $special eq 'FAIL' ) {
180 @suffix= ( "", "_fail" );
183 die "unknown :type ':$special'";
185 foreach my $suffix (@suffix) {
187 name => "$real$suffix",
190 comment => "state for $type",
191 line_num => $line_num,
193 register_node($node);
200 my ( $flag, $varname, $comment )= @_;
201 $comment= '' unless defined $comment;
205 for my $node (@ops) {
206 my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0;
208 # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic
212 ord substr $bitmap, ( $node->{id} >> 3 );
214 substr( $bitmap, ( $node->{id} >> 3 ), 1 )=
215 chr( $current | ( $set << ( $node->{id} & 7 ) ) );
217 push @selected, $node->{name} if $set;
219 my $out_string= join ', ', @selected, 0;
220 $out_string =~ s/(.{1,70},) /$1\n /g;
222 my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap;
224 return $comment . <<"EOP";
225 #define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7)))
228 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__;
230 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = {
236 EXTCONST U8 PL_${varname}_bitmask[];
238 EXTCONST U8 PL_${varname}_bitmask[] = {
245 sub read_definition {
247 my ( $seen_sep, $pod_comment )= "";
248 open my $in_fh, "<", $file
249 or die "Failed to open '$file' for reading: $!";
252 # Special pod comments
253 if (/^#\* ?/) { $pod_comment .= "# $'"; }
255 # Truly blank lines possibly surrounding pod comments
256 elsif (/^\s*$/) { $pod_comment .= "\n" }
258 next if /\A\s*#/ || /\A\s*\z/;
267 parse_state_def( $_, $., $pod_comment );
270 parse_opcode_def( $_, $., $pod_comment );
275 die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all,
280 # use fixed width to keep the diffs between regcomp.pl recompiles
281 # as small as possible.
282 my ( $width, $rwidth, $twidth )= ( 22, 12, 9 );
284 sub print_state_defs {
287 /* Regops and State definitions */
294 REGNODE_MAX => $#ops,
295 -$width, REGMATCH_STATE_MAX => $#all;
297 my %rev_type_alias= reverse %type_alias;
298 for my $node (@ops) {
299 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
300 -$width, $node->{name}, $node->{id}, $node->{id}, $node->{comment};
301 if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) {
302 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
303 -$width, $alias, $node->{id}, $node->{id}, "type alias";
307 print $out "\t/* ------------ States ------------- */\n";
308 for my $node (@states) {
309 printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n",
310 -$width, $node->{name}, $node->{id} - $#ops, $node->{comment};
318 /* PL_regkind[] What type of regop or state is this. */
321 EXTCONST U8 PL_regkind[];
323 EXTCONST U8 PL_regkind[] = {
326 foreach my $node (@all) {
327 print Dumper($node) if !defined $node->{type} or !defined( $node->{name} );
328 printf $out "\t%*s\t/* %*s */\n",
329 -1 - $twidth, "$node->{type},", -$width, $node->{name};
330 print $out "\t/* ------------ States ------------- */\n"
331 if $node->{id} == $#ops and $node->{id} != $#all;
340 sub wrap_ifdef_print {
354 sub print_regarglen {
358 /* regarglen[] - How large is the argument part of the node (in regnodes) */
360 static const U8 regarglen[] = {
363 foreach my $node (@ops) {
365 $size= "EXTRA_SIZE(struct regnode_$node->{args})" if $node->{args};
367 printf $out "\t%*s\t/* %*s */\n", -37, "$size,", -$rwidth, $node->{name};
375 sub print_reg_off_by_arg {
379 /* reg_off_by_arg[] - Which argument holds the offset to the next node */
381 static const char reg_off_by_arg[] = {
384 foreach my $node (@ops) {
385 my $size= $node->{longj} || 0;
387 printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name};
400 /* reg_name[] - Opcode/state names in string form, for debugging */
403 EXTCONST char * PL_reg_name[];
405 EXTCONST char * const PL_reg_name[] = {
410 foreach my $node (@all) {
411 my $size= $node->{longj} || 0;
413 printf $out "\t%*s\t/* $sym%#04x */\n",
414 -3 - $width, qq("$node->{name}",), $node->{id} - $ofs;
415 if ( $node->{id} == $#ops and @ops != @all ) {
416 print $out "\t/* ------------ States ------------- */\n";
418 $sym= 'REGNODE_MAX +';
429 sub print_reg_extflags_name {
432 /* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */
435 EXTCONST char * PL_reg_extflags_name[];
437 EXTCONST char * const PL_reg_extflags_name[] = {
441 my %definitions; # Remember what the symbol definitions are
444 my $REG_EXTFLAGS_NAME_SIZE= 0;
445 foreach my $file ( "op_reg_common.h", "regexp.h" ) {
446 open my $in_fh, "<", $file or die "Can't read '$file': $!";
449 # optional leading '_'. Return symbol in $1, and strip it from
451 if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
455 s{ /\* .*? \*/ }{ }x; # Replace comments by a blank
457 # Replace any prior defined symbols by their values
458 foreach my $key ( keys %definitions ) {
459 s/\b$key\b/$definitions{$key}/g;
462 # Remove the U suffix from unsigned int literals
463 s/\b([0-9]+)U\b/$1/g;
465 my $newval= eval $_; # Get numeric definition
467 $definitions{$define}= $newval;
469 next unless $_ =~ /<</; # Bit defines use left shift
470 if ( $val & $newval ) {
471 my @names= ( $define, $reverse{$newval} );
473 if ( $names[0] ne $names[1] ) {
475 "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)",
481 $rxfv{$define}= $newval;
482 $reverse{$newval}= $define;
486 my %vrxf= reverse %rxfv;
487 printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N',
491 my $power_of_2= 2**$_;
492 my $n= $vrxf{$power_of_2};
496 # Here, there was no name that matched exactly the bit. It could be
497 # either that it is unused, or the name matches multiple bits.
498 if ( !( $val & $power_of_2 ) ) {
503 # Here, must be because it matches multiple bits. Look through
504 # all possibilities until find one that matches this one. Use
505 # that name, and all the bits it matches
506 foreach my $name ( keys %rxfv ) {
507 if ( $rxfv{$name} & $power_of_2 ) {
508 $n= $name . ( $multibits{$name}++ );
509 $extra= sprintf qq{ : "%s" - 0x%08x}, $name,
511 if $power_of_2 != $rxfv{$name};
517 s/\bRXf_(PMf_)?// for $n, $extra;
518 printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra;
519 $REG_EXTFLAGS_NAME_SIZE++;
527 # define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE
533 sub print_reg_intflags_name {
537 /* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */
540 EXTCONST char * PL_reg_intflags_name[];
542 EXTCONST char * const PL_reg_intflags_name[] = {
546 my %definitions; # Remember what the symbol definitions are
549 my $REG_INTFLAGS_NAME_SIZE= 0;
550 foreach my $file ("regcomp.h") {
551 open my $fh, "<", $file or die "Can't read $file: $!";
554 # optional leading '_'. Return symbol in $1, and strip it from
557 m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi
566 $comment= $comment ? " - $comment" : "";
568 printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",),
569 $val, $define, $comment;
570 $REG_INTFLAGS_NAME_SIZE++;
582 # define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE
588 sub print_process_flags {
591 print $out process_flags( 'V', 'varies', <<'EOC');
592 /* The following have no fixed length. U8 so we can do strchr() on it. */
595 print $out process_flags( 'S', 'simple', <<'EOC');
597 /* The following always have a length of 1. U8 we can do strchr() on it. */
598 /* (Note that length 1 means "one character" under UTF8, not "one octet".) */
604 my $guts= open_new( 'pod/perldebguts.pod', '>' );
608 my $name_fmt= '<' x ( $longest_name_length - 1 );
609 my $descr_fmt= '<' x ( 58 - $longest_name_length );
610 eval <<EOD or die $@;
613 \$node->{pod_comment}
614 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~
615 \$node->{name}, \$code, \$node->{comment}//''
620 my $old_fh= select($guts);
623 open my $oldguts, "pod/perldebguts.pod"
624 or die "$0 cannot open pod/perldebguts.pod for reading: $!";
627 last if /=for regcomp.pl begin/;
630 print <<'END_OF_DESCR';
632 # TYPE arg-description [num-args] [longjump-len] DESCRIPTION
636 $code= "$node->{code} " . ( $node->{args} || "" );
637 $code .= " $node->{longj}" if $node->{longj};
638 if ( $node->{pod_comment} ||= "" ) {
640 # Trim multiple blanks
641 $node->{pod_comment} =~ s/^\n\n+/\n/;
642 $node->{pod_comment} =~ s/\n\n+$/\n\n/;
649 last if /=for regcomp.pl end/;
651 do { print } while <$oldguts>;
653 close_and_rename($guts);
656 read_definition("regcomp.sym");
657 my $out= open_new( 'regnodes.h', '>',
658 { by => 'regen/regcomp.pl', from => 'regcomp.sym' } );
659 print_state_defs($out);
665 \&print_reg_off_by_arg
667 print_reg_name($out);
668 print_reg_extflags_name($out);
669 print_reg_intflags_name($out);
670 print_process_flags($out);
671 read_only_bottom_close_and_rename($out);