X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/65aa4ca74a9ca9deb18dd3f64021e628289096c4..ffd62fc2fb74955cac5af41e7b3820e09877c3b4:/regen/regcomp.pl diff --git a/regen/regcomp.pl b/regen/regcomp.pl index 843ef1b..cb98613 100644 --- a/regen/regcomp.pl +++ b/regen/regcomp.pl @@ -1,5 +1,6 @@ #!/usr/bin/perl -w -# +# +# # Regenerate (overwriting only if changed): # # pod/perldebguts.pod @@ -19,102 +20,214 @@ BEGIN { # Get function prototypes - require 'regen/regen_lib.pl'; + require './regen/regen_lib.pl'; } use strict; -open DESC, 'regcomp.sym'; - -my $ind = 0; -my (@name,@rest,@type,@code,@args,@flags,@longj,@cmnt); -my ($longest_name_length,$desc,$lastregop) = 0; -while () { - # Special pod comments - if (/^#\* ?/) { $cmnt[$ind] .= "# $'"; } - # Truly blank lines possibly surrounding pod comments - elsif (/^\s*$/) { $cmnt[$ind] .= "\n" } - - next if /^(?:#|\s*$)/; - chomp; # No \z in 5.004 - s/\s*$//; - if (/^-+\s*$/) { - $lastregop= $ind; - next; - } - unless ($lastregop) { - ($name[$ind], $desc, $rest[$ind]) = /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/; - ($type[$ind], $code[$ind], $args[$ind], $flags[$ind], $longj[$ind]) - = split /[,\s]\s*/, $desc; - $longest_name_length = length $name[$ind] - if length $name[$ind] > $longest_name_length; - ++$ind; +# NOTE I don't think anyone actually knows what all of these properties mean, +# and I suspect some of them are outright unused. This is a first attempt to +# clean up the generation so maybe one day we can move to something more self +# documenting. (One might argue that an array of hashes of properties would +# be easier to use.) +# +# Why we use the term regnode and nodes, and not say, opcodes, I am not sure. + +# General thoughts: +# 1. We use a single continuum to represent both opcodes and states, +# and in regexec.c we switch on the combined set. +# 2. Opcodes have more information associated to them, states are simpler, +# basically just an identifier/number that can be used to switch within +# the state machine. +# 3. Some opcode are order dependent. +# 4. Output files often use "tricks" to reduce diff effects. Some of what +# we do below is more clumsy looking than it could be because of this. + +# Op/state properties: +# +# Property In Descr +# ---------------------------------------------------------------------------- +# name Both Name of op/state +# id Both integer value for this opcode/state +# optype Both Either 'op' or 'state' +# line_num Both line_num number of the input file for this item. +# type Op Type of node (aka regkind) +# code Op what code is associated with this node (???) +# args Op what type of args the node has (which regnode struct) +# flags Op (???) +# longj Op Boolean as to if this node is a longjump +# comment Both Comment about node, if any +# pod_comment Both Special comments for pod output (preceding lines in def) + +# Global State +my @all; # all opcodes/state +my %all; # hash of all opcode/state names + +my @ops; # array of just opcodes +my @states; # array of just states + +my $longest_name_length= 0; # track lengths of names for nicer reports +my (%type_alias); # map the type (??) + +# register a newly constructed node into our state tables. +# ensures that we have no name collisions (on name anyway), +# and issues the "id" for the node. +sub register_node { + my ($node)= @_; + + if ( $all{ $node->{name} } ) { + die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} " + . "previously defined on line $all{ $node->{name} }{line_num}\n"; + } elsif (!$node->{optype}) { + die "must have an optype in node ", Dumper($node); + } elsif ($node->{optype} eq "op") { + push @ops, $node; + } elsif ($node->{optype} eq "state") { + push @states, $node; } else { - my ($type,@lists)=split /\s+/, $_; - die "No list? $type" if !@lists; - foreach my $list (@lists) { - my ($names,$special)=split /:/, $list , 2; - $special ||= ""; - foreach my $name (split /,/,$names) { - my $real= $name eq 'resume' - ? "resume_$type" - : "${type}_$name"; - my @suffix; - if (!$special) { - @suffix=(""); - } elsif ($special=~/\d/) { - @suffix=(1..$special); - } elsif ($special eq 'FAIL') { - @suffix=("","_fail"); - } else { - die "unknown :type ':$special'"; - } - foreach my $suffix (@suffix) { - $name[$ind]="$real$suffix"; - $type[$ind]=$type; - $rest[$ind]="state for $type"; - ++$ind; - } + die "Uknown optype '$node->{optype}' in ", Dumper($node); + } + $node->{id}= 0 + @all; + push @all, $node; + $all{ $node->{name} }= $node; + + if ($node->{longj} && $node->{longj} != 1) { + die "longj field must be in [01] if present in ", Dumper($node); + } + +} + +# Parse and add an opcode definition to the global state. +# An opcode definition looks like this: +# +# +- args +# | +- flags +# | | +- longjmp +# Name Type code | | | ; comment +# -------------------------------------------------------------------------- +# IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches. +# UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches. +# SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE. +# IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceded by switcher. +# GROUPP GROUPP, num 1 ; Whether the group matched. +# +# Not every opcode definition has all of these. We should maybe make this +# nicer/easier to read in the future. Also note that the above is tab +# sensitive. + +sub parse_opcode_def { + my ( $text, $line_num, $pod_comment )= @_; + my $node= { + line_num => $line_num, + pod_comment => $pod_comment, + optype => "op", + }; + + # first split the line into three, the initial NAME, a middle part + # that we call "desc" which contains various (not well documented) things, + # and a comment section. + @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/ + or die "Failed to match $_"; + + # the content of the "desc" field from the first step is extracted here: + @{$node}{qw(type code args flags longj)}= split /[,\s]\s*/, $node->{desc}; + + defined $node->{$_} or $node->{$_} = "" + for qw(type code args flags longj); + + register_node($node); # has to be before the type_alias code below + + if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) { + + #warn "Regop type '$node->{type}' from regcomp.sym line $line_num" + # ." is not an existing regop, and will be aliased to $node->{name}\n" + # if -t STDERR; + $type_alias{ $node->{type} }= $node->{name}; + } + + $longest_name_length= length $node->{name} + if length $node->{name} > $longest_name_length; +} + +# parse out a state definition and add the resulting data +# into the global state. may create multiple new states from +# a single definition (this is part of the point). +# Format for states: +# REGOP \t typelist [ \t typelist] +# typelist= namelist +# = namelist:FAIL +# = name:count +# Eg: +# WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL +# BRANCH next:FAIL +# CURLYM A,B:FAIL +# +# The CURLYM definition would create the states: +# CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail +sub parse_state_def { + my ( $text, $line_num, $pod_comment )= @_; + my ( $type, @lists )= split /\s+/, $text; + die "No list? $type" if !@lists; + foreach my $list (@lists) { + my ( $names, $special )= split /:/, $list, 2; + $special ||= ""; + foreach my $name ( split /,/, $names ) { + my $real= + $name eq 'resume' + ? "resume_$type" + : "${type}_$name"; + my @suffix; + if ( !$special ) { + @suffix= (""); + } + elsif ( $special =~ /\d/ ) { + @suffix= ( 1 .. $special ); + } + elsif ( $special eq 'FAIL' ) { + @suffix= ( "", "_fail" ); + } + else { + die "unknown :type ':$special'"; + } + foreach my $suffix (@suffix) { + my $node= { + name => "$real$suffix", + optype => "state", + type => $type || "", + comment => "state for $type", + line_num => $line_num, + }; + register_node($node); } } - } } -# use fixed width to keep the diffs between regcomp.pl recompiles -# as small as possible. -my ($width,$rwidth,$twidth)=(22,12,9); -$lastregop ||= $ind; -my $tot = $ind; -close DESC; -die "Too many regexp/state opcodes! Maximum is 256, but there are $lastregop in file!" - if $lastregop>256; sub process_flags { - my ($flag, $varname, $comment) = @_; - $comment = '' unless defined $comment; - - $ind = 0; - my @selected; - my $bitmap = ''; - do { - my $set = $flags[$ind] && $flags[$ind] eq $flag ? 1 : 0; - # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic - # ops in the C code. - my $current = do { - local $^W; - ord do { - substr $bitmap, ($ind >> 3); - } - }; - substr($bitmap, ($ind >> 3), 1) = chr($current | ($set << ($ind & 7))); - - push @selected, $name[$ind] if $set; - } while (++$ind < $lastregop); - my $out_string = join ', ', @selected, 0; - $out_string =~ s/(.{1,70},) /$1\n /g; + my ( $flag, $varname, $comment )= @_; + $comment= '' unless defined $comment; + + my @selected; + my $bitmap= ''; + for my $node (@ops) { + my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0; + + # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic + # ops in the C code. + my $current= do { + no warnings; + ord substr $bitmap, ( $node->{id} >> 3 ); + }; + substr( $bitmap, ( $node->{id} >> 3 ), 1 )= + chr( $current | ( $set << ( $node->{id} & 7 ) ) ); + + push @selected, $node->{name} if $set; + } + my $out_string= join ', ', @selected, 0; + $out_string =~ s/(.{1,70},) /$1\n /g; - my $out_mask = join ', ', map {sprintf "0x%02X", ord $_} split '', $bitmap; + my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap; - return $comment . <<"EOP"; + return $comment . <<"EOP"; #define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7))) #ifndef DOINIT @@ -135,31 +248,78 @@ EXTCONST U8 PL_${varname}_bitmask[] = { EOP } -my $out = open_new('regnodes.h', '>', - { by => 'regen/regcomp.pl', from => 'regcomp.sym' }); -printf $out <) { + + # Special pod comments + if (/^#\* ?/) { $pod_comment .= "# $'"; } + + # Truly blank lines possibly surrounding pod comments + elsif (/^\s*$/) { $pod_comment .= "\n" } + + next if /\A\s*#/ || /\A\s*\z/; + + s/\s*\z//; + if (/^-+\s*$/) { + $seen_sep= 1; + next; + } + + if ($seen_sep) { + parse_state_def( $_, $., $pod_comment ); + } + else { + parse_opcode_def( $_, $., $pod_comment ); + } + $pod_comment= ""; + } + close $in_fh; + die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all, + " in file!" + if @all > 256; +} + +# use fixed width to keep the diffs between regcomp.pl recompiles +# as small as possible. +my ( $width, $rwidth, $twidth )= ( 22, 12, 9 ); + +sub print_state_defs { + my ($out)= @_; + printf $out < $lastregop - 1, - -$width, REGMATCH_STATE_MAX => $tot - 1 -; - + -$width, + REGNODE_MAX => $#ops, + -$width, REGMATCH_STATE_MAX => $#all; + + my %rev_type_alias= reverse %type_alias; + for my $node (@ops) { + printf $out "#define\t%*s\t%d\t/* %#04x %s */\n", + -$width, $node->{name}, $node->{id}, $node->{id}, $node->{comment}; + if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) { + printf $out "#define\t%*s\t%d\t/* %#04x %s */\n", + -$width, $alias, $node->{id}, $node->{id}, "type alias"; + } + } -for ($ind=0; $ind < $lastregop ; ++$ind) { - printf $out "#define\t%*s\t%d\t/* %#04x %s */\n", - -$width, $name[$ind], $ind, $ind, $rest[$ind]; -} -print $out "\t/* ------------ States ------------- */\n"; -for ( ; $ind < $tot ; $ind++) { - printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n", - -$width, $name[$ind], $ind - $lastregop + 1, $rest[$ind]; + print $out "\t/* ------------ States ------------- */\n"; + for my $node (@states) { + printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n", + -$width, $node->{name}, $node->{id} - $#ops, $node->{comment}; + } } -print $out <{type} or !defined( $node->{name} ); + printf $out "\t%*s\t/* %*s */\n", + -1 - $twidth, "$node->{type},", -$width, $node->{name}; + print $out "\t/* ------------ States ------------- */\n" + if $node->{id} == $#ops and $node->{id} != $#all; + } -$ind = 0; -do { - printf $out "\t%*s\t/* %*s */\n", - -1-$twidth, "$type[$ind],", -$width, $name[$ind]; - print $out "\t/* ------------ States ------------- */\n" - if $ind + 1 == $lastregop and $lastregop != $tot; -} while (++$ind < $tot); - -print $out <($out) for @_; + print $out <{args}; -print $out <{name}; + } + + print $out <{longj} || 0; - printf $out "\t%d,\t/* %*s */\n", - $size, -$rwidth, $name[$ind] -} while (++$ind < $lastregop); + printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name}; + } -print $out <{longj} || 0; + + printf $out "\t%*s\t/* $sym%#04x */\n", + -3 - $width, qq("$node->{name}",), $node->{id} - $ofs; + if ( $node->{id} == $#ops and @ops != @all ) { + print $out "\t/* ------------ States ------------- */\n"; + $ofs= $#ops; + $sym= 'REGNODE_MAX +'; + } + } -print $out <) { - - # optional leading '_'. Return symbol in $1, and strip it from - # rest of line - if (s/ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) { - chomp; - my $define = $1; - s: / \s* \* .*? \* \s* / : :x; # Replace comments by a blank - - # Replace any prior defined symbols by their values - foreach my $key (keys %definitions) { - s/\b$key\b/$definitions{$key}/g; - } + my %rxfv; + my %definitions; # Remember what the symbol definitions are + my $val= 0; + my %reverse; + my $REG_EXTFLAGS_NAME_SIZE= 0; + foreach my $file ( "op_reg_common.h", "regexp.h" ) { + open my $in_fh, "<", $file or die "Can't read '$file': $!"; + while (<$in_fh>) { + + # optional leading '_'. Return symbol in $1, and strip it from + # comment of line + if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) { + chomp; + my $define= $1; + my $orig= $_; + s{ /\* .*? \*/ }{ }x; # Replace comments by a blank + + # Replace any prior defined symbols by their values + foreach my $key ( keys %definitions ) { + s/\b$key\b/$definitions{$key}/g; + } - # Remove the U suffix from unsigned int literals - s/\b([0-9]+)U\b/$1/g; + # Remove the U suffix from unsigned int literals + s/\b([0-9]+)U\b/$1/g; - my $newval = eval $_; # Get numeric definition + my $newval= eval $_; # Get numeric definition - $definitions{$define} = $newval; + $definitions{$define}= $newval; - next unless $_ =~ /<) { + + # optional leading '_'. Return symbol in $1, and strip it from + # comment of line + if ( + m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi + ) + { + chomp; + my $define= $1; + my $abbr= $2; + my $hex= $3; + my $comment= $4; + my $val= hex($hex); + $comment= $comment ? " - $comment" : ""; + + printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",), + $val, $define, $comment; + $REG_INTFLAGS_NAME_SIZE++; } } } - $n=~s/^RXf_(PMf_)?//; - printf $out qq(\t%-20s/* 0x%08x */\n), - qq("$n",),$power_of_2; -} - -print $out <'); +sub do_perldebguts { + my $guts= open_new( 'pod/perldebguts.pod', '>' ); -my $code; -my $name_fmt = '<' x ($longest_name_length-1); -my $descr_fmt = '<' x (58-$longest_name_length); -eval <{pod_comment} + ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~ + \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : '' . +1; EOD + + my $old_fh= select($guts); + $~= "GuTS"; -select +(select($guts), do { - $~ = "GuTS"; - - open my $oldguts, "pod/perldebguts.pod" + open my $oldguts, '<', 'pod/perldebguts.pod' or die "$0 cannot open pod/perldebguts.pod for reading: $!"; - while(<$oldguts>) { + while (<$oldguts>) { print; last if /=for regcomp.pl begin/; } - print <<'end'; + print <<'END_OF_DESCR'; # TYPE arg-description [num-args] [longjump-len] DESCRIPTION -end - for (0..$lastregop-1) { - $code = "$code[$_] ".($args[$_]||""); - $code .= " $longj[$_]" if $longj[$_]; - if ($cmnt[$_] ||= "") { +END_OF_DESCR + for my $n (@ops) { + $node= $n; + $code= "$node->{code} " . ( $node->{args} || "" ); + $code .= " $node->{longj}" if $node->{longj}; + if ( $node->{pod_comment} ||= "" ) { + # Trim multiple blanks - $cmnt[$_] =~ s/^\n\n+/\n/; $cmnt[$_] =~ s/\n\n+$/\n\n/ + $node->{pod_comment} =~ s/^\n\n+/\n/; + $node->{pod_comment} =~ s/\n\n+$/\n\n/; } write; } print "\n"; - while(<$oldguts>) { + while (<$oldguts>) { last if /=for regcomp.pl end/; } - do { print } while <$oldguts>; + do { print } while <$oldguts>; #win32 can't unlink an open FH + close $oldguts or die "Error closing pod/perldebguts.pod: $!"; + select $old_fh; + close_and_rename($guts); +} -})[0]; +read_definition("regcomp.sym"); +my $out= open_new( 'regnodes.h', '>', + { by => 'regen/regcomp.pl', from => 'regcomp.sym' } ); +print_state_defs($out); +print_regkind($out); +wrap_ifdef_print( + $out, + "REG_COMP_C", + \&print_regarglen, + \&print_reg_off_by_arg +); +print_reg_name($out); +print_reg_extflags_name($out); +print_reg_intflags_name($out); +print_process_flags($out); +read_only_bottom_close_and_rename($out); -close_and_rename($guts); +do_perldebguts();