4 # Regenerate (overwriting only if changed):
9 # from information stored in
15 # pod/perldebguts.pod is not completely regenerated. Only the table of
16 # regexp nodes is replaced; other parts remain unchanged.
18 # Accepts the standard regen_lib -q and -v args.
20 # This script is normally invoked from regen.pl.
22 # F<regcomp.sym> defines the opcodes and states used in the regex
23 # engine, it also includes documentation on the opcodes. This script
24 # parses those definitions out and turns them into typedefs, defines,
25 # and data structures, and maybe even code which the regex engine can
28 # F<regexp.h> and op_reg_common.h contain defines C<RXf_xxx> and
29 # C<PREGf_xxx> that are used in flags in our code. These defines are
30 # parsed out and data structures are created to allow the debug mode of
31 # the regex engine to show things such as which flags were set during
32 # compilation. In some cases we transform the C code in the header files
33 # into perl code which we execute to C<eval()> the contents. For instance
34 # in a situation like this:
36 # #define RXf_X 0x1 /* the X mode */
37 # #define RXf_Y 0x2 /* the Y mode */
38 # #define RXf_Z (X|Y) /* the Z mode */
40 # this script might end up eval()ing something like C<0x1> and then
41 # C<0x2> and then C<(0x1|0x2)> the results of which it then might use in
42 # constructing a data structure, or pod in perldebguts, or a comment in
43 # C<regnodes.h>. It also would separate out the "X", "Y", and "Z" and
44 # use them, and would also use the data in the line comment if present.
46 # If you compile a regex under perl -Mre=Debug,ALL you can see much
47 # of the content that this file generates and parses out of its input
51 # Get function prototypes
52 require './regen/regen_lib.pl';
57 # NOTE I don't think anyone actually knows what all of these properties mean,
58 # and I suspect some of them are outright unused. This is a first attempt to
59 # clean up the generation so maybe one day we can move to something more self
60 # documenting. (One might argue that an array of hashes of properties would
63 # Why we use the term regnode and nodes, and not say, opcodes, I am not sure.
66 # 1. We use a single continuum to represent both opcodes and states,
67 # and in regexec.c we switch on the combined set.
68 # 2. Opcodes have more information associated to them, states are simpler,
69 # basically just an identifier/number that can be used to switch within
71 # 3. Some opcode are order dependent.
72 # 4. Output files often use "tricks" to reduce diff effects. Some of what
73 # we do below is more clumsy looking than it could be because of this.
75 # Op/state properties:
78 # ----------------------------------------------------------------------------
79 # name Both Name of op/state
80 # id Both integer value for this opcode/state
81 # optype Both Either 'op' or 'state'
82 # line_num Both line_num number of the input file for this item.
83 # type Op Type of node (aka regnode_kind)
84 # code Op Meta about the node, used to detect variable length nodes
85 # suffix Op which regnode struct this uses, so if this is '1', it
86 # uses 'struct regnode_1'
87 # flags Op S for simple; V for varies
88 # longj Op Boolean as to if this node is a longjump
89 # comment Both Comment about node, if any. Placed in perlredebguts
91 # pod_comment Both Special comments for pod output (preceding lines in def)
92 # Such lines begin with '#*'
95 my @all; # all opcodes/state
96 my %all; # hash of all opcode/state names
98 my @ops; # array of just opcodes
99 my @states; # array of just states
101 my $longest_name_length= 0; # track lengths of names for nicer reports
102 my (%type_alias); # map the type (??)
104 # register a newly constructed node into our state tables.
105 # ensures that we have no name collisions (on name anyway),
106 # and issues the "id" for the node.
110 if ( $all{ $node->{name} } ) {
111 die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} "
112 . "previously defined on line $all{ $node->{name} }{line_num}\n";
113 } elsif (!$node->{optype}) {
114 die "must have an optype in node ", Dumper($node);
115 } elsif ($node->{optype} eq "op") {
117 } elsif ($node->{optype} eq "state") {
120 die "Uknown optype '$node->{optype}' in ", Dumper($node);
122 $node->{id}= 0 + @all;
124 $all{ $node->{name} }= $node;
126 if ($node->{longj} && $node->{longj} != 1) {
127 die "longj field must be in [01] if present in ", Dumper($node);
132 # Parse and add an opcode definition to the global state.
133 # What an opcode definition looks like is given in regcomp.sym.
135 # Not every opcode definition has all of the components. We should maybe make
136 # this nicer/easier to read in the future. Also note that the above is tab
139 # Special comments for an entry precede it, and begin with '#*' and are placed
140 # in the generated pod file just before the entry.
142 sub parse_opcode_def {
143 my ( $text, $line_num, $pod_comment )= @_;
145 line_num => $line_num,
146 pod_comment => $pod_comment,
150 # first split the line into three, the initial NAME, a middle part
151 # that we call "desc" which contains various (not well documented) things,
152 # and a comment section.
153 @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/
154 or die "Failed to match $_";
156 # the content of the "desc" field from the first step is extracted here:
157 @{$node}{qw(type code suffix flags longj)}= split /[,\s]\s*/, $node->{desc};
159 defined $node->{$_} or $node->{$_} = ""
160 for qw(type code suffix flags longj);
162 register_node($node); # has to be before the type_alias code below
164 if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) {
166 #warn "Regop type '$node->{type}' from regcomp.sym line $line_num"
167 # ." is not an existing regop, and will be aliased to $node->{name}\n"
169 $type_alias{ $node->{type} }= $node->{name};
172 $longest_name_length= length $node->{name}
173 if length $node->{name} > $longest_name_length;
176 # parse out a state definition and add the resulting data
177 # into the global state. may create multiple new states from
178 # a single definition (this is part of the point).
180 # REGOP \t typelist [ \t typelist]
185 # WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL
189 # The CURLYM definition would create the states:
190 # CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail
191 sub parse_state_def {
192 my ( $text, $line_num, $pod_comment )= @_;
193 my ( $type, @lists )= split /\s+/, $text;
194 die "No list? $type" if !@lists;
195 foreach my $list (@lists) {
196 my ( $names, $special )= split /:/, $list, 2;
198 foreach my $name ( split /,/, $names ) {
207 elsif ( $special =~ /\d/ ) {
208 @suffix= ( 1 .. $special );
210 elsif ( $special eq 'FAIL' ) {
211 @suffix= ( "", "_fail" );
214 die "unknown :type ':$special'";
216 foreach my $suffix (@suffix) {
218 name => "$real$suffix",
221 comment => "state for $type",
222 line_num => $line_num,
224 register_node($node);
231 my ( $flag, $varname, $comment )= @_;
232 $comment= '' unless defined $comment;
236 for my $node (@ops) {
237 my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0;
239 # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic
243 ord substr $bitmap, ( $node->{id} >> 3 );
245 substr( $bitmap, ( $node->{id} >> 3 ), 1 )=
246 chr( $current | ( $set << ( $node->{id} & 7 ) ) );
248 push @selected, $node->{name} if $set;
250 my $out_string= join ', ', @selected, 0;
251 $out_string =~ s/(.{1,70},) /$1\n /g;
253 my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap;
255 return $comment . <<"EOP";
256 #define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7)))
259 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__;
261 EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = {
267 EXTCONST U8 PL_${varname}_bitmask[];
269 EXTCONST U8 PL_${varname}_bitmask[] = {
276 sub print_process_EXACTish {
279 # Creates some bitmaps for EXACTish nodes.
285 for my $node (@ops) {
286 next unless $node->{type} eq 'EXACT';
287 my $name = $node->{name};
288 $base = $node->{id} if $name eq 'EXACT';
290 my $index = $node->{id} - $base;
292 # This depends entirely on naming conventions in regcomp.sym
293 $folded[$index] = $name =~ /^EXACTF/ || 0;
294 $req8[$index] = $name =~ /8/ || 0;
297 die "Can't cope with > 32 EXACTish nodes" if @folded > 32;
299 my $exactf = sprintf "%X", oct("0b" . join "", reverse @folded);
300 my $req8 = sprintf "%X", oct("0b" . join "", reverse @req8);
303 /* Is 'op', known to be of type EXACT, folding? */
304 #define isEXACTFish(op) (__ASSERT_(REGNODE_TYPE(op) == EXACT) (PL_EXACTFish_bitmask & (1U << (op - EXACT))))
306 /* Do only UTF-8 target strings match 'op', known to be of type EXACT? */
307 #define isEXACT_REQ8(op) (__ASSERT_(REGNODE_TYPE(op) == EXACT) (PL_EXACT_REQ8_bitmask & (1U << (op - EXACT))))
310 EXTCONST U32 PL_EXACTFish_bitmask;
311 EXTCONST U32 PL_EXACT_REQ8_bitmask;
313 EXTCONST U32 PL_EXACTFish_bitmask = 0x$exactf;
314 EXTCONST U32 PL_EXACT_REQ8_bitmask = 0x$req8;
319 sub read_definition {
321 my ( $seen_sep, $pod_comment )= "";
322 open my $in_fh, "<", $file
323 or die "Failed to open '$file' for reading: $!";
326 # Special pod comments
327 if (/^#\* ?/) { $pod_comment .= "# $'"; }
329 # Truly blank lines possibly surrounding pod comments
330 elsif (/^\s*$/) { $pod_comment .= "\n" }
332 next if /\A\s*#/ || /\A\s*\z/;
341 parse_state_def( $_, $., $pod_comment );
344 parse_opcode_def( $_, $., $pod_comment );
349 die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all,
354 # use fixed width to keep the diffs between regcomp.pl recompiles
355 # as small as possible.
356 my ( $base_name_width, $rwidth, $twidth )= ( 22, 12, 9 );
358 sub print_state_defs {
361 /* Regops and State definitions */
368 REGNODE_MAX => $#ops,
369 -$base_name_width, REGMATCH_STATE_MAX => $#all;
371 my %rev_type_alias= reverse %type_alias;
372 my $base_format = "#define %*s\t%d\t/* %#04x %s */\n";
376 my $max_name_width = 0;
377 for my $ref (\@ops, \@states) {
378 for my $node ($ref->@*) {
379 my $len = length $node->{name};
380 $max_name_width = $len if $max_name_width < $len;
384 die "Do a white-space only commit to increase \$base_name_width to"
385 . " $max_name_width; then re-run" if $base_name_width < $max_name_width;
388 /* -- For regexec.c to switch on target being utf8 (t8) or not (tb, b='byte'); */
389 #define with_t_UTF8ness(op, t_utf8) (((op) << 1) + (cBOOL(t_utf8)))
390 /* -- same, but also with pattern (p8, pb) -- */
391 #define with_tp_UTF8ness(op, t_utf8, p_utf8) \\
392 \t\t(((op) << 2) + (cBOOL(t_utf8) << 1) + cBOOL(p_utf8))
394 /* The #defines below give both the basic regnode and the expanded version for
395 switching on utf8ness */
398 for my $node (@ops) {
399 print_state_def_line($out, $node->{name}, $node->{id}, $node->{comment});
400 if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) {
401 print_state_def_line($out, $alias, $node->{id}, $node->{comment});
405 print $out "\t/* ------------ States ------------- */\n";
406 for my $node (@states) {
407 print_state_def_line($out, $node->{name}, $node->{id}, $node->{comment});
411 sub print_state_def_line
413 my ($fh, $name, $id, $comment) = @_;
415 # The sub-names are like '_tb' or '_tb_p8' = max 6 chars wide
416 my $name_col_width = $base_name_width + 6;
417 my $base_id_width = 3; # Max is '255' or 3 cols
418 my $mid_id_width = 3; # Max is '511' or 3 cols
419 my $full_id_width = 3; # Max is '1023' but not close to using the 4th
421 my $line = "#define " . $name;
422 $line .= " " x ($name_col_width - length($name));
424 $line .= sprintf "%*s", $base_id_width, $id;
425 $line .= " " x $mid_id_width;
426 $line .= " " x ($full_id_width + 2);
429 my $hanging = length $line; # Indent any subsequent line to this pos
430 $line .= sprintf "0x%02x", $id;
434 # From the documentation: 'In fact, every resulting line will have length
435 # of no more than "$columns - 1"'
436 $line = wrap($columns + 1, "", " " x $hanging, "$line $comment");
437 chomp $line; # wrap always adds a trailing \n
438 $line =~ s/ \s+ $ //x; # trim, just in case.
440 # The comment may have wrapped. Find the final \n and measure the length
441 # to the end. If it is short enough, just append the ' */' to the line.
442 # If it is too close to the end of the space available, add an extra line
443 # that consists solely of blanks and the ' */'
444 my $len = length($line); my $rindex = rindex($line, "\n");
445 if (length($line) - rindex($line, "\n") - 1 <= $columns - 3) {
449 $line .= "\n" . " " x ($hanging - 3) . "*/\n";
454 # And add the 2 subsidiary #defines used when switching on
456 my $with_id_t = $id * 2;
457 for my $with (qw(tb t8)) {
458 my $with_name = "${name}_$with";
459 print $fh "#define ", $with_name;
460 print $fh " " x ($name_col_width - length($with_name) + $base_id_width);
461 printf $fh "%*s", $mid_id_width, $with_id_t;
462 print $fh " " x $full_id_width;
464 print $fh " " x (4 + 2); # 4 is width of 0xHH that the base entry uses
465 printf $fh "0x%03x */\n", $with_id_t;
470 # Finally add the 4 subsidiary #defines used when switching on
472 my $with_id_tp = $id * 4;
473 for my $with (qw(tb_pb tb_p8 t8_pb t8_p8)) {
474 my $with_name = "${name}_$with";
475 print $fh "#define ", $with_name;
476 print $fh " " x ($name_col_width - length($with_name) + $base_id_width + $mid_id_width);
477 printf $fh "%*s", $full_id_width, $with_id_tp;
479 print $fh " " x (4 + 2); # 4 is width of 0xHH that the base entry uses
480 printf $fh "0x%03x */\n", $with_id_tp;
485 print $fh "\n"; # Blank line separates groups for clarity
492 /* typedefs for regex nodes - one typedef per node type */
496 foreach my $node (@ops) {
497 if ($node->{suffix} and $len < length($node->{suffix})) {
498 $len= length $node->{suffix};
501 $len += length "struct regnode_";
502 $len = (int($len/5)+2)*5;
503 my $prefix= "tregnode";
505 foreach my $node (sort { $a->{name} cmp $b->{name} } @ops) {
506 my $struct_name= "struct regnode";
507 if (my $suffix= $node->{suffix}) {
508 $struct_name .= "_$suffix";
510 $node->{typedef}= $prefix . "_" . $node->{name};
511 printf $out "typedef %*s %s;\n", -$len, $struct_name, $node->{typedef};
524 sub print_regnode_info {
528 /* PL_regnode_info[] - Opcode/state names in string form, for debugging */
531 EXTCONST struct regnode_meta PL_regnode_info[];
533 EXTCONST struct regnode_meta PL_regnode_info[] = {
535 my @fields= qw(type arg_len arg_len_varies off_by_arg);
536 foreach my $node_idx (0..$#all) {
537 my $node= $all[$node_idx];
540 $size= "EXTRA_SIZE($node->{typedef})" if $node->{suffix};
541 $node->{arg_len}= $size;
546 $varies= 1 if $node->{code} and $node->{code}=~"str";
547 $node->{arg_len_varies}= $varies;
549 $node->{off_by_arg}= $node->{longj} || 0;
551 print $out " /* #$node_idx $node->{optype} $node->{name} */\n";
552 foreach my $f_idx (0..$#fields) {
553 my $field= $fields[$f_idx];
554 printf $out " .%s = %s", $field, $node->{$field} // 0;
555 printf $out $f_idx == $#fields ? "\n" : ",\n";
558 print $out $node_idx==$#all ? "\n" : ",\n";
569 sub print_regnode_name {
573 /* PL_regnode_name[] - Opcode/state names in string form, for debugging */
576 EXTCONST char * PL_regnode_name[];
578 EXTCONST char * const PL_regnode_name[] = {
583 foreach my $node (@all) {
584 printf $out "\t%*s\t/* $sym%#04x */\n",
585 -3 - $base_name_width, qq("$node->{name}",), $node->{id} - $ofs;
586 if ( $node->{id} == $#ops and @ops != @all ) {
587 print $out "\t/* ------------ States ------------- */\n";
589 $sym= 'REGNODE_MAX +';
600 sub print_reg_extflags_name {
603 /* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */
606 EXTCONST char * PL_reg_extflags_name[];
608 EXTCONST char * const PL_reg_extflags_name[] = {
612 my %definitions; # Remember what the symbol definitions are
615 my $REG_EXTFLAGS_NAME_SIZE= 0;
616 foreach my $file ( "op_reg_common.h", "regexp.h" ) {
617 open my $in_fh, "<", $file or die "Can't read '$file': $!";
620 # optional leading '_'. Return symbol in $1, and strip it from
621 # comment of line. Currently doesn't handle comments running onto
623 if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
627 s{ /\* .*? \*/ }{ }x; # Replace comments by a blank
629 # Replace any prior defined symbols by their values
630 foreach my $key ( keys %definitions ) {
631 s/\b$key\b/$definitions{$key}/g;
634 # Remove the U suffix from unsigned int literals
635 s/\b([0-9]+)U\b/$1/g;
637 my $newval= eval $_; # Get numeric definition
639 $definitions{$define}= $newval;
641 next unless $_ =~ /<</; # Bit defines use left shift
642 if ( $val & $newval ) {
643 my @names= ( $define, $reverse{$newval} );
645 if ( $names[0] ne $names[1] ) {
647 "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)",
653 $rxfv{$define}= $newval;
654 $reverse{$newval}= $define;
658 my %vrxf= reverse %rxfv;
659 printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N',
663 my $power_of_2= 2**$_;
664 my $n= $vrxf{$power_of_2};
668 # Here, there was no name that matched exactly the bit. It could be
669 # either that it is unused, or the name matches multiple bits.
670 if ( !( $val & $power_of_2 ) ) {
675 # Here, must be because it matches multiple bits. Look through
676 # all possibilities until find one that matches this one. Use
677 # that name, and all the bits it matches
678 foreach my $name ( keys %rxfv ) {
679 if ( $rxfv{$name} & $power_of_2 ) {
680 $n= $name . ( $multibits{$name}++ );
681 $extra= sprintf qq{ : "%s" - 0x%08x}, $name,
683 if $power_of_2 != $rxfv{$name};
689 s/\bRXf_(PMf_)?// for $n, $extra;
690 printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra;
691 $REG_EXTFLAGS_NAME_SIZE++;
699 # define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE
705 sub print_reg_intflags_name {
709 /* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */
712 EXTCONST char * PL_reg_intflags_name[];
714 EXTCONST char * const PL_reg_intflags_name[] = {
718 my %definitions; # Remember what the symbol definitions are
721 my $REG_INTFLAGS_NAME_SIZE= 0;
722 foreach my $file ("regcomp.h") {
723 open my $fh, "<", $file or die "Can't read $file: $!";
726 # optional leading '_'. Return symbol in $1, and strip it from
729 m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi
738 $comment= $comment ? " - $comment" : "";
740 printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",),
741 $val, $define, $comment;
742 $REG_INTFLAGS_NAME_SIZE++;
754 # define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE
760 sub print_process_flags {
763 print $out process_flags( 'V', 'varies', <<'EOC');
764 /* The following have no fixed length. U8 so we can do strchr() on it. */
767 print $out process_flags( 'S', 'simple', <<'EOC');
769 /* The following always have a length of 1. U8 we can do strchr() on it. */
770 /* (Note that length 1 means "one character" under UTF8, not "one octet".) */
776 my $guts= open_new( 'pod/perldebguts.pod', '>' );
780 my $name_fmt= '<' x ( $longest_name_length - 1 );
781 my $descr_fmt= '<' x ( 58 - $longest_name_length );
782 eval <<EOD or die $@;
785 \$node->{pod_comment}
786 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~
787 \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : ''
792 my $old_fh= select($guts);
795 open my $oldguts, '<', 'pod/perldebguts.pod'
796 or die "$0 cannot open pod/perldebguts.pod for reading: $!";
799 last if /=for regcomp.pl begin/;
802 print <<'END_OF_DESCR';
804 # TYPE arg-description [regnode-struct-suffix] [longjump-len] DESCRIPTION
808 $code= "$node->{code} " . ( $node->{suffix} || "" );
809 $code .= " $node->{longj}" if $node->{longj};
810 if ( $node->{pod_comment} ||= "" ) {
812 # Trim multiple blanks
813 $node->{pod_comment} =~ s/^\n\n+/\n/;
814 $node->{pod_comment} =~ s/\n\n+$/\n\n/;
821 last if /=for regcomp.pl end/;
823 do { print } while <$oldguts>; #win32 can't unlink an open FH
824 close $oldguts or die "Error closing pod/perldebguts.pod: $!";
826 close_and_rename($guts);
829 my $confine_to_core = 'defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)';
830 read_definition("regcomp.sym");
832 require Data::Dumper;
833 print Data::Dumper::Dumper(\@all);
836 my $out= open_new( 'regnodes.h', '>',
838 by => 'regen/regcomp.pl',
839 from => [ 'regcomp.sym', 'op_reg_common.h', 'regexp.h' ],
842 print $out "#if $confine_to_core\n\n";
843 print_typedefs($out);
844 print_state_defs($out);
846 print_regnode_name($out);
847 print_regnode_info($out);
850 print_reg_extflags_name($out);
851 print_reg_intflags_name($out);
852 print_process_flags($out);
853 print_process_EXACTish($out);
854 print $out "\n#endif /* $confine_to_core */\n";
855 read_only_bottom_close_and_rename($out);