Porting/bench.pl

   1 #!/usr/bin/perl
   2 #
   3 # A tool for analysing the performance of the code snippets found in
   4 # t/perf/benchmarks or similar
   5
   6
   7 =head1 NAME
   8
   9 bench.pl - Compare the performance of perl code snippets across multiple
  10 perls.
  11
  12 =head1 SYNOPSIS
  13
  14     # Basic: run the tests in t/perf/benchmarks against two or
  15     # more perls
  16
  17     bench.pl [options] perl1[=label1] perl2[=label2] ...
  18
  19     # Run bench.pl's own built-in sanity tests
  20
  21     bench.pl --action=selftest
  22
  23 =head1 DESCRIPTION
  24
  25 By default, F<bench.pl> will run code snippets found in
  26 F<t/perf/benchmarks> (or similar) under cachegrind, in order to calculate
  27 how many instruction reads, data writes, branches, cache misses, etc. that
  28 one execution of the snippet uses. It will run them against two or more
  29 perl executables and show how much each test has gotten better or worse.
  30
  31 It is modelled on the F<perlbench> tool, but since it measures instruction
  32 reads etc., rather than timings, it is much more precise and reproducible.
  33 It is also considerably faster, and is capable or running tests in
  34 parallel (with C<-j>). Rather than  displaying a single relative
  35 percentage per test/perl combination, it displays values for 13 different
  36 measurements, such as instruction reads, conditional branch misses etc.
  37
  38 There are options to write the raw data to a file, and to read it back.
  39 This means that you can view the same run data in different views with
  40 different selection and sort options.
  41
  42 The optional C<=label> after each perl executable is used in the display
  43 output.
  44
  45 =head1 OPTIONS
  46
  47 =over 4
  48
  49 =item *
  50
  51 --action=I<foo>
  52
  53 What action to perform. The default is  I<grind>, which runs the benchmarks
  54 using I<cachegrind> as the back end. The only other action at the moment is
  55 I<selftest>, which runs some basic sanity checks and produces TAP output.
  56
  57 =item *
  58
  59 --average
  60
  61 Only display the overall average, rather than the results for each
  62 individual test.
  63
  64 =item *
  65
  66 --benchfile=I<foo>
  67
  68 The path of the file which contains the benchmarks (F<t/perf/benchmarks>
  69 by default).
  70
  71 =item *
  72
  73 --bisect=I<field,minval,maxval>
  74
  75 Run a single test against one perl and exit with a zero status if the
  76 named field is in the specified range; exit 1 otherwise. It will complain
  77 if more than one test or perl has been specified. It is intended to be
  78 called as part of a bisect run, to determine when something changed.
  79 For example,
  80
  81     bench.pl -j 8 --tests=foo --bisect=Ir,100,105 --perlargs=-Ilib \
  82         ./miniperl
  83
  84 might be called from bisect to find when the number of instruction reads
  85 for test I<foo> falls outside the range 100..105.
  86
  87 =item *
  88
  89 --debug
  90
  91 Enable verbose debugging output.
  92
  93 =item *
  94
  95 --fields=I<a,b,c>
  96
  97 Display only the specified fields; for example,
  98
  99     --fields=Ir,Ir_m,Ir_mm
 100
 101 If only one field is selected, the output is in more compact form.
 102
 103 =item *
 104
 105 --grindargs=I<foo>
 106
 107 Optional command-line arguments to pass to cachegrind invocations.
 108
 109 =item *
 110
 111 ---help
 112
 113 Display basic usage information.
 114
 115 =item *
 116
 117 -j I<N>
 118 --jobs=I<N>
 119
 120 Run I<N> jobs in parallel (default 1). This determines how many cachegrind
 121 process will running at a time, and should generally be set to the number
 122 of CPUs available.
 123
 124 =item *
 125
 126 --norm=I<foo>
 127
 128 Specify which perl column in the output to treat as the 100% norm.
 129 It may be a column number (0..N-1) or a perl executable name or label.
 130 It defaults to the leftmost column.
 131
 132 =item *
 133
 134 --perlargs=I<foo>
 135
 136 Optional command-line arguments to pass to each perl that is run as part of
 137 a cachegrind session. For example, C<--perlargs=-Ilib>.
 138
 139 =item *
 140
 141 --raw
 142
 143 Display raw data counts rather than percentages in the outputs. This
 144 allows you to see the exact number of intruction reads, branch misses etc.
 145 for each test/perl combination. It also causes the C<AVERAGE> display
 146 per field to be calculated based on the average of each tests's count
 147 rather than average of each percentage. This means that tests with very
 148 high counts will dominate.
 149
 150 =item *
 151
 152 --sort=I<field:perl>
 153
 154 Order the tests in the output based on the value of I<field> in the
 155 column I<perl>. The I<perl> value is as per C<--norm>. For example
 156
 157     bench.pl --sort=Dw:perl-5.20.0 \
 158         perl-5.16.0 perl-5.18.0 perl-5.20.0
 159
 160 =item *
 161
 162 -r I<file>
 163 --read=I<file>
 164
 165 Read in saved data from a previous C<--write> run from the specified file.
 166
 167 Requires C<JSON::PP> to be available.
 168
 169 =item *
 170
 171 --tests=I<FOO>
 172
 173 Specify a subset of tests to run (or in the case of C<--read>, to display).
 174 It may be either a comma-separated list of test names, or a regular
 175 expression. For example
 176
 177     --tests=expr::assign::scalar_lex,expr::assign::2list_lex
 178     --tests=/^expr::/
 179
 180 =item *
 181
 182 --verbose
 183
 184 Display progress information.
 185
 186 =item *
 187
 188 -w I<file>
 189 --write=I<file>
 190
 191 Save the raw data to the specified file. It can be read back later with
 192 C<--read>.
 193
 194 Requires C<JSON::PP> to be available.
 195
 196 =back
 197
 198 =cut
 199
 200
 201
 202 use 5.010000;
 203 use warnings;
 204 use strict;
 205 use Getopt::Long qw(:config no_auto_abbrev);
 206 use IPC::Open2 ();
 207 use IO::Select;
 208 use IO::File;
 209 use POSIX ":sys_wait_h";
 210
 211 # The version of the file format used to save data. We refuse to process
 212 # the file if the integer component differs.
 213
 214 my $FORMAT_VERSION = 1.0;
 215
 216 # The fields we know about
 217
 218 my %VALID_FIELDS = map { $_ => 1 }
 219     qw(Ir Ir_m1 Ir_mm Dr Dr_m1 Dr_mm Dw Dw_m1 Dw_mm COND COND_m IND IND_m);
 220
 221 sub usage {
 222     die <<EOF;
 223 usage: $0 [options] perl[=label] ...
 224   --action=foo       What action to perform [default: grind].
 225   --average          Only display average, not individual test results.
 226   --benchfile=foo    File containing the benchmarks;
 227                        [default: t/perf/benchmarks].
 228   --bisect=f,min,max run a single test against one perl and exit with a
 229                        zero status if the named field is in the specified
 230                        range; exit 1 otherwise.
 231   --debug            Enable verbose debugging output.
 232   --fields=a,b,c     Display only the specified fields (e.g. Ir,Ir_m,Ir_mm).
 233   --grindargs=foo    Optional command-line args to pass to cachegrind.
 234   --help             Display this help.
 235   -j|--jobs=N        Run N jobs in parallel [default 1].
 236   --norm=perl        Which perl column to treat as 100%; may be a column
 237                        number (0..N-1) or a perl executable name or label;
 238                        [default: 0].
 239   --perlargs=foo     Optional command-line args to pass to each perl to run.
 240   --raw              Display raw data counts rather than percentages.
 241   --sort=field:perl  Sort the tests based on the value of 'field' in the
 242                        column 'perl'. The perl value is as per --norm.
 243   -r|--read=file     Read in previously saved data from the specified file.
 244   --tests=FOO        Select only the specified tests from the benchmarks file;
 245                        FOO may be either of the form 'foo,bar' or '/regex/';
 246                        [default: all tests].
 247   --verbose          Display progress information.
 248   -w|--write=file    Save the raw data to the specified file.
 249
 250 --action is one of:
 251     grind            run the code under cachegrind
 252     selftest         perform a selftest; produce TAP output
 253
 254 The command line ends with one or more specified perl executables,
 255 which will be searched for in the current \$PATH. Each binary name may
 256 have an optional =LABEL appended, which will be used rather than the
 257 executable name in output. E.g.
 258
 259     perl-5.20.1=PRE-BUGFIX  perl-5.20.1-new=POST-BUGFIX
 260 EOF
 261 }
 262
 263 my %OPTS = (
 264     action    => 'grind',
 265     average   => 0,
 266     benchfile => 't/perf/benchmarks',
 267     bisect    => undef,
 268     debug     => 0,
 269     grindargs => '',
 270     fields    => undef,
 271     jobs      => 1,
 272     norm      => 0,
 273     perlargs  => '',
 274     raw       => 0,
 275     read      => undef,
 276     sort      => undef,
 277     tests     => undef,
 278     verbose   => 0,
 279     write     => undef,
 280 );
 281
 282
 283 # process command-line args and call top-level action
 284
 285 {
 286     GetOptions(
 287         'action=s'    => \$OPTS{action},
 288         'average'     => \$OPTS{average},
 289         'benchfile=s' => \$OPTS{benchfile},
 290         'bisect=s'    => \$OPTS{bisect},
 291         'debug'       => \$OPTS{debug},
 292         'grindargs=s' => \$OPTS{grindargs},
 293         'help'        => \$OPTS{help},
 294         'fields=s'    => \$OPTS{fields},
 295         'jobs|j=i'    => \$OPTS{jobs},
 296         'norm=s'      => \$OPTS{norm},
 297         'perlargs=s'  => \$OPTS{perlargs},
 298         'raw'         => \$OPTS{raw},
 299         'read|r=s'    => \$OPTS{read},
 300         'sort=s'      => \$OPTS{sort},
 301         'tests=s'     => \$OPTS{tests},
 302         'verbose'     => \$OPTS{verbose},
 303         'write|w=s'   => \$OPTS{write},
 304     ) or usage;
 305
 306     usage if $OPTS{help};
 307
 308
 309     if (defined $OPTS{read} and defined $OPTS{write}) {
 310         die "Error: can't specify both --read and --write options\n";
 311     }
 312
 313     if (defined $OPTS{read} or defined $OPTS{write}) {
 314         # fail early if it's not present
 315         require JSON::PP;
 316     }
 317
 318     if (defined $OPTS{fields}) {
 319         my @f = split /,/, $OPTS{fields};
 320         for (@f) {
 321             die "Error: --fields: unknown field '$_'\n"
 322                 unless $VALID_FIELDS{$_};
 323         }
 324         my %f = map { $_ => 1 } @f;
 325         $OPTS{fields} = \%f;
 326     }
 327
 328     my %valid_actions = qw(grind 1 selftest 1);
 329     unless ($valid_actions{$OPTS{action}}) {
 330         die "Error: unrecognised action '$OPTS{action}'\n"
 331           . "must be one of: " . join(', ', sort keys %valid_actions)."\n";
 332     }
 333
 334     if (defined $OPTS{sort}) {
 335         my @s = split /:/, $OPTS{sort};
 336         if (@s != 2) {
 337             die "Error: --sort argument should be of the form field:perl: "
 338               . "'$OPTS{sort}'\n";
 339         }
 340         my ($field, $perl) = @s;
 341         die "Error: --sort: unknown field '$field\n"
 342             unless $VALID_FIELDS{$field};
 343         # the 'perl' value will be validated later, after we have processed
 344         # the perls
 345         $OPTS{'sort-field'} = $field;
 346         $OPTS{'sort-perl'}  = $perl;
 347     }
 348
 349     if ($OPTS{action} eq 'selftest') {
 350         if (@ARGV) {
 351             die "Error: no perl executables may be specified with --read\n"
 352         }
 353     }
 354     elsif (defined $OPTS{bisect}) {
 355         die "Error: exactly one perl executable must be specified for bisect\n"
 356                                                 unless @ARGV == 1;
 357         die "Error: Can't specify both --bisect and --read\n"
 358                                                 if defined $OPTS{read};
 359         die "Error: Can't specify both --bisect and --write\n"
 360                                                 if defined $OPTS{write};
 361     }
 362     elsif (defined $OPTS{read}) {
 363         if (@ARGV) {
 364             die "Error: no perl executables may be specified with --read\n"
 365         }
 366     }
 367     elsif ($OPTS{raw}) {
 368         unless (@ARGV) {
 369             die "Error: at least one perl executable must be specified\n";
 370         }
 371     }
 372     else {
 373         unless (@ARGV >= 2) {
 374             die "Error: at least two perl executables must be specified\n";
 375         }
 376     }
 377
 378     if ($OPTS{action} eq 'grind') {
 379         do_grind(\@ARGV);
 380     }
 381     elsif ($OPTS{action} eq 'selftest') {
 382         do_selftest();
 383     }
 384 }
 385 exit 0;
 386
 387
 388 # Given a hash ref keyed by test names, filter it by deleting unwanted
 389 # tests, based on $OPTS{tests}.
 390
 391 sub filter_tests {
 392     my ($tests) = @_;
 393
 394     my $opt = $OPTS{tests};
 395     return unless defined $opt;
 396
 397     my @tests;
 398
 399     if ($opt =~ m{^/}) {
 400         $opt =~ s{^/(.+)/$}{$1}
 401             or die "Error: --tests regex must be of the form /.../\n";
 402         for (keys %$tests) {
 403             delete $tests->{$_} unless /$opt/;
 404         }
 405     }
 406     else {
 407         my %t;
 408         for (split /,/, $opt) {
 409             die "Error: no such test found: '$_'\n" unless exists $tests->{$_};
 410             $t{$_} = 1;
 411         }
 412         for (keys %$tests) {
 413             delete $tests->{$_} unless exists $t{$_};
 414         }
 415     }
 416 }
 417
 418
 419 # Read in the test file, and filter out any tests excluded by $OPTS{tests}
 420
 421 sub read_tests_file {
 422     my ($file) = @_;
 423
 424     my $ta = do $file;
 425     unless ($ta) {
 426         die "Error: can't parse '$file': $@\n" if $@;
 427         die "Error: can't read '$file': $!\n";
 428     }
 429
 430     my $t = { @$ta };
 431     filter_tests($t);
 432     return $t;
 433 }
 434
 435
 436 # Process the perl/column argument of options like --norm and --sort.
 437 # Return the index of the matching perl.
 438
 439 sub select_a_perl {
 440     my ($perl, $perls, $who) = @_;
 441
 442     if ($perl =~ /^[0-9]$/) {
 443         die "Error: $who value $perl outside range 0.." . $#$perls . "\n"
 444                                         unless $perl < @$perls;
 445         return $perl;
 446     }
 447     else {
 448         my @perl = grep    $perls->[$_][0] eq $perl
 449                         || $perls->[$_][1] eq $perl,
 450                         0..$#$perls;
 451         die "Error: $who: unrecognised perl '$perl'\n"
 452                                         unless @perl;
 453         die "Error: $who: ambiguous perl '$perl'\n"
 454                                         if @perl > 1;
 455         return $perl[0];
 456     }
 457 }
 458
 459
 460 # Validate the list of perl=label on the command line.
 461 # Return a list of [ exe, label ] pairs.
 462
 463 sub process_perls {
 464     my @results;
 465     for my $p (@_) {
 466         my ($perl, $label) = split /=/, $p, 2;
 467         $label //= $perl;
 468         my $r = qx($perl -e 'print qq(ok\n)' 2>&1);
 469         die "Error: unable to execute '$perl': $r" if $r ne "ok\n";
 470         push @results, [ $perl, $label ];
 471     }
 472     return @results;
 473 }
 474
 475
 476
 477 # Return a string containing perl test code wrapped in a loop
 478 # that runs $ARGV[0] times
 479
 480 sub make_perl_prog {
 481     my ($test, $desc, $setup, $code) = @_;
 482
 483     return <<EOF;
 484 # $desc
 485 package $test;
 486 BEGIN { srand(0) }
 487 $setup;
 488 for my \$__loop__ (1..\$ARGV[0]) {
 489     $code;
 490 }
 491 EOF
 492 }
 493
 494
 495 # Parse the output from cachegrind. Return a hash ref.
 496 # See do_selftest() for examples of the output format.
 497
 498 sub parse_cachegrind {
 499     my ($output, $id, $perl) = @_;
 500
 501     my %res;
 502
 503     my @lines = split /\n/, $output;
 504     for (@lines) {
 505         unless (s/(==\d+==)|(--\d+--) //) {
 506             die "Error: while executing $id:\n"
 507               . "unexpected code or cachegrind output:\n$_\n";
 508         }
 509         if (/I   refs:\s+([\d,]+)/) {
 510             $res{Ir} = $1;
 511         }
 512         elsif (/I1  misses:\s+([\d,]+)/) {
 513             $res{Ir_m1} = $1;
 514         }
 515         elsif (/LLi misses:\s+([\d,]+)/) {
 516             $res{Ir_mm} = $1;
 517         }
 518         elsif (/D   refs:\s+.*?([\d,]+) rd .*?([\d,]+) wr/) {
 519             @res{qw(Dr Dw)} = ($1,$2);
 520         }
 521         elsif (/D1  misses:\s+.*?([\d,]+) rd .*?([\d,]+) wr/) {
 522             @res{qw(Dr_m1 Dw_m1)} = ($1,$2);
 523         }
 524         elsif (/LLd misses:\s+.*?([\d,]+) rd .*?([\d,]+) wr/) {
 525             @res{qw(Dr_mm Dw_mm)} = ($1,$2);
 526         }
 527         elsif (/Branches:\s+.*?([\d,]+) cond .*?([\d,]+) ind/) {
 528             @res{qw(COND IND)} = ($1,$2);
 529         }
 530         elsif (/Mispredicts:\s+.*?([\d,]+) cond .*?([\d,]+) ind/) {
 531             @res{qw(COND_m IND_m)} = ($1,$2);
 532         }
 533     }
 534
 535     for my $field (keys %VALID_FIELDS) {
 536         die "Error: can't parse '$field' field from cachegrind output:\n$output"
 537             unless exists $res{$field};
 538         $res{$field} =~ s/,//g;
 539     }
 540
 541     return \%res;
 542 }
 543
 544
 545 # Handle the 'grind' action
 546
 547 sub do_grind {
 548     my ($perl_args) = @_; # the residue of @ARGV after option processing
 549
 550     my ($loop_counts, $perls, $results, $tests);
 551     my ($bisect_field, $bisect_min, $bisect_max);
 552
 553     if (defined $OPTS{bisect}) {
 554         ($bisect_field, $bisect_min, $bisect_max) = split /,/, $OPTS{bisect}, 3;
 555         die "Error: --bisect option must be of form 'field,integer,integer'\n"
 556             unless
 557                     defined $bisect_max
 558                 and $bisect_min =~ /^[0-9]+$/
 559                 and $bisect_max =~ /^[0-9]+$/;
 560
 561         die "Error: unrecognised field '$bisect_field' in --bisect option\n"
 562             unless $VALID_FIELDS{$bisect_field};
 563
 564         die "Error: --bisect min ($bisect_min) must be <= max ($bisect_max)\n"
 565             if $bisect_min > $bisect_max;
 566     }
 567
 568     if (defined $OPTS{read}) {
 569         open my $in, '<:encoding(UTF-8)', $OPTS{read}
 570             or die " Error: can't open $OPTS{read} for reading: $!\n";
 571         my $data = do { local $/; <$in> };
 572         close $in;
 573
 574         my $hash = JSON::PP::decode_json($data);
 575         if (int($FORMAT_VERSION) < int($hash->{version})) {
 576             die "Error: unsupported version $hash->{version} in file"
 577               . "'$OPTS{read}' (too new)\n";
 578         }
 579         ($loop_counts, $perls, $results, $tests) =
 580             @$hash{qw(loop_counts perls results tests)};
 581
 582         filter_tests($results);
 583         filter_tests($tests);
 584     }
 585     else {
 586         # How many times to execute the loop for the two trials. The lower
 587         # value is intended to do the loop enough times that branch
 588         # prediction has taken hold; the higher loop allows us to see the
 589         # branch misses after that
 590         $loop_counts = [10, 20];
 591
 592         $tests = read_tests_file($OPTS{benchfile});
 593         die "Error: only a single test may be specified with --bisect\n"
 594             if defined $OPTS{bisect} and keys %$tests != 1;
 595
 596         $perls = [ process_perls(@$perl_args) ];
 597
 598
 599         $results = grind_run($tests, $perls, $loop_counts);
 600     }
 601
 602     # now that we have a list of perls, use it to process the
 603     # 'perl' component of the --norm and --sort args
 604
 605     $OPTS{norm} = select_a_perl($OPTS{norm}, $perls, "--norm");
 606     if (defined $OPTS{'sort-perl'}) {
 607         $OPTS{'sort-perl'} =
 608                 select_a_perl($OPTS{'sort-perl'}, $perls, "--sort");
 609     }
 610
 611     if (defined $OPTS{write}) {
 612         my $json = JSON::PP::encode_json({
 613                     version      => $FORMAT_VERSION,
 614                     loop_counts  => $loop_counts,
 615                     perls        => $perls,
 616                     results      => $results,
 617                     tests        => $tests,
 618                 });
 619
 620         open my $out, '>:encoding(UTF-8)', $OPTS{write}
 621             or die " Error: can't open $OPTS{write} for writing: $!\n";
 622         print $out $json or die "Error: writing to file '$OPTS{write}': $!\n";
 623         close $out       or die "Error: closing file '$OPTS{write}': $!\n";
 624     }
 625     else {
 626         my ($processed, $averages) =
 627                     grind_process($results, $perls, $loop_counts);
 628
 629         if (defined $OPTS{bisect}) {
 630             my @r = values %$results;
 631             die "Panic: expected exactly one test result in bisect\n"
 632                                                             if @r != 1;
 633             @r = values %{$r[0]};
 634             die "Panic: expected exactly one perl result in bisect\n"
 635                                                             if @r != 1;
 636             my $c = $r[0]{$bisect_field};
 637             die "Panic: no result in bisect for field '$bisect_field'\n"
 638                                                             unless defined $c;
 639             exit 0 if $bisect_min <= $c and $c <= $bisect_max;
 640             exit 1;
 641         }
 642         else {
 643             grind_print($processed, $averages, $perls, $tests);
 644         }
 645     }
 646 }
 647
 648
 649 # Run cachegrind for every test/perl combo.
 650 # It may run several processes in parallel when -j is specified.
 651 # Return a hash ref suitable for input to grind_process()
 652
 653 sub grind_run {
 654     my ($tests, $perls, $counts) = @_;
 655
 656     # Build a list of all the jobs to run
 657
 658     my @jobs;
 659
 660     for my $test (sort keys %$tests) {
 661
 662         # Create two test progs: one with an empty loop and one with code.
 663         # Note that the empty loop is actually '{1;}' rather than '{}';
 664         # this causes the loop to have a single nextstate rather than a
 665         # stub op, so more closely matches the active loop; e.g.:
 666         #   {1;}    => nextstate;                       unstack
 667         #   {$x=1;} => nextstate; const; gvsv; sassign; unstack
 668         my @prog = (
 669             make_perl_prog($test, @{$tests->{$test}}{qw(desc setup)}, '1'),
 670             make_perl_prog($test, @{$tests->{$test}}{qw(desc setup code)}),
 671         );
 672
 673         for my $p (@$perls) {
 674             my ($perl, $label) = @$p;
 675
 676             # Run both the empty loop and the active loop
 677             # $counts->[0] and $counts->[1] times.
 678
 679             for my $i (0,1) {
 680                 for my $j (0,1) {
 681                     my $cmd = "PERL_HASH_SEED=0 "
 682                             . "valgrind --tool=cachegrind  --branch-sim=yes "
 683                             . "--cachegrind-out-file=/dev/null "
 684                             . "$OPTS{grindargs} "
 685                             . "$perl $OPTS{perlargs} - $counts->[$j] 2>&1";
 686                     # for debugging and error messages
 687                     my $id = "$test/$perl "
 688                         . ($i ? "active" : "empty") . "/"
 689                         . ($j ? "long"   : "short") . " loop";
 690
 691                     push @jobs, {
 692                         test   => $test,
 693                         perl   => $perl,
 694                         plabel => $label,
 695                         cmd    => $cmd,
 696                         prog   => $prog[$i],
 697                         active => $i,
 698                         loopix => $j,
 699                         id     => $id,
 700                     };
 701                 }
 702             }
 703         }
 704     }
 705
 706     # Execute each cachegrind and store the results in %results.
 707
 708     local $SIG{PIPE} = 'IGNORE';
 709
 710     my $max_jobs = $OPTS{jobs};
 711     my $running  = 0; # count of executing jobs
 712     my %pids;         # map pids to jobs
 713     my %fds;          # map fds  to jobs
 714     my %results;
 715     my $select = IO::Select->new();
 716
 717     while (@jobs or $running) {
 718
 719         if ($OPTS{debug}) {
 720             printf "Main loop: pending=%d running=%d\n",
 721                 scalar(@jobs), $running;
 722         }
 723
 724         # Start new jobs
 725
 726         while (@jobs && $running < $max_jobs) {
 727             my $job = shift @jobs;
 728             my ($id, $cmd) =@$job{qw(id cmd)};
 729
 730             my ($in, $out, $pid);
 731             warn "Starting $id\n" if $OPTS{verbose};
 732             eval { $pid = IPC::Open2::open2($out, $in, $cmd); 1; }
 733                 or die "Error: while starting cachegrind subprocess"
 734                    ." for $id:\n$@";
 735             $running++;
 736             $pids{$pid}    = $job;
 737             $fds{"$out"}   = $job;
 738             $job->{out_fd} = $out;
 739             $job->{output} = '';
 740             $job->{pid}    = $pid;
 741
 742             $out->blocking(0);
 743             $select->add($out);
 744
 745             if ($OPTS{debug}) {
 746                 print "Started pid $pid for $id\n";
 747             }
 748
 749             # Note:
 750             # In principle we should write to $in in the main select loop,
 751             # since it may block. In reality,
 752             #  a) the code we write to the perl process's stdin is likely
 753             #     to be less than the OS's pipe buffer size;
 754             #  b) by the time the perl process has read in all its stdin,
 755             #     the only output it should have generated is a few lines
 756             #     of cachegrind output preamble.
 757             # If these assumptions change, then perform the following print
 758             # in the select loop instead.
 759
 760             print $in $job->{prog};
 761             close $in;
 762         }
 763
 764         # Get output of running jobs
 765
 766         if ($OPTS{debug}) {
 767             printf "Select: waiting on (%s)\n",
 768                 join ', ', sort { $a <=> $b } map $fds{$_}{pid},
 769                             $select->handles;
 770         }
 771
 772         my @ready = $select->can_read;
 773
 774         if ($OPTS{debug}) {
 775             printf "Select: pids (%s) ready\n",
 776                 join ', ', sort { $a <=> $b } map $fds{$_}{pid}, @ready;
 777         }
 778
 779         unless (@ready) {
 780             die "Panic: select returned no file handles\n";
 781         }
 782
 783         for my $fd (@ready) {
 784             my $j = $fds{"$fd"};
 785             my $r = sysread $fd, $j->{output}, 8192, length($j->{output});
 786             unless (defined $r) {
 787                 die "Panic: Read from process running $j->{id} gave:\n$!";
 788             }
 789             next if $r;
 790
 791             # EOF
 792
 793             if ($OPTS{debug}) {
 794                 print "Got eof for pid $fds{$fd}{pid} ($j->{id})\n";
 795             }
 796
 797             $select->remove($j->{out_fd});
 798             close($j->{out_fd})
 799                 or die "Panic: closing output fh on $j->{id} gave:\n$!\n";
 800             $running--;
 801             delete $fds{"$j->{out_fd}"};
 802             my $output = $j->{output};
 803
 804             if ($OPTS{debug}) {
 805                 my $p = $j->{prog};
 806                 $p =~ s/^/    : /mg;
 807                 my $o = $output;
 808                 $o =~ s/^/    : /mg;
 809
 810                 print "\n$j->{id}/\nCommand: $j->{cmd}\n"
 811                     . "Input:\n$p"
 812                     . "Output\n$o";
 813             }
 814
 815             $results{$j->{test}}{$j->{perl}}[$j->{active}][$j->{loopix}]
 816                     = parse_cachegrind($output, $j->{id}, $j->{perl});
 817         }
 818
 819         # Reap finished jobs
 820
 821         while (1) {
 822             my $kid = waitpid(-1, WNOHANG);
 823             my $ret = $?;
 824             last if $kid <= 0;
 825
 826             unless (exists $pids{$kid}) {
 827                 die "Panic: reaped unexpected child $kid";
 828             }
 829             my $j = $pids{$kid};
 830             if ($ret) {
 831                 die sprintf("Error: $j->{id} gave return status 0x%04x\n", $ret)
 832                     . "with the following output\n:$j->{output}\n";
 833             }
 834             delete $pids{$kid};
 835         }
 836     }
 837
 838     return \%results;
 839 }
 840
 841
 842
 843
 844 # grind_process(): process the data that has been extracted from
 845 # cachgegrind's output.
 846 #
 847 # $res is of the form ->{benchmark_name}{perl_name}[active][count]{field_name},
 848 # where active is 0 or 1 indicating an empty or active loop,
 849 # count is 0 or 1 indicating a short or long loop. E.g.
 850 #
 851 #    $res->{'expr::assign::scalar_lex'}{perl-5.21.1}[0][10]{Dw_mm}
 852 #
 853 # The $res data structure is modified in-place by this sub.
 854 #
 855 # $perls is [ [ perl-exe, perl-label], .... ].
 856 #
 857 # $counts is [ N, M ] indicating the counts for the short and long loops.
 858 #
 859 #
 860 # return \%output, \%averages, where
 861 #
 862 # $output{benchmark_name}{perl_name}{field_name} = N
 863 # $averages{perl_name}{field_name} = M
 864 #
 865 # where N is the raw count ($OPTS{raw}), or count_perl0/count_perlI otherwise;
 866 # M is the average raw count over all tests ($OPTS{raw}), or
 867 # 1/(sum(count_perlI/count_perl0)/num_tests) otherwise.
 868
 869 sub grind_process {
 870     my ($res, $perls, $counts) = @_;
 871
 872     # Process the four results for each test/perf combo:
 873     # Convert
 874     #    $res->{benchmark_name}{perl_name}[active][count]{field_name} = n
 875     # to
 876     #    $res->{benchmark_name}{perl_name}{field_name} = averaged_n
 877     #
 878     # $r[0][1] - $r[0][0] is the time to do ($counts->[1]-$counts->[0])
 879     #                     empty loops, eliminating startup time
 880     # $r[1][1] - $r[1][0] is the time to do ($counts->[1]-$counts->[0])
 881     #                     active loops, eliminating startup time
 882     # (the two startup times may be different because different code
 883     # is being compiled); the difference of the two results above
 884     # divided by the count difference is the time to execute the
 885     # active code once, eliminating both startup and loop overhead.
 886
 887     for my $tests (values %$res) {
 888         for my $r (values %$tests) {
 889             my $r2;
 890             for (keys %{$r->[0][0]}) {
 891                 my $n = (  ($r->[1][1]{$_} - $r->[1][0]{$_})
 892                          - ($r->[0][1]{$_} - $r->[0][0]{$_})
 893                         ) / ($counts->[1] - $counts->[0]);
 894                 $r2->{$_} = $n;
 895             }
 896             $r = $r2;
 897         }
 898     }
 899
 900     my %totals;
 901     my %counts;
 902     my %data;
 903
 904     my $perl_norm = $perls->[$OPTS{norm}][0]; # the name of the reference perl
 905
 906     for my $test_name (keys %$res) {
 907         my $res1 = $res->{$test_name};
 908         my $res2_norm = $res1->{$perl_norm};
 909         for my $perl (keys %$res1) {
 910             my $res2 = $res1->{$perl};
 911             for my $field (keys %$res2) {
 912                 my ($p, $q) = ($res2_norm->{$field}, $res2->{$field});
 913
 914                 if ($OPTS{raw}) {
 915                     # Avoid annoying '-0.0' displays. Ideally this number
 916                     # should never be negative, but fluctuations in
 917                     # startup etc can theoretically make this happen
 918                     $q = 0 if ($q <= 0 && $q > -0.1);
 919                     $totals{$perl}{$field} += $q;
 920                     $counts{$perl}{$field}++;
 921                     $data{$test_name}{$perl}{$field} = $q;
 922                     next;
 923                 }
 924
 925                 # $p and $q are notionally integer counts, but
 926                 # due to variations in startup etc, it's possible for a
 927                 # count which is supposedly zero to be calculated as a
 928                 # small positive or negative value.
 929                 # In this case, set it to zero. Further below we
 930                 # special-case zeros to avoid division by zero errors etc.
 931
 932                 $p = 0.0 if $p < 0.01;
 933                 $q = 0.0 if $q < 0.01;
 934
 935                 if ($p == 0.0 && $q == 0.0) {
 936                     # Both perls gave a count of zero, so no change:
 937                     # treat as 100%
 938                     $totals{$perl}{$field} += 1;
 939                     $counts{$perl}{$field}++;
 940                     $data{$test_name}{$perl}{$field} = 1;
 941                 }
 942                 elsif ($p == 0.0 || $q == 0.0) {
 943                     # If either count is zero, there were too few events
 944                     # to give a meaningful ratio (and we will end up with
 945                     # division by zero if we try). Mark the result undef,
 946                     # indicating that it shouldn't be displayed; and skip
 947                     # adding to the average
 948                     $data{$test_name}{$perl}{$field} = undef;
 949                 }
 950                 else {
 951                     # For averages, we record q/p rather than p/q.
 952                     # Consider a test where perl_norm took 1000 cycles
 953                     # and perlN took 800 cycles. For the individual
 954                     # results we display p/q, or 1.25; i.e. a quarter
 955                     # quicker. For the averages, we instead sum all
 956                     # the 0.8's, which gives the total cycles required to
 957                     # execute all tests, with all tests given equal
 958                     # weight. Later we reciprocate the final result,
 959                     # i.e. 1/(sum(qi/pi)/n)
 960
 961                     $totals{$perl}{$field} += $q/$p;
 962                     $counts{$perl}{$field}++;
 963                     $data{$test_name}{$perl}{$field} = $p/$q;
 964                 }
 965             }
 966         }
 967     }
 968
 969     # Calculate averages based on %totals and %counts accumulated earlier.
 970
 971     my %averages;
 972     for my $perl (keys %totals) {
 973         my $t = $totals{$perl};
 974         for my $field (keys %$t) {
 975             $averages{$perl}{$field} = $OPTS{raw}
 976                 ? $t->{$field} / $counts{$perl}{$field}
 977                   # reciprocal - see comments above
 978                 : $counts{$perl}{$field} / $t->{$field};
 979         }
 980     }
 981
 982     return \%data, \%averages;
 983 }
 984
 985
 986 # grind_print(): display the tabulated results of all the cachegrinds.
 987 #
 988 # Arguments are of the form:
 989 #    $results->{benchmark_name}{perl_name}{field_name} = N
 990 #    $averages->{perl_name}{field_name} = M
 991 #    $perls = [ [ perl-exe, perl-label ], ... ]
 992 #    $tests->{test_name}{desc => ..., ...}
 993
 994 sub grind_print {
 995     my ($results, $averages, $perls, $tests) = @_;
 996
 997     my @perl_names = map $_->[0], @$perls;
 998     my %perl_labels;
 999     $perl_labels{$_->[0]} = $_->[1] for @$perls;
1000
1001     my $field_label_width = 6;
1002     # Calculate the width to display for each column.
1003     my $min_width = $OPTS{raw} ? 8 : 6;
1004     my @widths = map { length($_) < $min_width ? $min_width : length($_) }
1005                             @perl_labels{@perl_names};
1006
1007     # Print header.
1008
1009     print <<EOF;
1010 Key:
1011     Ir   Instruction read
1012     Dr   Data read
1013     Dw   Data write
1014     COND conditional branches
1015     IND  indirect branches
1016     _m   branch predict miss
1017     _m1  level 1 cache miss
1018     _mm  last cache (e.g. L3) miss
1019     -    indeterminate percentage (e.g. 1/0)
1020
1021 EOF
1022
1023     if ($OPTS{raw}) {
1024         print "The numbers represent raw counts per loop iteration.\n";
1025     }
1026     else {
1027         print <<EOF;
1028 The numbers represent relative counts per loop iteration, compared to
1029 $perl_labels{$perl_names[0]} at 100.0%.
1030 Higher is better: for example, using half as many instructions gives 200%,
1031 while using twice as many gives 50%.
1032 EOF
1033     }
1034
1035     # Populate @test_names with the tests in sorted order.
1036
1037     my @test_names;
1038     unless ($OPTS{average}) {
1039         if (defined $OPTS{'sort-field'}) {
1040             my ($field, $perlix) = @OPTS{'sort-field', 'sort-perl'};
1041             my $perl = $perls->[$perlix][0];
1042             @test_names = sort
1043                 {
1044                         $results->{$a}{$perl}{$field}
1045                     <=> $results->{$b}{$perl}{$field}
1046                 }
1047                 keys %$results;
1048         }
1049         else {
1050             @test_names = sort(keys %$results);
1051         }
1052     }
1053
1054     # No point in displaying average for only one test.
1055     push @test_names,  'AVERAGE' unless @test_names == 1;
1056
1057     # If only a single field is to be displayed, use a more compact
1058     # format with only a single line of output per test.
1059
1060     my $one_field = defined $OPTS{fields} &&  keys(%{$OPTS{fields}}) == 1;
1061
1062     if ($one_field) {
1063         print "Results for field " . (keys(%{$OPTS{fields}}))[0] . ".\n";
1064
1065         # The first column will now contain test names rather than
1066         # field names; Calculate the max width.
1067
1068         $field_label_width = 0;
1069         for (@test_names) {
1070             $field_label_width = length if length > $field_label_width;
1071         }
1072
1073         # Print the perl executables header.
1074
1075         print "\n";
1076         for my $i (0,1) {
1077             print " " x $field_label_width;
1078             for (0..$#widths) {
1079                 printf " %*s", $widths[$_],
1080                     $i ? ('-' x$widths[$_]) :  $perl_labels{$perl_names[$_]};
1081             }
1082             print "\n";
1083         }
1084     }
1085
1086     # Dump the results for each test.
1087
1088     for my $test_name (@test_names) {
1089         my $doing_ave = ($test_name eq 'AVERAGE');
1090         my $res1 = $doing_ave ? $averages : $results->{$test_name};
1091
1092         unless ($one_field) {
1093             print "\n$test_name";
1094             print "\n$tests->{$test_name}{desc}" unless $doing_ave;
1095             print "\n\n";
1096
1097             # Print the perl executables header.
1098             for my $i (0,1) {
1099                 print " " x $field_label_width;
1100                 for (0..$#widths) {
1101                     printf " %*s", $widths[$_],
1102                         $i ? ('-' x$widths[$_]) :  $perl_labels{$perl_names[$_]};
1103                 }
1104                 print "\n";
1105             }
1106         }
1107
1108         for my $field (qw(Ir Dr Dw COND IND
1109                           N
1110                           COND_m IND_m
1111                           N
1112                           Ir_m1 Dr_m1 Dw_m1
1113                           N
1114                           Ir_mm Dr_mm Dw_mm
1115                       ))
1116         {
1117             next if $OPTS{fields} and ! exists $OPTS{fields}{$field};
1118
1119             if ($field eq 'N') {
1120                 print "\n";
1121                 next;
1122             }
1123
1124             if ($one_field) {
1125                 printf "%-*s", $field_label_width, $test_name;
1126             }
1127             else {
1128                 printf "%*s", $field_label_width, $field;
1129             }
1130
1131             for my $i (0..$#widths) {
1132                 my $res2 = $res1->{$perl_names[$i]};
1133                 my $p = $res2->{$field};
1134                 if (!defined $p) {
1135                     printf " %*s", $widths[$i], '-';
1136                 }
1137                 elsif ($OPTS{raw}) {
1138                     printf " %*.1f", $widths[$i], $p;
1139                 }
1140                 else {
1141                     printf " %*.2f", $widths[$i], $p * 100;
1142                 }
1143             }
1144             print "\n";
1145         }
1146     }
1147 }
1148
1149
1150 # do_selftest(): check that we can parse known cachegrind()
1151 # output formats. If the output of cachegrind changes, add a *new*
1152 # test here; keep the old tests to make sure we continue to parse
1153 # old cachegrinds
1154
1155 sub do_selftest {
1156
1157     my @tests = (
1158         'standard',
1159         <<'EOF',
1160 ==32350== Cachegrind, a cache and branch-prediction profiler
1161 ==32350== Copyright (C) 2002-2013, and GNU GPL'd, by Nicholas Nethercote et al.
1162 ==32350== Using Valgrind-3.9.0 and LibVEX; rerun with -h for copyright info
1163 ==32350== Command: perl5211o /tmp/uiS2gjdqe5 1
1164 ==32350==
1165 --32350-- warning: L3 cache found, using its data for the LL simulation.
1166 ==32350==
1167 ==32350== I   refs:      1,124,055
1168 ==32350== I1  misses:        5,573
1169 ==32350== LLi misses:        3,338
1170 ==32350== I1  miss rate:      0.49%
1171 ==32350== LLi miss rate:      0.29%
1172 ==32350==
1173 ==32350== D   refs:        404,275  (259,191 rd   + 145,084 wr)
1174 ==32350== D1  misses:        9,608  (  6,098 rd   +   3,510 wr)
1175 ==32350== LLd misses:        5,794  (  2,781 rd   +   3,013 wr)
1176 ==32350== D1  miss rate:       2.3% (    2.3%     +     2.4%  )
1177 ==32350== LLd miss rate:       1.4% (    1.0%     +     2.0%  )
1178 ==32350==
1179 ==32350== LL refs:          15,181  ( 11,671 rd   +   3,510 wr)
1180 ==32350== LL misses:         9,132  (  6,119 rd   +   3,013 wr)
1181 ==32350== LL miss rate:        0.5% (    0.4%     +     2.0%  )
1182 ==32350==
1183 ==32350== Branches:        202,372  (197,050 cond +   5,322 ind)
1184 ==32350== Mispredicts:      19,153  ( 17,742 cond +   1,411 ind)
1185 ==32350== Mispred rate:        9.4% (    9.0%     +    26.5%   )
1186 EOF
1187         {
1188             COND    =>  197050,
1189             COND_m  =>   17742,
1190             Dr      =>  259191,
1191             Dr_m1   =>    6098,
1192             Dr_mm   =>    2781,
1193             Dw      =>  145084,
1194             Dw_m1   =>    3510,
1195             Dw_mm   =>    3013,
1196             IND     =>    5322,
1197             IND_m   =>    1411,
1198             Ir      => 1124055,
1199             Ir_m1   =>    5573,
1200             Ir_mm   =>    3338,
1201         },
1202     );
1203
1204     for ('t', '.') {
1205         last if require "$_/test.pl";
1206     }
1207     plan(@tests / 3 * keys %VALID_FIELDS);
1208
1209     while (@tests) {
1210         my $desc     = shift @tests;
1211         my $output   = shift @tests;
1212         my $expected = shift @tests;
1213         my $p = parse_cachegrind($output);
1214         for (sort keys %VALID_FIELDS) {
1215             is($p->{$_}, $expected->{$_}, "$desc, $_");
1216         }
1217     }
1218 }