1 package TAP::Parser::Grammar;
6 use TAP::Parser::ResultFactory ();
7 use TAP::Parser::YAMLish::Reader ();
9 use base 'TAP::Object';
13 TAP::Parser::Grammar - A grammar for the Test Anything Protocol.
21 our $VERSION = '3.42';
25 use TAP::Parser::Grammar;
26 my $grammar = $self->make_grammar({
27 iterator => $tap_parser_iterator,
28 parser => $tap_parser,
32 my $result = $grammar->tokenize;
36 C<TAP::Parser::Grammar> tokenizes lines from a L<TAP::Parser::Iterator> and
37 constructs L<TAP::Parser::Result> subclasses to represent the tokens.
39 Do not attempt to use this class directly. It won't make sense. It's mainly
40 here to ensure that we will be able to have pluggable grammars when TAP is
41 expanded at some future date (plus, this stuff was really cluttering the
50 my $grammar = TAP::Parser::Grammar->new({
51 iterator => $iterator,
56 Returns L<TAP::Parser> grammar object that will parse the TAP stream from the
57 specified iterator. Both C<iterator> and C<parser> are required arguments.
58 If C<version> is not set it defaults to C<12> (see L</set_version> for more
63 # new() implementation supplied by TAP::Object
65 my ( $self, $args ) = @_;
66 $self->{iterator} = $args->{iterator}; # TODO: accessor
67 $self->{iterator} ||= $args->{stream}; # deprecated
68 $self->{parser} = $args->{parser}; # TODO: accessor
69 $self->set_version( $args->{version} || 12 );
77 # XXX the 'not' and 'ok' might be on separate lines in VMS ...
78 my $ok = qr/(?:not )?ok\b/;
83 syntax => qr/^TAP\s+version\s+(\d+)\s*\z/i,
85 my ( $self, $line ) = @_;
87 return $self->_make_version_token( $line, $version, );
91 syntax => qr/^1\.\.(\d+)\s*(.*)\z/,
93 my ( $self, $line ) = @_;
94 my ( $tests_planned, $tail ) = ( $1, $2 );
95 my $explanation = undef;
98 if ( $tail =~ /^todo((?:\s+\d+)+)/ ) {
99 my @todo = split /\s+/, _trim($1);
100 return $self->_make_plan_token(
101 $line, $tests_planned, 'TODO',
105 elsif ( 0 == $tests_planned ) {
108 # If we can't match # SKIP the directive should be undef.
109 ($explanation) = $tail =~ /^#\s*SKIP\S*\s+(.*)/i;
111 elsif ( $tail !~ /^\s*$/ ) {
112 return $self->_make_unknown_token($line);
115 $explanation = '' unless defined $explanation;
117 return $self->_make_plan_token(
118 $line, $tests_planned, $skip,
125 # An optimization to handle the most common test lines without
128 syntax => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x,
130 my ( $self, $line ) = @_;
131 my ( $ok, $num, $desc ) = ( $1, $2, $3 );
133 return $self->_make_test_token(
140 syntax => qr/^($ok) \s* ($num)? \s* (.*) \z/x,
142 my ( $self, $line ) = @_;
143 my ( $ok, $num, $desc ) = ( $1, $2, $3 );
144 my ( $dir, $explanation ) = ( '', '' );
145 if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* )
146 \# \s* (SKIP|TODO) \b \s* (.*) $/ix
149 ( $desc, $dir, $explanation ) = ( $1, $2, $3 );
151 return $self->_make_test_token(
152 $line, $ok, $num, $desc,
158 syntax => qr/^#(.*)/,
160 my ( $self, $line ) = @_;
162 return $self->_make_comment_token( $line, $comment );
166 syntax => qr/^\s*Bail out!\s*(.*)/,
168 my ( $self, $line ) = @_;
169 my $explanation = $1;
170 return $self->_make_bailout_token(
181 syntax => qr/^1\.\.(\d+)\s*(?:\s*#\s*SKIP\b(.*))?\z/i,
183 my ( $self, $line ) = @_;
184 my ( $tests_planned, $explanation ) = ( $1, $2 );
186 = ( 0 == $tests_planned || defined $explanation )
189 $explanation = '' unless defined $explanation;
190 return $self->_make_plan_token(
191 $line, $tests_planned, $skip,
197 syntax => qr/^ (\s+) (---.*) $/x,
199 my ( $self, $line ) = @_;
200 my ( $pad, $marker ) = ( $1, $2 );
201 return $self->_make_yaml_token( $pad, $marker );
206 qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x,
208 my ( $self, $line ) = @_;
210 return $self->_make_pragma_token( $line, $pragmas );
222 shift->{iterator}->handle_unicode;
228 ##############################################################################
230 =head2 Instance Methods
232 =head3 C<set_version>
234 $grammar->set_version(13);
236 Tell the grammar which TAP syntax version to support. The lowest
237 supported version is 12. Although 'TAP version' isn't valid version 12
238 syntax it is accepted so that higher version numbers may be parsed.
246 if ( my $language = $language_for{$version} ) {
247 $self->{version} = $version;
248 $self->{tokens} = $language->{tokens};
250 if ( my $setup = $language->{setup} ) {
254 $self->_order_tokens;
258 Carp::croak("Unsupported syntax version: $version");
262 # Optimization to put the most frequent tokens first.
266 my %copy = %{ $self->{tokens} };
267 my @ordered_tokens = grep {defined}
268 map { delete $copy{$_} } qw( simple_test test comment plan );
269 push @ordered_tokens, values %copy;
271 $self->{ordered_tokens} = \@ordered_tokens;
274 ##############################################################################
278 my $token = $grammar->tokenize;
280 This method will return a L<TAP::Parser::Result> object representing the
288 my $line = $self->{iterator}->next;
289 unless ( defined $line ) {
290 delete $self->{parser}; # break circular ref
296 for my $token_data ( @{ $self->{ordered_tokens} } ) {
297 if ( $line =~ $token_data->{syntax} ) {
298 my $handler = $token_data->{handler};
299 $token = $self->$handler($line);
304 $token = $self->_make_unknown_token($line) unless $token;
306 return $self->{parser}->make_result($token);
309 ##############################################################################
311 =head3 C<token_types>
313 my @types = $grammar->token_types;
315 Returns the different types of tokens which this grammar can parse.
321 return keys %{ $self->{tokens} };
324 ##############################################################################
328 my $syntax = $grammar->syntax_for($token_type);
330 Returns a pre-compiled regular expression which will match a chunk of TAP
331 corresponding to the token type. For example (not that you should really pay
332 attention to this, C<< $grammar->syntax_for('comment') >> will return
338 my ( $self, $type ) = @_;
339 return $self->{tokens}->{$type}->{syntax};
342 ##############################################################################
344 =head3 C<handler_for>
346 my $handler = $grammar->handler_for($token_type);
348 Returns a code reference which, when passed an appropriate line of TAP,
349 returns the lexed token corresponding to that line. As a result, the basic
350 TAP parsing loop looks similar to the following:
353 my $grammar = TAP::Grammar->new;
354 LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) {
355 for my $type ( $grammar->token_types ) {
356 my $syntax = $grammar->syntax_for($type);
357 if ( $line =~ $syntax ) {
358 my $handler = $grammar->handler_for($type);
359 push @tokens => $grammar->$handler($line);
363 push @tokens => $grammar->_make_unknown_token($line);
369 my ( $self, $type ) = @_;
370 return $self->{tokens}->{$type}->{handler};
373 sub _make_version_token {
374 my ( $self, $line, $version ) = @_;
382 sub _make_plan_token {
383 my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_;
385 if ( $directive eq 'SKIP'
386 && 0 != $tests_planned
387 && $self->{version} < 13 )
390 "Specified SKIP directive in plan but more than 0 tests ($line)\n";
396 tests_planned => $tests_planned,
397 directive => $directive,
398 explanation => _trim($explanation),
403 sub _make_test_token {
404 my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_;
408 # forcing this to be an integer (and not a string) reduces memory
409 # consumption. RT #84939
410 test_num => ( defined $num ? 0 + $num : undef ),
411 description => _trim($desc),
412 directive => ( defined $dir ? uc $dir : '' ),
413 explanation => _trim($explanation),
419 sub _make_unknown_token {
420 my ( $self, $line ) = @_;
427 sub _make_comment_token {
428 my ( $self, $line, $comment ) = @_;
432 comment => _trim($comment)
436 sub _make_bailout_token {
437 my ( $self, $line, $explanation ) = @_;
441 bailout => _trim($explanation)
445 sub _make_yaml_token {
446 my ( $self, $pad, $marker ) = @_;
448 my $yaml = TAP::Parser::YAMLish::Reader->new;
450 my $iterator = $self->{iterator};
452 # Construct a reader that reads from our input stripping leading
453 # spaces from each line.
454 my $leader = length($pad);
455 my $strip = qr{ ^ (\s{$leader}) (.*) $ }x;
456 my @extra = ($marker);
458 return shift @extra if @extra;
459 my $line = $iterator->next;
460 return $2 if $line =~ $strip;
464 my $data = $yaml->read($reader);
466 # Reconstitute input. This is convoluted. Maybe we should just
467 # record it on the way in...
468 chomp( my $raw = $yaml->get_raw );
478 sub _make_pragma_token {
479 my ( $self, $line, $pragmas ) = @_;
483 pragmas => [ split /\s*,\s*/, _trim($pragmas) ],
490 return '' unless defined $data;
501 B<NOTE:> This grammar is slightly out of date. There's still some discussion
502 about it and a new one will be provided when we have things better defined.
504 The L<TAP::Parser> does not use a formal grammar because TAP is essentially a
505 stream-based protocol. In fact, it's quite legal to have an infinite stream.
506 For the same reason that we don't apply regexes to streams, we're not using a
507 formal grammar here. Instead, we parse the TAP in lines.
509 For purposes for forward compatibility, any result which does not match the
510 following grammar is currently referred to as
511 L<TAP::Parser::Result::Unknown>. It is I<not> a parse error.
513 A formal grammar would look similar to the following:
516 For the time being, I'm cheating on the EBNF by allowing
517 certain terms to be defined by POSIX character classes by
518 using the following syntax:
522 As far as I am aware, that's not valid EBNF. Sue me. I
523 didn't know how to write "char" otherwise (Unicode issues).
527 tap ::= version? { comment | unknown } leading_plan lines
529 lines trailing_plan {comment}
531 version ::= 'TAP version ' positiveInteger {positiveInteger} "\n"
533 leading_plan ::= plan skip_directive? "\n"
535 trailing_plan ::= plan "\n"
537 plan ::= '1..' nonNegativeInteger
539 lines ::= line {line}
541 line ::= (comment | test | unknown | bailout ) "\n"
543 test ::= status positiveInteger? description? directive?
545 status ::= 'not '? 'ok '
547 description ::= (character - (digit | '#')) {character - '#'}
549 directive ::= todo_directive | skip_directive
551 todo_directive ::= hash_mark 'TODO' ' ' {character}
553 skip_directive ::= hash_mark 'SKIP' ' ' {character}
555 comment ::= hash_mark {character}
557 hash_mark ::= '#' {' '}
559 bailout ::= 'Bail out!' {character}
561 unknown ::= { (character - "\n") }
563 (* POSIX character classes and other terminals *)
566 character ::= ([:print:] - "\n")
567 positiveInteger ::= ( digit - '0' ) {digit}
568 nonNegativeInteger ::= digit {digit}
572 Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview.
574 If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to
575 do is read through the code. There's no easy way of summarizing it here.
581 L<TAP::Parser::Iterator>,
582 L<TAP::Parser::Result>,