This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to NEXT 0.50.
[perl5.git] / lib / Text / Balanced.pm
CommitLineData
3270c621
JH
1# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2# FOR FULL DOCUMENTATION SEE Balanced.pod
3
4use 5.005;
5use strict;
6
7package Text::Balanced;
8
9use Exporter;
10use SelfLoader;
11use vars qw { $VERSION @ISA %EXPORT_TAGS };
12
55a1c97c 13$VERSION = '1.86';
3270c621
JH
14@ISA = qw ( Exporter );
15
16%EXPORT_TAGS = ( ALL => [ qw(
17 &extract_delimited
18 &extract_bracketed
19 &extract_quotelike
20 &extract_codeblock
21 &extract_variable
22 &extract_tagged
23 &extract_multiple
24
25 &gen_delimited_pat
26 &gen_extract_tagged
27
28 &delimited_pat
29 ) ] );
30
31Exporter::export_ok_tags('ALL');
32
33# PROTOTYPES
34
35sub _match_bracketed($$$$$$);
36sub _match_variable($$);
37sub _match_codeblock($$$$$$$);
38sub _match_quotelike($$$$);
39
40# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41
42sub _failmsg {
43 my ($message, $pos) = @_;
44 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
45}
46
47sub _fail
48{
49 my ($wantarray, $textref, $message, $pos) = @_;
50 _failmsg $message, $pos if $message;
51 return ("",$$textref,"") if $wantarray;
52 return undef;
53}
54
55sub _succeed
56{
57 $@ = undef;
58 my ($wantarray,$textref) = splice @_, 0, 2;
59 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
60 my ($startlen) = $_[5];
61 my $remainderpos = $_[2];
62 if ($wantarray)
63 {
64 my @res;
65 while (my ($from, $len) = splice @_, 0, 2)
66 {
67 push @res, substr($$textref,$from,$len);
68 }
69 if ($extralen) { # CORRECT FILLET
70 my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
71 $res[1] = "$extra$res[1]";
72 eval { substr($$textref,$remainderpos,0) = $extra;
73 substr($$textref,$extrapos,$extralen,"\n")} ;
74 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
75 pos($$textref) = $remainderpos-$extralen+1; # RESET \G
76 }
77 else {
78 pos($$textref) = $remainderpos; # RESET \G
79 }
80 return @res;
81 }
82 else
83 {
84 my $match = substr($$textref,$_[0],$_[1]);
85 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
86 my $extra = $extralen
87 ? substr($$textref, $extrapos, $extralen)."\n" : "";
88 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
89 pos($$textref) = $_[4]; # RESET \G
90 return $match;
91 }
92}
93
94# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
95
96sub gen_delimited_pat($;$) # ($delimiters;$escapes)
97{
98 my ($dels, $escs) = @_;
99 return "" unless $dels =~ /\S/;
100 $escs = '\\' unless $escs;
101 $escs .= substr($escs,-1) x (length($dels)-length($escs));
102 my @pat = ();
103 my $i;
104 for ($i=0; $i<length $dels; $i++)
105 {
106 my $del = quotemeta substr($dels,$i,1);
107 my $esc = quotemeta substr($escs,$i,1);
108 if ($del eq $esc)
109 {
110 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
111 }
112 else
113 {
114 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
115 }
116 }
117 my $pat = join '|', @pat;
118 return "(?:$pat)";
119}
120
121*delimited_pat = \&gen_delimited_pat;
122
123
124# THE EXTRACTION FUNCTIONS
125
126sub extract_delimited (;$$$$)
127{
128 my $textref = defined $_[0] ? \$_[0] : \$_;
129 my $wantarray = wantarray;
130 my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
131 my $pre = defined $_[2] ? $_[2] : '\s*';
132 my $esc = defined $_[3] ? $_[3] : qq{\\};
133 my $pat = gen_delimited_pat($del, $esc);
134 my $startpos = pos $$textref || 0;
135 return _fail($wantarray, $textref, "Not a delimited pattern", 0)
136 unless $$textref =~ m/\G($pre)($pat)/gc;
137 my $prelen = length($1);
138 my $matchpos = $startpos+$prelen;
139 my $endpos = pos $$textref;
140 return _succeed $wantarray, $textref,
141 $matchpos, $endpos-$matchpos, # MATCH
142 $endpos, length($$textref)-$endpos, # REMAINDER
143 $startpos, $prelen; # PREFIX
144}
145
146sub extract_bracketed (;$$$)
147{
148 my $textref = defined $_[0] ? \$_[0] : \$_;
149 my $ldel = defined $_[1] ? $_[1] : '{([<';
150 my $pre = defined $_[2] ? $_[2] : '\s*';
151 my $wantarray = wantarray;
152 my $qdel = "";
153 my $quotelike;
154 $ldel =~ s/'//g and $qdel .= q{'};
155 $ldel =~ s/"//g and $qdel .= q{"};
156 $ldel =~ s/`//g and $qdel .= q{`};
157 $ldel =~ s/q//g and $quotelike = 1;
158 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
159 my $rdel = $ldel;
160 unless ($rdel =~ tr/[({</])}>/)
161 {
162 return _fail $wantarray, $textref,
163 "Did not find a suitable bracket in delimiter: \"$_[1]\"",
164 0;
165 }
166 my $posbug = pos;
167 $ldel = join('|', map { quotemeta $_ } split('', $ldel));
168 $rdel = join('|', map { quotemeta $_ } split('', $rdel));
169 pos = $posbug;
170
171 my $startpos = pos $$textref || 0;
172 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
173
174 return _fail ($wantarray, $textref) unless @match;
175
176 return _succeed ( $wantarray, $textref,
177 $match[2], $match[5]+2, # MATCH
178 @match[8,9], # REMAINDER
179 @match[0,1], # PREFIX
180 );
181}
182
183sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
184{
185 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
186 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
187 unless ($$textref =~ m/\G$pre/gc)
188 {
189 _failmsg "Did not find prefix: /$pre/", $startpos;
190 return;
191 }
192
193 $ldelpos = pos $$textref;
194
195 unless ($$textref =~ m/\G($ldel)/gc)
196 {
197 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
198 pos $$textref;
199 pos $$textref = $startpos;
200 return;
201 }
202
203 my @nesting = ( $1 );
204 my $textlen = length $$textref;
205 while (pos $$textref < $textlen)
206 {
207 next if $$textref =~ m/\G\\./gcs;
208
209 if ($$textref =~ m/\G($ldel)/gc)
210 {
211 push @nesting, $1;
212 }
213 elsif ($$textref =~ m/\G($rdel)/gc)
214 {
215 my ($found, $brackettype) = ($1, $1);
216 if ($#nesting < 0)
217 {
218 _failmsg "Unmatched closing bracket: \"$found\"",
219 pos $$textref;
220 pos $$textref = $startpos;
221 return;
222 }
223 my $expected = pop(@nesting);
224 $expected =~ tr/({[</)}]>/;
225 if ($expected ne $brackettype)
226 {
227 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
228 pos $$textref;
229 pos $$textref = $startpos;
230 return;
231 }
232 last if $#nesting < 0;
233 }
234 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
235 {
9686a75b 236 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
3270c621
JH
237 _failmsg "Unmatched embedded quote ($1)",
238 pos $$textref;
239 pos $$textref = $startpos;
240 return;
241 }
242 elsif ($quotelike && _match_quotelike($textref,"",1,0))
243 {
244 next;
245 }
246
247 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
248 }
249 if ($#nesting>=0)
250 {
251 _failmsg "Unmatched opening bracket(s): "
252 . join("..",@nesting)."..",
253 pos $$textref;
254 pos $$textref = $startpos;
255 return;
256 }
257
258 $endpos = pos $$textref;
259
260 return (
261 $startpos, $ldelpos-$startpos, # PREFIX
262 $ldelpos, 1, # OPENING BRACKET
263 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
264 $endpos-1, 1, # CLOSING BRACKET
265 $endpos, length($$textref)-$endpos, # REMAINDER
266 );
267}
268
269sub revbracket($)
270{
271 my $brack = reverse $_[0];
272 $brack =~ tr/[({</])}>/;
273 return $brack;
274}
275
276my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
277
278sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
279{
280 my $textref = defined $_[0] ? \$_[0] : \$_;
281 my $ldel = $_[1];
282 my $rdel = $_[2];
283 my $pre = defined $_[3] ? $_[3] : '\s*';
284 my %options = defined $_[4] ? %{$_[4]} : ();
285 my $omode = defined $options{fail} ? $options{fail} : '';
286 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
287 : defined($options{reject}) ? $options{reject}
288 : ''
289 ;
290 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
291 : defined($options{ignore}) ? $options{ignore}
292 : ''
293 ;
294
295 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
296 $@ = undef;
297
298 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
299
300 return _fail(wantarray, $textref) unless @match;
301 return _succeed wantarray, $textref,
302 $match[2], $match[3]+$match[5]+$match[7], # MATCH
303 @match[8..9,0..1,2..7]; # REM, PRE, BITS
304}
305
306sub _match_tagged # ($$$$$$$)
307{
308 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
309 my $rdelspec;
310
311 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
312
313 unless ($$textref =~ m/\G($pre)/gc)
314 {
315 _failmsg "Did not find prefix: /$pre/", pos $$textref;
316 goto failed;
317 }
318
319 $opentagpos = pos($$textref);
320
321 unless ($$textref =~ m/\G$ldel/gc)
322 {
323 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
324 goto failed;
325 }
326
327 $textpos = pos($$textref);
328
329 if (!defined $rdel)
330 {
331 $rdelspec = $&;
332 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
333 {
334 _failmsg "Unable to construct closing tag to match: $rdel",
335 pos $$textref;
336 goto failed;
337 }
338 }
339 else
340 {
341 $rdelspec = eval "qq{$rdel}";
342 }
343
344 while (pos($$textref) < length($$textref))
345 {
346 next if $$textref =~ m/\G\\./gc;
347
348 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
349 {
350 $parapos = pos($$textref) - length($1)
351 unless defined $parapos;
352 }
353 elsif ($$textref =~ m/\G($rdelspec)/gc )
354 {
355 $closetagpos = pos($$textref)-length($1);
356 goto matched;
357 }
358 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
359 {
360 next;
361 }
362 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
363 {
364 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
365 goto short if ($omode eq 'PARA' || $omode eq 'MAX');
366 _failmsg "Found invalid nested tag: $1", pos $$textref;
367 goto failed;
368 }
369 elsif ($$textref =~ m/\G($ldel)/gc)
370 {
371 my $tag = $1;
372 pos($$textref) -= length($tag); # REWIND TO NESTED TAG
373 unless (_match_tagged(@_)) # MATCH NESTED TAG
374 {
375 goto short if $omode eq 'PARA' || $omode eq 'MAX';
376 _failmsg "Found unbalanced nested tag: $tag",
377 pos $$textref;
378 goto failed;
379 }
380 }
381 else { $$textref =~ m/./gcs }
382 }
383
384short:
385 $closetagpos = pos($$textref);
386 goto matched if $omode eq 'MAX';
387 goto failed unless $omode eq 'PARA';
388
389 if (defined $parapos) { pos($$textref) = $parapos }
390 else { $parapos = pos($$textref) }
391
392 return (
393 $startpos, $opentagpos-$startpos, # PREFIX
394 $opentagpos, $textpos-$opentagpos, # OPENING TAG
395 $textpos, $parapos-$textpos, # TEXT
396 $parapos, 0, # NO CLOSING TAG
397 $parapos, length($$textref)-$parapos, # REMAINDER
398 );
399
400matched:
401 $endpos = pos($$textref);
402 return (
403 $startpos, $opentagpos-$startpos, # PREFIX
404 $opentagpos, $textpos-$opentagpos, # OPENING TAG
405 $textpos, $closetagpos-$textpos, # TEXT
406 $closetagpos, $endpos-$closetagpos, # CLOSING TAG
407 $endpos, length($$textref)-$endpos, # REMAINDER
408 );
409
410failed:
411 _failmsg "Did not find closing tag", pos $$textref unless $@;
412 pos($$textref) = $startpos;
413 return;
414}
415
416sub extract_variable (;$$)
417{
418 my $textref = defined $_[0] ? \$_[0] : \$_;
419 return ("","","") unless defined $$textref;
420 my $pre = defined $_[1] ? $_[1] : '\s*';
421
422 my @match = _match_variable($textref,$pre);
423
424 return _fail wantarray, $textref unless @match;
425
426 return _succeed wantarray, $textref,
427 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
428}
429
430sub _match_variable($$)
431{
432 my ($textref, $pre) = @_;
433 my $startpos = pos($$textref) = pos($$textref)||0;
434 unless ($$textref =~ m/\G($pre)/gc)
435 {
436 _failmsg "Did not find prefix: /$pre/", pos $$textref;
437 return;
438 }
439 my $varpos = pos($$textref);
440 unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc)
441 {
442 _failmsg "Did not find leading dereferencer", pos $$textref;
443 pos $$textref = $startpos;
444 return;
445 }
446
447 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
448 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0))
449 {
450 _failmsg "Bad identifier after dereferencer", pos $$textref;
451 pos $$textref = $startpos;
452 return;
453 }
454
455 while (1)
456 {
457 next if _match_codeblock($textref,
2f250b7c 458 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
3270c621
JH
459 qr/[({[]/, qr/[)}\]]/,
460 qr/[({[]/, qr/[)}\]]/, 0);
461 next if _match_codeblock($textref,
462 qr/\s*/, qr/[{[]/, qr/[}\]]/,
463 qr/[{[]/, qr/[}\]]/, 0);
464 next if _match_variable($textref,'\s*->\s*');
465 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
466 last;
467 }
468
469 my $endpos = pos($$textref);
470 return ($startpos, $varpos-$startpos,
471 $varpos, $endpos-$varpos,
472 $endpos, length($$textref)-$endpos
473 );
474}
475
476sub extract_codeblock (;$$$$$)
477{
478 my $textref = defined $_[0] ? \$_[0] : \$_;
479 my $wantarray = wantarray;
480 my $ldel_inner = defined $_[1] ? $_[1] : '{';
481 my $pre = defined $_[2] ? $_[2] : '\s*';
482 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
483 my $rd = $_[4];
484 my $rdel_inner = $ldel_inner;
485 my $rdel_outer = $ldel_outer;
486 my $posbug = pos;
487 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
488 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
489 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
490 {
491 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
492 }
493 pos = $posbug;
494
495 my @match = _match_codeblock($textref, $pre,
496 $ldel_outer, $rdel_outer,
497 $ldel_inner, $rdel_inner,
498 $rd);
499 return _fail($wantarray, $textref) unless @match;
500 return _succeed($wantarray, $textref,
501 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
502 );
503
504}
505
506sub _match_codeblock($$$$$$$)
507{
508 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
509 my $startpos = pos($$textref) = pos($$textref) || 0;
510 unless ($$textref =~ m/\G($pre)/gc)
511 {
512 _failmsg qq{Did not match prefix /$pre/ at"} .
513 substr($$textref,pos($$textref),20) .
514 q{..."},
515 pos $$textref;
516 return;
517 }
518 my $codepos = pos($$textref);
519 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
520 {
521 _failmsg qq{Did not find expected opening bracket at "} .
522 substr($$textref,pos($$textref),20) .
523 q{..."},
524 pos $$textref;
525 pos $$textref = $startpos;
526 return;
527 }
528 my $closing = $1;
529 $closing =~ tr/([<{/)]>}/;
530 my $matched;
531 my $patvalid = 1;
532 while (pos($$textref) < length($$textref))
533 {
534 $matched = '';
535 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
536 {
537 $patvalid = 0;
538 next;
539 }
540
541 if ($$textref =~ m/\G\s*#.*/gc)
542 {
543 next;
544 }
545
546 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
547 {
548 unless ($matched = ($closing && $1 eq $closing) )
549 {
550 next if $1 eq '>'; # MIGHT BE A "LESS THAN"
551 _failmsg q{Mismatched closing bracket at "} .
552 substr($$textref,pos($$textref),20) .
553 qq{...". Expected '$closing'},
554 pos $$textref;
555 }
556 last;
557 }
558
559 if (_match_variable($textref,'\s*') ||
560 _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
561 {
562 $patvalid = 0;
563 next;
564 }
565
566
567 # NEED TO COVER MANY MORE CASES HERE!!!
568 if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
55a1c97c 569 | [!=]~
3270c621
JH
570 | =(?!>)
571 | (\*\*|&&|\|\||<<|>>)=?
3270c621
JH
572 | split|grep|map|return
573 )#gcx)
574 {
575 $patvalid = 1;
576 next;
577 }
578
579 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
580 {
581 $patvalid = 1;
582 next;
583 }
584
585 if ($$textref =~ m/\G\s*$ldel_outer/gc)
586 {
587 _failmsg q{Improperly nested codeblock at "} .
588 substr($$textref,pos($$textref),20) .
589 q{..."},
590 pos $$textref;
591 last;
592 }
593
594 $patvalid = 0;
595 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
596 }
597 continue { $@ = undef }
598
599 unless ($matched)
600 {
601 _failmsg 'No match found for opening bracket', pos $$textref
602 unless $@;
603 return;
604 }
605
606 my $endpos = pos($$textref);
607 return ( $startpos, $codepos-$startpos,
608 $codepos, $endpos-$codepos,
609 $endpos, length($$textref)-$endpos,
610 );
611}
612
613
614my %mods = (
615 'none' => '[cgimsox]*',
616 'm' => '[cgimsox]*',
617 's' => '[cegimsox]*',
618 'tr' => '[cds]*',
619 'y' => '[cds]*',
620 'qq' => '',
621 'qx' => '',
622 'qw' => '',
623 'qr' => '[imsx]*',
624 'q' => '',
625 );
626
627sub extract_quotelike (;$$)
628{
629 my $textref = $_[0] ? \$_[0] : \$_;
630 my $wantarray = wantarray;
631 my $pre = defined $_[1] ? $_[1] : '\s*';
632
633 my @match = _match_quotelike($textref,$pre,1,0);
634 return _fail($wantarray, $textref) unless @match;
635 return _succeed($wantarray, $textref,
636 $match[2], $match[18]-$match[2], # MATCH
637 @match[18,19], # REMAINDER
638 @match[0,1], # PREFIX
639 @match[2..17], # THE BITS
640 @match[20,21], # ANY FILLET?
641 );
642};
643
644sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
645{
646 my ($textref, $pre, $rawmatch, $qmark) = @_;
647
648 my ($textlen,$startpos,
649 $oppos,
650 $preld1pos,$ld1pos,$str1pos,$rd1pos,
651 $preld2pos,$ld2pos,$str2pos,$rd2pos,
652 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
653
654 unless ($$textref =~ m/\G($pre)/gc)
655 {
656 _failmsg qq{Did not find prefix /$pre/ at "} .
657 substr($$textref, pos($$textref), 20) .
658 q{..."},
659 pos $$textref;
660 return;
661 }
662 $oppos = pos($$textref);
663
664 my $initial = substr($$textref,$oppos,1);
665
666 if ($initial && $initial =~ m|^[\"\'\`]|
667 || $rawmatch && $initial =~ m|^/|
668 || $qmark && $initial =~ m|^\?|)
669 {
9686a75b 670 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
3270c621
JH
671 {
672 _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
673 substr($$textref, $oppos, 20) .
674 q{..."},
675 pos $$textref;
676 pos $$textref = $startpos;
677 return;
678 }
679 $modpos= pos($$textref);
680 $rd1pos = $modpos-1;
681
682 if ($initial eq '/' || $initial eq '?')
683 {
684 $$textref =~ m/\G$mods{none}/gc
685 }
686
687 my $endpos = pos($$textref);
688 return (
689 $startpos, $oppos-$startpos, # PREFIX
690 $oppos, 0, # NO OPERATOR
691 $oppos, 1, # LEFT DEL
692 $oppos+1, $rd1pos-$oppos-1, # STR/PAT
693 $rd1pos, 1, # RIGHT DEL
694 $modpos, 0, # NO 2ND LDEL
695 $modpos, 0, # NO 2ND STR
696 $modpos, 0, # NO 2ND RDEL
697 $modpos, $endpos-$modpos, # MODIFIERS
698 $endpos, $textlen-$endpos, # REMAINDER
699 );
700 }
701
702 unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
703 {
704 _failmsg q{No quotelike operator found after prefix at "} .
705 substr($$textref, pos($$textref), 20) .
706 q{..."},
707 pos $$textref;
708 pos $$textref = $startpos;
709 return;
710 }
711
712 my $op = $1;
713 $preld1pos = pos($$textref);
714 if ($op eq '<<') {
715 $ld1pos = pos($$textref);
716 my $label;
717 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
718 $label = $1;
719 }
720 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
721 | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
722 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
9686a75b 723 }gcsx) {
3270c621
JH
724 $label = $+;
725 }
726 else {
727 $label = "";
728 }
729 my $extrapos = pos($$textref);
730 $$textref =~ m{.*\n}gc;
731 $str1pos = pos($$textref);
732 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
733 _failmsg qq{Missing here doc terminator ('$label') after "} .
734 substr($$textref, $startpos, 20) .
735 q{..."},
736 pos $$textref;
737 pos $$textref = $startpos;
738 return;
739 }
740 $rd1pos = pos($$textref);
741 $$textref =~ m{$label\n}gc;
742 $ld2pos = pos($$textref);
743 return (
744 $startpos, $oppos-$startpos, # PREFIX
745 $oppos, length($op), # OPERATOR
746 $ld1pos, $extrapos-$ld1pos, # LEFT DEL
747 $str1pos, $rd1pos-$str1pos, # STR/PAT
748 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
749 $ld2pos, 0, # NO 2ND LDEL
750 $ld2pos, 0, # NO 2ND STR
751 $ld2pos, 0, # NO 2ND RDEL
752 $ld2pos, 0, # NO MODIFIERS
753 $ld2pos, $textlen-$ld2pos, # REMAINDER
754 $extrapos, $str1pos-$extrapos, # FILLETED BIT
755 );
756 }
757
758 $$textref =~ m/\G\s*/gc;
759 $ld1pos = pos($$textref);
760 $str1pos = $ld1pos+1;
761
762 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
763 {
764 _failmsg "No block delimiter found after quotelike $op",
765 pos $$textref;
766 pos $$textref = $startpos;
767 return;
768 }
769 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
770 my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
771 if ($ldel1 =~ /[[(<{]/)
772 {
773 $rdel1 =~ tr/[({</])}>/;
774 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
775 || do { pos $$textref = $startpos; return };
776 }
777 else
778 {
9686a75b 779 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
3270c621
JH
780 || do { pos $$textref = $startpos; return };
781 }
782 $ld2pos = $rd1pos = pos($$textref)-1;
783
784 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
785 if ($second_arg)
786 {
787 my ($ldel2, $rdel2);
788 if ($ldel1 =~ /[[(<{]/)
789 {
790 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
791 {
792 _failmsg "Missing second block for quotelike $op",
793 pos $$textref;
794 pos $$textref = $startpos;
795 return;
796 }
797 $ldel2 = $rdel2 = "\Q$1";
798 $rdel2 =~ tr/[({</])}>/;
799 }
800 else
801 {
802 $ldel2 = $rdel2 = $ldel1;
803 }
804 $str2pos = $ld2pos+1;
805
806 if ($ldel2 =~ /[[(<{]/)
807 {
808 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
809 _match_bracketed($textref,"",$ldel2,"","",$rdel2)
810 || do { pos $$textref = $startpos; return };
811 }
812 else
813 {
9686a75b 814 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
3270c621
JH
815 || do { pos $$textref = $startpos; return };
816 }
817 $rd2pos = pos($$textref)-1;
818 }
819 else
820 {
821 $ld2pos = $str2pos = $rd2pos = $rd1pos;
822 }
823
824 $modpos = pos $$textref;
825
826 $$textref =~ m/\G($mods{$op})/gc;
827 my $endpos = pos $$textref;
828
829 return (
830 $startpos, $oppos-$startpos, # PREFIX
831 $oppos, length($op), # OPERATOR
832 $ld1pos, 1, # LEFT DEL
833 $str1pos, $rd1pos-$str1pos, # STR/PAT
834 $rd1pos, 1, # RIGHT DEL
835 $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
836 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
837 $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
838 $modpos, $endpos-$modpos, # MODIFIERS
839 $endpos, $textlen-$endpos, # REMAINDER
840 );
841}
842
843my $def_func =
844[
845 sub { extract_variable($_[0], '') },
846 sub { extract_quotelike($_[0],'') },
847 sub { extract_codeblock($_[0],'{}','') },
848];
849
850sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
851{
852 my $textref = defined($_[0]) ? \$_[0] : \$_;
853 my $posbug = pos;
854 my ($lastpos, $firstpos);
855 my @fields = ();
856
857 for ($$textref)
858 {
859 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
860 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
861 my $igunk = $_[3];
862
863 pos ||= 0;
864
865 unless (wantarray)
866 {
867 use Carp;
868 carp "extract_multiple reset maximal count to 1 in scalar context"
869 if $^W && defined($_[2]) && $max > 1;
870 $max = 1
871 }
872
873 my $unkpos;
874 my $func;
875 my $class;
876
877 my @class;
878 foreach $func ( @func )
879 {
880 if (ref($func) eq 'HASH')
881 {
882 push @class, (keys %$func)[0];
883 $func = (values %$func)[0];
884 }
885 else
886 {
887 push @class, undef;
888 }
889 }
890
891 FIELD: while (pos() < length())
892 {
893 my $field;
894 foreach my $i ( 0..$#func )
895 {
896 $func = $func[$i];
897 $class = $class[$i];
898 $lastpos = pos;
899 if (ref($func) eq 'CODE')
900 { ($field) = $func->($_) }
901 elsif (ref($func) eq 'Text::Balanced::Extractor')
902 { $field = $func->extract($_) }
903 elsif( m/\G$func/gc )
904 { $field = defined($1) ? $1 : $& }
905
906 if (defined($field) && length($field))
907 {
908 if (defined($unkpos) && !$igunk)
909 {
910 push @fields, substr($_, $unkpos, $lastpos-$unkpos);
911 $firstpos = $unkpos unless defined $firstpos;
912 undef $unkpos;
913 last FIELD if @fields == $max;
914 }
915 push @fields, $class
916 ? bless(\$field, $class)
917 : $field;
918 $firstpos = $lastpos unless defined $firstpos;
919 $lastpos = pos;
920 last FIELD if @fields == $max;
921 next FIELD;
922 }
923 }
924 if (/\G(.)/gcs)
925 {
926 $unkpos = pos()-1
927 unless $igunk || defined $unkpos;
928 }
929 }
930
931 if (defined $unkpos)
932 {
933 push @fields, substr($_, $unkpos);
934 $firstpos = $unkpos unless defined $firstpos;
935 $lastpos = length;
936 }
937 last;
938 }
939
940 pos $$textref = $lastpos;
941 return @fields if wantarray;
942
943 $firstpos ||= 0;
944 eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
945 pos $$textref = $firstpos };
946 return $fields[0];
947}
948
949
950sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
951{
952 my $ldel = $_[0];
953 my $rdel = $_[1];
954 my $pre = defined $_[2] ? $_[2] : '\s*';
955 my %options = defined $_[3] ? %{$_[3]} : ();
956 my $omode = defined $options{fail} ? $options{fail} : '';
957 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
958 : defined($options{reject}) ? $options{reject}
959 : ''
960 ;
961 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
962 : defined($options{ignore}) ? $options{ignore}
963 : ''
964 ;
965
966 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
967
968 my $posbug = pos;
969 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
970 pos = $posbug;
971
972 my $closure = sub
973 {
974 my $textref = defined $_[0] ? \$_[0] : \$_;
975 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
976
977 return _fail(wantarray, $textref) unless @match;
978 return _succeed wantarray, $textref,
979 $match[2], $match[3]+$match[5]+$match[7], # MATCH
980 @match[8..9,0..1,2..7]; # REM, PRE, BITS
981 };
982
983 bless $closure, 'Text::Balanced::Extractor';
984}
985
986package Text::Balanced::Extractor;
987
988sub extract($$) # ($self, $text)
989{
990 &{$_[0]}($_[1]);
991}
992
993package Text::Balanced::ErrorMsg;
994
995use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
996
9971;
55a1c97c
JH
998
999__END__
1000
1001=head1 NAME
1002
1003Text::Balanced - Extract delimited text sequences from strings.
1004
1005
1006=head1 SYNOPSIS
1007
1008 use Text::Balanced qw (
1009 extract_delimited
1010 extract_bracketed
1011 extract_quotelike
1012 extract_codeblock
1013 extract_variable
1014 extract_tagged
1015 extract_multiple
1016
1017 gen_delimited_pat
1018 gen_extract_tagged
1019 );
1020
1021 # Extract the initial substring of $text that is delimited by
1022 # two (unescaped) instances of the first character in $delim.
1023
1024 ($extracted, $remainder) = extract_delimited($text,$delim);
1025
1026
1027 # Extract the initial substring of $text that is bracketed
1028 # with a delimiter(s) specified by $delim (where the string
1029 # in $delim contains one or more of '(){}[]<>').
1030
1031 ($extracted, $remainder) = extract_bracketed($text,$delim);
1032
1033
1034 # Extract the initial substring of $text that is bounded by
1035 # an HTML/XML tag.
1036
1037 ($extracted, $remainder) = extract_tagged($text);
1038
1039
1040 # Extract the initial substring of $text that is bounded by
1041 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1042
1043 ($extracted, $remainder) =
1044 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1045
1046
1047 # Extract the initial substring of $text that represents a
1048 # Perl "quote or quote-like operation"
1049
1050 ($extracted, $remainder) = extract_quotelike($text);
1051
1052
1053 # Extract the initial substring of $text that represents a block
1054 # of Perl code, bracketed by any of character(s) specified by $delim
1055 # (where the string $delim contains one or more of '(){}[]<>').
1056
1057 ($extracted, $remainder) = extract_codeblock($text,$delim);
1058
1059
1060 # Extract the initial substrings of $text that would be extracted by
1061 # one or more sequential applications of the specified functions
1062 # or regular expressions
1063
1064 @extracted = extract_multiple($text,
1065 [ \&extract_bracketed,
1066 \&extract_quotelike,
1067 \&some_other_extractor_sub,
1068 qr/[xyz]*/,
1069 'literal',
1070 ]);
1071
1072# Create a string representing an optimized pattern (a la Friedl)
1073# that matches a substring delimited by any of the specified characters
1074# (in this case: any type of quote or a slash)
1075
1076 $patstring = gen_delimited_pat(q{'"`/});
1077
1078
1079# Generate a reference to an anonymous sub that is just like extract_tagged
1080# but pre-compiled and optimized for a specific pair of tags, and consequently
1081# much faster (i.e. 3 times faster). It uses qr// for better performance on
1082# repeated calls, so it only works under Perl 5.005 or later.
1083
1084 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1085
1086 ($extracted, $remainder) = $extract_head->($text);
1087
1088
1089=head1 DESCRIPTION
1090
1091The various C<extract_...> subroutines may be used to extract a
1092delimited string (possibly after skipping a specified prefix string).
1093The search for the string always begins at the current C<pos>
1094location of the string's variable (or at index zero, if no C<pos>
1095position is defined).
1096
1097=head2 General behaviour in list contexts
1098
1099In a list context, all the subroutines return a list, the first three
1100elements of which are always:
1101
1102=over 4
1103
1104=item [0]
1105
1106The extracted string, including the specified delimiters.
1107If the extraction fails an empty string is returned.
1108
1109=item [1]
1110
1111The remainder of the input string (i.e. the characters after the
1112extracted string). On failure, the entire string is returned.
1113
1114=item [2]
1115
1116The skipped prefix (i.e. the characters before the extracted string).
1117On failure, the empty string is returned.
1118
1119=back
1120
1121Note that in a list context, the contents of the original input text (the first
1122argument) are not modified in any way.
1123
1124However, if the input text was passed in a variable, that variable's
1125C<pos> value is updated to point at the first character after the
1126extracted text. That means that in a list context the various
1127subroutines can be used much like regular expressions. For example:
1128
1129 while ( $next = (extract_quotelike($text))[0] )
1130 {
1131 # process next quote-like (in $next)
1132 }
1133
1134
1135=head2 General behaviour in scalar and void contexts
1136
1137In a scalar context, the extracted string is returned, having first been
1138removed from the input text. Thus, the following code also processes
1139each quote-like operation, but actually removes them from $text:
1140
1141 while ( $next = extract_quotelike($text) )
1142 {
1143 # process next quote-like (in $next)
1144 }
1145
1146Note that if the input text is a read-only string (i.e. a literal),
1147no attempt is made to remove the extracted text.
1148
1149In a void context the behaviour of the extraction subroutines is
1150exactly the same as in a scalar context, except (of course) that the
1151extracted substring is not returned.
1152
1153=head2 A note about prefixes
1154
1155Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1156This can bite you if you're expecting a prefix specification like
1157'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1158pattern will only succeed if the <H1> tag is on the current line, since
1159. normally doesn't match newlines.
1160
1161To overcome this limitation, you need to turn on /s matching within
1162the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1163
1164
1165=head2 C<extract_delimited>
1166
1167The C<extract_delimited> function formalizes the common idiom
1168of extracting a single-character-delimited substring from the start of
1169a string. For example, to extract a single-quote delimited string, the
1170following code is typically used:
1171
1172 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1173 $extracted = $1;
1174
1175but with C<extract_delimited> it can be simplified to:
1176
1177 ($extracted,$remainder) = extract_delimited($text, "'");
1178
1179C<extract_delimited> takes up to four scalars (the input text, the
1180delimiters, a prefix pattern to be skipped, and any escape characters)
1181and extracts the initial substring of the text that
1182is appropriately delimited. If the delimiter string has multiple
1183characters, the first one encountered in the text is taken to delimit
1184the substring.
1185The third argument specifies a prefix pattern that is to be skipped
1186(but must be present!) before the substring is extracted.
1187The final argument specifies the escape character to be used for each
1188delimiter.
1189
1190All arguments are optional. If the escape characters are not specified,
1191every delimiter is escaped with a backslash (C<\>).
1192If the prefix is not specified, the
1193pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1194is also not specified, the set C</["'`]/> is used. If the text to be processed
1195is not specified either, C<$_> is used.
1196
d1be9408 1197In list context, C<extract_delimited> returns an array of three
55a1c97c
JH
1198elements, the extracted substring (I<including the surrounding
1199delimiters>), the remainder of the text, and the skipped prefix (if
1200any). If a suitable delimited substring is not found, the first
1201element of the array is the empty string, the second is the complete
1202original text, and the prefix returned in the third element is an
1203empty string.
1204
1205In a scalar context, just the extracted substring is returned. In
1206a void context, the extracted substring (and any prefix) are simply
1207removed from the beginning of the first argument.
1208
1209Examples:
1210
1211 # Remove a single-quoted substring from the very beginning of $text:
1212
1213 $substring = extract_delimited($text, "'", '');
1214
1215 # Remove a single-quoted Pascalish substring (i.e. one in which
1216 # doubling the quote character escapes it) from the very
1217 # beginning of $text:
1218
1219 $substring = extract_delimited($text, "'", '', "'");
1220
1221 # Extract a single- or double- quoted substring from the
1222 # beginning of $text, optionally after some whitespace
1223 # (note the list context to protect $text from modification):
1224
1225 ($substring) = extract_delimited $text, q{"'};
1226
1227
1228 # Delete the substring delimited by the first '/' in $text:
1229
1230 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1231
1232Note that this last example is I<not> the same as deleting the first
1233quote-like pattern. For instance, if C<$text> contained the string:
1234
1235 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1236
1237then after the deletion it would contain:
1238
1239 "if ('.$UNIXCMD/s) { $cmd = $1; }"
1240
1241not:
1242
1243 "if ('./cmd' =~ ms) { $cmd = $1; }"
1244
1245
1246See L<"extract_quotelike"> for a (partial) solution to this problem.
1247
1248
1249=head2 C<extract_bracketed>
1250
1251Like C<"extract_delimited">, the C<extract_bracketed> function takes
1252up to three optional scalar arguments: a string to extract from, a delimiter
1253specifier, and a prefix pattern. As before, a missing prefix defaults to
1254optional whitespace and a missing text defaults to C<$_>. However, a missing
1255delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1256
1257C<extract_bracketed> extracts a balanced-bracket-delimited
1258substring (using any one (or more) of the user-specified delimiter
1259brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1260respect quoted unbalanced brackets (see below).
1261
1262A "delimiter bracket" is a bracket in list of delimiters passed as
1263C<extract_bracketed>'s second argument. Delimiter brackets are
1264specified by giving either the left or right (or both!) versions
1265of the required bracket(s). Note that the order in which
1266two or more delimiter brackets are specified is not significant.
1267
1268A "balanced-bracket-delimited substring" is a substring bounded by
1269matched brackets, such that any other (left or right) delimiter
1270bracket I<within> the substring is also matched by an opposite
1271(right or left) delimiter bracket I<at the same level of nesting>. Any
1272type of bracket not in the delimiter list is treated as an ordinary
1273character.
1274
1275In other words, each type of bracket specified as a delimiter must be
1276balanced and correctly nested within the substring, and any other kind of
1277("non-delimiter") bracket in the substring is ignored.
1278
1279For example, given the string:
1280
1281 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1282
1283then a call to C<extract_bracketed> in a list context:
1284
1285 @result = extract_bracketed( $text, '{}' );
1286
1287would return:
1288
1289 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1290
1291since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1292(In a scalar context just the first element of the array would be returned. In
1293a void context, C<$text> would be replaced by an empty string.)
1294
1295Likewise the call in:
1296
1297 @result = extract_bracketed( $text, '{[' );
1298
1299would return the same result, since all sets of both types of specified
1300delimiter brackets are correctly nested and balanced.
1301
1302However, the call in:
1303
1304 @result = extract_bracketed( $text, '{([<' );
1305
1306would fail, returning:
1307
1308 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
1309
1310because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1311the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1312return an empty string. In a void context, C<$text> would be unchanged.)
1313
1314Note that the embedded single-quotes in the string don't help in this
1315case, since they have not been specified as acceptable delimiters and are
1316therefore treated as non-delimiter characters (and ignored).
1317
1318However, if a particular species of quote character is included in the
1319delimiter specification, then that type of quote will be correctly handled.
1320for example, if C<$text> is:
1321
1322 $text = '<A HREF=">>>>">link</A>';
1323
1324then
1325
1326 @result = extract_bracketed( $text, '<">' );
1327
1328returns:
1329
1330 ( '<A HREF=">>>>">', 'link</A>', "" )
1331
1332as expected. Without the specification of C<"> as an embedded quoter:
1333
1334 @result = extract_bracketed( $text, '<>' );
1335
1336the result would be:
1337
1338 ( '<A HREF=">', '>>>">link</A>', "" )
1339
1340In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1341quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1342letter 'q' as a delimiter. Hence:
1343
1344 @result = extract_bracketed( $text, '<q>' );
1345
1346would correctly match something like this:
1347
1348 $text = '<leftop: conj /and/ conj>';
1349
1350See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1351
1352
1353=head2 C<extract_tagged>
1354
1355C<extract_tagged> extracts and segments text between (balanced)
1356specified tags.
1357
1358The subroutine takes up to five optional arguments:
1359
1360=over 4
1361
1362=item 1.
1363
1364A string to be processed (C<$_> if the string is omitted or C<undef>)
1365
1366=item 2.
1367
1368A string specifying a pattern to be matched as the opening tag.
1369If the pattern string is omitted (or C<undef>) then a pattern
1370that matches any standard HTML/XML tag is used.
1371
1372=item 3.
1373
1374A string specifying a pattern to be matched at the closing tag.
1375If the pattern string is omitted (or C<undef>) then the closing
1376tag is constructed by inserting a C</> after any leading bracket
1377characters in the actual opening tag that was matched (I<not> the pattern
1378that matched the tag). For example, if the opening tag pattern
1379is specified as C<'{{\w+}}'> and actually matched the opening tag
1380C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1381
1382=item 4.
1383
1384A string specifying a pattern to be matched as a prefix (which is to be
1385skipped). If omitted, optional whitespace is skipped.
1386
1387=item 5.
1388
1389A hash reference containing various parsing options (see below)
1390
1391=back
1392
1393The various options that can be specified are:
1394
1395=over 4
1396
1397=item C<reject =E<gt> $listref>
1398
1399The list reference contains one or more strings specifying patterns
1400that must I<not> appear within the tagged text.
1401
1402For example, to extract
1403an HTML link (which should not contain nested links) use:
1404
1405 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1406
1407=item C<ignore =E<gt> $listref>
1408
1409The list reference contains one or more strings specifying patterns
1410that are I<not> be be treated as nested tags within the tagged text
1411(even if they would match the start tag pattern).
1412
1413For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1414
1415 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1416
1417(also see L<"gen_delimited_pat"> below).
1418
1419
1420=item C<fail =E<gt> $str>
1421
1422The C<fail> option indicates the action to be taken if a matching end
1423tag is not encountered (i.e. before the end of the string or some
1424C<reject> pattern matches). By default, a failure to match a closing
1425tag causes C<extract_tagged> to immediately fail.
1426
1427However, if the string value associated with <reject> is "MAX", then
1428C<extract_tagged> returns the complete text up to the point of failure.
1429If the string is "PARA", C<extract_tagged> returns only the first paragraph
1430after the tag (up to the first line that is either empty or contains
1431only whitespace characters).
d1be9408 1432If the string is "", the default behaviour (i.e. failure) is reinstated.
55a1c97c
JH
1433
1434For example, suppose the start tag "/para" introduces a paragraph, which then
1435continues until the next "/endpara" tag or until another "/para" tag is
1436encountered:
1437
1438 $text = "/para line 1\n\nline 3\n/para line 4";
1439
1440 extract_tagged($text, '/para', '/endpara', undef,
1441 {reject => '/para', fail => MAX );
1442
1443 # EXTRACTED: "/para line 1\n\nline 3\n"
1444
1445Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1446tag refers only to the immediately following paragraph:
1447
1448 $text = "/para line 1\n\nline 3\n/para line 4";
1449
1450 extract_tagged($text, '/para', '/endpara', undef,
1451 {reject => '/para', fail => MAX );
1452
1453 # EXTRACTED: "/para line 1\n"
1454
1455Note that the specified C<fail> behaviour applies to nested tags as well.
1456
1457=back
1458
1459On success in a list context, an array of 6 elements is returned. The elements are:
1460
1461=over 4
1462
1463=item [0]
1464
1465the extracted tagged substring (including the outermost tags),
1466
1467=item [1]
1468
1469the remainder of the input text,
1470
1471=item [2]
1472
1473the prefix substring (if any),
1474
1475=item [3]
1476
1477the opening tag
1478
1479=item [4]
1480
1481the text between the opening and closing tags
1482
1483=item [5]
1484
1485the closing tag (or "" if no closing tag was found)
1486
1487=back
1488
1489On failure, all of these values (except the remaining text) are C<undef>.
1490
1491In a scalar context, C<extract_tagged> returns just the complete
1492substring that matched a tagged text (including the start and end
1493tags). C<undef> is returned on failure. In addition, the original input
1494text has the returned substring (and any prefix) removed from it.
1495
1496In a void context, the input text just has the matched substring (and
1497any specified prefix) removed.
1498
1499
1500=head2 C<gen_extract_tagged>
1501
1502(Note: This subroutine is only available under Perl5.005)
1503
1504C<gen_extract_tagged> generates a new anonymous subroutine which
1505extracts text between (balanced) specified tags. In other words,
1506it generates a function identical in function to C<extract_tagged>.
1507
1508The difference between C<extract_tagged> and the anonymous
1509subroutines generated by
1510C<gen_extract_tagged>, is that those generated subroutines:
1511
1512=over 4
1513
1514=item *
1515
1516do not have to reparse tag specification or parsing options every time
1517they are called (whereas C<extract_tagged> has to effectively rebuild
1518its tag parser on every call);
1519
1520=item *
1521
1522make use of the new qr// construct to pre-compile the regexes they use
1523(whereas C<extract_tagged> uses standard string variable interpolation
1524to create tag-matching patterns).
1525
1526=back
1527
1528The subroutine takes up to four optional arguments (the same set as
1529C<extract_tagged> except for the string to be processed). It returns
1530a reference to a subroutine which in turn takes a single argument (the text to
1531be extracted from).
1532
1533In other words, the implementation of C<extract_tagged> is exactly
1534equivalent to:
1535
1536 sub extract_tagged
1537 {
1538 my $text = shift;
1539 $extractor = gen_extract_tagged(@_);
1540 return $extractor->($text);
1541 }
1542
1543(although C<extract_tagged> is not currently implemented that way, in order
1544to preserve pre-5.005 compatibility).
1545
1546Using C<gen_extract_tagged> to create extraction functions for specific tags
1547is a good idea if those functions are going to be called more than once, since
1548their performance is typically twice as good as the more general-purpose
1549C<extract_tagged>.
1550
1551
1552=head2 C<extract_quotelike>
1553
1554C<extract_quotelike> attempts to recognize, extract, and segment any
1555one of the various Perl quotes and quotelike operators (see
1556L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1557delimiters (for the quotelike operators), and trailing modifiers are
1558all caught. For example, in:
1559
1560 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1561
1562 extract_quotelike ' "You said, \"Use sed\"." '
1563
1564 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1565
1566 extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1567
1568the full Perl quotelike operations are all extracted correctly.
1569
1570Note too that, when using the /x modifier on a regex, any comment
1571containing the current pattern delimiter will cause the regex to be
1572immediately terminated. In other words:
1573
1574 'm /
1575 (?i) # CASE INSENSITIVE
1576 [a-z_] # LEADING ALPHABETIC/UNDERSCORE
1577 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1578 /x'
1579
1580will be extracted as if it were:
1581
1582 'm /
1583 (?i) # CASE INSENSITIVE
1584 [a-z_] # LEADING ALPHABETIC/'
1585
1586This behaviour is identical to that of the actual compiler.
1587
1588C<extract_quotelike> takes two arguments: the text to be processed and
1589a prefix to be matched at the very beginning of the text. If no prefix
1590is specified, optional whitespace is the default. If no text is given,
1591C<$_> is used.
1592
1593In a list context, an array of 11 elements is returned. The elements are:
1594
1595=over 4
1596
1597=item [0]
1598
1599the extracted quotelike substring (including trailing modifiers),
1600
1601=item [1]
1602
1603the remainder of the input text,
1604
1605=item [2]
1606
1607the prefix substring (if any),
1608
1609=item [3]
1610
1611the name of the quotelike operator (if any),
1612
1613=item [4]
1614
1615the left delimiter of the first block of the operation,
1616
1617=item [5]
1618
1619the text of the first block of the operation
1620(that is, the contents of
1621a quote, the regex of a match or substitution or the target list of a
1622translation),
1623
1624=item [6]
1625
1626the right delimiter of the first block of the operation,
1627
1628=item [7]
1629
1630the left delimiter of the second block of the operation
d1be9408 1631(that is, if it is an C<s>, C<tr>, or C<y>),
55a1c97c
JH
1632
1633=item [8]
1634
1635the text of the second block of the operation
1636(that is, the replacement of a substitution or the translation list
1637of a translation),
1638
1639=item [9]
1640
1641the right delimiter of the second block of the operation (if any),
1642
1643=item [10]
1644
1645the trailing modifiers on the operation (if any).
1646
1647=back
1648
1649For each of the fields marked "(if any)" the default value on success is
1650an empty string.
1651On failure, all of these values (except the remaining text) are C<undef>.
1652
1653
1654In a scalar context, C<extract_quotelike> returns just the complete substring
1655that matched a quotelike operation (or C<undef> on failure). In a scalar or
1656void context, the input text has the same substring (and any specified
1657prefix) removed.
1658
1659Examples:
1660
1661 # Remove the first quotelike literal that appears in text
1662
1663 $quotelike = extract_quotelike($text,'.*?');
1664
1665 # Replace one or more leading whitespace-separated quotelike
1666 # literals in $_ with "<QLL>"
1667
1668 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1669
1670
1671 # Isolate the search pattern in a quotelike operation from $text
1672
1673 ($op,$pat) = (extract_quotelike $text)[3,5];
1674 if ($op =~ /[ms]/)
1675 {
1676 print "search pattern: $pat\n";
1677 }
1678 else
1679 {
1680 print "$op is not a pattern matching operation\n";
1681 }
1682
1683
1684=head2 C<extract_quotelike> and "here documents"
1685
1686C<extract_quotelike> can successfully extract "here documents" from an input
1687string, but with an important caveat in list contexts.
1688
1689Unlike other types of quote-like literals, a here document is rarely
1690a contiguous substring. For example, a typical piece of code using
1691here document might look like this:
1692
1693 <<'EOMSG' || die;
1694 This is the message.
1695 EOMSG
1696 exit;
1697
1698Given this as an input string in a scalar context, C<extract_quotelike>
1699would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1700leaving the string " || die;\nexit;" in the original variable. In other words,
1701the two separate pieces of the here document are successfully extracted and
1702concatenated.
1703
1704In a list context, C<extract_quotelike> would return the list
1705
1706=over 4
1707
1708=item [0]
1709
1710"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1711including fore and aft delimiters),
1712
1713=item [1]
1714
1715" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1716
1717=item [2]
1718
1719"" (i.e. the prefix substring -- trivial in this case),
1720
1721=item [3]
1722
1723"<<" (i.e. the "name" of the quotelike operator)
1724
1725=item [4]
1726
1727"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1728
1729=item [5]
1730
1731"This is the message.\n" (i.e. the text of the here document),
1732
1733=item [6]
1734
1735"EOMSG" (i.e. the right delimiter of the here document),
1736
1737=item [7..10]
1738
1739"" (a here document has no second left delimiter, second text, second right
1740delimiter, or trailing modifiers).
1741
1742=back
1743
1744However, the matching position of the input variable would be set to
1745"exit;" (i.e. I<after> the closing delimiter of the here document),
1746which would cause the earlier " || die;\nexit;" to be skipped in any
1747sequence of code fragment extractions.
1748
d1be9408 1749To avoid this problem, when it encounters a here document while
55a1c97c
JH
1750extracting from a modifiable string, C<extract_quotelike> silently
1751rearranges the string to an equivalent piece of Perl:
1752
1753 <<'EOMSG'
1754 This is the message.
1755 EOMSG
1756 || die;
1757 exit;
1758
1759in which the here document I<is> contiguous. It still leaves the
1760matching position after the here document, but now the rest of the line
1761on which the here document starts is not skipped.
1762
1763To prevent <extract_quotelike> from mucking about with the input in this way
1764(this is the only case where a list-context C<extract_quotelike> does so),
1765you can pass the input variable as an interpolated literal:
1766
1767 $quotelike = extract_quotelike("$var");
1768
1769
1770=head2 C<extract_codeblock>
1771
1772C<extract_codeblock> attempts to recognize and extract a balanced
1773bracket delimited substring that may contain unbalanced brackets
1774inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1775is like a combination of C<"extract_bracketed"> and
1776C<"extract_quotelike">.
1777
1778C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1779a text to process, a set of delimiter brackets to look for, and a prefix to
1780match first. It also takes an optional fourth parameter, which allows the
1781outermost delimiter brackets to be specified separately (see below).
1782
1783Omitting the first argument (input text) means process C<$_> instead.
1784Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1785Omitting the third argument (prefix argument) implies optional whitespace at the start.
1786Omitting the fourth argument (outermost delimiter brackets) indicates that the
1787value of the second argument is to be used for the outermost delimiters.
1788
d1be9408 1789Once the prefix an the outermost opening delimiter bracket have been
55a1c97c
JH
1790recognized, code blocks are extracted by stepping through the input text and
1791trying the following alternatives in sequence:
1792
1793=over 4
1794
1795=item 1.
1796
1797Try and match a closing delimiter bracket. If the bracket was the same
1798species as the last opening bracket, return the substring to that
1799point. If the bracket was mismatched, return an error.
1800
1801=item 2.
1802
1803Try to match a quote or quotelike operator. If found, call
1804C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1805the error it returned. Otherwise go back to step 1.
1806
1807=item 3.
1808
1809Try to match an opening delimiter bracket. If found, call
1810C<extract_codeblock> recursively to eat the embedded block. If the
1811recursive call fails, return an error. Otherwise, go back to step 1.
1812
1813=item 4.
1814
1815Unconditionally match a bareword or any other single character, and
1816then go back to step 1.
1817
1818=back
1819
1820
1821Examples:
1822
1823 # Find a while loop in the text
1824
1825 if ($text =~ s/.*?while\s*\{/{/)
1826 {
1827 $loop = "while " . extract_codeblock($text);
1828 }
1829
1830 # Remove the first round-bracketed list (which may include
1831 # round- or curly-bracketed code blocks or quotelike operators)
1832
1833 extract_codeblock $text, "(){}", '[^(]*';
1834
1835
1836The ability to specify a different outermost delimiter bracket is useful
1837in some circumstances. For example, in the Parse::RecDescent module,
1838parser actions which are to be performed only on a successful parse
1839are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1840
1841 sentence: subject verb object
1842 <defer: {$::theVerb = $item{verb}} >
1843
1844Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1845within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1846
1847A deferred action like this:
1848
1849 <defer: {if ($count>10) {$count--}} >
1850
1851will be incorrectly parsed as:
1852
1853 <defer: {if ($count>
1854
1855because the "less than" operator is interpreted as a closing delimiter.
1856
1857But, by extracting the directive using
1858S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1859the '>' character is only treated as a delimited at the outermost
1860level of the code block, so the directive is parsed correctly.
1861
1862=head2 C<extract_multiple>
1863
1864The C<extract_multiple> subroutine takes a string to be processed and a
1865list of extractors (subroutines or regular expressions) to apply to that string.
1866
1867In an array context C<extract_multiple> returns an array of substrings
1868of the original string, as extracted by the specified extractors.
1869In a scalar context, C<extract_multiple> returns the first
1870substring successfully extracted from the original string. In both
1871scalar and void contexts the original string has the first successfully
1872extracted substring removed from it. In all contexts
1873C<extract_multiple> starts at the current C<pos> of the string, and
1874sets that C<pos> appropriately after it matches.
1875
d1be9408 1876Hence, the aim of a call to C<extract_multiple> in a list context
55a1c97c
JH
1877is to split the processed string into as many non-overlapping fields as
1878possible, by repeatedly applying each of the specified extractors
1879to the remainder of the string. Thus C<extract_multiple> is
1880a generalized form of Perl's C<split> subroutine.
1881
1882The subroutine takes up to four optional arguments:
1883
1884=over 4
1885
1886=item 1.
1887
1888A string to be processed (C<$_> if the string is omitted or C<undef>)
1889
1890=item 2.
1891
1892A reference to a list of subroutine references and/or qr// objects and/or
1893literal strings and/or hash references, specifying the extractors
1894to be used to split the string. If this argument is omitted (or
1895C<undef>) the list:
1896
1897 [
1898 sub { extract_variable($_[0], '') },
1899 sub { extract_quotelike($_[0],'') },
1900 sub { extract_codeblock($_[0],'{}','') },
1901 ]
1902
1903is used.
1904
1905
1906=item 3.
1907
d1be9408 1908A number specifying the maximum number of fields to return. If this
55a1c97c
JH
1909argument is omitted (or C<undef>), split continues as long as possible.
1910
1911If the third argument is I<N>, then extraction continues until I<N> fields
1912have been successfully extracted, or until the string has been completely
1913processed.
1914
1915Note that in scalar and void contexts the value of this argument is
1916automatically reset to 1 (under C<-w>, a warning is issued if the argument
1917has to be reset).
1918
1919=item 4.
1920
1921A value indicating whether unmatched substrings (see below) within the
1922text should be skipped or returned as fields. If the value is true,
1923such substrings are skipped. Otherwise, they are returned.
1924
1925=back
1926
1927The extraction process works by applying each extractor in
1928sequence to the text string. If the extractor is a subroutine it
1929is called in a list
1930context and is expected to return a list of a single element, namely
1931the extracted text.
1932Note that the value returned by an extractor subroutine need not bear any
1933relationship to the corresponding substring of the original text (see
1934examples below).
1935
1936If the extractor is a precompiled regular expression or a string,
1937it is matched against the text in a scalar context with a leading
1938'\G' and the gc modifiers enabled. The extracted value is either
1939$1 if that variable is defined after the match, or else the
1940complete match (i.e. $&).
1941
1942If the extractor is a hash reference, it must contain exactly one element.
1943The value of that element is one of the
1944above extractor types (subroutine reference, regular expression, or string).
1945The key of that element is the name of a class into which the successful
1946return value of the extractor will be blessed.
1947
1948If an extractor returns a defined value, that value is immediately
1949treated as the next extracted field and pushed onto the list of fields.
1950If the extractor was specified in a hash reference, the field is also
1951blessed into the appropriate class,
1952
1953If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
1954assumed to have failed to extract.
1955If none of the extractor subroutines succeeds, then one
1956character is extracted from the start of the text and the extraction
1957subroutines reapplied. Characters which are thus removed are accumulated and
1958eventually become the next field (unless the fourth argument is true, in which
d1be9408 1959case they are discarded).
55a1c97c
JH
1960
1961For example, the following extracts substrings that are valid Perl variables:
1962
1963 @fields = extract_multiple($text,
1964 [ sub { extract_variable($_[0]) } ],
1965 undef, 1);
1966
1967This example separates a text into fields which are quote delimited,
1968curly bracketed, and anything else. The delimited and bracketed
1969parts are also blessed to identify them (the "anything else" is unblessed):
1970
1971 @fields = extract_multiple($text,
1972 [
1973 { Delim => sub { extract_delimited($_[0],q{'"}) } },
1974 { Brack => sub { extract_bracketed($_[0],'{}') } },
1975 ]);
1976
1977This call extracts the next single substring that is a valid Perl quotelike
1978operator (and removes it from $text):
1979
1980 $quotelike = extract_multiple($text,
1981 [
1982 sub { extract_quotelike($_[0]) },
1983 ], undef, 1);
1984
1985Finally, here is yet another way to do comma-separated value parsing:
1986
1987 @fields = extract_multiple($csv_text,
1988 [
1989 sub { extract_delimited($_[0],q{'"}) },
1990 qr/([^,]+)(.*)/,
1991 ],
1992 undef,1);
1993
1994The list in the second argument means:
1995I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
1996The undef third argument means:
1997I<"...as many times as possible...">,
1998and the true value in the fourth argument means
1999I<"...discarding anything else that appears (i.e. the commas)">.
2000
2001If you wanted the commas preserved as separate fields (i.e. like split
2002does if your split pattern has capturing parentheses), you would
2003just make the last parameter undefined (or remove it).
2004
2005
2006=head2 C<gen_delimited_pat>
2007
2008The C<gen_delimited_pat> subroutine takes a single (string) argument and
2009 > builds a Friedl-style optimized regex that matches a string delimited
2010by any one of the characters in the single argument. For example:
2011
2012 gen_delimited_pat(q{'"})
2013
2014returns the regex:
2015
2016 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2017
2018Note that the specified delimiters are automatically quotemeta'd.
2019
2020A typical use of C<gen_delimited_pat> would be to build special purpose tags
2021for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2022(which might contain quoted strings):
2023
2024 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2025
2026 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2027
2028
2029C<gen_delimited_pat> may also be called with an optional second argument,
2030which specifies the "escape" character(s) to be used for each delimiter.
2031For example to match a Pascal-style string (where ' is the delimiter
2032and '' is a literal ' within the string):
2033
2034 gen_delimited_pat(q{'},q{'});
2035
2036Different escape characters can be specified for different delimiters.
2037For example, to specify that '/' is the escape for single quotes
2038and '%' is the escape for double quotes:
2039
2040 gen_delimited_pat(q{'"},q{/%});
2041
2042If more delimiters than escape chars are specified, the last escape char
2043is used for the remaining delimiters.
2044If no escape char is specified for a given specified delimiter, '\' is used.
2045
2046Note that
2047C<gen_delimited_pat> was previously called
2048C<delimited_pat>. That name may still be used, but is now deprecated.
2049
2050
2051=head1 DIAGNOSTICS
2052
2053In a list context, all the functions return C<(undef,$original_text)>
2054on failure. In a scalar context, failure is indicated by returning C<undef>
2055(in this case the input text is not modified in any way).
2056
2057In addition, on failure in I<any> context, the C<$@> variable is set.
2058Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2059below.
2060Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2061which the error was detected (although not necessarily where it occurred!)
2062Printing C<$@> directly produces the error message, with the offset appended.
2063On success, the C<$@> variable is guaranteed to be C<undef>.
2064
2065The available diagnostics are:
2066
2067=over 4
2068
2069=item C<Did not find a suitable bracket: "%s">
2070
2071The delimiter provided to C<extract_bracketed> was not one of
2072C<'()[]E<lt>E<gt>{}'>.
2073
2074=item C<Did not find prefix: /%s/>
2075
2076A non-optional prefix was specified but wasn't found at the start of the text.
2077
2078=item C<Did not find opening bracket after prefix: "%s">
2079
2080C<extract_bracketed> or C<extract_codeblock> was expecting a
2081particular kind of bracket at the start of the text, and didn't find it.
2082
2083=item C<No quotelike operator found after prefix: "%s">
2084
2085C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2086C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2087it was extracting.
2088
2089=item C<Unmatched closing bracket: "%c">
2090
2091C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2092a closing bracket where none was expected.
2093
2094=item C<Unmatched opening bracket(s): "%s">
2095
2096C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2097out of characters in the text before closing one or more levels of nested
2098brackets.
2099
2100=item C<Unmatched embedded quote (%s)>
2101
2102C<extract_bracketed> attempted to match an embedded quoted substring, but
2103failed to find a closing quote to match it.
2104
2105=item C<Did not find closing delimiter to match '%s'>
2106
2107C<extract_quotelike> was unable to find a closing delimiter to match the
2108one that opened the quote-like operation.
2109
2110=item C<Mismatched closing bracket: expected "%c" but found "%s">
2111
2112C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2113a valid bracket delimiter, but it was the wrong species. This usually
2114indicates a nesting error, but may indicate incorrect quoting or escaping.
2115
2116=item C<No block delimiter found after quotelike "%s">
2117
2118C<extract_quotelike> or C<extract_codeblock> found one of the
2119quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2120without a suitable block after it.
2121
2122=item C<Did not find leading dereferencer>
2123
2124C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2125a variable, but didn't find any of them.
2126
2127=item C<Bad identifier after dereferencer>
2128
2129C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2130character was not followed by a legal Perl identifier.
2131
2132=item C<Did not find expected opening bracket at %s>
2133
2134C<extract_codeblock> failed to find any of the outermost opening brackets
2135that were specified.
2136
2137=item C<Improperly nested codeblock at %s>
2138
2139A nested code block was found that started with a delimiter that was specified
2140as being only to be used as an outermost bracket.
2141
2142=item C<Missing second block for quotelike "%s">
2143
2144C<extract_codeblock> or C<extract_quotelike> found one of the
2145quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2146
2147=item C<No match found for opening bracket>
2148
2149C<extract_codeblock> failed to find a closing bracket to match the outermost
2150opening bracket.
2151
2152=item C<Did not find opening tag: /%s/>
2153
2154C<extract_tagged> did not find a suitable opening tag (after any specified
2155prefix was removed).
2156
2157=item C<Unable to construct closing tag to match: /%s/>
2158
2159C<extract_tagged> matched the specified opening tag and tried to
2160modify the matched text to produce a matching closing tag (because
2161none was specified). It failed to generate the closing tag, almost
2162certainly because the opening tag did not start with a
2163bracket of some kind.
2164
2165=item C<Found invalid nested tag: %s>
2166
2167C<extract_tagged> found a nested tag that appeared in the "reject" list
2168(and the failure mode was not "MAX" or "PARA").
2169
2170=item C<Found unbalanced nested tag: %s>
2171
2172C<extract_tagged> found a nested opening tag that was not matched by a
2173corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2174
2175=item C<Did not find closing tag>
2176
2177C<extract_tagged> reached the end of the text without finding a closing tag
2178to match the original opening tag (and the failure mode was not
2179"MAX" or "PARA").
2180
2181
2182
2183
2184=back
2185
2186
2187=head1 AUTHOR
2188
2189Damian Conway (damian@conway.org)
2190
2191
2192=head1 BUGS AND IRRITATIONS
2193
2194There are undoubtedly serious bugs lurking somewhere in this code, if
2195only because parts of it give the impression of understanding a great deal
2196more about Perl than they really do.
2197
2198Bug reports and other feedback are most welcome.
2199
2200
2201=head1 COPYRIGHT
2202
2203 Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2204 This module is free software. It may be used, redistributed
2205 and/or modified under the same terms as Perl itself.