This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Version bump and typo fix from change #28125.
[perl5.git] / lib / Text / Balanced.pm
CommitLineData
3270c621
JH
1# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2# FOR FULL DOCUMENTATION SEE Balanced.pod
3
4use 5.005;
5use strict;
6
7package Text::Balanced;
8
9use Exporter;
3270c621
JH
10use vars qw { $VERSION @ISA %EXPORT_TAGS };
11
62417cc9 12$VERSION = '1.98_01';
3270c621
JH
13@ISA = qw ( Exporter );
14
15%EXPORT_TAGS = ( ALL => [ qw(
16 &extract_delimited
17 &extract_bracketed
18 &extract_quotelike
19 &extract_codeblock
20 &extract_variable
21 &extract_tagged
22 &extract_multiple
23
24 &gen_delimited_pat
25 &gen_extract_tagged
26
27 &delimited_pat
28 ) ] );
29
30Exporter::export_ok_tags('ALL');
31
32# PROTOTYPES
33
34sub _match_bracketed($$$$$$);
35sub _match_variable($$);
36sub _match_codeblock($$$$$$$);
37sub _match_quotelike($$$$);
38
75c4c974
T
39sub carp {
40 require Carp; goto &Carp::carp;
41}
42
43sub croak {
44 require Carp; goto &Carp::croak;
45}
46
3270c621
JH
47# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
48
49sub _failmsg {
50 my ($message, $pos) = @_;
51 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
52}
53
54sub _fail
55{
56 my ($wantarray, $textref, $message, $pos) = @_;
57 _failmsg $message, $pos if $message;
49c03c89 58 return (undef,$$textref,undef) if $wantarray;
3270c621
JH
59 return undef;
60}
61
62sub _succeed
63{
64 $@ = undef;
65 my ($wantarray,$textref) = splice @_, 0, 2;
66 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
62417cc9 67 my ($startlen) = $_[5];
eb67bf7e 68 my $oppos = $_[6];
3270c621
JH
69 my $remainderpos = $_[2];
70 if ($wantarray)
71 {
72 my @res;
73 while (my ($from, $len) = splice @_, 0, 2)
74 {
75 push @res, substr($$textref,$from,$len);
76 }
77 if ($extralen) { # CORRECT FILLET
ce3ac4b6 78 my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
3270c621
JH
79 $res[1] = "$extra$res[1]";
80 eval { substr($$textref,$remainderpos,0) = $extra;
81 substr($$textref,$extrapos,$extralen,"\n")} ;
82 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
83 pos($$textref) = $remainderpos-$extralen+1; # RESET \G
84 }
85 else {
86 pos($$textref) = $remainderpos; # RESET \G
87 }
88 return @res;
89 }
90 else
91 {
92 my $match = substr($$textref,$_[0],$_[1]);
93 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
94 my $extra = $extralen
95 ? substr($$textref, $extrapos, $extralen)."\n" : "";
96 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
97 pos($$textref) = $_[4]; # RESET \G
98 return $match;
99 }
100}
101
102# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
103
104sub gen_delimited_pat($;$) # ($delimiters;$escapes)
105{
106 my ($dels, $escs) = @_;
107 return "" unless $dels =~ /\S/;
108 $escs = '\\' unless $escs;
109 $escs .= substr($escs,-1) x (length($dels)-length($escs));
110 my @pat = ();
111 my $i;
112 for ($i=0; $i<length $dels; $i++)
113 {
114 my $del = quotemeta substr($dels,$i,1);
115 my $esc = quotemeta substr($escs,$i,1);
116 if ($del eq $esc)
117 {
118 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
119 }
120 else
121 {
122 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
123 }
124 }
125 my $pat = join '|', @pat;
126 return "(?:$pat)";
127}
128
129*delimited_pat = \&gen_delimited_pat;
130
131
132# THE EXTRACTION FUNCTIONS
133
134sub extract_delimited (;$$$$)
135{
136 my $textref = defined $_[0] ? \$_[0] : \$_;
137 my $wantarray = wantarray;
138 my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
139 my $pre = defined $_[2] ? $_[2] : '\s*';
140 my $esc = defined $_[3] ? $_[3] : qq{\\};
141 my $pat = gen_delimited_pat($del, $esc);
142 my $startpos = pos $$textref || 0;
143 return _fail($wantarray, $textref, "Not a delimited pattern", 0)
144 unless $$textref =~ m/\G($pre)($pat)/gc;
145 my $prelen = length($1);
146 my $matchpos = $startpos+$prelen;
147 my $endpos = pos $$textref;
148 return _succeed $wantarray, $textref,
149 $matchpos, $endpos-$matchpos, # MATCH
150 $endpos, length($$textref)-$endpos, # REMAINDER
151 $startpos, $prelen; # PREFIX
152}
153
154sub extract_bracketed (;$$$)
155{
156 my $textref = defined $_[0] ? \$_[0] : \$_;
157 my $ldel = defined $_[1] ? $_[1] : '{([<';
158 my $pre = defined $_[2] ? $_[2] : '\s*';
159 my $wantarray = wantarray;
160 my $qdel = "";
161 my $quotelike;
162 $ldel =~ s/'//g and $qdel .= q{'};
163 $ldel =~ s/"//g and $qdel .= q{"};
164 $ldel =~ s/`//g and $qdel .= q{`};
165 $ldel =~ s/q//g and $quotelike = 1;
166 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
167 my $rdel = $ldel;
168 unless ($rdel =~ tr/[({</])}>/)
169 {
170 return _fail $wantarray, $textref,
171 "Did not find a suitable bracket in delimiter: \"$_[1]\"",
172 0;
173 }
174 my $posbug = pos;
175 $ldel = join('|', map { quotemeta $_ } split('', $ldel));
176 $rdel = join('|', map { quotemeta $_ } split('', $rdel));
177 pos = $posbug;
178
179 my $startpos = pos $$textref || 0;
180 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
181
182 return _fail ($wantarray, $textref) unless @match;
183
184 return _succeed ( $wantarray, $textref,
185 $match[2], $match[5]+2, # MATCH
186 @match[8,9], # REMAINDER
187 @match[0,1], # PREFIX
188 );
189}
190
191sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
192{
193 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
194 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
195 unless ($$textref =~ m/\G$pre/gc)
196 {
197 _failmsg "Did not find prefix: /$pre/", $startpos;
198 return;
199 }
200
201 $ldelpos = pos $$textref;
202
203 unless ($$textref =~ m/\G($ldel)/gc)
204 {
205 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
206 pos $$textref;
207 pos $$textref = $startpos;
208 return;
209 }
210
211 my @nesting = ( $1 );
212 my $textlen = length $$textref;
213 while (pos $$textref < $textlen)
214 {
215 next if $$textref =~ m/\G\\./gcs;
216
217 if ($$textref =~ m/\G($ldel)/gc)
218 {
219 push @nesting, $1;
220 }
221 elsif ($$textref =~ m/\G($rdel)/gc)
222 {
223 my ($found, $brackettype) = ($1, $1);
224 if ($#nesting < 0)
225 {
226 _failmsg "Unmatched closing bracket: \"$found\"",
227 pos $$textref;
228 pos $$textref = $startpos;
229 return;
230 }
231 my $expected = pop(@nesting);
232 $expected =~ tr/({[</)}]>/;
233 if ($expected ne $brackettype)
234 {
235 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
236 pos $$textref;
237 pos $$textref = $startpos;
238 return;
239 }
240 last if $#nesting < 0;
241 }
242 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
243 {
9686a75b 244 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
3270c621
JH
245 _failmsg "Unmatched embedded quote ($1)",
246 pos $$textref;
247 pos $$textref = $startpos;
248 return;
249 }
250 elsif ($quotelike && _match_quotelike($textref,"",1,0))
251 {
252 next;
253 }
254
255 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
256 }
257 if ($#nesting>=0)
258 {
259 _failmsg "Unmatched opening bracket(s): "
260 . join("..",@nesting)."..",
261 pos $$textref;
262 pos $$textref = $startpos;
263 return;
264 }
265
266 $endpos = pos $$textref;
267
268 return (
269 $startpos, $ldelpos-$startpos, # PREFIX
270 $ldelpos, 1, # OPENING BRACKET
271 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
272 $endpos-1, 1, # CLOSING BRACKET
273 $endpos, length($$textref)-$endpos, # REMAINDER
274 );
275}
276
49c03c89 277sub _revbracket($)
3270c621
JH
278{
279 my $brack = reverse $_[0];
280 $brack =~ tr/[({</])}>/;
281 return $brack;
282}
283
284my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
285
286sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
287{
288 my $textref = defined $_[0] ? \$_[0] : \$_;
289 my $ldel = $_[1];
290 my $rdel = $_[2];
291 my $pre = defined $_[3] ? $_[3] : '\s*';
292 my %options = defined $_[4] ? %{$_[4]} : ();
293 my $omode = defined $options{fail} ? $options{fail} : '';
294 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
295 : defined($options{reject}) ? $options{reject}
296 : ''
297 ;
298 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
299 : defined($options{ignore}) ? $options{ignore}
300 : ''
301 ;
302
303 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
304 $@ = undef;
305
306 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
307
308 return _fail(wantarray, $textref) unless @match;
309 return _succeed wantarray, $textref,
310 $match[2], $match[3]+$match[5]+$match[7], # MATCH
311 @match[8..9,0..1,2..7]; # REM, PRE, BITS
312}
313
314sub _match_tagged # ($$$$$$$)
315{
316 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
317 my $rdelspec;
318
319 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
320
321 unless ($$textref =~ m/\G($pre)/gc)
322 {
323 _failmsg "Did not find prefix: /$pre/", pos $$textref;
324 goto failed;
325 }
326
327 $opentagpos = pos($$textref);
328
329 unless ($$textref =~ m/\G$ldel/gc)
330 {
331 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
332 goto failed;
333 }
334
335 $textpos = pos($$textref);
336
337 if (!defined $rdel)
338 {
48f821bf 339 $rdelspec = $&;
49c03c89 340 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
3270c621
JH
341 {
342 _failmsg "Unable to construct closing tag to match: $rdel",
343 pos $$textref;
344 goto failed;
345 }
346 }
347 else
348 {
48f821bf
JH
349 $rdelspec = eval "qq{$rdel}" || do {
350 my $del;
351 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
352 { next if $rdel =~ /\Q$_/; $del = $_; last }
353 unless ($del) {
75c4c974 354 croak ("Can't interpolate right delimiter $rdel")
48f821bf
JH
355 }
356 eval "qq$del$rdel$del";
357 };
3270c621
JH
358 }
359
360 while (pos($$textref) < length($$textref))
361 {
362 next if $$textref =~ m/\G\\./gc;
363
364 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
365 {
366 $parapos = pos($$textref) - length($1)
367 unless defined $parapos;
368 }
369 elsif ($$textref =~ m/\G($rdelspec)/gc )
370 {
371 $closetagpos = pos($$textref)-length($1);
372 goto matched;
373 }
374 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
375 {
376 next;
377 }
378 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
379 {
380 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
381 goto short if ($omode eq 'PARA' || $omode eq 'MAX');
382 _failmsg "Found invalid nested tag: $1", pos $$textref;
383 goto failed;
384 }
385 elsif ($$textref =~ m/\G($ldel)/gc)
386 {
387 my $tag = $1;
388 pos($$textref) -= length($tag); # REWIND TO NESTED TAG
389 unless (_match_tagged(@_)) # MATCH NESTED TAG
390 {
391 goto short if $omode eq 'PARA' || $omode eq 'MAX';
392 _failmsg "Found unbalanced nested tag: $tag",
393 pos $$textref;
394 goto failed;
395 }
396 }
397 else { $$textref =~ m/./gcs }
398 }
399
400short:
401 $closetagpos = pos($$textref);
402 goto matched if $omode eq 'MAX';
403 goto failed unless $omode eq 'PARA';
404
405 if (defined $parapos) { pos($$textref) = $parapos }
406 else { $parapos = pos($$textref) }
407
408 return (
409 $startpos, $opentagpos-$startpos, # PREFIX
410 $opentagpos, $textpos-$opentagpos, # OPENING TAG
411 $textpos, $parapos-$textpos, # TEXT
412 $parapos, 0, # NO CLOSING TAG
413 $parapos, length($$textref)-$parapos, # REMAINDER
414 );
415
416matched:
417 $endpos = pos($$textref);
418 return (
419 $startpos, $opentagpos-$startpos, # PREFIX
420 $opentagpos, $textpos-$opentagpos, # OPENING TAG
421 $textpos, $closetagpos-$textpos, # TEXT
422 $closetagpos, $endpos-$closetagpos, # CLOSING TAG
423 $endpos, length($$textref)-$endpos, # REMAINDER
424 );
425
426failed:
427 _failmsg "Did not find closing tag", pos $$textref unless $@;
428 pos($$textref) = $startpos;
429 return;
430}
431
432sub extract_variable (;$$)
433{
434 my $textref = defined $_[0] ? \$_[0] : \$_;
435 return ("","","") unless defined $$textref;
436 my $pre = defined $_[1] ? $_[1] : '\s*';
437
438 my @match = _match_variable($textref,$pre);
439
440 return _fail wantarray, $textref unless @match;
441
442 return _succeed wantarray, $textref,
443 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
444}
445
446sub _match_variable($$)
447{
a7602084
JH
448# $#
449# $^
450# $$
3270c621
JH
451 my ($textref, $pre) = @_;
452 my $startpos = pos($$textref) = pos($$textref)||0;
453 unless ($$textref =~ m/\G($pre)/gc)
454 {
455 _failmsg "Did not find prefix: /$pre/", pos $$textref;
456 return;
457 }
458 my $varpos = pos($$textref);
48f821bf 459 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
3270c621 460 {
a7602084
JH
461 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
462 {
3270c621
JH
463 _failmsg "Did not find leading dereferencer", pos $$textref;
464 pos $$textref = $startpos;
465 return;
a7602084
JH
466 }
467 my $deref = $1;
3270c621 468
a7602084
JH
469 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
470 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
471 or $deref eq '$#' or $deref eq '$$' )
472 {
3270c621
JH
473 _failmsg "Bad identifier after dereferencer", pos $$textref;
474 pos $$textref = $startpos;
475 return;
a7602084 476 }
3270c621
JH
477 }
478
479 while (1)
480 {
48f821bf 481 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
3270c621 482 next if _match_codeblock($textref,
2f250b7c 483 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
3270c621
JH
484 qr/[({[]/, qr/[)}\]]/,
485 qr/[({[]/, qr/[)}\]]/, 0);
486 next if _match_codeblock($textref,
487 qr/\s*/, qr/[{[]/, qr/[}\]]/,
488 qr/[{[]/, qr/[}\]]/, 0);
489 next if _match_variable($textref,'\s*->\s*');
490 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
491 last;
492 }
493
494 my $endpos = pos($$textref);
495 return ($startpos, $varpos-$startpos,
496 $varpos, $endpos-$varpos,
497 $endpos, length($$textref)-$endpos
498 );
499}
500
501sub extract_codeblock (;$$$$$)
502{
503 my $textref = defined $_[0] ? \$_[0] : \$_;
504 my $wantarray = wantarray;
505 my $ldel_inner = defined $_[1] ? $_[1] : '{';
506 my $pre = defined $_[2] ? $_[2] : '\s*';
507 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
508 my $rd = $_[4];
509 my $rdel_inner = $ldel_inner;
510 my $rdel_outer = $ldel_outer;
511 my $posbug = pos;
512 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
513 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
514 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
515 {
516 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
517 }
518 pos = $posbug;
519
520 my @match = _match_codeblock($textref, $pre,
521 $ldel_outer, $rdel_outer,
522 $ldel_inner, $rdel_inner,
523 $rd);
524 return _fail($wantarray, $textref) unless @match;
525 return _succeed($wantarray, $textref,
526 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
527 );
528
529}
530
531sub _match_codeblock($$$$$$$)
532{
533 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
534 my $startpos = pos($$textref) = pos($$textref) || 0;
535 unless ($$textref =~ m/\G($pre)/gc)
536 {
537 _failmsg qq{Did not match prefix /$pre/ at"} .
538 substr($$textref,pos($$textref),20) .
539 q{..."},
540 pos $$textref;
541 return;
542 }
543 my $codepos = pos($$textref);
544 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
545 {
546 _failmsg qq{Did not find expected opening bracket at "} .
547 substr($$textref,pos($$textref),20) .
548 q{..."},
549 pos $$textref;
550 pos $$textref = $startpos;
551 return;
552 }
553 my $closing = $1;
554 $closing =~ tr/([<{/)]>}/;
555 my $matched;
556 my $patvalid = 1;
557 while (pos($$textref) < length($$textref))
558 {
559 $matched = '';
560 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
561 {
562 $patvalid = 0;
563 next;
564 }
565
566 if ($$textref =~ m/\G\s*#.*/gc)
567 {
568 next;
569 }
570
571 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
572 {
573 unless ($matched = ($closing && $1 eq $closing) )
574 {
575 next if $1 eq '>'; # MIGHT BE A "LESS THAN"
576 _failmsg q{Mismatched closing bracket at "} .
577 substr($$textref,pos($$textref),20) .
578 qq{...". Expected '$closing'},
579 pos $$textref;
580 }
581 last;
582 }
583
584 if (_match_variable($textref,'\s*') ||
585 _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
586 {
587 $patvalid = 0;
588 next;
589 }
590
591
592 # NEED TO COVER MANY MORE CASES HERE!!!
ce940696
DM
593 # NB 'case' is included here, because in Switch.pm,
594 # it's followed by a term, not an op
595
48f821bf
JH
596 if ($$textref =~ m#\G\s*(?!$ldel_inner)
597 ( [-+*x/%^&|.]=?
55a1c97c 598 | [!=]~
3270c621
JH
599 | =(?!>)
600 | (\*\*|&&|\|\||<<|>>)=?
ce940696 601 | case|split|grep|map|return
48f821bf 602 | [([]
3270c621
JH
603 )#gcx)
604 {
605 $patvalid = 1;
606 next;
607 }
608
609 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
610 {
611 $patvalid = 1;
612 next;
613 }
614
615 if ($$textref =~ m/\G\s*$ldel_outer/gc)
616 {
617 _failmsg q{Improperly nested codeblock at "} .
618 substr($$textref,pos($$textref),20) .
619 q{..."},
620 pos $$textref;
621 last;
622 }
623
624 $patvalid = 0;
625 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
626 }
627 continue { $@ = undef }
628
629 unless ($matched)
630 {
631 _failmsg 'No match found for opening bracket', pos $$textref
632 unless $@;
633 return;
634 }
635
636 my $endpos = pos($$textref);
637 return ( $startpos, $codepos-$startpos,
638 $codepos, $endpos-$codepos,
639 $endpos, length($$textref)-$endpos,
640 );
641}
642
643
644my %mods = (
645 'none' => '[cgimsox]*',
646 'm' => '[cgimsox]*',
647 's' => '[cegimsox]*',
648 'tr' => '[cds]*',
649 'y' => '[cds]*',
650 'qq' => '',
651 'qx' => '',
652 'qw' => '',
653 'qr' => '[imsx]*',
654 'q' => '',
655 );
656
657sub extract_quotelike (;$$)
658{
659 my $textref = $_[0] ? \$_[0] : \$_;
660 my $wantarray = wantarray;
661 my $pre = defined $_[1] ? $_[1] : '\s*';
662
663 my @match = _match_quotelike($textref,$pre,1,0);
664 return _fail($wantarray, $textref) unless @match;
665 return _succeed($wantarray, $textref,
666 $match[2], $match[18]-$match[2], # MATCH
667 @match[18,19], # REMAINDER
668 @match[0,1], # PREFIX
669 @match[2..17], # THE BITS
670 @match[20,21], # ANY FILLET?
671 );
672};
673
674sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
675{
676 my ($textref, $pre, $rawmatch, $qmark) = @_;
677
678 my ($textlen,$startpos,
679 $oppos,
680 $preld1pos,$ld1pos,$str1pos,$rd1pos,
681 $preld2pos,$ld2pos,$str2pos,$rd2pos,
682 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
683
684 unless ($$textref =~ m/\G($pre)/gc)
685 {
686 _failmsg qq{Did not find prefix /$pre/ at "} .
687 substr($$textref, pos($$textref), 20) .
688 q{..."},
689 pos $$textref;
690 return;
691 }
692 $oppos = pos($$textref);
693
694 my $initial = substr($$textref,$oppos,1);
695
696 if ($initial && $initial =~ m|^[\"\'\`]|
697 || $rawmatch && $initial =~ m|^/|
698 || $qmark && $initial =~ m|^\?|)
699 {
9686a75b 700 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
3270c621
JH
701 {
702 _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
703 substr($$textref, $oppos, 20) .
704 q{..."},
705 pos $$textref;
706 pos $$textref = $startpos;
707 return;
708 }
709 $modpos= pos($$textref);
710 $rd1pos = $modpos-1;
711
712 if ($initial eq '/' || $initial eq '?')
713 {
714 $$textref =~ m/\G$mods{none}/gc
715 }
716
717 my $endpos = pos($$textref);
718 return (
719 $startpos, $oppos-$startpos, # PREFIX
720 $oppos, 0, # NO OPERATOR
721 $oppos, 1, # LEFT DEL
722 $oppos+1, $rd1pos-$oppos-1, # STR/PAT
723 $rd1pos, 1, # RIGHT DEL
724 $modpos, 0, # NO 2ND LDEL
725 $modpos, 0, # NO 2ND STR
726 $modpos, 0, # NO 2ND RDEL
727 $modpos, $endpos-$modpos, # MODIFIERS
728 $endpos, $textlen-$endpos, # REMAINDER
729 );
730 }
731
eb67bf7e 732 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
3270c621
JH
733 {
734 _failmsg q{No quotelike operator found after prefix at "} .
735 substr($$textref, pos($$textref), 20) .
736 q{..."},
737 pos $$textref;
738 pos $$textref = $startpos;
739 return;
740 }
741
742 my $op = $1;
743 $preld1pos = pos($$textref);
744 if ($op eq '<<') {
745 $ld1pos = pos($$textref);
746 my $label;
747 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
748 $label = $1;
749 }
750 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
751 | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
752 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
9686a75b 753 }gcsx) {
3270c621
JH
754 $label = $+;
755 }
756 else {
757 $label = "";
758 }
759 my $extrapos = pos($$textref);
760 $$textref =~ m{.*\n}gc;
ce3ac4b6
DM
761 $str1pos = pos($$textref)--;
762 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
3270c621
JH
763 _failmsg qq{Missing here doc terminator ('$label') after "} .
764 substr($$textref, $startpos, 20) .
765 q{..."},
766 pos $$textref;
767 pos $$textref = $startpos;
768 return;
769 }
770 $rd1pos = pos($$textref);
ce3ac4b6 771 $$textref =~ m{\Q$label\E\n}gc;
3270c621
JH
772 $ld2pos = pos($$textref);
773 return (
774 $startpos, $oppos-$startpos, # PREFIX
775 $oppos, length($op), # OPERATOR
776 $ld1pos, $extrapos-$ld1pos, # LEFT DEL
777 $str1pos, $rd1pos-$str1pos, # STR/PAT
778 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
779 $ld2pos, 0, # NO 2ND LDEL
780 $ld2pos, 0, # NO 2ND STR
781 $ld2pos, 0, # NO 2ND RDEL
782 $ld2pos, 0, # NO MODIFIERS
783 $ld2pos, $textlen-$ld2pos, # REMAINDER
784 $extrapos, $str1pos-$extrapos, # FILLETED BIT
785 );
786 }
787
788 $$textref =~ m/\G\s*/gc;
789 $ld1pos = pos($$textref);
790 $str1pos = $ld1pos+1;
791
792 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
793 {
794 _failmsg "No block delimiter found after quotelike $op",
795 pos $$textref;
796 pos $$textref = $startpos;
797 return;
798 }
799 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
800 my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
801 if ($ldel1 =~ /[[(<{]/)
802 {
803 $rdel1 =~ tr/[({</])}>/;
ce3ac4b6 804 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
3270c621
JH
805 || do { pos $$textref = $startpos; return };
806 }
807 else
808 {
9686a75b 809 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
3270c621
JH
810 || do { pos $$textref = $startpos; return };
811 }
812 $ld2pos = $rd1pos = pos($$textref)-1;
813
814 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
815 if ($second_arg)
816 {
817 my ($ldel2, $rdel2);
818 if ($ldel1 =~ /[[(<{]/)
819 {
820 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
821 {
822 _failmsg "Missing second block for quotelike $op",
823 pos $$textref;
824 pos $$textref = $startpos;
825 return;
826 }
827 $ldel2 = $rdel2 = "\Q$1";
828 $rdel2 =~ tr/[({</])}>/;
829 }
830 else
831 {
832 $ldel2 = $rdel2 = $ldel1;
833 }
834 $str2pos = $ld2pos+1;
835
836 if ($ldel2 =~ /[[(<{]/)
837 {
838 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
ce3ac4b6 839 defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
3270c621
JH
840 || do { pos $$textref = $startpos; return };
841 }
842 else
843 {
9686a75b 844 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
3270c621
JH
845 || do { pos $$textref = $startpos; return };
846 }
847 $rd2pos = pos($$textref)-1;
848 }
849 else
850 {
851 $ld2pos = $str2pos = $rd2pos = $rd1pos;
852 }
853
854 $modpos = pos $$textref;
855
856 $$textref =~ m/\G($mods{$op})/gc;
857 my $endpos = pos $$textref;
858
859 return (
860 $startpos, $oppos-$startpos, # PREFIX
861 $oppos, length($op), # OPERATOR
862 $ld1pos, 1, # LEFT DEL
863 $str1pos, $rd1pos-$str1pos, # STR/PAT
864 $rd1pos, 1, # RIGHT DEL
865 $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
866 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
867 $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
868 $modpos, $endpos-$modpos, # MODIFIERS
869 $endpos, $textlen-$endpos, # REMAINDER
870 );
871}
872
873my $def_func =
874[
875 sub { extract_variable($_[0], '') },
876 sub { extract_quotelike($_[0],'') },
877 sub { extract_codeblock($_[0],'{}','') },
878];
879
880sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
881{
882 my $textref = defined($_[0]) ? \$_[0] : \$_;
883 my $posbug = pos;
884 my ($lastpos, $firstpos);
885 my @fields = ();
886
a7602084 887 #for ($$textref)
3270c621
JH
888 {
889 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
890 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
891 my $igunk = $_[3];
892
a7602084 893 pos $$textref ||= 0;
3270c621
JH
894
895 unless (wantarray)
896 {
75c4c974 897 carp ("extract_multiple reset maximal count to 1 in scalar context")
3270c621
JH
898 if $^W && defined($_[2]) && $max > 1;
899 $max = 1
900 }
901
902 my $unkpos;
903 my $func;
904 my $class;
905
906 my @class;
907 foreach $func ( @func )
908 {
909 if (ref($func) eq 'HASH')
910 {
911 push @class, (keys %$func)[0];
912 $func = (values %$func)[0];
913 }
914 else
915 {
916 push @class, undef;
917 }
918 }
919
a7602084 920 FIELD: while (pos($$textref) < length($$textref))
3270c621 921 {
48f821bf 922 my ($field, $rem);
a7602084 923 my @bits;
3270c621
JH
924 foreach my $i ( 0..$#func )
925 {
a7602084 926 my $pref;
3270c621
JH
927 $func = $func[$i];
928 $class = $class[$i];
a7602084 929 $lastpos = pos $$textref;
3270c621 930 if (ref($func) eq 'CODE')
49c03c89 931 { ($field,$rem,$pref) = @bits = $func->($$textref) }
3270c621 932 elsif (ref($func) eq 'Text::Balanced::Extractor')
a7602084
JH
933 { @bits = $field = $func->extract($$textref) }
934 elsif( $$textref =~ m/\G$func/gc )
48f821bf 935 { @bits = $field = defined($1) ? $1 : $& }
a7602084 936 $pref ||= "";
3270c621
JH
937 if (defined($field) && length($field))
938 {
a7602084 939 if (!$igunk) {
ce3ac4b6 940 $unkpos = $lastpos
a7602084
JH
941 if length($pref) && !defined($unkpos);
942 if (defined $unkpos)
943 {
944 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
945 $firstpos = $unkpos unless defined $firstpos;
946 undef $unkpos;
947 last FIELD if @fields == $max;
948 }
3270c621 949 }
a7602084
JH
950 push @fields, $class
951 ? bless (\$field, $class)
3270c621
JH
952 : $field;
953 $firstpos = $lastpos unless defined $firstpos;
a7602084 954 $lastpos = pos $$textref;
3270c621
JH
955 last FIELD if @fields == $max;
956 next FIELD;
957 }
958 }
a7602084 959 if ($$textref =~ /\G(.)/gcs)
3270c621 960 {
a7602084 961 $unkpos = pos($$textref)-1
3270c621
JH
962 unless $igunk || defined $unkpos;
963 }
964 }
965
966 if (defined $unkpos)
967 {
a7602084 968 push @fields, substr($$textref, $unkpos);
3270c621 969 $firstpos = $unkpos unless defined $firstpos;
a7602084 970 $lastpos = length $$textref;
3270c621
JH
971 }
972 last;
973 }
974
975 pos $$textref = $lastpos;
976 return @fields if wantarray;
977
978 $firstpos ||= 0;
979 eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
980 pos $$textref = $firstpos };
981 return $fields[0];
982}
983
984
985sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
986{
987 my $ldel = $_[0];
988 my $rdel = $_[1];
989 my $pre = defined $_[2] ? $_[2] : '\s*';
990 my %options = defined $_[3] ? %{$_[3]} : ();
991 my $omode = defined $options{fail} ? $options{fail} : '';
992 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
993 : defined($options{reject}) ? $options{reject}
994 : ''
995 ;
996 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
997 : defined($options{ignore}) ? $options{ignore}
998 : ''
999 ;
1000
1001 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
1002
1003 my $posbug = pos;
1004 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1005 pos = $posbug;
1006
1007 my $closure = sub
1008 {
1009 my $textref = defined $_[0] ? \$_[0] : \$_;
1010 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1011
1012 return _fail(wantarray, $textref) unless @match;
1013 return _succeed wantarray, $textref,
1014 $match[2], $match[3]+$match[5]+$match[7], # MATCH
1015 @match[8..9,0..1,2..7]; # REM, PRE, BITS
1016 };
1017
1018 bless $closure, 'Text::Balanced::Extractor';
1019}
1020
1021package Text::Balanced::Extractor;
1022
1023sub extract($$) # ($self, $text)
1024{
1025 &{$_[0]}($_[1]);
1026}
1027
1028package Text::Balanced::ErrorMsg;
1029
1030use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1031
10321;
55a1c97c
JH
1033
1034__END__
1035
1036=head1 NAME
1037
1038Text::Balanced - Extract delimited text sequences from strings.
1039
1040
1041=head1 SYNOPSIS
1042
1043 use Text::Balanced qw (
1044 extract_delimited
1045 extract_bracketed
1046 extract_quotelike
1047 extract_codeblock
1048 extract_variable
1049 extract_tagged
1050 extract_multiple
1051
1052 gen_delimited_pat
1053 gen_extract_tagged
1054 );
1055
1056 # Extract the initial substring of $text that is delimited by
1057 # two (unescaped) instances of the first character in $delim.
1058
1059 ($extracted, $remainder) = extract_delimited($text,$delim);
1060
1061
1062 # Extract the initial substring of $text that is bracketed
1063 # with a delimiter(s) specified by $delim (where the string
1064 # in $delim contains one or more of '(){}[]<>').
1065
1066 ($extracted, $remainder) = extract_bracketed($text,$delim);
1067
1068
1069 # Extract the initial substring of $text that is bounded by
48f821bf 1070 # an XML tag.
55a1c97c
JH
1071
1072 ($extracted, $remainder) = extract_tagged($text);
1073
1074
1075 # Extract the initial substring of $text that is bounded by
1076 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1077
1078 ($extracted, $remainder) =
1079 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1080
1081
1082 # Extract the initial substring of $text that represents a
1083 # Perl "quote or quote-like operation"
1084
1085 ($extracted, $remainder) = extract_quotelike($text);
1086
1087
1088 # Extract the initial substring of $text that represents a block
1089 # of Perl code, bracketed by any of character(s) specified by $delim
1090 # (where the string $delim contains one or more of '(){}[]<>').
1091
1092 ($extracted, $remainder) = extract_codeblock($text,$delim);
1093
1094
1095 # Extract the initial substrings of $text that would be extracted by
1096 # one or more sequential applications of the specified functions
1097 # or regular expressions
1098
1099 @extracted = extract_multiple($text,
1100 [ \&extract_bracketed,
1101 \&extract_quotelike,
1102 \&some_other_extractor_sub,
1103 qr/[xyz]*/,
1104 'literal',
1105 ]);
1106
1107# Create a string representing an optimized pattern (a la Friedl)
1108# that matches a substring delimited by any of the specified characters
1109# (in this case: any type of quote or a slash)
1110
1111 $patstring = gen_delimited_pat(q{'"`/});
1112
1113
1114# Generate a reference to an anonymous sub that is just like extract_tagged
1115# but pre-compiled and optimized for a specific pair of tags, and consequently
1116# much faster (i.e. 3 times faster). It uses qr// for better performance on
1117# repeated calls, so it only works under Perl 5.005 or later.
1118
1119 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1120
1121 ($extracted, $remainder) = $extract_head->($text);
1122
1123
1124=head1 DESCRIPTION
1125
48f821bf
JH
1126The various C<extract_...> subroutines may be used to
1127extract a delimited substring, possibly after skipping a
1128specified prefix string. By default, that prefix is
1129optional whitespace (C</\s*/>), but you can change it to whatever
1130you wish (see below).
1131
1132The substring to be extracted must appear at the
1133current C<pos> location of the string's variable
1134(or at index zero, if no C<pos> position is defined).
1135In other words, the C<extract_...> subroutines I<don't>
3c4b39be 1136extract the first occurrence of a substring anywhere
48f821bf 1137in a string (like an unanchored regex would). Rather,
3c4b39be 1138they extract an occurrence of the substring appearing
48f821bf
JH
1139immediately at the current matching position in the
1140string (like a C<\G>-anchored regex would).
1141
1142
55a1c97c
JH
1143
1144=head2 General behaviour in list contexts
1145
1146In a list context, all the subroutines return a list, the first three
1147elements of which are always:
1148
1149=over 4
1150
1151=item [0]
1152
1153The extracted string, including the specified delimiters.
49c03c89 1154If the extraction fails C<undef> is returned.
55a1c97c
JH
1155
1156=item [1]
1157
1158The remainder of the input string (i.e. the characters after the
1159extracted string). On failure, the entire string is returned.
1160
1161=item [2]
1162
1163The skipped prefix (i.e. the characters before the extracted string).
49c03c89 1164On failure, C<undef> is returned.
55a1c97c
JH
1165
1166=back
1167
1168Note that in a list context, the contents of the original input text (the first
1169argument) are not modified in any way.
1170
1171However, if the input text was passed in a variable, that variable's
1172C<pos> value is updated to point at the first character after the
1173extracted text. That means that in a list context the various
1174subroutines can be used much like regular expressions. For example:
1175
1176 while ( $next = (extract_quotelike($text))[0] )
1177 {
1178 # process next quote-like (in $next)
1179 }
1180
1181
1182=head2 General behaviour in scalar and void contexts
1183
1184In a scalar context, the extracted string is returned, having first been
1185removed from the input text. Thus, the following code also processes
1186each quote-like operation, but actually removes them from $text:
1187
1188 while ( $next = extract_quotelike($text) )
1189 {
1190 # process next quote-like (in $next)
1191 }
1192
1193Note that if the input text is a read-only string (i.e. a literal),
1194no attempt is made to remove the extracted text.
1195
1196In a void context the behaviour of the extraction subroutines is
1197exactly the same as in a scalar context, except (of course) that the
1198extracted substring is not returned.
1199
1200=head2 A note about prefixes
1201
1202Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1203This can bite you if you're expecting a prefix specification like
1204'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1205pattern will only succeed if the <H1> tag is on the current line, since
1206. normally doesn't match newlines.
1207
1208To overcome this limitation, you need to turn on /s matching within
1209the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1210
1211
1212=head2 C<extract_delimited>
1213
1214The C<extract_delimited> function formalizes the common idiom
1215of extracting a single-character-delimited substring from the start of
1216a string. For example, to extract a single-quote delimited string, the
1217following code is typically used:
1218
1219 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1220 $extracted = $1;
1221
1222but with C<extract_delimited> it can be simplified to:
1223
1224 ($extracted,$remainder) = extract_delimited($text, "'");
1225
1226C<extract_delimited> takes up to four scalars (the input text, the
1227delimiters, a prefix pattern to be skipped, and any escape characters)
1228and extracts the initial substring of the text that
1229is appropriately delimited. If the delimiter string has multiple
1230characters, the first one encountered in the text is taken to delimit
1231the substring.
1232The third argument specifies a prefix pattern that is to be skipped
1233(but must be present!) before the substring is extracted.
1234The final argument specifies the escape character to be used for each
1235delimiter.
1236
1237All arguments are optional. If the escape characters are not specified,
1238every delimiter is escaped with a backslash (C<\>).
1239If the prefix is not specified, the
1240pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1241is also not specified, the set C</["'`]/> is used. If the text to be processed
1242is not specified either, C<$_> is used.
1243
48f821bf 1244In list context, C<extract_delimited> returns a array of three
55a1c97c
JH
1245elements, the extracted substring (I<including the surrounding
1246delimiters>), the remainder of the text, and the skipped prefix (if
1247any). If a suitable delimited substring is not found, the first
1248element of the array is the empty string, the second is the complete
1249original text, and the prefix returned in the third element is an
1250empty string.
1251
1252In a scalar context, just the extracted substring is returned. In
1253a void context, the extracted substring (and any prefix) are simply
1254removed from the beginning of the first argument.
1255
1256Examples:
1257
1258 # Remove a single-quoted substring from the very beginning of $text:
1259
1260 $substring = extract_delimited($text, "'", '');
1261
1262 # Remove a single-quoted Pascalish substring (i.e. one in which
1263 # doubling the quote character escapes it) from the very
1264 # beginning of $text:
1265
1266 $substring = extract_delimited($text, "'", '', "'");
1267
1268 # Extract a single- or double- quoted substring from the
1269 # beginning of $text, optionally after some whitespace
1270 # (note the list context to protect $text from modification):
1271
1272 ($substring) = extract_delimited $text, q{"'};
1273
1274
1275 # Delete the substring delimited by the first '/' in $text:
1276
1277 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1278
1279Note that this last example is I<not> the same as deleting the first
1280quote-like pattern. For instance, if C<$text> contained the string:
1281
1282 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1283
1284then after the deletion it would contain:
1285
1286 "if ('.$UNIXCMD/s) { $cmd = $1; }"
1287
1288not:
1289
1290 "if ('./cmd' =~ ms) { $cmd = $1; }"
1291
1292
1293See L<"extract_quotelike"> for a (partial) solution to this problem.
1294
1295
1296=head2 C<extract_bracketed>
1297
1298Like C<"extract_delimited">, the C<extract_bracketed> function takes
1299up to three optional scalar arguments: a string to extract from, a delimiter
1300specifier, and a prefix pattern. As before, a missing prefix defaults to
1301optional whitespace and a missing text defaults to C<$_>. However, a missing
1302delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1303
1304C<extract_bracketed> extracts a balanced-bracket-delimited
1305substring (using any one (or more) of the user-specified delimiter
1306brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1307respect quoted unbalanced brackets (see below).
1308
1309A "delimiter bracket" is a bracket in list of delimiters passed as
1310C<extract_bracketed>'s second argument. Delimiter brackets are
1311specified by giving either the left or right (or both!) versions
1312of the required bracket(s). Note that the order in which
1313two or more delimiter brackets are specified is not significant.
1314
1315A "balanced-bracket-delimited substring" is a substring bounded by
1316matched brackets, such that any other (left or right) delimiter
1317bracket I<within> the substring is also matched by an opposite
1318(right or left) delimiter bracket I<at the same level of nesting>. Any
1319type of bracket not in the delimiter list is treated as an ordinary
1320character.
1321
1322In other words, each type of bracket specified as a delimiter must be
1323balanced and correctly nested within the substring, and any other kind of
1324("non-delimiter") bracket in the substring is ignored.
1325
1326For example, given the string:
1327
1328 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1329
1330then a call to C<extract_bracketed> in a list context:
1331
1332 @result = extract_bracketed( $text, '{}' );
1333
1334would return:
1335
1336 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1337
1338since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1339(In a scalar context just the first element of the array would be returned. In
1340a void context, C<$text> would be replaced by an empty string.)
1341
1342Likewise the call in:
1343
1344 @result = extract_bracketed( $text, '{[' );
1345
1346would return the same result, since all sets of both types of specified
1347delimiter brackets are correctly nested and balanced.
1348
1349However, the call in:
1350
1351 @result = extract_bracketed( $text, '{([<' );
1352
1353would fail, returning:
1354
1355 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
1356
1357because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1358the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1359return an empty string. In a void context, C<$text> would be unchanged.)
1360
1361Note that the embedded single-quotes in the string don't help in this
1362case, since they have not been specified as acceptable delimiters and are
1363therefore treated as non-delimiter characters (and ignored).
1364
1365However, if a particular species of quote character is included in the
1366delimiter specification, then that type of quote will be correctly handled.
1367for example, if C<$text> is:
1368
1369 $text = '<A HREF=">>>>">link</A>';
1370
1371then
1372
1373 @result = extract_bracketed( $text, '<">' );
1374
1375returns:
1376
1377 ( '<A HREF=">>>>">', 'link</A>', "" )
1378
1379as expected. Without the specification of C<"> as an embedded quoter:
1380
1381 @result = extract_bracketed( $text, '<>' );
1382
1383the result would be:
1384
1385 ( '<A HREF=">', '>>>">link</A>', "" )
1386
1387In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1388quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1389letter 'q' as a delimiter. Hence:
1390
1391 @result = extract_bracketed( $text, '<q>' );
1392
1393would correctly match something like this:
1394
1395 $text = '<leftop: conj /and/ conj>';
1396
1397See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1398
1399
48f821bf
JH
1400=head2 C<extract_variable>
1401
1402C<extract_variable> extracts any valid Perl variable or
1403variable-involved expression, including scalars, arrays, hashes, array
3c4b39be 1404accesses, hash look-ups, method calls through objects, subroutine calls
48f821bf
JH
1405through subroutine references, etc.
1406
1407The subroutine takes up to two optional arguments:
1408
1409=over 4
1410
1411=item 1.
1412
1413A string to be processed (C<$_> if the string is omitted or C<undef>)
1414
1415=item 2.
1416
1417A string specifying a pattern to be matched as a prefix (which is to be
1418skipped). If omitted, optional whitespace is skipped.
1419
1420=back
1421
1422On success in a list context, an array of 3 elements is returned. The
1423elements are:
1424
1425=over 4
1426
1427=item [0]
1428
1429the extracted variable, or variablish expression
1430
1431=item [1]
1432
1433the remainder of the input text,
1434
1435=item [2]
1436
1437the prefix substring (if any),
1438
1439=back
1440
1441On failure, all of these values (except the remaining text) are C<undef>.
1442
1443In a scalar context, C<extract_variable> returns just the complete
1444substring that matched a variablish expression. C<undef> is returned on
1445failure. In addition, the original input text has the returned substring
1446(and any prefix) removed from it.
1447
1448In a void context, the input text just has the matched substring (and
1449any specified prefix) removed.
1450
1451
55a1c97c
JH
1452=head2 C<extract_tagged>
1453
1454C<extract_tagged> extracts and segments text between (balanced)
1455specified tags.
1456
1457The subroutine takes up to five optional arguments:
1458
1459=over 4
1460
1461=item 1.
1462
1463A string to be processed (C<$_> if the string is omitted or C<undef>)
1464
1465=item 2.
1466
1467A string specifying a pattern to be matched as the opening tag.
1468If the pattern string is omitted (or C<undef>) then a pattern
48f821bf 1469that matches any standard XML tag is used.
55a1c97c
JH
1470
1471=item 3.
1472
1473A string specifying a pattern to be matched at the closing tag.
1474If the pattern string is omitted (or C<undef>) then the closing
1475tag is constructed by inserting a C</> after any leading bracket
1476characters in the actual opening tag that was matched (I<not> the pattern
1477that matched the tag). For example, if the opening tag pattern
1478is specified as C<'{{\w+}}'> and actually matched the opening tag
1479C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1480
1481=item 4.
1482
1483A string specifying a pattern to be matched as a prefix (which is to be
1484skipped). If omitted, optional whitespace is skipped.
1485
1486=item 5.
1487
1488A hash reference containing various parsing options (see below)
1489
1490=back
1491
1492The various options that can be specified are:
1493
1494=over 4
1495
1496=item C<reject =E<gt> $listref>
1497
1498The list reference contains one or more strings specifying patterns
1499that must I<not> appear within the tagged text.
1500
1501For example, to extract
1502an HTML link (which should not contain nested links) use:
1503
48f821bf 1504 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
55a1c97c
JH
1505
1506=item C<ignore =E<gt> $listref>
1507
1508The list reference contains one or more strings specifying patterns
1509that are I<not> be be treated as nested tags within the tagged text
1510(even if they would match the start tag pattern).
1511
1512For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1513
48f821bf 1514 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
55a1c97c
JH
1515
1516(also see L<"gen_delimited_pat"> below).
1517
1518
1519=item C<fail =E<gt> $str>
1520
1521The C<fail> option indicates the action to be taken if a matching end
1522tag is not encountered (i.e. before the end of the string or some
1523C<reject> pattern matches). By default, a failure to match a closing
1524tag causes C<extract_tagged> to immediately fail.
1525
1526However, if the string value associated with <reject> is "MAX", then
1527C<extract_tagged> returns the complete text up to the point of failure.
1528If the string is "PARA", C<extract_tagged> returns only the first paragraph
1529after the tag (up to the first line that is either empty or contains
1530only whitespace characters).
48f821bf 1531If the string is "", the the default behaviour (i.e. failure) is reinstated.
55a1c97c
JH
1532
1533For example, suppose the start tag "/para" introduces a paragraph, which then
1534continues until the next "/endpara" tag or until another "/para" tag is
1535encountered:
1536
48f821bf 1537 $text = "/para line 1\n\nline 3\n/para line 4";
55a1c97c 1538
48f821bf
JH
1539 extract_tagged($text, '/para', '/endpara', undef,
1540 {reject => '/para', fail => MAX );
55a1c97c 1541
48f821bf 1542 # EXTRACTED: "/para line 1\n\nline 3\n"
55a1c97c
JH
1543
1544Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1545tag refers only to the immediately following paragraph:
1546
48f821bf 1547 $text = "/para line 1\n\nline 3\n/para line 4";
55a1c97c 1548
48f821bf
JH
1549 extract_tagged($text, '/para', '/endpara', undef,
1550 {reject => '/para', fail => MAX );
55a1c97c 1551
48f821bf 1552 # EXTRACTED: "/para line 1\n"
55a1c97c
JH
1553
1554Note that the specified C<fail> behaviour applies to nested tags as well.
1555
1556=back
1557
1558On success in a list context, an array of 6 elements is returned. The elements are:
1559
1560=over 4
1561
1562=item [0]
1563
1564the extracted tagged substring (including the outermost tags),
1565
1566=item [1]
1567
1568the remainder of the input text,
1569
1570=item [2]
1571
1572the prefix substring (if any),
1573
1574=item [3]
1575
1576the opening tag
1577
1578=item [4]
1579
1580the text between the opening and closing tags
1581
1582=item [5]
1583
1584the closing tag (or "" if no closing tag was found)
1585
1586=back
1587
1588On failure, all of these values (except the remaining text) are C<undef>.
1589
1590In a scalar context, C<extract_tagged> returns just the complete
1591substring that matched a tagged text (including the start and end
1592tags). C<undef> is returned on failure. In addition, the original input
1593text has the returned substring (and any prefix) removed from it.
1594
1595In a void context, the input text just has the matched substring (and
1596any specified prefix) removed.
1597
1598
1599=head2 C<gen_extract_tagged>
1600
1601(Note: This subroutine is only available under Perl5.005)
1602
1603C<gen_extract_tagged> generates a new anonymous subroutine which
1604extracts text between (balanced) specified tags. In other words,
1605it generates a function identical in function to C<extract_tagged>.
1606
1607The difference between C<extract_tagged> and the anonymous
1608subroutines generated by
1609C<gen_extract_tagged>, is that those generated subroutines:
1610
1611=over 4
1612
1613=item *
1614
1615do not have to reparse tag specification or parsing options every time
1616they are called (whereas C<extract_tagged> has to effectively rebuild
1617its tag parser on every call);
1618
1619=item *
1620
1621make use of the new qr// construct to pre-compile the regexes they use
1622(whereas C<extract_tagged> uses standard string variable interpolation
1623to create tag-matching patterns).
1624
1625=back
1626
1627The subroutine takes up to four optional arguments (the same set as
1628C<extract_tagged> except for the string to be processed). It returns
1629a reference to a subroutine which in turn takes a single argument (the text to
1630be extracted from).
1631
1632In other words, the implementation of C<extract_tagged> is exactly
1633equivalent to:
1634
48f821bf
JH
1635 sub extract_tagged
1636 {
1637 my $text = shift;
1638 $extractor = gen_extract_tagged(@_);
1639 return $extractor->($text);
1640 }
55a1c97c
JH
1641
1642(although C<extract_tagged> is not currently implemented that way, in order
1643to preserve pre-5.005 compatibility).
1644
1645Using C<gen_extract_tagged> to create extraction functions for specific tags
1646is a good idea if those functions are going to be called more than once, since
1647their performance is typically twice as good as the more general-purpose
1648C<extract_tagged>.
1649
1650
1651=head2 C<extract_quotelike>
1652
1653C<extract_quotelike> attempts to recognize, extract, and segment any
1654one of the various Perl quotes and quotelike operators (see
1655L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1656delimiters (for the quotelike operators), and trailing modifiers are
1657all caught. For example, in:
1658
48f821bf
JH
1659 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1660
1661 extract_quotelike ' "You said, \"Use sed\"." '
55a1c97c 1662
48f821bf 1663 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
55a1c97c 1664
48f821bf 1665 extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
55a1c97c
JH
1666
1667the full Perl quotelike operations are all extracted correctly.
1668
1669Note too that, when using the /x modifier on a regex, any comment
1670containing the current pattern delimiter will cause the regex to be
1671immediately terminated. In other words:
1672
48f821bf
JH
1673 'm /
1674 (?i) # CASE INSENSITIVE
1675 [a-z_] # LEADING ALPHABETIC/UNDERSCORE
1676 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1677 /x'
55a1c97c
JH
1678
1679will be extracted as if it were:
1680
48f821bf
JH
1681 'm /
1682 (?i) # CASE INSENSITIVE
1683 [a-z_] # LEADING ALPHABETIC/'
55a1c97c
JH
1684
1685This behaviour is identical to that of the actual compiler.
1686
1687C<extract_quotelike> takes two arguments: the text to be processed and
1688a prefix to be matched at the very beginning of the text. If no prefix
1689is specified, optional whitespace is the default. If no text is given,
1690C<$_> is used.
1691
1692In a list context, an array of 11 elements is returned. The elements are:
1693
1694=over 4
1695
1696=item [0]
1697
1698the extracted quotelike substring (including trailing modifiers),
1699
1700=item [1]
1701
1702the remainder of the input text,
1703
1704=item [2]
1705
1706the prefix substring (if any),
1707
1708=item [3]
1709
1710the name of the quotelike operator (if any),
1711
1712=item [4]
1713
1714the left delimiter of the first block of the operation,
1715
1716=item [5]
1717
1718the text of the first block of the operation
1719(that is, the contents of
1720a quote, the regex of a match or substitution or the target list of a
1721translation),
1722
1723=item [6]
1724
1725the right delimiter of the first block of the operation,
1726
1727=item [7]
1728
1729the left delimiter of the second block of the operation
48f821bf 1730(that is, if it is a C<s>, C<tr>, or C<y>),
55a1c97c
JH
1731
1732=item [8]
1733
1734the text of the second block of the operation
1735(that is, the replacement of a substitution or the translation list
1736of a translation),
1737
1738=item [9]
1739
1740the right delimiter of the second block of the operation (if any),
1741
1742=item [10]
1743
1744the trailing modifiers on the operation (if any).
1745
1746=back
1747
1748For each of the fields marked "(if any)" the default value on success is
1749an empty string.
1750On failure, all of these values (except the remaining text) are C<undef>.
1751
1752
1753In a scalar context, C<extract_quotelike> returns just the complete substring
1754that matched a quotelike operation (or C<undef> on failure). In a scalar or
1755void context, the input text has the same substring (and any specified
1756prefix) removed.
1757
1758Examples:
1759
48f821bf 1760 # Remove the first quotelike literal that appears in text
55a1c97c 1761
48f821bf 1762 $quotelike = extract_quotelike($text,'.*?');
55a1c97c 1763
48f821bf
JH
1764 # Replace one or more leading whitespace-separated quotelike
1765 # literals in $_ with "<QLL>"
55a1c97c 1766
48f821bf 1767 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
55a1c97c
JH
1768
1769
48f821bf 1770 # Isolate the search pattern in a quotelike operation from $text
55a1c97c 1771
48f821bf
JH
1772 ($op,$pat) = (extract_quotelike $text)[3,5];
1773 if ($op =~ /[ms]/)
1774 {
1775 print "search pattern: $pat\n";
1776 }
1777 else
1778 {
1779 print "$op is not a pattern matching operation\n";
1780 }
55a1c97c
JH
1781
1782
1783=head2 C<extract_quotelike> and "here documents"
1784
1785C<extract_quotelike> can successfully extract "here documents" from an input
1786string, but with an important caveat in list contexts.
1787
1788Unlike other types of quote-like literals, a here document is rarely
1789a contiguous substring. For example, a typical piece of code using
1790here document might look like this:
1791
1792 <<'EOMSG' || die;
1793 This is the message.
1794 EOMSG
48f821bf 1795 exit;
55a1c97c
JH
1796
1797Given this as an input string in a scalar context, C<extract_quotelike>
1798would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1799leaving the string " || die;\nexit;" in the original variable. In other words,
1800the two separate pieces of the here document are successfully extracted and
1801concatenated.
1802
1803In a list context, C<extract_quotelike> would return the list
1804
1805=over 4
1806
1807=item [0]
1808
1809"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1810including fore and aft delimiters),
1811
1812=item [1]
1813
1814" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1815
1816=item [2]
1817
1818"" (i.e. the prefix substring -- trivial in this case),
1819
1820=item [3]
1821
1822"<<" (i.e. the "name" of the quotelike operator)
1823
1824=item [4]
1825
1826"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1827
1828=item [5]
1829
1830"This is the message.\n" (i.e. the text of the here document),
1831
1832=item [6]
1833
1834"EOMSG" (i.e. the right delimiter of the here document),
1835
1836=item [7..10]
1837
1838"" (a here document has no second left delimiter, second text, second right
1839delimiter, or trailing modifiers).
1840
1841=back
1842
1843However, the matching position of the input variable would be set to
1844"exit;" (i.e. I<after> the closing delimiter of the here document),
1845which would cause the earlier " || die;\nexit;" to be skipped in any
1846sequence of code fragment extractions.
1847
48f821bf 1848To avoid this problem, when it encounters a here document whilst
55a1c97c
JH
1849extracting from a modifiable string, C<extract_quotelike> silently
1850rearranges the string to an equivalent piece of Perl:
1851
1852 <<'EOMSG'
1853 This is the message.
1854 EOMSG
1855 || die;
48f821bf 1856 exit;
55a1c97c
JH
1857
1858in which the here document I<is> contiguous. It still leaves the
1859matching position after the here document, but now the rest of the line
1860on which the here document starts is not skipped.
1861
1862To prevent <extract_quotelike> from mucking about with the input in this way
1863(this is the only case where a list-context C<extract_quotelike> does so),
1864you can pass the input variable as an interpolated literal:
1865
1866 $quotelike = extract_quotelike("$var");
1867
1868
1869=head2 C<extract_codeblock>
1870
1871C<extract_codeblock> attempts to recognize and extract a balanced
1872bracket delimited substring that may contain unbalanced brackets
1873inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1874is like a combination of C<"extract_bracketed"> and
1875C<"extract_quotelike">.
1876
1877C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1878a text to process, a set of delimiter brackets to look for, and a prefix to
1879match first. It also takes an optional fourth parameter, which allows the
1880outermost delimiter brackets to be specified separately (see below).
1881
1882Omitting the first argument (input text) means process C<$_> instead.
1883Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1884Omitting the third argument (prefix argument) implies optional whitespace at the start.
1885Omitting the fourth argument (outermost delimiter brackets) indicates that the
1886value of the second argument is to be used for the outermost delimiters.
1887
48f821bf 1888Once the prefix an dthe outermost opening delimiter bracket have been
55a1c97c
JH
1889recognized, code blocks are extracted by stepping through the input text and
1890trying the following alternatives in sequence:
1891
1892=over 4
1893
1894=item 1.
1895
1896Try and match a closing delimiter bracket. If the bracket was the same
1897species as the last opening bracket, return the substring to that
1898point. If the bracket was mismatched, return an error.
1899
1900=item 2.
1901
1902Try to match a quote or quotelike operator. If found, call
1903C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1904the error it returned. Otherwise go back to step 1.
1905
1906=item 3.
1907
1908Try to match an opening delimiter bracket. If found, call
1909C<extract_codeblock> recursively to eat the embedded block. If the
1910recursive call fails, return an error. Otherwise, go back to step 1.
1911
1912=item 4.
1913
1914Unconditionally match a bareword or any other single character, and
1915then go back to step 1.
1916
1917=back
1918
1919
1920Examples:
1921
1922 # Find a while loop in the text
1923
1924 if ($text =~ s/.*?while\s*\{/{/)
1925 {
1926 $loop = "while " . extract_codeblock($text);
1927 }
1928
1929 # Remove the first round-bracketed list (which may include
1930 # round- or curly-bracketed code blocks or quotelike operators)
1931
1932 extract_codeblock $text, "(){}", '[^(]*';
1933
1934
1935The ability to specify a different outermost delimiter bracket is useful
1936in some circumstances. For example, in the Parse::RecDescent module,
1937parser actions which are to be performed only on a successful parse
1938are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1939
1940 sentence: subject verb object
1941 <defer: {$::theVerb = $item{verb}} >
1942
1943Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1944within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1945
1946A deferred action like this:
1947
1948 <defer: {if ($count>10) {$count--}} >
1949
1950will be incorrectly parsed as:
1951
1952 <defer: {if ($count>
1953
1954because the "less than" operator is interpreted as a closing delimiter.
1955
1956But, by extracting the directive using
1957S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1958the '>' character is only treated as a delimited at the outermost
1959level of the code block, so the directive is parsed correctly.
1960
1961=head2 C<extract_multiple>
1962
1963The C<extract_multiple> subroutine takes a string to be processed and a
1964list of extractors (subroutines or regular expressions) to apply to that string.
1965
1966In an array context C<extract_multiple> returns an array of substrings
1967of the original string, as extracted by the specified extractors.
1968In a scalar context, C<extract_multiple> returns the first
1969substring successfully extracted from the original string. In both
1970scalar and void contexts the original string has the first successfully
1971extracted substring removed from it. In all contexts
1972C<extract_multiple> starts at the current C<pos> of the string, and
1973sets that C<pos> appropriately after it matches.
1974
48f821bf 1975Hence, the aim of of a call to C<extract_multiple> in a list context
55a1c97c
JH
1976is to split the processed string into as many non-overlapping fields as
1977possible, by repeatedly applying each of the specified extractors
1978to the remainder of the string. Thus C<extract_multiple> is
1979a generalized form of Perl's C<split> subroutine.
1980
1981The subroutine takes up to four optional arguments:
1982
1983=over 4
1984
1985=item 1.
1986
1987A string to be processed (C<$_> if the string is omitted or C<undef>)
1988
1989=item 2.
1990
1991A reference to a list of subroutine references and/or qr// objects and/or
1992literal strings and/or hash references, specifying the extractors
1993to be used to split the string. If this argument is omitted (or
1994C<undef>) the list:
1995
1996 [
1997 sub { extract_variable($_[0], '') },
1998 sub { extract_quotelike($_[0],'') },
1999 sub { extract_codeblock($_[0],'{}','') },
2000 ]
2001
2002is used.
2003
2004
2005=item 3.
2006
48f821bf 2007An number specifying the maximum number of fields to return. If this
55a1c97c
JH
2008argument is omitted (or C<undef>), split continues as long as possible.
2009
2010If the third argument is I<N>, then extraction continues until I<N> fields
2011have been successfully extracted, or until the string has been completely
2012processed.
2013
2014Note that in scalar and void contexts the value of this argument is
2015automatically reset to 1 (under C<-w>, a warning is issued if the argument
2016has to be reset).
2017
2018=item 4.
2019
2020A value indicating whether unmatched substrings (see below) within the
2021text should be skipped or returned as fields. If the value is true,
2022such substrings are skipped. Otherwise, they are returned.
2023
2024=back
2025
2026The extraction process works by applying each extractor in
a7602084
JH
2027sequence to the text string.
2028
2029If the extractor is a subroutine it is called in a list context and is
2030expected to return a list of a single element, namely the extracted
2031text. It may optionally also return two further arguments: a string
2032representing the text left after extraction (like $' for a pattern
2033match), and a string representing any prefix skipped before the
2034extraction (like $` in a pattern match). Note that this is designed
2035to facilitate the use of other Text::Balanced subroutines with
2036C<extract_multiple>. Note too that the value returned by an extractor
2037subroutine need not bear any relationship to the corresponding substring
2038of the original text (see examples below).
55a1c97c
JH
2039
2040If the extractor is a precompiled regular expression or a string,
2041it is matched against the text in a scalar context with a leading
2042'\G' and the gc modifiers enabled. The extracted value is either
2043$1 if that variable is defined after the match, or else the
2044complete match (i.e. $&).
2045
2046If the extractor is a hash reference, it must contain exactly one element.
2047The value of that element is one of the
2048above extractor types (subroutine reference, regular expression, or string).
2049The key of that element is the name of a class into which the successful
2050return value of the extractor will be blessed.
2051
2052If an extractor returns a defined value, that value is immediately
2053treated as the next extracted field and pushed onto the list of fields.
2054If the extractor was specified in a hash reference, the field is also
2055blessed into the appropriate class,
2056
2057If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2058assumed to have failed to extract.
2059If none of the extractor subroutines succeeds, then one
2060character is extracted from the start of the text and the extraction
2061subroutines reapplied. Characters which are thus removed are accumulated and
2062eventually become the next field (unless the fourth argument is true, in which
3c4b39be 2063case they are discarded).
55a1c97c
JH
2064
2065For example, the following extracts substrings that are valid Perl variables:
2066
2067 @fields = extract_multiple($text,
2068 [ sub { extract_variable($_[0]) } ],
2069 undef, 1);
2070
2071This example separates a text into fields which are quote delimited,
2072curly bracketed, and anything else. The delimited and bracketed
2073parts are also blessed to identify them (the "anything else" is unblessed):
2074
2075 @fields = extract_multiple($text,
2076 [
2077 { Delim => sub { extract_delimited($_[0],q{'"}) } },
2078 { Brack => sub { extract_bracketed($_[0],'{}') } },
2079 ]);
2080
2081This call extracts the next single substring that is a valid Perl quotelike
2082operator (and removes it from $text):
2083
2084 $quotelike = extract_multiple($text,
2085 [
2086 sub { extract_quotelike($_[0]) },
2087 ], undef, 1);
2088
2089Finally, here is yet another way to do comma-separated value parsing:
2090
2091 @fields = extract_multiple($csv_text,
2092 [
2093 sub { extract_delimited($_[0],q{'"}) },
2094 qr/([^,]+)(.*)/,
2095 ],
2096 undef,1);
2097
2098The list in the second argument means:
2099I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2100The undef third argument means:
2101I<"...as many times as possible...">,
2102and the true value in the fourth argument means
2103I<"...discarding anything else that appears (i.e. the commas)">.
2104
2105If you wanted the commas preserved as separate fields (i.e. like split
2106does if your split pattern has capturing parentheses), you would
2107just make the last parameter undefined (or remove it).
2108
2109
2110=head2 C<gen_delimited_pat>
2111
2112The C<gen_delimited_pat> subroutine takes a single (string) argument and
2113 > builds a Friedl-style optimized regex that matches a string delimited
2114by any one of the characters in the single argument. For example:
2115
2116 gen_delimited_pat(q{'"})
2117
2118returns the regex:
2119
2120 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2121
2122Note that the specified delimiters are automatically quotemeta'd.
2123
2124A typical use of C<gen_delimited_pat> would be to build special purpose tags
2125for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2126(which might contain quoted strings):
2127
2128 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2129
2130 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2131
2132
2133C<gen_delimited_pat> may also be called with an optional second argument,
2134which specifies the "escape" character(s) to be used for each delimiter.
2135For example to match a Pascal-style string (where ' is the delimiter
2136and '' is a literal ' within the string):
2137
2138 gen_delimited_pat(q{'},q{'});
2139
2140Different escape characters can be specified for different delimiters.
2141For example, to specify that '/' is the escape for single quotes
2142and '%' is the escape for double quotes:
2143
2144 gen_delimited_pat(q{'"},q{/%});
2145
2146If more delimiters than escape chars are specified, the last escape char
2147is used for the remaining delimiters.
2148If no escape char is specified for a given specified delimiter, '\' is used.
2149
49c03c89
SP
2150=head2 C<delimited_pat>
2151
2152Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2153That name may still be used, but is now deprecated.
55a1c97c
JH
2154
2155
2156=head1 DIAGNOSTICS
2157
2158In a list context, all the functions return C<(undef,$original_text)>
2159on failure. In a scalar context, failure is indicated by returning C<undef>
2160(in this case the input text is not modified in any way).
2161
2162In addition, on failure in I<any> context, the C<$@> variable is set.
2163Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2164below.
2165Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2166which the error was detected (although not necessarily where it occurred!)
2167Printing C<$@> directly produces the error message, with the offset appended.
2168On success, the C<$@> variable is guaranteed to be C<undef>.
2169
2170The available diagnostics are:
2171
2172=over 4
2173
2174=item C<Did not find a suitable bracket: "%s">
2175
2176The delimiter provided to C<extract_bracketed> was not one of
2177C<'()[]E<lt>E<gt>{}'>.
2178
2179=item C<Did not find prefix: /%s/>
2180
2181A non-optional prefix was specified but wasn't found at the start of the text.
2182
2183=item C<Did not find opening bracket after prefix: "%s">
2184
2185C<extract_bracketed> or C<extract_codeblock> was expecting a
2186particular kind of bracket at the start of the text, and didn't find it.
2187
2188=item C<No quotelike operator found after prefix: "%s">
2189
2190C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2191C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2192it was extracting.
2193
2194=item C<Unmatched closing bracket: "%c">
2195
2196C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2197a closing bracket where none was expected.
2198
2199=item C<Unmatched opening bracket(s): "%s">
2200
2201C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2202out of characters in the text before closing one or more levels of nested
2203brackets.
2204
2205=item C<Unmatched embedded quote (%s)>
2206
2207C<extract_bracketed> attempted to match an embedded quoted substring, but
2208failed to find a closing quote to match it.
2209
2210=item C<Did not find closing delimiter to match '%s'>
2211
2212C<extract_quotelike> was unable to find a closing delimiter to match the
2213one that opened the quote-like operation.
2214
2215=item C<Mismatched closing bracket: expected "%c" but found "%s">
2216
2217C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2218a valid bracket delimiter, but it was the wrong species. This usually
2219indicates a nesting error, but may indicate incorrect quoting or escaping.
2220
2221=item C<No block delimiter found after quotelike "%s">
2222
2223C<extract_quotelike> or C<extract_codeblock> found one of the
2224quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2225without a suitable block after it.
2226
2227=item C<Did not find leading dereferencer>
2228
2229C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2230a variable, but didn't find any of them.
2231
2232=item C<Bad identifier after dereferencer>
2233
2234C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2235character was not followed by a legal Perl identifier.
2236
2237=item C<Did not find expected opening bracket at %s>
2238
2239C<extract_codeblock> failed to find any of the outermost opening brackets
2240that were specified.
2241
2242=item C<Improperly nested codeblock at %s>
2243
2244A nested code block was found that started with a delimiter that was specified
2245as being only to be used as an outermost bracket.
2246
2247=item C<Missing second block for quotelike "%s">
2248
2249C<extract_codeblock> or C<extract_quotelike> found one of the
2250quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2251
2252=item C<No match found for opening bracket>
2253
2254C<extract_codeblock> failed to find a closing bracket to match the outermost
2255opening bracket.
2256
2257=item C<Did not find opening tag: /%s/>
2258
2259C<extract_tagged> did not find a suitable opening tag (after any specified
2260prefix was removed).
2261
2262=item C<Unable to construct closing tag to match: /%s/>
2263
2264C<extract_tagged> matched the specified opening tag and tried to
2265modify the matched text to produce a matching closing tag (because
2266none was specified). It failed to generate the closing tag, almost
2267certainly because the opening tag did not start with a
2268bracket of some kind.
2269
2270=item C<Found invalid nested tag: %s>
2271
2272C<extract_tagged> found a nested tag that appeared in the "reject" list
2273(and the failure mode was not "MAX" or "PARA").
2274
2275=item C<Found unbalanced nested tag: %s>
2276
2277C<extract_tagged> found a nested opening tag that was not matched by a
2278corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2279
2280=item C<Did not find closing tag>
2281
2282C<extract_tagged> reached the end of the text without finding a closing tag
2283to match the original opening tag (and the failure mode was not
2284"MAX" or "PARA").
2285
2286
2287
2288
2289=back
2290
2291
2292=head1 AUTHOR
2293
2294Damian Conway (damian@conway.org)
2295
2296
2297=head1 BUGS AND IRRITATIONS
2298
2299There are undoubtedly serious bugs lurking somewhere in this code, if
2300only because parts of it give the impression of understanding a great deal
2301more about Perl than they really do.
2302
2303Bug reports and other feedback are most welcome.
2304
2305
2306=head1 COPYRIGHT
2307
2308 Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2309 This module is free software. It may be used, redistributed
2310 and/or modified under the same terms as Perl itself.