1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2 # FOR FULL DOCUMENTATION SEE Balanced.pod
7 package Text::Balanced;
11 use vars qw { $VERSION @ISA %EXPORT_TAGS };
13 # I really don't want to bring the XS version module into maint. So for now,
14 # I'm commiting the sin of Bowdlerising Damian's module:
15 # use version; $VERSION = qv('2.0.0');
17 @ISA = qw ( Exporter );
19 %EXPORT_TAGS = ( ALL => [ qw(
34 Exporter::export_ok_tags('ALL');
38 sub _match_bracketed($$$$$$);
39 sub _match_variable($$);
40 sub _match_codeblock($$$$$$$);
41 sub _match_quotelike($$$$);
43 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
46 my ($message, $pos) = @_;
47 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
52 my ($wantarray, $textref, $message, $pos) = @_;
53 _failmsg $message, $pos if $message;
54 return (undef,$$textref,undef) if $wantarray;
61 my ($wantarray,$textref) = splice @_, 0, 2;
62 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
63 my ($startlen, $oppos) = @_[5,6];
64 my $remainderpos = $_[2];
68 while (my ($from, $len) = splice @_, 0, 2)
70 push @res, substr($$textref,$from,$len);
72 if ($extralen) { # CORRECT FILLET
73 my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
74 $res[1] = "$extra$res[1]";
75 eval { substr($$textref,$remainderpos,0) = $extra;
76 substr($$textref,$extrapos,$extralen,"\n")} ;
77 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
78 pos($$textref) = $remainderpos-$extralen+1; # RESET \G
81 pos($$textref) = $remainderpos; # RESET \G
87 my $match = substr($$textref,$_[0],$_[1]);
88 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
90 ? substr($$textref, $extrapos, $extralen)."\n" : "";
91 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
92 pos($$textref) = $_[4]; # RESET \G
97 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
99 sub gen_delimited_pat($;$) # ($delimiters;$escapes)
101 my ($dels, $escs) = @_;
102 return "" unless $dels =~ /\S/;
103 $escs = '\\' unless $escs;
104 $escs .= substr($escs,-1) x (length($dels)-length($escs));
107 for ($i=0; $i<length $dels; $i++)
109 my $del = quotemeta substr($dels,$i,1);
110 my $esc = quotemeta substr($escs,$i,1);
113 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
117 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
120 my $pat = join '|', @pat;
124 *delimited_pat = \&gen_delimited_pat;
127 # THE EXTRACTION FUNCTIONS
129 sub extract_delimited (;$$$$)
131 my $textref = defined $_[0] ? \$_[0] : \$_;
132 my $wantarray = wantarray;
133 my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
134 my $pre = defined $_[2] ? $_[2] : '\s*';
135 my $esc = defined $_[3] ? $_[3] : qq{\\};
136 my $pat = gen_delimited_pat($del, $esc);
137 my $startpos = pos $$textref || 0;
138 return _fail($wantarray, $textref, "Not a delimited pattern", 0)
139 unless $$textref =~ m/\G($pre)($pat)/gc;
140 my $prelen = length($1);
141 my $matchpos = $startpos+$prelen;
142 my $endpos = pos $$textref;
143 return _succeed $wantarray, $textref,
144 $matchpos, $endpos-$matchpos, # MATCH
145 $endpos, length($$textref)-$endpos, # REMAINDER
146 $startpos, $prelen; # PREFIX
149 sub extract_bracketed (;$$$)
151 my $textref = defined $_[0] ? \$_[0] : \$_;
152 my $ldel = defined $_[1] ? $_[1] : '{([<';
153 my $pre = defined $_[2] ? $_[2] : '\s*';
154 my $wantarray = wantarray;
157 $ldel =~ s/'//g and $qdel .= q{'};
158 $ldel =~ s/"//g and $qdel .= q{"};
159 $ldel =~ s/`//g and $qdel .= q{`};
160 $ldel =~ s/q//g and $quotelike = 1;
161 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
163 unless ($rdel =~ tr/[({</])}>/)
165 return _fail $wantarray, $textref,
166 "Did not find a suitable bracket in delimiter: \"$_[1]\"",
170 $ldel = join('|', map { quotemeta $_ } split('', $ldel));
171 $rdel = join('|', map { quotemeta $_ } split('', $rdel));
174 my $startpos = pos $$textref || 0;
175 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
177 return _fail ($wantarray, $textref) unless @match;
179 return _succeed ( $wantarray, $textref,
180 $match[2], $match[5]+2, # MATCH
181 @match[8,9], # REMAINDER
182 @match[0,1], # PREFIX
186 sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
188 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
189 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
190 unless ($$textref =~ m/\G$pre/gc)
192 _failmsg "Did not find prefix: /$pre/", $startpos;
196 $ldelpos = pos $$textref;
198 unless ($$textref =~ m/\G($ldel)/gc)
200 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
202 pos $$textref = $startpos;
206 my @nesting = ( $1 );
207 my $textlen = length $$textref;
208 while (pos $$textref < $textlen)
210 next if $$textref =~ m/\G\\./gcs;
212 if ($$textref =~ m/\G($ldel)/gc)
216 elsif ($$textref =~ m/\G($rdel)/gc)
218 my ($found, $brackettype) = ($1, $1);
221 _failmsg "Unmatched closing bracket: \"$found\"",
223 pos $$textref = $startpos;
226 my $expected = pop(@nesting);
227 $expected =~ tr/({[</)}]>/;
228 if ($expected ne $brackettype)
230 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
232 pos $$textref = $startpos;
235 last if $#nesting < 0;
237 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
239 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
240 _failmsg "Unmatched embedded quote ($1)",
242 pos $$textref = $startpos;
245 elsif ($quotelike && _match_quotelike($textref,"",1,0))
250 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
254 _failmsg "Unmatched opening bracket(s): "
255 . join("..",@nesting)."..",
257 pos $$textref = $startpos;
261 $endpos = pos $$textref;
264 $startpos, $ldelpos-$startpos, # PREFIX
265 $ldelpos, 1, # OPENING BRACKET
266 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
267 $endpos-1, 1, # CLOSING BRACKET
268 $endpos, length($$textref)-$endpos, # REMAINDER
274 my $brack = reverse $_[0];
275 $brack =~ tr/[({</])}>/;
279 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
281 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
283 my $textref = defined $_[0] ? \$_[0] : \$_;
286 my $pre = defined $_[3] ? $_[3] : '\s*';
287 my %options = defined $_[4] ? %{$_[4]} : ();
288 my $omode = defined $options{fail} ? $options{fail} : '';
289 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
290 : defined($options{reject}) ? $options{reject}
293 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
294 : defined($options{ignore}) ? $options{ignore}
298 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
301 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
303 return _fail(wantarray, $textref) unless @match;
304 return _succeed wantarray, $textref,
305 $match[2], $match[3]+$match[5]+$match[7], # MATCH
306 @match[8..9,0..1,2..7]; # REM, PRE, BITS
309 sub _match_tagged # ($$$$$$$)
311 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
314 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
316 unless ($$textref =~ m/\G($pre)/gc)
318 _failmsg "Did not find prefix: /$pre/", pos $$textref;
322 $opentagpos = pos($$textref);
324 unless ($$textref =~ m/\G$ldel/gc)
326 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
330 $textpos = pos($$textref);
334 $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
335 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
337 _failmsg "Unable to construct closing tag to match: $rdel",
344 $rdelspec = eval "qq{$rdel}" || do {
346 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
347 { next if $rdel =~ /\Q$_/; $del = $_; last }
350 croak "Can't interpolate right delimiter $rdel"
352 eval "qq$del$rdel$del";
356 while (pos($$textref) < length($$textref))
358 next if $$textref =~ m/\G\\./gc;
360 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
362 $parapos = pos($$textref) - length($1)
363 unless defined $parapos;
365 elsif ($$textref =~ m/\G($rdelspec)/gc )
367 $closetagpos = pos($$textref)-length($1);
370 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
374 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
376 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
377 goto short if ($omode eq 'PARA' || $omode eq 'MAX');
378 _failmsg "Found invalid nested tag: $1", pos $$textref;
381 elsif ($$textref =~ m/\G($ldel)/gc)
384 pos($$textref) -= length($tag); # REWIND TO NESTED TAG
385 unless (_match_tagged(@_)) # MATCH NESTED TAG
387 goto short if $omode eq 'PARA' || $omode eq 'MAX';
388 _failmsg "Found unbalanced nested tag: $tag",
393 else { $$textref =~ m/./gcs }
397 $closetagpos = pos($$textref);
398 goto matched if $omode eq 'MAX';
399 goto failed unless $omode eq 'PARA';
401 if (defined $parapos) { pos($$textref) = $parapos }
402 else { $parapos = pos($$textref) }
405 $startpos, $opentagpos-$startpos, # PREFIX
406 $opentagpos, $textpos-$opentagpos, # OPENING TAG
407 $textpos, $parapos-$textpos, # TEXT
408 $parapos, 0, # NO CLOSING TAG
409 $parapos, length($$textref)-$parapos, # REMAINDER
413 $endpos = pos($$textref);
415 $startpos, $opentagpos-$startpos, # PREFIX
416 $opentagpos, $textpos-$opentagpos, # OPENING TAG
417 $textpos, $closetagpos-$textpos, # TEXT
418 $closetagpos, $endpos-$closetagpos, # CLOSING TAG
419 $endpos, length($$textref)-$endpos, # REMAINDER
423 _failmsg "Did not find closing tag", pos $$textref unless $@;
424 pos($$textref) = $startpos;
428 sub extract_variable (;$$)
430 my $textref = defined $_[0] ? \$_[0] : \$_;
431 return ("","","") unless defined $$textref;
432 my $pre = defined $_[1] ? $_[1] : '\s*';
434 my @match = _match_variable($textref,$pre);
436 return _fail wantarray, $textref unless @match;
438 return _succeed wantarray, $textref,
439 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
442 sub _match_variable($$)
447 my ($textref, $pre) = @_;
448 my $startpos = pos($$textref) = pos($$textref)||0;
449 unless ($$textref =~ m/\G($pre)/gc)
451 _failmsg "Did not find prefix: /$pre/", pos $$textref;
454 my $varpos = pos($$textref);
455 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
457 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
459 _failmsg "Did not find leading dereferencer", pos $$textref;
460 pos $$textref = $startpos;
465 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
466 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
467 or $deref eq '$#' or $deref eq '$$' )
469 _failmsg "Bad identifier after dereferencer", pos $$textref;
470 pos $$textref = $startpos;
477 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
478 next if _match_codeblock($textref,
479 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
480 qr/[({[]/, qr/[)}\]]/,
481 qr/[({[]/, qr/[)}\]]/, 0);
482 next if _match_codeblock($textref,
483 qr/\s*/, qr/[{[]/, qr/[}\]]/,
484 qr/[{[]/, qr/[}\]]/, 0);
485 next if _match_variable($textref,'\s*->\s*');
486 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
490 my $endpos = pos($$textref);
491 return ($startpos, $varpos-$startpos,
492 $varpos, $endpos-$varpos,
493 $endpos, length($$textref)-$endpos
497 sub extract_codeblock (;$$$$$)
499 my $textref = defined $_[0] ? \$_[0] : \$_;
500 my $wantarray = wantarray;
501 my $ldel_inner = defined $_[1] ? $_[1] : '{';
502 my $pre = defined $_[2] ? $_[2] : '\s*';
503 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
505 my $rdel_inner = $ldel_inner;
506 my $rdel_outer = $ldel_outer;
508 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
509 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
510 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
512 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
516 my @match = _match_codeblock($textref, $pre,
517 $ldel_outer, $rdel_outer,
518 $ldel_inner, $rdel_inner,
520 return _fail($wantarray, $textref) unless @match;
521 return _succeed($wantarray, $textref,
522 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
527 sub _match_codeblock($$$$$$$)
529 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
530 my $startpos = pos($$textref) = pos($$textref) || 0;
531 unless ($$textref =~ m/\G($pre)/gc)
533 _failmsg qq{Did not match prefix /$pre/ at"} .
534 substr($$textref,pos($$textref),20) .
539 my $codepos = pos($$textref);
540 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
542 _failmsg qq{Did not find expected opening bracket at "} .
543 substr($$textref,pos($$textref),20) .
546 pos $$textref = $startpos;
550 $closing =~ tr/([<{/)]>}/;
553 while (pos($$textref) < length($$textref))
556 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
562 if ($$textref =~ m/\G\s*#.*/gc)
567 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
569 unless ($matched = ($closing && $1 eq $closing) )
571 next if $1 eq '>'; # MIGHT BE A "LESS THAN"
572 _failmsg q{Mismatched closing bracket at "} .
573 substr($$textref,pos($$textref),20) .
574 qq{...". Expected '$closing'},
580 if (_match_variable($textref,'\s*') ||
581 _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
588 # NEED TO COVER MANY MORE CASES HERE!!!
589 if ($$textref =~ m#\G\s*(?!$ldel_inner)
593 | (\*\*|&&|\|\||<<|>>)=?
594 | split|grep|map|return
602 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
608 if ($$textref =~ m/\G\s*$ldel_outer/gc)
610 _failmsg q{Improperly nested codeblock at "} .
611 substr($$textref,pos($$textref),20) .
618 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
620 continue { $@ = undef }
624 _failmsg 'No match found for opening bracket', pos $$textref
629 my $endpos = pos($$textref);
630 return ( $startpos, $codepos-$startpos,
631 $codepos, $endpos-$codepos,
632 $endpos, length($$textref)-$endpos,
638 'none' => '[cgimsox]*',
640 's' => '[cegimsox]*',
650 sub extract_quotelike (;$$)
652 my $textref = $_[0] ? \$_[0] : \$_;
653 my $wantarray = wantarray;
654 my $pre = defined $_[1] ? $_[1] : '\s*';
656 my @match = _match_quotelike($textref,$pre,1,0);
657 return _fail($wantarray, $textref) unless @match;
658 return _succeed($wantarray, $textref,
659 $match[2], $match[18]-$match[2], # MATCH
660 @match[18,19], # REMAINDER
661 @match[0,1], # PREFIX
662 @match[2..17], # THE BITS
663 @match[20,21], # ANY FILLET?
667 sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
669 my ($textref, $pre, $rawmatch, $qmark) = @_;
671 my ($textlen,$startpos,
673 $preld1pos,$ld1pos,$str1pos,$rd1pos,
674 $preld2pos,$ld2pos,$str2pos,$rd2pos,
675 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
677 unless ($$textref =~ m/\G($pre)/gc)
679 _failmsg qq{Did not find prefix /$pre/ at "} .
680 substr($$textref, pos($$textref), 20) .
685 $oppos = pos($$textref);
687 my $initial = substr($$textref,$oppos,1);
689 if ($initial && $initial =~ m|^[\"\'\`]|
690 || $rawmatch && $initial =~ m|^/|
691 || $qmark && $initial =~ m|^\?|)
693 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
695 _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
696 substr($$textref, $oppos, 20) .
699 pos $$textref = $startpos;
702 $modpos= pos($$textref);
705 if ($initial eq '/' || $initial eq '?')
707 $$textref =~ m/\G$mods{none}/gc
710 my $endpos = pos($$textref);
712 $startpos, $oppos-$startpos, # PREFIX
713 $oppos, 0, # NO OPERATOR
714 $oppos, 1, # LEFT DEL
715 $oppos+1, $rd1pos-$oppos-1, # STR/PAT
716 $rd1pos, 1, # RIGHT DEL
717 $modpos, 0, # NO 2ND LDEL
718 $modpos, 0, # NO 2ND STR
719 $modpos, 0, # NO 2ND RDEL
720 $modpos, $endpos-$modpos, # MODIFIERS
721 $endpos, $textlen-$endpos, # REMAINDER
725 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
727 _failmsg q{No quotelike operator found after prefix at "} .
728 substr($$textref, pos($$textref), 20) .
731 pos $$textref = $startpos;
736 $preld1pos = pos($$textref);
738 $ld1pos = pos($$textref);
740 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
743 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
744 | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
745 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
752 my $extrapos = pos($$textref);
753 $$textref =~ m{.*\n}gc;
754 $str1pos = pos($$textref)--;
755 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
756 _failmsg qq{Missing here doc terminator ('$label') after "} .
757 substr($$textref, $startpos, 20) .
760 pos $$textref = $startpos;
763 $rd1pos = pos($$textref);
764 $$textref =~ m{\Q$label\E\n}gc;
765 $ld2pos = pos($$textref);
767 $startpos, $oppos-$startpos, # PREFIX
768 $oppos, length($op), # OPERATOR
769 $ld1pos, $extrapos-$ld1pos, # LEFT DEL
770 $str1pos, $rd1pos-$str1pos, # STR/PAT
771 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
772 $ld2pos, 0, # NO 2ND LDEL
773 $ld2pos, 0, # NO 2ND STR
774 $ld2pos, 0, # NO 2ND RDEL
775 $ld2pos, 0, # NO MODIFIERS
776 $ld2pos, $textlen-$ld2pos, # REMAINDER
777 $extrapos, $str1pos-$extrapos, # FILLETED BIT
781 $$textref =~ m/\G\s*/gc;
782 $ld1pos = pos($$textref);
783 $str1pos = $ld1pos+1;
785 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
787 _failmsg "No block delimiter found after quotelike $op",
789 pos $$textref = $startpos;
792 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
793 my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
794 if ($ldel1 =~ /[[(<{]/)
796 $rdel1 =~ tr/[({</])}>/;
797 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
798 || do { pos $$textref = $startpos; return };
799 $ld2pos = pos($$textref);
804 $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
805 || do { pos $$textref = $startpos; return };
806 $ld2pos = $rd1pos = pos($$textref)-1;
809 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
813 if ($ldel1 =~ /[[(<{]/)
815 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
817 _failmsg "Missing second block for quotelike $op",
819 pos $$textref = $startpos;
822 $ldel2 = $rdel2 = "\Q$1";
823 $rdel2 =~ tr/[({</])}>/;
827 $ldel2 = $rdel2 = $ldel1;
829 $str2pos = $ld2pos+1;
831 if ($ldel2 =~ /[[(<{]/)
833 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
834 defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
835 || do { pos $$textref = $startpos; return };
839 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
840 || do { pos $$textref = $startpos; return };
842 $rd2pos = pos($$textref)-1;
846 $ld2pos = $str2pos = $rd2pos = $rd1pos;
849 $modpos = pos $$textref;
851 $$textref =~ m/\G($mods{$op})/gc;
852 my $endpos = pos $$textref;
855 $startpos, $oppos-$startpos, # PREFIX
856 $oppos, length($op), # OPERATOR
857 $ld1pos, 1, # LEFT DEL
858 $str1pos, $rd1pos-$str1pos, # STR/PAT
859 $rd1pos, 1, # RIGHT DEL
860 $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
861 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
862 $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
863 $modpos, $endpos-$modpos, # MODIFIERS
864 $endpos, $textlen-$endpos, # REMAINDER
870 sub { extract_variable($_[0], '') },
871 sub { extract_quotelike($_[0],'') },
872 sub { extract_codeblock($_[0],'{}','') },
875 sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
877 my $textref = defined($_[0]) ? \$_[0] : \$_;
879 my ($lastpos, $firstpos);
884 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
885 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
893 carp "extract_multiple reset maximal count to 1 in scalar context"
894 if $^W && defined($_[2]) && $max > 1;
903 foreach $func ( @func )
905 if (ref($func) eq 'HASH')
907 push @class, (keys %$func)[0];
908 $func = (values %$func)[0];
916 FIELD: while (pos($$textref) < length($$textref))
920 foreach my $i ( 0..$#func )
925 $lastpos = pos $$textref;
926 if (ref($func) eq 'CODE')
927 { ($field,$rem,$pref) = @bits = $func->($$textref) }
928 elsif (ref($func) eq 'Text::Balanced::Extractor')
929 { @bits = $field = $func->extract($$textref) }
930 elsif( $$textref =~ m/\G$func/gc )
931 { @bits = $field = defined($1)
933 : substr($$textref, $-[0], $+[0] - $-[0])
936 if (defined($field) && length($field))
940 if length($pref) && !defined($unkpos);
943 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
944 $firstpos = $unkpos unless defined $firstpos;
946 last FIELD if @fields == $max;
950 ? bless (\$field, $class)
952 $firstpos = $lastpos unless defined $firstpos;
953 $lastpos = pos $$textref;
954 last FIELD if @fields == $max;
958 if ($$textref =~ /\G(.)/gcs)
960 $unkpos = pos($$textref)-1
961 unless $igunk || defined $unkpos;
967 push @fields, substr($$textref, $unkpos);
968 $firstpos = $unkpos unless defined $firstpos;
969 $lastpos = length $$textref;
974 pos $$textref = $lastpos;
975 return @fields if wantarray;
978 eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
979 pos $$textref = $firstpos };
984 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
988 my $pre = defined $_[2] ? $_[2] : '\s*';
989 my %options = defined $_[3] ? %{$_[3]} : ();
990 my $omode = defined $options{fail} ? $options{fail} : '';
991 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
992 : defined($options{reject}) ? $options{reject}
995 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
996 : defined($options{ignore}) ? $options{ignore}
1000 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
1003 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1008 my $textref = defined $_[0] ? \$_[0] : \$_;
1009 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1011 return _fail(wantarray, $textref) unless @match;
1012 return _succeed wantarray, $textref,
1013 $match[2], $match[3]+$match[5]+$match[7], # MATCH
1014 @match[8..9,0..1,2..7]; # REM, PRE, BITS
1017 bless $closure, 'Text::Balanced::Extractor';
1020 package Text::Balanced::Extractor;
1022 sub extract($$) # ($self, $text)
1027 package Text::Balanced::ErrorMsg;
1029 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1037 Text::Balanced - Extract delimited text sequences from strings.
1042 use Text::Balanced qw (
1055 # Extract the initial substring of $text that is delimited by
1056 # two (unescaped) instances of the first character in $delim.
1058 ($extracted, $remainder) = extract_delimited($text,$delim);
1061 # Extract the initial substring of $text that is bracketed
1062 # with a delimiter(s) specified by $delim (where the string
1063 # in $delim contains one or more of '(){}[]<>').
1065 ($extracted, $remainder) = extract_bracketed($text,$delim);
1068 # Extract the initial substring of $text that is bounded by
1071 ($extracted, $remainder) = extract_tagged($text);
1074 # Extract the initial substring of $text that is bounded by
1075 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1077 ($extracted, $remainder) =
1078 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1081 # Extract the initial substring of $text that represents a
1082 # Perl "quote or quote-like operation"
1084 ($extracted, $remainder) = extract_quotelike($text);
1087 # Extract the initial substring of $text that represents a block
1088 # of Perl code, bracketed by any of character(s) specified by $delim
1089 # (where the string $delim contains one or more of '(){}[]<>').
1091 ($extracted, $remainder) = extract_codeblock($text,$delim);
1094 # Extract the initial substrings of $text that would be extracted by
1095 # one or more sequential applications of the specified functions
1096 # or regular expressions
1098 @extracted = extract_multiple($text,
1099 [ \&extract_bracketed,
1100 \&extract_quotelike,
1101 \&some_other_extractor_sub,
1106 # Create a string representing an optimized pattern (a la Friedl)
1107 # that matches a substring delimited by any of the specified characters
1108 # (in this case: any type of quote or a slash)
1110 $patstring = gen_delimited_pat(q{'"`/});
1113 # Generate a reference to an anonymous sub that is just like extract_tagged
1114 # but pre-compiled and optimized for a specific pair of tags, and consequently
1115 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1116 # repeated calls, so it only works under Perl 5.005 or later.
1118 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1120 ($extracted, $remainder) = $extract_head->($text);
1125 The various C<extract_...> subroutines may be used to
1126 extract a delimited substring, possibly after skipping a
1127 specified prefix string. By default, that prefix is
1128 optional whitespace (C</\s*/>), but you can change it to whatever
1129 you wish (see below).
1131 The substring to be extracted must appear at the
1132 current C<pos> location of the string's variable
1133 (or at index zero, if no C<pos> position is defined).
1134 In other words, the C<extract_...> subroutines I<don't>
1135 extract the first occurrence of a substring anywhere
1136 in a string (like an unanchored regex would). Rather,
1137 they extract an occurrence of the substring appearing
1138 immediately at the current matching position in the
1139 string (like a C<\G>-anchored regex would).
1143 =head2 General behaviour in list contexts
1145 In a list context, all the subroutines return a list, the first three
1146 elements of which are always:
1152 The extracted string, including the specified delimiters.
1153 If the extraction fails C<undef> is returned.
1157 The remainder of the input string (i.e. the characters after the
1158 extracted string). On failure, the entire string is returned.
1162 The skipped prefix (i.e. the characters before the extracted string).
1163 On failure, C<undef> is returned.
1167 Note that in a list context, the contents of the original input text (the first
1168 argument) are not modified in any way.
1170 However, if the input text was passed in a variable, that variable's
1171 C<pos> value is updated to point at the first character after the
1172 extracted text. That means that in a list context the various
1173 subroutines can be used much like regular expressions. For example:
1175 while ( $next = (extract_quotelike($text))[0] )
1177 # process next quote-like (in $next)
1181 =head2 General behaviour in scalar and void contexts
1183 In a scalar context, the extracted string is returned, having first been
1184 removed from the input text. Thus, the following code also processes
1185 each quote-like operation, but actually removes them from $text:
1187 while ( $next = extract_quotelike($text) )
1189 # process next quote-like (in $next)
1192 Note that if the input text is a read-only string (i.e. a literal),
1193 no attempt is made to remove the extracted text.
1195 In a void context the behaviour of the extraction subroutines is
1196 exactly the same as in a scalar context, except (of course) that the
1197 extracted substring is not returned.
1199 =head2 A note about prefixes
1201 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1202 This can bite you if you're expecting a prefix specification like
1203 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1204 pattern will only succeed if the <H1> tag is on the current line, since
1205 . normally doesn't match newlines.
1207 To overcome this limitation, you need to turn on /s matching within
1208 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1211 =head2 C<extract_delimited>
1213 The C<extract_delimited> function formalizes the common idiom
1214 of extracting a single-character-delimited substring from the start of
1215 a string. For example, to extract a single-quote delimited string, the
1216 following code is typically used:
1218 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1221 but with C<extract_delimited> it can be simplified to:
1223 ($extracted,$remainder) = extract_delimited($text, "'");
1225 C<extract_delimited> takes up to four scalars (the input text, the
1226 delimiters, a prefix pattern to be skipped, and any escape characters)
1227 and extracts the initial substring of the text that
1228 is appropriately delimited. If the delimiter string has multiple
1229 characters, the first one encountered in the text is taken to delimit
1231 The third argument specifies a prefix pattern that is to be skipped
1232 (but must be present!) before the substring is extracted.
1233 The final argument specifies the escape character to be used for each
1236 All arguments are optional. If the escape characters are not specified,
1237 every delimiter is escaped with a backslash (C<\>).
1238 If the prefix is not specified, the
1239 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1240 is also not specified, the set C</["'`]/> is used. If the text to be processed
1241 is not specified either, C<$_> is used.
1243 In list context, C<extract_delimited> returns a array of three
1244 elements, the extracted substring (I<including the surrounding
1245 delimiters>), the remainder of the text, and the skipped prefix (if
1246 any). If a suitable delimited substring is not found, the first
1247 element of the array is the empty string, the second is the complete
1248 original text, and the prefix returned in the third element is an
1251 In a scalar context, just the extracted substring is returned. In
1252 a void context, the extracted substring (and any prefix) are simply
1253 removed from the beginning of the first argument.
1257 # Remove a single-quoted substring from the very beginning of $text:
1259 $substring = extract_delimited($text, "'", '');
1261 # Remove a single-quoted Pascalish substring (i.e. one in which
1262 # doubling the quote character escapes it) from the very
1263 # beginning of $text:
1265 $substring = extract_delimited($text, "'", '', "'");
1267 # Extract a single- or double- quoted substring from the
1268 # beginning of $text, optionally after some whitespace
1269 # (note the list context to protect $text from modification):
1271 ($substring) = extract_delimited $text, q{"'};
1274 # Delete the substring delimited by the first '/' in $text:
1276 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1278 Note that this last example is I<not> the same as deleting the first
1279 quote-like pattern. For instance, if C<$text> contained the string:
1281 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1283 then after the deletion it would contain:
1285 "if ('.$UNIXCMD/s) { $cmd = $1; }"
1289 "if ('./cmd' =~ ms) { $cmd = $1; }"
1292 See L<"extract_quotelike"> for a (partial) solution to this problem.
1295 =head2 C<extract_bracketed>
1297 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1298 up to three optional scalar arguments: a string to extract from, a delimiter
1299 specifier, and a prefix pattern. As before, a missing prefix defaults to
1300 optional whitespace and a missing text defaults to C<$_>. However, a missing
1301 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1303 C<extract_bracketed> extracts a balanced-bracket-delimited
1304 substring (using any one (or more) of the user-specified delimiter
1305 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1306 respect quoted unbalanced brackets (see below).
1308 A "delimiter bracket" is a bracket in list of delimiters passed as
1309 C<extract_bracketed>'s second argument. Delimiter brackets are
1310 specified by giving either the left or right (or both!) versions
1311 of the required bracket(s). Note that the order in which
1312 two or more delimiter brackets are specified is not significant.
1314 A "balanced-bracket-delimited substring" is a substring bounded by
1315 matched brackets, such that any other (left or right) delimiter
1316 bracket I<within> the substring is also matched by an opposite
1317 (right or left) delimiter bracket I<at the same level of nesting>. Any
1318 type of bracket not in the delimiter list is treated as an ordinary
1321 In other words, each type of bracket specified as a delimiter must be
1322 balanced and correctly nested within the substring, and any other kind of
1323 ("non-delimiter") bracket in the substring is ignored.
1325 For example, given the string:
1327 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1329 then a call to C<extract_bracketed> in a list context:
1331 @result = extract_bracketed( $text, '{}' );
1335 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1337 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1338 (In a scalar context just the first element of the array would be returned. In
1339 a void context, C<$text> would be replaced by an empty string.)
1341 Likewise the call in:
1343 @result = extract_bracketed( $text, '{[' );
1345 would return the same result, since all sets of both types of specified
1346 delimiter brackets are correctly nested and balanced.
1348 However, the call in:
1350 @result = extract_bracketed( $text, '{([<' );
1352 would fail, returning:
1354 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
1356 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1357 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1358 return an empty string. In a void context, C<$text> would be unchanged.)
1360 Note that the embedded single-quotes in the string don't help in this
1361 case, since they have not been specified as acceptable delimiters and are
1362 therefore treated as non-delimiter characters (and ignored).
1364 However, if a particular species of quote character is included in the
1365 delimiter specification, then that type of quote will be correctly handled.
1366 for example, if C<$text> is:
1368 $text = '<A HREF=">>>>">link</A>';
1372 @result = extract_bracketed( $text, '<">' );
1376 ( '<A HREF=">>>>">', 'link</A>', "" )
1378 as expected. Without the specification of C<"> as an embedded quoter:
1380 @result = extract_bracketed( $text, '<>' );
1382 the result would be:
1384 ( '<A HREF=">', '>>>">link</A>', "" )
1386 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1387 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1388 letter 'q' as a delimiter. Hence:
1390 @result = extract_bracketed( $text, '<q>' );
1392 would correctly match something like this:
1394 $text = '<leftop: conj /and/ conj>';
1396 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1399 =head2 C<extract_variable>
1401 C<extract_variable> extracts any valid Perl variable or
1402 variable-involved expression, including scalars, arrays, hashes, array
1403 accesses, hash look-ups, method calls through objects, subroutine calls
1404 through subroutine references, etc.
1406 The subroutine takes up to two optional arguments:
1412 A string to be processed (C<$_> if the string is omitted or C<undef>)
1416 A string specifying a pattern to be matched as a prefix (which is to be
1417 skipped). If omitted, optional whitespace is skipped.
1421 On success in a list context, an array of 3 elements is returned. The
1428 the extracted variable, or variablish expression
1432 the remainder of the input text,
1436 the prefix substring (if any),
1440 On failure, all of these values (except the remaining text) are C<undef>.
1442 In a scalar context, C<extract_variable> returns just the complete
1443 substring that matched a variablish expression. C<undef> is returned on
1444 failure. In addition, the original input text has the returned substring
1445 (and any prefix) removed from it.
1447 In a void context, the input text just has the matched substring (and
1448 any specified prefix) removed.
1451 =head2 C<extract_tagged>
1453 C<extract_tagged> extracts and segments text between (balanced)
1456 The subroutine takes up to five optional arguments:
1462 A string to be processed (C<$_> if the string is omitted or C<undef>)
1466 A string specifying a pattern to be matched as the opening tag.
1467 If the pattern string is omitted (or C<undef>) then a pattern
1468 that matches any standard XML tag is used.
1472 A string specifying a pattern to be matched at the closing tag.
1473 If the pattern string is omitted (or C<undef>) then the closing
1474 tag is constructed by inserting a C</> after any leading bracket
1475 characters in the actual opening tag that was matched (I<not> the pattern
1476 that matched the tag). For example, if the opening tag pattern
1477 is specified as C<'{{\w+}}'> and actually matched the opening tag
1478 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1482 A string specifying a pattern to be matched as a prefix (which is to be
1483 skipped). If omitted, optional whitespace is skipped.
1487 A hash reference containing various parsing options (see below)
1491 The various options that can be specified are:
1495 =item C<reject =E<gt> $listref>
1497 The list reference contains one or more strings specifying patterns
1498 that must I<not> appear within the tagged text.
1500 For example, to extract
1501 an HTML link (which should not contain nested links) use:
1503 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1505 =item C<ignore =E<gt> $listref>
1507 The list reference contains one or more strings specifying patterns
1508 that are I<not> be be treated as nested tags within the tagged text
1509 (even if they would match the start tag pattern).
1511 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1513 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1515 (also see L<"gen_delimited_pat"> below).
1518 =item C<fail =E<gt> $str>
1520 The C<fail> option indicates the action to be taken if a matching end
1521 tag is not encountered (i.e. before the end of the string or some
1522 C<reject> pattern matches). By default, a failure to match a closing
1523 tag causes C<extract_tagged> to immediately fail.
1525 However, if the string value associated with <reject> is "MAX", then
1526 C<extract_tagged> returns the complete text up to the point of failure.
1527 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1528 after the tag (up to the first line that is either empty or contains
1529 only whitespace characters).
1530 If the string is "", the the default behaviour (i.e. failure) is reinstated.
1532 For example, suppose the start tag "/para" introduces a paragraph, which then
1533 continues until the next "/endpara" tag or until another "/para" tag is
1536 $text = "/para line 1\n\nline 3\n/para line 4";
1538 extract_tagged($text, '/para', '/endpara', undef,
1539 {reject => '/para', fail => MAX );
1541 # EXTRACTED: "/para line 1\n\nline 3\n"
1543 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1544 tag refers only to the immediately following paragraph:
1546 $text = "/para line 1\n\nline 3\n/para line 4";
1548 extract_tagged($text, '/para', '/endpara', undef,
1549 {reject => '/para', fail => MAX );
1551 # EXTRACTED: "/para line 1\n"
1553 Note that the specified C<fail> behaviour applies to nested tags as well.
1557 On success in a list context, an array of 6 elements is returned. The elements are:
1563 the extracted tagged substring (including the outermost tags),
1567 the remainder of the input text,
1571 the prefix substring (if any),
1579 the text between the opening and closing tags
1583 the closing tag (or "" if no closing tag was found)
1587 On failure, all of these values (except the remaining text) are C<undef>.
1589 In a scalar context, C<extract_tagged> returns just the complete
1590 substring that matched a tagged text (including the start and end
1591 tags). C<undef> is returned on failure. In addition, the original input
1592 text has the returned substring (and any prefix) removed from it.
1594 In a void context, the input text just has the matched substring (and
1595 any specified prefix) removed.
1598 =head2 C<gen_extract_tagged>
1600 (Note: This subroutine is only available under Perl5.005)
1602 C<gen_extract_tagged> generates a new anonymous subroutine which
1603 extracts text between (balanced) specified tags. In other words,
1604 it generates a function identical in function to C<extract_tagged>.
1606 The difference between C<extract_tagged> and the anonymous
1607 subroutines generated by
1608 C<gen_extract_tagged>, is that those generated subroutines:
1614 do not have to reparse tag specification or parsing options every time
1615 they are called (whereas C<extract_tagged> has to effectively rebuild
1616 its tag parser on every call);
1620 make use of the new qr// construct to pre-compile the regexes they use
1621 (whereas C<extract_tagged> uses standard string variable interpolation
1622 to create tag-matching patterns).
1626 The subroutine takes up to four optional arguments (the same set as
1627 C<extract_tagged> except for the string to be processed). It returns
1628 a reference to a subroutine which in turn takes a single argument (the text to
1631 In other words, the implementation of C<extract_tagged> is exactly
1637 $extractor = gen_extract_tagged(@_);
1638 return $extractor->($text);
1641 (although C<extract_tagged> is not currently implemented that way, in order
1642 to preserve pre-5.005 compatibility).
1644 Using C<gen_extract_tagged> to create extraction functions for specific tags
1645 is a good idea if those functions are going to be called more than once, since
1646 their performance is typically twice as good as the more general-purpose
1650 =head2 C<extract_quotelike>
1652 C<extract_quotelike> attempts to recognize, extract, and segment any
1653 one of the various Perl quotes and quotelike operators (see
1654 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1655 delimiters (for the quotelike operators), and trailing modifiers are
1656 all caught. For example, in:
1658 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1660 extract_quotelike ' "You said, \"Use sed\"." '
1662 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1664 extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1666 the full Perl quotelike operations are all extracted correctly.
1668 Note too that, when using the /x modifier on a regex, any comment
1669 containing the current pattern delimiter will cause the regex to be
1670 immediately terminated. In other words:
1673 (?i) # CASE INSENSITIVE
1674 [a-z_] # LEADING ALPHABETIC/UNDERSCORE
1675 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1678 will be extracted as if it were:
1681 (?i) # CASE INSENSITIVE
1682 [a-z_] # LEADING ALPHABETIC/'
1684 This behaviour is identical to that of the actual compiler.
1686 C<extract_quotelike> takes two arguments: the text to be processed and
1687 a prefix to be matched at the very beginning of the text. If no prefix
1688 is specified, optional whitespace is the default. If no text is given,
1691 In a list context, an array of 11 elements is returned. The elements are:
1697 the extracted quotelike substring (including trailing modifiers),
1701 the remainder of the input text,
1705 the prefix substring (if any),
1709 the name of the quotelike operator (if any),
1713 the left delimiter of the first block of the operation,
1717 the text of the first block of the operation
1718 (that is, the contents of
1719 a quote, the regex of a match or substitution or the target list of a
1724 the right delimiter of the first block of the operation,
1728 the left delimiter of the second block of the operation
1729 (that is, if it is a C<s>, C<tr>, or C<y>),
1733 the text of the second block of the operation
1734 (that is, the replacement of a substitution or the translation list
1739 the right delimiter of the second block of the operation (if any),
1743 the trailing modifiers on the operation (if any).
1747 For each of the fields marked "(if any)" the default value on success is
1749 On failure, all of these values (except the remaining text) are C<undef>.
1752 In a scalar context, C<extract_quotelike> returns just the complete substring
1753 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1754 void context, the input text has the same substring (and any specified
1759 # Remove the first quotelike literal that appears in text
1761 $quotelike = extract_quotelike($text,'.*?');
1763 # Replace one or more leading whitespace-separated quotelike
1764 # literals in $_ with "<QLL>"
1766 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1769 # Isolate the search pattern in a quotelike operation from $text
1771 ($op,$pat) = (extract_quotelike $text)[3,5];
1774 print "search pattern: $pat\n";
1778 print "$op is not a pattern matching operation\n";
1782 =head2 C<extract_quotelike> and "here documents"
1784 C<extract_quotelike> can successfully extract "here documents" from an input
1785 string, but with an important caveat in list contexts.
1787 Unlike other types of quote-like literals, a here document is rarely
1788 a contiguous substring. For example, a typical piece of code using
1789 here document might look like this:
1792 This is the message.
1796 Given this as an input string in a scalar context, C<extract_quotelike>
1797 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1798 leaving the string " || die;\nexit;" in the original variable. In other words,
1799 the two separate pieces of the here document are successfully extracted and
1802 In a list context, C<extract_quotelike> would return the list
1808 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1809 including fore and aft delimiters),
1813 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1817 "" (i.e. the prefix substring -- trivial in this case),
1821 "<<" (i.e. the "name" of the quotelike operator)
1825 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1829 "This is the message.\n" (i.e. the text of the here document),
1833 "EOMSG" (i.e. the right delimiter of the here document),
1837 "" (a here document has no second left delimiter, second text, second right
1838 delimiter, or trailing modifiers).
1842 However, the matching position of the input variable would be set to
1843 "exit;" (i.e. I<after> the closing delimiter of the here document),
1844 which would cause the earlier " || die;\nexit;" to be skipped in any
1845 sequence of code fragment extractions.
1847 To avoid this problem, when it encounters a here document whilst
1848 extracting from a modifiable string, C<extract_quotelike> silently
1849 rearranges the string to an equivalent piece of Perl:
1852 This is the message.
1857 in which the here document I<is> contiguous. It still leaves the
1858 matching position after the here document, but now the rest of the line
1859 on which the here document starts is not skipped.
1861 To prevent <extract_quotelike> from mucking about with the input in this way
1862 (this is the only case where a list-context C<extract_quotelike> does so),
1863 you can pass the input variable as an interpolated literal:
1865 $quotelike = extract_quotelike("$var");
1868 =head2 C<extract_codeblock>
1870 C<extract_codeblock> attempts to recognize and extract a balanced
1871 bracket delimited substring that may contain unbalanced brackets
1872 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1873 is like a combination of C<"extract_bracketed"> and
1874 C<"extract_quotelike">.
1876 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1877 a text to process, a set of delimiter brackets to look for, and a prefix to
1878 match first. It also takes an optional fourth parameter, which allows the
1879 outermost delimiter brackets to be specified separately (see below).
1881 Omitting the first argument (input text) means process C<$_> instead.
1882 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1883 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1884 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1885 value of the second argument is to be used for the outermost delimiters.
1887 Once the prefix an dthe outermost opening delimiter bracket have been
1888 recognized, code blocks are extracted by stepping through the input text and
1889 trying the following alternatives in sequence:
1895 Try and match a closing delimiter bracket. If the bracket was the same
1896 species as the last opening bracket, return the substring to that
1897 point. If the bracket was mismatched, return an error.
1901 Try to match a quote or quotelike operator. If found, call
1902 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1903 the error it returned. Otherwise go back to step 1.
1907 Try to match an opening delimiter bracket. If found, call
1908 C<extract_codeblock> recursively to eat the embedded block. If the
1909 recursive call fails, return an error. Otherwise, go back to step 1.
1913 Unconditionally match a bareword or any other single character, and
1914 then go back to step 1.
1921 # Find a while loop in the text
1923 if ($text =~ s/.*?while\s*\{/{/)
1925 $loop = "while " . extract_codeblock($text);
1928 # Remove the first round-bracketed list (which may include
1929 # round- or curly-bracketed code blocks or quotelike operators)
1931 extract_codeblock $text, "(){}", '[^(]*';
1934 The ability to specify a different outermost delimiter bracket is useful
1935 in some circumstances. For example, in the Parse::RecDescent module,
1936 parser actions which are to be performed only on a successful parse
1937 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1939 sentence: subject verb object
1940 <defer: {$::theVerb = $item{verb}} >
1942 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1943 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1945 A deferred action like this:
1947 <defer: {if ($count>10) {$count--}} >
1949 will be incorrectly parsed as:
1951 <defer: {if ($count>
1953 because the "less than" operator is interpreted as a closing delimiter.
1955 But, by extracting the directive using
1956 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1957 the '>' character is only treated as a delimited at the outermost
1958 level of the code block, so the directive is parsed correctly.
1960 =head2 C<extract_multiple>
1962 The C<extract_multiple> subroutine takes a string to be processed and a
1963 list of extractors (subroutines or regular expressions) to apply to that string.
1965 In an array context C<extract_multiple> returns an array of substrings
1966 of the original string, as extracted by the specified extractors.
1967 In a scalar context, C<extract_multiple> returns the first
1968 substring successfully extracted from the original string. In both
1969 scalar and void contexts the original string has the first successfully
1970 extracted substring removed from it. In all contexts
1971 C<extract_multiple> starts at the current C<pos> of the string, and
1972 sets that C<pos> appropriately after it matches.
1974 Hence, the aim of of a call to C<extract_multiple> in a list context
1975 is to split the processed string into as many non-overlapping fields as
1976 possible, by repeatedly applying each of the specified extractors
1977 to the remainder of the string. Thus C<extract_multiple> is
1978 a generalized form of Perl's C<split> subroutine.
1980 The subroutine takes up to four optional arguments:
1986 A string to be processed (C<$_> if the string is omitted or C<undef>)
1990 A reference to a list of subroutine references and/or qr// objects and/or
1991 literal strings and/or hash references, specifying the extractors
1992 to be used to split the string. If this argument is omitted (or
1996 sub { extract_variable($_[0], '') },
1997 sub { extract_quotelike($_[0],'') },
1998 sub { extract_codeblock($_[0],'{}','') },
2006 An number specifying the maximum number of fields to return. If this
2007 argument is omitted (or C<undef>), split continues as long as possible.
2009 If the third argument is I<N>, then extraction continues until I<N> fields
2010 have been successfully extracted, or until the string has been completely
2013 Note that in scalar and void contexts the value of this argument is
2014 automatically reset to 1 (under C<-w>, a warning is issued if the argument
2019 A value indicating whether unmatched substrings (see below) within the
2020 text should be skipped or returned as fields. If the value is true,
2021 such substrings are skipped. Otherwise, they are returned.
2025 The extraction process works by applying each extractor in
2026 sequence to the text string.
2028 If the extractor is a subroutine it is called in a list context and is
2029 expected to return a list of a single element, namely the extracted
2030 text. It may optionally also return two further arguments: a string
2031 representing the text left after extraction (like $' for a pattern
2032 match), and a string representing any prefix skipped before the
2033 extraction (like $` in a pattern match). Note that this is designed
2034 to facilitate the use of other Text::Balanced subroutines with
2035 C<extract_multiple>. Note too that the value returned by an extractor
2036 subroutine need not bear any relationship to the corresponding substring
2037 of the original text (see examples below).
2039 If the extractor is a precompiled regular expression or a string,
2040 it is matched against the text in a scalar context with a leading
2041 '\G' and the gc modifiers enabled. The extracted value is either
2042 $1 if that variable is defined after the match, or else the
2043 complete match (i.e. $&).
2045 If the extractor is a hash reference, it must contain exactly one element.
2046 The value of that element is one of the
2047 above extractor types (subroutine reference, regular expression, or string).
2048 The key of that element is the name of a class into which the successful
2049 return value of the extractor will be blessed.
2051 If an extractor returns a defined value, that value is immediately
2052 treated as the next extracted field and pushed onto the list of fields.
2053 If the extractor was specified in a hash reference, the field is also
2054 blessed into the appropriate class,
2056 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2057 assumed to have failed to extract.
2058 If none of the extractor subroutines succeeds, then one
2059 character is extracted from the start of the text and the extraction
2060 subroutines reapplied. Characters which are thus removed are accumulated and
2061 eventually become the next field (unless the fourth argument is true, in which
2062 case they are discarded).
2064 For example, the following extracts substrings that are valid Perl variables:
2066 @fields = extract_multiple($text,
2067 [ sub { extract_variable($_[0]) } ],
2070 This example separates a text into fields which are quote delimited,
2071 curly bracketed, and anything else. The delimited and bracketed
2072 parts are also blessed to identify them (the "anything else" is unblessed):
2074 @fields = extract_multiple($text,
2076 { Delim => sub { extract_delimited($_[0],q{'"}) } },
2077 { Brack => sub { extract_bracketed($_[0],'{}') } },
2080 This call extracts the next single substring that is a valid Perl quotelike
2081 operator (and removes it from $text):
2083 $quotelike = extract_multiple($text,
2085 sub { extract_quotelike($_[0]) },
2088 Finally, here is yet another way to do comma-separated value parsing:
2090 @fields = extract_multiple($csv_text,
2092 sub { extract_delimited($_[0],q{'"}) },
2097 The list in the second argument means:
2098 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2099 The undef third argument means:
2100 I<"...as many times as possible...">,
2101 and the true value in the fourth argument means
2102 I<"...discarding anything else that appears (i.e. the commas)">.
2104 If you wanted the commas preserved as separate fields (i.e. like split
2105 does if your split pattern has capturing parentheses), you would
2106 just make the last parameter undefined (or remove it).
2109 =head2 C<gen_delimited_pat>
2111 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2112 > builds a Friedl-style optimized regex that matches a string delimited
2113 by any one of the characters in the single argument. For example:
2115 gen_delimited_pat(q{'"})
2119 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2121 Note that the specified delimiters are automatically quotemeta'd.
2123 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2124 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2125 (which might contain quoted strings):
2127 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2129 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2132 C<gen_delimited_pat> may also be called with an optional second argument,
2133 which specifies the "escape" character(s) to be used for each delimiter.
2134 For example to match a Pascal-style string (where ' is the delimiter
2135 and '' is a literal ' within the string):
2137 gen_delimited_pat(q{'},q{'});
2139 Different escape characters can be specified for different delimiters.
2140 For example, to specify that '/' is the escape for single quotes
2141 and '%' is the escape for double quotes:
2143 gen_delimited_pat(q{'"},q{/%});
2145 If more delimiters than escape chars are specified, the last escape char
2146 is used for the remaining delimiters.
2147 If no escape char is specified for a given specified delimiter, '\' is used.
2149 =head2 C<delimited_pat>
2151 Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2152 That name may still be used, but is now deprecated.
2157 In a list context, all the functions return C<(undef,$original_text)>
2158 on failure. In a scalar context, failure is indicated by returning C<undef>
2159 (in this case the input text is not modified in any way).
2161 In addition, on failure in I<any> context, the C<$@> variable is set.
2162 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2164 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2165 which the error was detected (although not necessarily where it occurred!)
2166 Printing C<$@> directly produces the error message, with the offset appended.
2167 On success, the C<$@> variable is guaranteed to be C<undef>.
2169 The available diagnostics are:
2173 =item C<Did not find a suitable bracket: "%s">
2175 The delimiter provided to C<extract_bracketed> was not one of
2176 C<'()[]E<lt>E<gt>{}'>.
2178 =item C<Did not find prefix: /%s/>
2180 A non-optional prefix was specified but wasn't found at the start of the text.
2182 =item C<Did not find opening bracket after prefix: "%s">
2184 C<extract_bracketed> or C<extract_codeblock> was expecting a
2185 particular kind of bracket at the start of the text, and didn't find it.
2187 =item C<No quotelike operator found after prefix: "%s">
2189 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2190 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2193 =item C<Unmatched closing bracket: "%c">
2195 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2196 a closing bracket where none was expected.
2198 =item C<Unmatched opening bracket(s): "%s">
2200 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2201 out of characters in the text before closing one or more levels of nested
2204 =item C<Unmatched embedded quote (%s)>
2206 C<extract_bracketed> attempted to match an embedded quoted substring, but
2207 failed to find a closing quote to match it.
2209 =item C<Did not find closing delimiter to match '%s'>
2211 C<extract_quotelike> was unable to find a closing delimiter to match the
2212 one that opened the quote-like operation.
2214 =item C<Mismatched closing bracket: expected "%c" but found "%s">
2216 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2217 a valid bracket delimiter, but it was the wrong species. This usually
2218 indicates a nesting error, but may indicate incorrect quoting or escaping.
2220 =item C<No block delimiter found after quotelike "%s">
2222 C<extract_quotelike> or C<extract_codeblock> found one of the
2223 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2224 without a suitable block after it.
2226 =item C<Did not find leading dereferencer>
2228 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2229 a variable, but didn't find any of them.
2231 =item C<Bad identifier after dereferencer>
2233 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2234 character was not followed by a legal Perl identifier.
2236 =item C<Did not find expected opening bracket at %s>
2238 C<extract_codeblock> failed to find any of the outermost opening brackets
2239 that were specified.
2241 =item C<Improperly nested codeblock at %s>
2243 A nested code block was found that started with a delimiter that was specified
2244 as being only to be used as an outermost bracket.
2246 =item C<Missing second block for quotelike "%s">
2248 C<extract_codeblock> or C<extract_quotelike> found one of the
2249 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2251 =item C<No match found for opening bracket>
2253 C<extract_codeblock> failed to find a closing bracket to match the outermost
2256 =item C<Did not find opening tag: /%s/>
2258 C<extract_tagged> did not find a suitable opening tag (after any specified
2259 prefix was removed).
2261 =item C<Unable to construct closing tag to match: /%s/>
2263 C<extract_tagged> matched the specified opening tag and tried to
2264 modify the matched text to produce a matching closing tag (because
2265 none was specified). It failed to generate the closing tag, almost
2266 certainly because the opening tag did not start with a
2267 bracket of some kind.
2269 =item C<Found invalid nested tag: %s>
2271 C<extract_tagged> found a nested tag that appeared in the "reject" list
2272 (and the failure mode was not "MAX" or "PARA").
2274 =item C<Found unbalanced nested tag: %s>
2276 C<extract_tagged> found a nested opening tag that was not matched by a
2277 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2279 =item C<Did not find closing tag>
2281 C<extract_tagged> reached the end of the text without finding a closing tag
2282 to match the original opening tag (and the failure mode was not
2293 Damian Conway (damian@conway.org)
2296 =head1 BUGS AND IRRITATIONS
2298 There are undoubtedly serious bugs lurking somewhere in this code, if
2299 only because parts of it give the impression of understanding a great deal
2300 more about Perl than they really do.
2302 Bug reports and other feedback are most welcome.
2307 Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2308 This module is free software. It may be used, redistributed
2309 and/or modified under the same terms as Perl itself.