lib/Text/Balanced.pm

   1 # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2 # FOR FULL DOCUMENTATION SEE Balanced.pod
   3
   4 use 5.005;
   5 use strict;
   6
   7 package Text::Balanced;
   8
   9 use Exporter;
  10 use SelfLoader;
  11 use vars qw { $VERSION @ISA %EXPORT_TAGS };
  12
  13 # I really don't want to bring the XS version module into maint. So for now,
  14 # I'm commiting the sin of Bowdlerising Damian's module:
  15 # use version; $VERSION = qv('2.0.0');
  16 $VERSION = 2.000000;
  17 @ISA            = qw ( Exporter );
  18
  19 %EXPORT_TAGS    = ( ALL => [ qw(
  20                                 &extract_delimited
  21                                 &extract_bracketed
  22                                 &extract_quotelike
  23                                 &extract_codeblock
  24                                 &extract_variable
  25                                 &extract_tagged
  26                                 &extract_multiple
  27
  28                                 &gen_delimited_pat
  29                                 &gen_extract_tagged
  30
  31                                 &delimited_pat
  32                                ) ] );
  33
  34 Exporter::export_ok_tags('ALL');
  35
  36 # PROTOTYPES
  37
  38 sub _match_bracketed($$$$$$);
  39 sub _match_variable($$);
  40 sub _match_codeblock($$$$$$$);
  41 sub _match_quotelike($$$$);
  42
  43 # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  44
  45 sub _failmsg {
  46         my ($message, $pos) = @_;
  47         $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  48 }
  49
  50 sub _fail
  51 {
  52         my ($wantarray, $textref, $message, $pos) = @_;
  53         _failmsg $message, $pos if $message;
  54         return (undef,$$textref,undef) if $wantarray;
  55         return undef;
  56 }
  57
  58 sub _succeed
  59 {
  60         $@ = undef;
  61         my ($wantarray,$textref) = splice @_, 0, 2;
  62         my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  63         my ($startlen, $oppos) = @_[5,6];
  64         my $remainderpos = $_[2];
  65         if ($wantarray)
  66         {
  67                 my @res;
  68                 while (my ($from, $len) = splice @_, 0, 2)
  69                 {
  70                         push @res, substr($$textref,$from,$len);
  71                 }
  72                 if ($extralen) {        # CORRECT FILLET
  73                         my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
  74                         $res[1] = "$extra$res[1]";
  75                         eval { substr($$textref,$remainderpos,0) = $extra;
  76                                substr($$textref,$extrapos,$extralen,"\n")} ;
  77                                 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  78                         pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  79                 }
  80                 else {
  81                         pos($$textref) = $remainderpos;             # RESET \G
  82                 }
  83                 return @res;
  84         }
  85         else
  86         {
  87                 my $match = substr($$textref,$_[0],$_[1]);
  88                 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  89                 my $extra = $extralen
  90                         ? substr($$textref, $extrapos, $extralen)."\n" : "";
  91                 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  92                 pos($$textref) = $_[4];                         # RESET \G
  93                 return $match;
  94         }
  95 }
  96
  97 # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
  98
  99 sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
 100 {
 101         my ($dels, $escs) = @_;
 102         return "" unless $dels =~ /\S/;
 103         $escs = '\\' unless $escs;
 104         $escs .= substr($escs,-1) x (length($dels)-length($escs));
 105         my @pat = ();
 106         my $i;
 107         for ($i=0; $i<length $dels; $i++)
 108         {
 109                 my $del = quotemeta substr($dels,$i,1);
 110                 my $esc = quotemeta substr($escs,$i,1);
 111                 if ($del eq $esc)
 112                 {
 113                         push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 114                 }
 115                 else
 116                 {
 117                         push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 118                 }
 119         }
 120         my $pat = join '|', @pat;
 121         return "(?:$pat)";
 122 }
 123
 124 *delimited_pat = \&gen_delimited_pat;
 125
 126
 127 # THE EXTRACTION FUNCTIONS
 128
 129 sub extract_delimited (;$$$$)
 130 {
 131         my $textref = defined $_[0] ? \$_[0] : \$_;
 132         my $wantarray = wantarray;
 133         my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 134         my $pre  = defined $_[2] ? $_[2] : '\s*';
 135         my $esc  = defined $_[3] ? $_[3] : qq{\\};
 136         my $pat = gen_delimited_pat($del, $esc);
 137         my $startpos = pos $$textref || 0;
 138         return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 139                 unless $$textref =~ m/\G($pre)($pat)/gc;
 140         my $prelen = length($1);
 141         my $matchpos = $startpos+$prelen;
 142         my $endpos = pos $$textref;
 143         return _succeed $wantarray, $textref,
 144                         $matchpos, $endpos-$matchpos,           # MATCH
 145                         $endpos,   length($$textref)-$endpos,   # REMAINDER
 146                         $startpos, $prelen;                     # PREFIX
 147 }
 148
 149 sub extract_bracketed (;$$$)
 150 {
 151         my $textref = defined $_[0] ? \$_[0] : \$_;
 152         my $ldel = defined $_[1] ? $_[1] : '{([<';
 153         my $pre  = defined $_[2] ? $_[2] : '\s*';
 154         my $wantarray = wantarray;
 155         my $qdel = "";
 156         my $quotelike;
 157         $ldel =~ s/'//g and $qdel .= q{'};
 158         $ldel =~ s/"//g and $qdel .= q{"};
 159         $ldel =~ s/`//g and $qdel .= q{`};
 160         $ldel =~ s/q//g and $quotelike = 1;
 161         $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 162         my $rdel = $ldel;
 163         unless ($rdel =~ tr/[({</])}>/)
 164         {
 165                 return _fail $wantarray, $textref,
 166                              "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 167                              0;
 168         }
 169         my $posbug = pos;
 170         $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 171         $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 172         pos = $posbug;
 173
 174         my $startpos = pos $$textref || 0;
 175         my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 176
 177         return _fail ($wantarray, $textref) unless @match;
 178
 179         return _succeed ( $wantarray, $textref,
 180                           $match[2], $match[5]+2,       # MATCH
 181                           @match[8,9],                  # REMAINDER
 182                           @match[0,1],                  # PREFIX
 183                         );
 184 }
 185
 186 sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 187 {
 188         my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 189         my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 190         unless ($$textref =~ m/\G$pre/gc)
 191         {
 192                 _failmsg "Did not find prefix: /$pre/", $startpos;
 193                 return;
 194         }
 195
 196         $ldelpos = pos $$textref;
 197
 198         unless ($$textref =~ m/\G($ldel)/gc)
 199         {
 200                 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 201                          pos $$textref;
 202                 pos $$textref = $startpos;
 203                 return;
 204         }
 205
 206         my @nesting = ( $1 );
 207         my $textlen = length $$textref;
 208         while (pos $$textref < $textlen)
 209         {
 210                 next if $$textref =~ m/\G\\./gcs;
 211
 212                 if ($$textref =~ m/\G($ldel)/gc)
 213                 {
 214                         push @nesting, $1;
 215                 }
 216                 elsif ($$textref =~ m/\G($rdel)/gc)
 217                 {
 218                         my ($found, $brackettype) = ($1, $1);
 219                         if ($#nesting < 0)
 220                         {
 221                                 _failmsg "Unmatched closing bracket: \"$found\"",
 222                                          pos $$textref;
 223                                 pos $$textref = $startpos;
 224                                 return;
 225                         }
 226                         my $expected = pop(@nesting);
 227                         $expected =~ tr/({[</)}]>/;
 228                         if ($expected ne $brackettype)
 229                         {
 230                                 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 231                                          pos $$textref;
 232                                 pos $$textref = $startpos;
 233                                 return;
 234                         }
 235                         last if $#nesting < 0;
 236                 }
 237                 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 238                 {
 239                         $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 240                         _failmsg "Unmatched embedded quote ($1)",
 241                                  pos $$textref;
 242                         pos $$textref = $startpos;
 243                         return;
 244                 }
 245                 elsif ($quotelike && _match_quotelike($textref,"",1,0))
 246                 {
 247                         next;
 248                 }
 249
 250                 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 251         }
 252         if ($#nesting>=0)
 253         {
 254                 _failmsg "Unmatched opening bracket(s): "
 255                                 . join("..",@nesting)."..",
 256                          pos $$textref;
 257                 pos $$textref = $startpos;
 258                 return;
 259         }
 260
 261         $endpos = pos $$textref;
 262
 263         return (
 264                 $startpos,  $ldelpos-$startpos,         # PREFIX
 265                 $ldelpos,   1,                          # OPENING BRACKET
 266                 $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
 267                 $endpos-1,  1,                          # CLOSING BRACKET
 268                 $endpos,    length($$textref)-$endpos,  # REMAINDER
 269                );
 270 }
 271
 272 sub _revbracket($)
 273 {
 274         my $brack = reverse $_[0];
 275         $brack =~ tr/[({</])}>/;
 276         return $brack;
 277 }
 278
 279 my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 280
 281 sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 282 {
 283         my $textref = defined $_[0] ? \$_[0] : \$_;
 284         my $ldel    = $_[1];
 285         my $rdel    = $_[2];
 286         my $pre     = defined $_[3] ? $_[3] : '\s*';
 287         my %options = defined $_[4] ? %{$_[4]} : ();
 288         my $omode   = defined $options{fail} ? $options{fail} : '';
 289         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 290                     : defined($options{reject})        ? $options{reject}
 291                     :                                    ''
 292                     ;
 293         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 294                     : defined($options{ignore})        ? $options{ignore}
 295                     :                                    ''
 296                     ;
 297
 298         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 299         $@ = undef;
 300
 301         my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 302
 303         return _fail(wantarray, $textref) unless @match;
 304         return _succeed wantarray, $textref,
 305                         $match[2], $match[3]+$match[5]+$match[7],       # MATCH
 306                         @match[8..9,0..1,2..7];                         # REM, PRE, BITS
 307 }
 308
 309 sub _match_tagged       # ($$$$$$$)
 310 {
 311         my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 312         my $rdelspec;
 313
 314         my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 315
 316         unless ($$textref =~ m/\G($pre)/gc)
 317         {
 318                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 319                 goto failed;
 320         }
 321
 322         $opentagpos = pos($$textref);
 323
 324         unless ($$textref =~ m/\G$ldel/gc)
 325         {
 326                 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 327                 goto failed;
 328         }
 329
 330         $textpos = pos($$textref);
 331
 332         if (!defined $rdel)
 333         {
 334                 $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
 335                 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
 336                 {
 337                         _failmsg "Unable to construct closing tag to match: $rdel",
 338                                  pos $$textref;
 339                         goto failed;
 340                 }
 341         }
 342         else
 343         {
 344                 $rdelspec = eval "qq{$rdel}" || do {
 345                         my $del;
 346                         for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
 347                                 { next if $rdel =~ /\Q$_/; $del = $_; last }
 348                         unless ($del) {
 349                                 use Carp;
 350                                 croak "Can't interpolate right delimiter $rdel"
 351                         }
 352                         eval "qq$del$rdel$del";
 353                 };
 354         }
 355
 356         while (pos($$textref) < length($$textref))
 357         {
 358                 next if $$textref =~ m/\G\\./gc;
 359
 360                 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 361                 {
 362                         $parapos = pos($$textref) - length($1)
 363                                 unless defined $parapos;
 364                 }
 365                 elsif ($$textref =~ m/\G($rdelspec)/gc )
 366                 {
 367                         $closetagpos = pos($$textref)-length($1);
 368                         goto matched;
 369                 }
 370                 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 371                 {
 372                         next;
 373                 }
 374                 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 375                 {
 376                         pos($$textref) -= length($1);   # CUT OFF WHATEVER CAUSED THE SHORTNESS
 377                         goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 378                         _failmsg "Found invalid nested tag: $1", pos $$textref;
 379                         goto failed;
 380                 }
 381                 elsif ($$textref =~ m/\G($ldel)/gc)
 382                 {
 383                         my $tag = $1;
 384                         pos($$textref) -= length($tag); # REWIND TO NESTED TAG
 385                         unless (_match_tagged(@_))      # MATCH NESTED TAG
 386                         {
 387                                 goto short if $omode eq 'PARA' || $omode eq 'MAX';
 388                                 _failmsg "Found unbalanced nested tag: $tag",
 389                                          pos $$textref;
 390                                 goto failed;
 391                         }
 392                 }
 393                 else { $$textref =~ m/./gcs }
 394         }
 395
 396 short:
 397         $closetagpos = pos($$textref);
 398         goto matched if $omode eq 'MAX';
 399         goto failed unless $omode eq 'PARA';
 400
 401         if (defined $parapos) { pos($$textref) = $parapos }
 402         else                  { $parapos = pos($$textref) }
 403
 404         return (
 405                 $startpos,    $opentagpos-$startpos,            # PREFIX
 406                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 407                 $textpos,     $parapos-$textpos,                # TEXT
 408                 $parapos,     0,                                # NO CLOSING TAG
 409                 $parapos,     length($$textref)-$parapos,       # REMAINDER
 410                );
 411
 412 matched:
 413         $endpos = pos($$textref);
 414         return (
 415                 $startpos,    $opentagpos-$startpos,            # PREFIX
 416                 $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
 417                 $textpos,     $closetagpos-$textpos,            # TEXT
 418                 $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
 419                 $endpos,      length($$textref)-$endpos,        # REMAINDER
 420                );
 421
 422 failed:
 423         _failmsg "Did not find closing tag", pos $$textref unless $@;
 424         pos($$textref) = $startpos;
 425         return;
 426 }
 427
 428 sub extract_variable (;$$)
 429 {
 430         my $textref = defined $_[0] ? \$_[0] : \$_;
 431         return ("","","") unless defined $$textref;
 432         my $pre  = defined $_[1] ? $_[1] : '\s*';
 433
 434         my @match = _match_variable($textref,$pre);
 435
 436         return _fail wantarray, $textref unless @match;
 437
 438         return _succeed wantarray, $textref,
 439                         @match[2..3,4..5,0..1];         # MATCH, REMAINDER, PREFIX
 440 }
 441
 442 sub _match_variable($$)
 443 {
 444 #  $#
 445 #  $^
 446 #  $$
 447         my ($textref, $pre) = @_;
 448         my $startpos = pos($$textref) = pos($$textref)||0;
 449         unless ($$textref =~ m/\G($pre)/gc)
 450         {
 451                 _failmsg "Did not find prefix: /$pre/", pos $$textref;
 452                 return;
 453         }
 454         my $varpos = pos($$textref);
 455         unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
 456         {
 457             unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
 458             {
 459                 _failmsg "Did not find leading dereferencer", pos $$textref;
 460                 pos $$textref = $startpos;
 461                 return;
 462             }
 463             my $deref = $1;
 464
 465             unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 466                 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
 467                 or $deref eq '$#' or $deref eq '$$' )
 468             {
 469                 _failmsg "Bad identifier after dereferencer", pos $$textref;
 470                 pos $$textref = $startpos;
 471                 return;
 472             }
 473         }
 474
 475         while (1)
 476         {
 477                 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
 478                 next if _match_codeblock($textref,
 479                                          qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 480                                          qr/[({[]/, qr/[)}\]]/,
 481                                          qr/[({[]/, qr/[)}\]]/, 0);
 482                 next if _match_codeblock($textref,
 483                                          qr/\s*/, qr/[{[]/, qr/[}\]]/,
 484                                          qr/[{[]/, qr/[}\]]/, 0);
 485                 next if _match_variable($textref,'\s*->\s*');
 486                 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 487                 last;
 488         }
 489
 490         my $endpos = pos($$textref);
 491         return ($startpos, $varpos-$startpos,
 492                 $varpos,   $endpos-$varpos,
 493                 $endpos,   length($$textref)-$endpos
 494                 );
 495 }
 496
 497 sub extract_codeblock (;$$$$$)
 498 {
 499         my $textref = defined $_[0] ? \$_[0] : \$_;
 500         my $wantarray = wantarray;
 501         my $ldel_inner = defined $_[1] ? $_[1] : '{';
 502         my $pre        = defined $_[2] ? $_[2] : '\s*';
 503         my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 504         my $rd         = $_[4];
 505         my $rdel_inner = $ldel_inner;
 506         my $rdel_outer = $ldel_outer;
 507         my $posbug = pos;
 508         for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 509         for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 510         for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 511         {
 512                 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 513         }
 514         pos = $posbug;
 515
 516         my @match = _match_codeblock($textref, $pre,
 517                                      $ldel_outer, $rdel_outer,
 518                                      $ldel_inner, $rdel_inner,
 519                                      $rd);
 520         return _fail($wantarray, $textref) unless @match;
 521         return _succeed($wantarray, $textref,
 522                         @match[2..3,4..5,0..1]  # MATCH, REMAINDER, PREFIX
 523                        );
 524
 525 }
 526
 527 sub _match_codeblock($$$$$$$)
 528 {
 529         my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 530         my $startpos = pos($$textref) = pos($$textref) || 0;
 531         unless ($$textref =~ m/\G($pre)/gc)
 532         {
 533                 _failmsg qq{Did not match prefix /$pre/ at"} .
 534                             substr($$textref,pos($$textref),20) .
 535                             q{..."},
 536                          pos $$textref;
 537                 return;
 538         }
 539         my $codepos = pos($$textref);
 540         unless ($$textref =~ m/\G($ldel_outer)/gc)      # OUTERMOST DELIMITER
 541         {
 542                 _failmsg qq{Did not find expected opening bracket at "} .
 543                              substr($$textref,pos($$textref),20) .
 544                              q{..."},
 545                          pos $$textref;
 546                 pos $$textref = $startpos;
 547                 return;
 548         }
 549         my $closing = $1;
 550            $closing =~ tr/([<{/)]>}/;
 551         my $matched;
 552         my $patvalid = 1;
 553         while (pos($$textref) < length($$textref))
 554         {
 555                 $matched = '';
 556                 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 557                 {
 558                         $patvalid = 0;
 559                         next;
 560                 }
 561
 562                 if ($$textref =~ m/\G\s*#.*/gc)
 563                 {
 564                         next;
 565                 }
 566
 567                 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 568                 {
 569                         unless ($matched = ($closing && $1 eq $closing) )
 570                         {
 571                                 next if $1 eq '>';      # MIGHT BE A "LESS THAN"
 572                                 _failmsg q{Mismatched closing bracket at "} .
 573                                              substr($$textref,pos($$textref),20) .
 574                                              qq{...". Expected '$closing'},
 575                                          pos $$textref;
 576                         }
 577                         last;
 578                 }
 579
 580                 if (_match_variable($textref,'\s*') ||
 581                     _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 582                 {
 583                         $patvalid = 0;
 584                         next;
 585                 }
 586
 587
 588                 # NEED TO COVER MANY MORE CASES HERE!!!
 589                 if ($$textref =~ m#\G\s*(?!$ldel_inner)
 590                                         ( [-+*x/%^&|.]=?
 591                                         | [!=]~
 592                                         | =(?!>)
 593                                         | (\*\*|&&|\|\||<<|>>)=?
 594                                         | split|grep|map|return
 595                                         | [([]
 596                                         )#gcx)
 597                 {
 598                         $patvalid = 1;
 599                         next;
 600                 }
 601
 602                 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 603                 {
 604                         $patvalid = 1;
 605                         next;
 606                 }
 607
 608                 if ($$textref =~ m/\G\s*$ldel_outer/gc)
 609                 {
 610                         _failmsg q{Improperly nested codeblock at "} .
 611                                      substr($$textref,pos($$textref),20) .
 612                                      q{..."},
 613                                  pos $$textref;
 614                         last;
 615                 }
 616
 617                 $patvalid = 0;
 618                 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 619         }
 620         continue { $@ = undef }
 621
 622         unless ($matched)
 623         {
 624                 _failmsg 'No match found for opening bracket', pos $$textref
 625                         unless $@;
 626                 return;
 627         }
 628
 629         my $endpos = pos($$textref);
 630         return ( $startpos, $codepos-$startpos,
 631                  $codepos, $endpos-$codepos,
 632                  $endpos,  length($$textref)-$endpos,
 633                );
 634 }
 635
 636
 637 my %mods   = (
 638                 'none'  => '[cgimsox]*',
 639                 'm'     => '[cgimsox]*',
 640                 's'     => '[cegimsox]*',
 641                 'tr'    => '[cds]*',
 642                 'y'     => '[cds]*',
 643                 'qq'    => '',
 644                 'qx'    => '',
 645                 'qw'    => '',
 646                 'qr'    => '[imsx]*',
 647                 'q'     => '',
 648              );
 649
 650 sub extract_quotelike (;$$)
 651 {
 652         my $textref = $_[0] ? \$_[0] : \$_;
 653         my $wantarray = wantarray;
 654         my $pre  = defined $_[1] ? $_[1] : '\s*';
 655
 656         my @match = _match_quotelike($textref,$pre,1,0);
 657         return _fail($wantarray, $textref) unless @match;
 658         return _succeed($wantarray, $textref,
 659                         $match[2], $match[18]-$match[2],        # MATCH
 660                         @match[18,19],                          # REMAINDER
 661                         @match[0,1],                            # PREFIX
 662                         @match[2..17],                          # THE BITS
 663                         @match[20,21],                          # ANY FILLET?
 664                        );
 665 };
 666
 667 sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
 668 {
 669         my ($textref, $pre, $rawmatch, $qmark) = @_;
 670
 671         my ($textlen,$startpos,
 672             $oppos,
 673             $preld1pos,$ld1pos,$str1pos,$rd1pos,
 674             $preld2pos,$ld2pos,$str2pos,$rd2pos,
 675             $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 676
 677         unless ($$textref =~ m/\G($pre)/gc)
 678         {
 679                 _failmsg qq{Did not find prefix /$pre/ at "} .
 680                              substr($$textref, pos($$textref), 20) .
 681                              q{..."},
 682                          pos $$textref;
 683                 return;
 684         }
 685         $oppos = pos($$textref);
 686
 687         my $initial = substr($$textref,$oppos,1);
 688
 689         if ($initial && $initial =~ m|^[\"\'\`]|
 690                      || $rawmatch && $initial =~ m|^/|
 691                      || $qmark && $initial =~ m|^\?|)
 692         {
 693                 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 694                 {
 695                         _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 696                                      substr($$textref, $oppos, 20) .
 697                                      q{..."},
 698                                  pos $$textref;
 699                         pos $$textref = $startpos;
 700                         return;
 701                 }
 702                 $modpos= pos($$textref);
 703                 $rd1pos = $modpos-1;
 704
 705                 if ($initial eq '/' || $initial eq '?')
 706                 {
 707                         $$textref =~ m/\G$mods{none}/gc
 708                 }
 709
 710                 my $endpos = pos($$textref);
 711                 return (
 712                         $startpos,      $oppos-$startpos,       # PREFIX
 713                         $oppos,         0,                      # NO OPERATOR
 714                         $oppos,         1,                      # LEFT DEL
 715                         $oppos+1,       $rd1pos-$oppos-1,       # STR/PAT
 716                         $rd1pos,        1,                      # RIGHT DEL
 717                         $modpos,        0,                      # NO 2ND LDEL
 718                         $modpos,        0,                      # NO 2ND STR
 719                         $modpos,        0,                      # NO 2ND RDEL
 720                         $modpos,        $endpos-$modpos,        # MODIFIERS
 721                         $endpos,        $textlen-$endpos,       # REMAINDER
 722                        );
 723         }
 724
 725         unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
 726         {
 727                 _failmsg q{No quotelike operator found after prefix at "} .
 728                              substr($$textref, pos($$textref), 20) .
 729                              q{..."},
 730                          pos $$textref;
 731                 pos $$textref = $startpos;
 732                 return;
 733         }
 734
 735         my $op = $1;
 736         $preld1pos = pos($$textref);
 737         if ($op eq '<<') {
 738                 $ld1pos = pos($$textref);
 739                 my $label;
 740                 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 741                         $label = $1;
 742                 }
 743                 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 744                                      | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 745                                      | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 746                                      }gcsx) {
 747                         $label = $+;
 748                 }
 749                 else {
 750                         $label = "";
 751                 }
 752                 my $extrapos = pos($$textref);
 753                 $$textref =~ m{.*\n}gc;
 754                 $str1pos = pos($$textref)--;
 755                 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
 756                         _failmsg qq{Missing here doc terminator ('$label') after "} .
 757                                      substr($$textref, $startpos, 20) .
 758                                      q{..."},
 759                                  pos $$textref;
 760                         pos $$textref = $startpos;
 761                         return;
 762                 }
 763                 $rd1pos = pos($$textref);
 764         $$textref =~ m{\Q$label\E\n}gc;
 765                 $ld2pos = pos($$textref);
 766                 return (
 767                         $startpos,      $oppos-$startpos,       # PREFIX
 768                         $oppos,         length($op),            # OPERATOR
 769                         $ld1pos,        $extrapos-$ld1pos,      # LEFT DEL
 770                         $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 771                         $rd1pos,        $ld2pos-$rd1pos,        # RIGHT DEL
 772                         $ld2pos,        0,                      # NO 2ND LDEL
 773                         $ld2pos,        0,                      # NO 2ND STR
 774                         $ld2pos,        0,                      # NO 2ND RDEL
 775                         $ld2pos,        0,                      # NO MODIFIERS
 776                         $ld2pos,        $textlen-$ld2pos,       # REMAINDER
 777                         $extrapos,      $str1pos-$extrapos,     # FILLETED BIT
 778                        );
 779         }
 780
 781         $$textref =~ m/\G\s*/gc;
 782         $ld1pos = pos($$textref);
 783         $str1pos = $ld1pos+1;
 784
 785         unless ($$textref =~ m/\G(\S)/gc)       # SHOULD USE LOOKAHEAD
 786         {
 787                 _failmsg "No block delimiter found after quotelike $op",
 788                          pos $$textref;
 789                 pos $$textref = $startpos;
 790                 return;
 791         }
 792         pos($$textref) = $ld1pos;       # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 793         my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 794         if ($ldel1 =~ /[[(<{]/)
 795         {
 796                 $rdel1 =~ tr/[({</])}>/;
 797                 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
 798                 || do { pos $$textref = $startpos; return };
 799         $ld2pos = pos($$textref);
 800         $rd1pos = $ld2pos-1;
 801         }
 802         else
 803         {
 804                 $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 805                 || do { pos $$textref = $startpos; return };
 806         $ld2pos = $rd1pos = pos($$textref)-1;
 807         }
 808
 809         my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 810         if ($second_arg)
 811         {
 812                 my ($ldel2, $rdel2);
 813                 if ($ldel1 =~ /[[(<{]/)
 814                 {
 815                         unless ($$textref =~ /\G\s*(\S)/gc)     # SHOULD USE LOOKAHEAD
 816                         {
 817                                 _failmsg "Missing second block for quotelike $op",
 818                                          pos $$textref;
 819                                 pos $$textref = $startpos;
 820                                 return;
 821                         }
 822                         $ldel2 = $rdel2 = "\Q$1";
 823                         $rdel2 =~ tr/[({</])}>/;
 824                 }
 825                 else
 826                 {
 827                         $ldel2 = $rdel2 = $ldel1;
 828                 }
 829                 $str2pos = $ld2pos+1;
 830
 831                 if ($ldel2 =~ /[[(<{]/)
 832                 {
 833                         pos($$textref)--;       # OVERCOME BROKEN LOOKAHEAD
 834                         defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
 835                         || do { pos $$textref = $startpos; return };
 836                 }
 837                 else
 838                 {
 839                         $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 840                         || do { pos $$textref = $startpos; return };
 841                 }
 842                 $rd2pos = pos($$textref)-1;
 843         }
 844         else
 845         {
 846                 $ld2pos = $str2pos = $rd2pos = $rd1pos;
 847         }
 848
 849         $modpos = pos $$textref;
 850
 851         $$textref =~ m/\G($mods{$op})/gc;
 852         my $endpos = pos $$textref;
 853
 854         return (
 855                 $startpos,      $oppos-$startpos,       # PREFIX
 856                 $oppos,         length($op),            # OPERATOR
 857                 $ld1pos,        1,                      # LEFT DEL
 858                 $str1pos,       $rd1pos-$str1pos,       # STR/PAT
 859                 $rd1pos,        1,                      # RIGHT DEL
 860                 $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
 861                 $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
 862                 $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
 863                 $modpos,        $endpos-$modpos,        # MODIFIERS
 864                 $endpos,        $textlen-$endpos,       # REMAINDER
 865                );
 866 }
 867
 868 my $def_func =
 869 [
 870         sub { extract_variable($_[0], '') },
 871         sub { extract_quotelike($_[0],'') },
 872         sub { extract_codeblock($_[0],'{}','') },
 873 ];
 874
 875 sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 876 {
 877         my $textref = defined($_[0]) ? \$_[0] : \$_;
 878         my $posbug = pos;
 879         my ($lastpos, $firstpos);
 880         my @fields = ();
 881
 882         #for ($$textref)
 883         {
 884                 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 885                 my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 886                 my $igunk = $_[3];
 887
 888                 pos $$textref ||= 0;
 889
 890                 unless (wantarray)
 891                 {
 892                         use Carp;
 893                         carp "extract_multiple reset maximal count to 1 in scalar context"
 894                                 if $^W && defined($_[2]) && $max > 1;
 895                         $max = 1
 896                 }
 897
 898                 my $unkpos;
 899                 my $func;
 900                 my $class;
 901
 902                 my @class;
 903                 foreach $func ( @func )
 904                 {
 905                         if (ref($func) eq 'HASH')
 906                         {
 907                                 push @class, (keys %$func)[0];
 908                                 $func = (values %$func)[0];
 909                         }
 910                         else
 911                         {
 912                                 push @class, undef;
 913                         }
 914                 }
 915
 916                 FIELD: while (pos($$textref) < length($$textref))
 917                 {
 918                         my ($field, $rem);
 919                         my @bits;
 920                         foreach my $i ( 0..$#func )
 921                         {
 922                                 my $pref;
 923                                 $func = $func[$i];
 924                                 $class = $class[$i];
 925                                 $lastpos = pos $$textref;
 926                                 if (ref($func) eq 'CODE')
 927                                         { ($field,$rem,$pref) = @bits = $func->($$textref) }
 928                                 elsif (ref($func) eq 'Text::Balanced::Extractor')
 929                                         { @bits = $field = $func->extract($$textref) }
 930                                 elsif( $$textref =~ m/\G$func/gc )
 931                                         { @bits = $field = defined($1)
 932                                 ? $1
 933                                 : substr($$textref, $-[0], $+[0] - $-[0])
 934                     }
 935                                 $pref ||= "";
 936                                 if (defined($field) && length($field))
 937                                 {
 938                                         if (!$igunk) {
 939                                                 $unkpos = $lastpos
 940                                                         if length($pref) && !defined($unkpos);
 941                                                 if (defined $unkpos)
 942                                                 {
 943                                                         push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
 944                                                         $firstpos = $unkpos unless defined $firstpos;
 945                                                         undef $unkpos;
 946                                                         last FIELD if @fields == $max;
 947                                                 }
 948                                         }
 949                                         push @fields, $class
 950                                                 ? bless (\$field, $class)
 951                                                 : $field;
 952                                         $firstpos = $lastpos unless defined $firstpos;
 953                                         $lastpos = pos $$textref;
 954                                         last FIELD if @fields == $max;
 955                                         next FIELD;
 956                                 }
 957                         }
 958                         if ($$textref =~ /\G(.)/gcs)
 959                         {
 960                                 $unkpos = pos($$textref)-1
 961                                         unless $igunk || defined $unkpos;
 962                         }
 963                 }
 964
 965                 if (defined $unkpos)
 966                 {
 967                         push @fields, substr($$textref, $unkpos);
 968                         $firstpos = $unkpos unless defined $firstpos;
 969                         $lastpos = length $$textref;
 970                 }
 971                 last;
 972         }
 973
 974         pos $$textref = $lastpos;
 975         return @fields if wantarray;
 976
 977         $firstpos ||= 0;
 978         eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 979                pos $$textref = $firstpos };
 980         return $fields[0];
 981 }
 982
 983
 984 sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 985 {
 986         my $ldel    = $_[0];
 987         my $rdel    = $_[1];
 988         my $pre     = defined $_[2] ? $_[2] : '\s*';
 989         my %options = defined $_[3] ? %{$_[3]} : ();
 990         my $omode   = defined $options{fail} ? $options{fail} : '';
 991         my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 992                     : defined($options{reject})        ? $options{reject}
 993                     :                                    ''
 994                     ;
 995         my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 996                     : defined($options{ignore})        ? $options{ignore}
 997                     :                                    ''
 998                     ;
 999
1000         if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
1001
1002         my $posbug = pos;
1003         for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1004         pos = $posbug;
1005
1006         my $closure = sub
1007         {
1008                 my $textref = defined $_[0] ? \$_[0] : \$_;
1009                 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1010
1011                 return _fail(wantarray, $textref) unless @match;
1012                 return _succeed wantarray, $textref,
1013                                 $match[2], $match[3]+$match[5]+$match[7],       # MATCH
1014                                 @match[8..9,0..1,2..7];                         # REM, PRE, BITS
1015         };
1016
1017         bless $closure, 'Text::Balanced::Extractor';
1018 }
1019
1020 package Text::Balanced::Extractor;
1021
1022 sub extract($$) # ($self, $text)
1023 {
1024         &{$_[0]}($_[1]);
1025 }
1026
1027 package Text::Balanced::ErrorMsg;
1028
1029 use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1030
1031 1;
1032
1033 __END__
1034
1035 =head1 NAME
1036
1037 Text::Balanced - Extract delimited text sequences from strings.
1038
1039
1040 =head1 SYNOPSIS
1041
1042  use Text::Balanced qw (
1043                         extract_delimited
1044                         extract_bracketed
1045                         extract_quotelike
1046                         extract_codeblock
1047                         extract_variable
1048                         extract_tagged
1049                         extract_multiple
1050
1051                         gen_delimited_pat
1052                         gen_extract_tagged
1053                        );
1054
1055  # Extract the initial substring of $text that is delimited by
1056  # two (unescaped) instances of the first character in $delim.
1057
1058         ($extracted, $remainder) = extract_delimited($text,$delim);
1059
1060
1061  # Extract the initial substring of $text that is bracketed
1062  # with a delimiter(s) specified by $delim (where the string
1063  # in $delim contains one or more of '(){}[]<>').
1064
1065         ($extracted, $remainder) = extract_bracketed($text,$delim);
1066
1067
1068  # Extract the initial substring of $text that is bounded by
1069  # an XML tag.
1070
1071         ($extracted, $remainder) = extract_tagged($text);
1072
1073
1074  # Extract the initial substring of $text that is bounded by
1075  # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1076
1077         ($extracted, $remainder) =
1078                 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1079
1080
1081  # Extract the initial substring of $text that represents a
1082  # Perl "quote or quote-like operation"
1083
1084         ($extracted, $remainder) = extract_quotelike($text);
1085
1086
1087  # Extract the initial substring of $text that represents a block
1088  # of Perl code, bracketed by any of character(s) specified by $delim
1089  # (where the string $delim contains one or more of '(){}[]<>').
1090
1091         ($extracted, $remainder) = extract_codeblock($text,$delim);
1092
1093
1094  # Extract the initial substrings of $text that would be extracted by
1095  # one or more sequential applications of the specified functions
1096  # or regular expressions
1097
1098         @extracted = extract_multiple($text,
1099                                       [ \&extract_bracketed,
1100                                         \&extract_quotelike,
1101                                         \&some_other_extractor_sub,
1102                                         qr/[xyz]*/,
1103                                         'literal',
1104                                       ]);
1105
1106 # Create a string representing an optimized pattern (a la Friedl)
1107 # that matches a substring delimited by any of the specified characters
1108 # (in this case: any type of quote or a slash)
1109
1110         $patstring = gen_delimited_pat(q{'"`/});
1111
1112
1113 # Generate a reference to an anonymous sub that is just like extract_tagged
1114 # but pre-compiled and optimized for a specific pair of tags, and consequently
1115 # much faster (i.e. 3 times faster). It uses qr// for better performance on
1116 # repeated calls, so it only works under Perl 5.005 or later.
1117
1118         $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1119
1120         ($extracted, $remainder) = $extract_head->($text);
1121
1122
1123 =head1 DESCRIPTION
1124
1125 The various C<extract_...> subroutines may be used to
1126 extract a delimited substring, possibly after skipping a
1127 specified prefix string. By default, that prefix is
1128 optional whitespace (C</\s*/>), but you can change it to whatever
1129 you wish (see below).
1130
1131 The substring to be extracted must appear at the
1132 current C<pos> location of the string's variable
1133 (or at index zero, if no C<pos> position is defined).
1134 In other words, the C<extract_...> subroutines I<don't>
1135 extract the first occurrence of a substring anywhere
1136 in a string (like an unanchored regex would). Rather,
1137 they extract an occurrence of the substring appearing
1138 immediately at the current matching position in the
1139 string (like a C<\G>-anchored regex would).
1140
1141
1142
1143 =head2 General behaviour in list contexts
1144
1145 In a list context, all the subroutines return a list, the first three
1146 elements of which are always:
1147
1148 =over 4
1149
1150 =item [0]
1151
1152 The extracted string, including the specified delimiters.
1153 If the extraction fails C<undef> is returned.
1154
1155 =item [1]
1156
1157 The remainder of the input string (i.e. the characters after the
1158 extracted string). On failure, the entire string is returned.
1159
1160 =item [2]
1161
1162 The skipped prefix (i.e. the characters before the extracted string).
1163 On failure, C<undef> is returned.
1164
1165 =back
1166
1167 Note that in a list context, the contents of the original input text (the first
1168 argument) are not modified in any way.
1169
1170 However, if the input text was passed in a variable, that variable's
1171 C<pos> value is updated to point at the first character after the
1172 extracted text. That means that in a list context the various
1173 subroutines can be used much like regular expressions. For example:
1174
1175         while ( $next = (extract_quotelike($text))[0] )
1176         {
1177                 # process next quote-like (in $next)
1178         }
1179
1180
1181 =head2 General behaviour in scalar and void contexts
1182
1183 In a scalar context, the extracted string is returned, having first been
1184 removed from the input text. Thus, the following code also processes
1185 each quote-like operation, but actually removes them from $text:
1186
1187         while ( $next = extract_quotelike($text) )
1188         {
1189                 # process next quote-like (in $next)
1190         }
1191
1192 Note that if the input text is a read-only string (i.e. a literal),
1193 no attempt is made to remove the extracted text.
1194
1195 In a void context the behaviour of the extraction subroutines is
1196 exactly the same as in a scalar context, except (of course) that the
1197 extracted substring is not returned.
1198
1199 =head2 A note about prefixes
1200
1201 Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1202 This can bite you if you're expecting a prefix specification like
1203 '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1204 pattern will only succeed if the <H1> tag is on the current line, since
1205 . normally doesn't match newlines.
1206
1207 To overcome this limitation, you need to turn on /s matching within
1208 the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1209
1210
1211 =head2 C<extract_delimited>
1212
1213 The C<extract_delimited> function formalizes the common idiom
1214 of extracting a single-character-delimited substring from the start of
1215 a string. For example, to extract a single-quote delimited string, the
1216 following code is typically used:
1217
1218         ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1219         $extracted = $1;
1220
1221 but with C<extract_delimited> it can be simplified to:
1222
1223         ($extracted,$remainder) = extract_delimited($text, "'");
1224
1225 C<extract_delimited> takes up to four scalars (the input text, the
1226 delimiters, a prefix pattern to be skipped, and any escape characters)
1227 and extracts the initial substring of the text that
1228 is appropriately delimited. If the delimiter string has multiple
1229 characters, the first one encountered in the text is taken to delimit
1230 the substring.
1231 The third argument specifies a prefix pattern that is to be skipped
1232 (but must be present!) before the substring is extracted.
1233 The final argument specifies the escape character to be used for each
1234 delimiter.
1235
1236 All arguments are optional. If the escape characters are not specified,
1237 every delimiter is escaped with a backslash (C<\>).
1238 If the prefix is not specified, the
1239 pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1240 is also not specified, the set C</["'`]/> is used. If the text to be processed
1241 is not specified either, C<$_> is used.
1242
1243 In list context, C<extract_delimited> returns a array of three
1244 elements, the extracted substring (I<including the surrounding
1245 delimiters>), the remainder of the text, and the skipped prefix (if
1246 any). If a suitable delimited substring is not found, the first
1247 element of the array is the empty string, the second is the complete
1248 original text, and the prefix returned in the third element is an
1249 empty string.
1250
1251 In a scalar context, just the extracted substring is returned. In
1252 a void context, the extracted substring (and any prefix) are simply
1253 removed from the beginning of the first argument.
1254
1255 Examples:
1256
1257         # Remove a single-quoted substring from the very beginning of $text:
1258
1259                 $substring = extract_delimited($text, "'", '');
1260
1261         # Remove a single-quoted Pascalish substring (i.e. one in which
1262         # doubling the quote character escapes it) from the very
1263         # beginning of $text:
1264
1265                 $substring = extract_delimited($text, "'", '', "'");
1266
1267         # Extract a single- or double- quoted substring from the
1268         # beginning of $text, optionally after some whitespace
1269         # (note the list context to protect $text from modification):
1270
1271                 ($substring) = extract_delimited $text, q{"'};
1272
1273
1274         # Delete the substring delimited by the first '/' in $text:
1275
1276                 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1277
1278 Note that this last example is I<not> the same as deleting the first
1279 quote-like pattern. For instance, if C<$text> contained the string:
1280
1281         "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1282
1283 then after the deletion it would contain:
1284
1285         "if ('.$UNIXCMD/s) { $cmd = $1; }"
1286
1287 not:
1288
1289         "if ('./cmd' =~ ms) { $cmd = $1; }"
1290
1291
1292 See L<"extract_quotelike"> for a (partial) solution to this problem.
1293
1294
1295 =head2 C<extract_bracketed>
1296
1297 Like C<"extract_delimited">, the C<extract_bracketed> function takes
1298 up to three optional scalar arguments: a string to extract from, a delimiter
1299 specifier, and a prefix pattern. As before, a missing prefix defaults to
1300 optional whitespace and a missing text defaults to C<$_>. However, a missing
1301 delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1302
1303 C<extract_bracketed> extracts a balanced-bracket-delimited
1304 substring (using any one (or more) of the user-specified delimiter
1305 brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1306 respect quoted unbalanced brackets (see below).
1307
1308 A "delimiter bracket" is a bracket in list of delimiters passed as
1309 C<extract_bracketed>'s second argument. Delimiter brackets are
1310 specified by giving either the left or right (or both!) versions
1311 of the required bracket(s). Note that the order in which
1312 two or more delimiter brackets are specified is not significant.
1313
1314 A "balanced-bracket-delimited substring" is a substring bounded by
1315 matched brackets, such that any other (left or right) delimiter
1316 bracket I<within> the substring is also matched by an opposite
1317 (right or left) delimiter bracket I<at the same level of nesting>. Any
1318 type of bracket not in the delimiter list is treated as an ordinary
1319 character.
1320
1321 In other words, each type of bracket specified as a delimiter must be
1322 balanced and correctly nested within the substring, and any other kind of
1323 ("non-delimiter") bracket in the substring is ignored.
1324
1325 For example, given the string:
1326
1327         $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1328
1329 then a call to C<extract_bracketed> in a list context:
1330
1331         @result = extract_bracketed( $text, '{}' );
1332
1333 would return:
1334
1335         ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1336
1337 since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1338 (In a scalar context just the first element of the array would be returned. In
1339 a void context, C<$text> would be replaced by an empty string.)
1340
1341 Likewise the call in:
1342
1343         @result = extract_bracketed( $text, '{[' );
1344
1345 would return the same result, since all sets of both types of specified
1346 delimiter brackets are correctly nested and balanced.
1347
1348 However, the call in:
1349
1350         @result = extract_bracketed( $text, '{([<' );
1351
1352 would fail, returning:
1353
1354         ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1355
1356 because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1357 the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1358 return an empty string. In a void context, C<$text> would be unchanged.)
1359
1360 Note that the embedded single-quotes in the string don't help in this
1361 case, since they have not been specified as acceptable delimiters and are
1362 therefore treated as non-delimiter characters (and ignored).
1363
1364 However, if a particular species of quote character is included in the
1365 delimiter specification, then that type of quote will be correctly handled.
1366 for example, if C<$text> is:
1367
1368         $text = '<A HREF=">>>>">link</A>';
1369
1370 then
1371
1372         @result = extract_bracketed( $text, '<">' );
1373
1374 returns:
1375
1376         ( '<A HREF=">>>>">', 'link</A>', "" )
1377
1378 as expected. Without the specification of C<"> as an embedded quoter:
1379
1380         @result = extract_bracketed( $text, '<>' );
1381
1382 the result would be:
1383
1384         ( '<A HREF=">', '>>>">link</A>', "" )
1385
1386 In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1387 quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1388 letter 'q' as a delimiter. Hence:
1389
1390         @result = extract_bracketed( $text, '<q>' );
1391
1392 would correctly match something like this:
1393
1394         $text = '<leftop: conj /and/ conj>';
1395
1396 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1397
1398
1399 =head2 C<extract_variable>
1400
1401 C<extract_variable> extracts any valid Perl variable or
1402 variable-involved expression, including scalars, arrays, hashes, array
1403 accesses, hash look-ups, method calls through objects, subroutine calls
1404 through subroutine references, etc.
1405
1406 The subroutine takes up to two optional arguments:
1407
1408 =over 4
1409
1410 =item 1.
1411
1412 A string to be processed (C<$_> if the string is omitted or C<undef>)
1413
1414 =item 2.
1415
1416 A string specifying a pattern to be matched as a prefix (which is to be
1417 skipped). If omitted, optional whitespace is skipped.
1418
1419 =back
1420
1421 On success in a list context, an array of 3 elements is returned. The
1422 elements are:
1423
1424 =over 4
1425
1426 =item [0]
1427
1428 the extracted variable, or variablish expression
1429
1430 =item [1]
1431
1432 the remainder of the input text,
1433
1434 =item [2]
1435
1436 the prefix substring (if any),
1437
1438 =back
1439
1440 On failure, all of these values (except the remaining text) are C<undef>.
1441
1442 In a scalar context, C<extract_variable> returns just the complete
1443 substring that matched a variablish expression. C<undef> is returned on
1444 failure. In addition, the original input text has the returned substring
1445 (and any prefix) removed from it.
1446
1447 In a void context, the input text just has the matched substring (and
1448 any specified prefix) removed.
1449
1450
1451 =head2 C<extract_tagged>
1452
1453 C<extract_tagged> extracts and segments text between (balanced)
1454 specified tags.
1455
1456 The subroutine takes up to five optional arguments:
1457
1458 =over 4
1459
1460 =item 1.
1461
1462 A string to be processed (C<$_> if the string is omitted or C<undef>)
1463
1464 =item 2.
1465
1466 A string specifying a pattern to be matched as the opening tag.
1467 If the pattern string is omitted (or C<undef>) then a pattern
1468 that matches any standard XML tag is used.
1469
1470 =item 3.
1471
1472 A string specifying a pattern to be matched at the closing tag.
1473 If the pattern string is omitted (or C<undef>) then the closing
1474 tag is constructed by inserting a C</> after any leading bracket
1475 characters in the actual opening tag that was matched (I<not> the pattern
1476 that matched the tag). For example, if the opening tag pattern
1477 is specified as C<'{{\w+}}'> and actually matched the opening tag
1478 C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1479
1480 =item 4.
1481
1482 A string specifying a pattern to be matched as a prefix (which is to be
1483 skipped). If omitted, optional whitespace is skipped.
1484
1485 =item 5.
1486
1487 A hash reference containing various parsing options (see below)
1488
1489 =back
1490
1491 The various options that can be specified are:
1492
1493 =over 4
1494
1495 =item C<reject =E<gt> $listref>
1496
1497 The list reference contains one or more strings specifying patterns
1498 that must I<not> appear within the tagged text.
1499
1500 For example, to extract
1501 an HTML link (which should not contain nested links) use:
1502
1503         extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1504
1505 =item C<ignore =E<gt> $listref>
1506
1507 The list reference contains one or more strings specifying patterns
1508 that are I<not> be be treated as nested tags within the tagged text
1509 (even if they would match the start tag pattern).
1510
1511 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1512
1513         extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1514
1515 (also see L<"gen_delimited_pat"> below).
1516
1517
1518 =item C<fail =E<gt> $str>
1519
1520 The C<fail> option indicates the action to be taken if a matching end
1521 tag is not encountered (i.e. before the end of the string or some
1522 C<reject> pattern matches). By default, a failure to match a closing
1523 tag causes C<extract_tagged> to immediately fail.
1524
1525 However, if the string value associated with <reject> is "MAX", then
1526 C<extract_tagged> returns the complete text up to the point of failure.
1527 If the string is "PARA", C<extract_tagged> returns only the first paragraph
1528 after the tag (up to the first line that is either empty or contains
1529 only whitespace characters).
1530 If the string is "", the the default behaviour (i.e. failure) is reinstated.
1531
1532 For example, suppose the start tag "/para" introduces a paragraph, which then
1533 continues until the next "/endpara" tag or until another "/para" tag is
1534 encountered:
1535
1536         $text = "/para line 1\n\nline 3\n/para line 4";
1537
1538         extract_tagged($text, '/para', '/endpara', undef,
1539                                 {reject => '/para', fail => MAX );
1540
1541         # EXTRACTED: "/para line 1\n\nline 3\n"
1542
1543 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1544 tag refers only to the immediately following paragraph:
1545
1546         $text = "/para line 1\n\nline 3\n/para line 4";
1547
1548         extract_tagged($text, '/para', '/endpara', undef,
1549                         {reject => '/para', fail => MAX );
1550
1551         # EXTRACTED: "/para line 1\n"
1552
1553 Note that the specified C<fail> behaviour applies to nested tags as well.
1554
1555 =back
1556
1557 On success in a list context, an array of 6 elements is returned. The elements are:
1558
1559 =over 4
1560
1561 =item [0]
1562
1563 the extracted tagged substring (including the outermost tags),
1564
1565 =item [1]
1566
1567 the remainder of the input text,
1568
1569 =item [2]
1570
1571 the prefix substring (if any),
1572
1573 =item [3]
1574
1575 the opening tag
1576
1577 =item [4]
1578
1579 the text between the opening and closing tags
1580
1581 =item [5]
1582
1583 the closing tag (or "" if no closing tag was found)
1584
1585 =back
1586
1587 On failure, all of these values (except the remaining text) are C<undef>.
1588
1589 In a scalar context, C<extract_tagged> returns just the complete
1590 substring that matched a tagged text (including the start and end
1591 tags). C<undef> is returned on failure. In addition, the original input
1592 text has the returned substring (and any prefix) removed from it.
1593
1594 In a void context, the input text just has the matched substring (and
1595 any specified prefix) removed.
1596
1597
1598 =head2 C<gen_extract_tagged>
1599
1600 (Note: This subroutine is only available under Perl5.005)
1601
1602 C<gen_extract_tagged> generates a new anonymous subroutine which
1603 extracts text between (balanced) specified tags. In other words,
1604 it generates a function identical in function to C<extract_tagged>.
1605
1606 The difference between C<extract_tagged> and the anonymous
1607 subroutines generated by
1608 C<gen_extract_tagged>, is that those generated subroutines:
1609
1610 =over 4
1611
1612 =item *
1613
1614 do not have to reparse tag specification or parsing options every time
1615 they are called (whereas C<extract_tagged> has to effectively rebuild
1616 its tag parser on every call);
1617
1618 =item *
1619
1620 make use of the new qr// construct to pre-compile the regexes they use
1621 (whereas C<extract_tagged> uses standard string variable interpolation
1622 to create tag-matching patterns).
1623
1624 =back
1625
1626 The subroutine takes up to four optional arguments (the same set as
1627 C<extract_tagged> except for the string to be processed). It returns
1628 a reference to a subroutine which in turn takes a single argument (the text to
1629 be extracted from).
1630
1631 In other words, the implementation of C<extract_tagged> is exactly
1632 equivalent to:
1633
1634         sub extract_tagged
1635         {
1636                 my $text = shift;
1637                 $extractor = gen_extract_tagged(@_);
1638                 return $extractor->($text);
1639         }
1640
1641 (although C<extract_tagged> is not currently implemented that way, in order
1642 to preserve pre-5.005 compatibility).
1643
1644 Using C<gen_extract_tagged> to create extraction functions for specific tags
1645 is a good idea if those functions are going to be called more than once, since
1646 their performance is typically twice as good as the more general-purpose
1647 C<extract_tagged>.
1648
1649
1650 =head2 C<extract_quotelike>
1651
1652 C<extract_quotelike> attempts to recognize, extract, and segment any
1653 one of the various Perl quotes and quotelike operators (see
1654 L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1655 delimiters (for the quotelike operators), and trailing modifiers are
1656 all caught. For example, in:
1657
1658         extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1659
1660         extract_quotelike '  "You said, \"Use sed\"."  '
1661
1662         extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1663
1664         extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1665
1666 the full Perl quotelike operations are all extracted correctly.
1667
1668 Note too that, when using the /x modifier on a regex, any comment
1669 containing the current pattern delimiter will cause the regex to be
1670 immediately terminated. In other words:
1671
1672         'm /
1673                 (?i)            # CASE INSENSITIVE
1674                 [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1675                 [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1676            /x'
1677
1678 will be extracted as if it were:
1679
1680         'm /
1681                 (?i)            # CASE INSENSITIVE
1682                 [a-z_]          # LEADING ALPHABETIC/'
1683
1684 This behaviour is identical to that of the actual compiler.
1685
1686 C<extract_quotelike> takes two arguments: the text to be processed and
1687 a prefix to be matched at the very beginning of the text. If no prefix
1688 is specified, optional whitespace is the default. If no text is given,
1689 C<$_> is used.
1690
1691 In a list context, an array of 11 elements is returned. The elements are:
1692
1693 =over 4
1694
1695 =item [0]
1696
1697 the extracted quotelike substring (including trailing modifiers),
1698
1699 =item [1]
1700
1701 the remainder of the input text,
1702
1703 =item [2]
1704
1705 the prefix substring (if any),
1706
1707 =item [3]
1708
1709 the name of the quotelike operator (if any),
1710
1711 =item [4]
1712
1713 the left delimiter of the first block of the operation,
1714
1715 =item [5]
1716
1717 the text of the first block of the operation
1718 (that is, the contents of
1719 a quote, the regex of a match or substitution or the target list of a
1720 translation),
1721
1722 =item [6]
1723
1724 the right delimiter of the first block of the operation,
1725
1726 =item [7]
1727
1728 the left delimiter of the second block of the operation
1729 (that is, if it is a C<s>, C<tr>, or C<y>),
1730
1731 =item [8]
1732
1733 the text of the second block of the operation
1734 (that is, the replacement of a substitution or the translation list
1735 of a translation),
1736
1737 =item [9]
1738
1739 the right delimiter of the second block of the operation (if any),
1740
1741 =item [10]
1742
1743 the trailing modifiers on the operation (if any).
1744
1745 =back
1746
1747 For each of the fields marked "(if any)" the default value on success is
1748 an empty string.
1749 On failure, all of these values (except the remaining text) are C<undef>.
1750
1751
1752 In a scalar context, C<extract_quotelike> returns just the complete substring
1753 that matched a quotelike operation (or C<undef> on failure). In a scalar or
1754 void context, the input text has the same substring (and any specified
1755 prefix) removed.
1756
1757 Examples:
1758
1759         # Remove the first quotelike literal that appears in text
1760
1761                 $quotelike = extract_quotelike($text,'.*?');
1762
1763         # Replace one or more leading whitespace-separated quotelike
1764         # literals in $_ with "<QLL>"
1765
1766                 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1767
1768
1769         # Isolate the search pattern in a quotelike operation from $text
1770
1771                 ($op,$pat) = (extract_quotelike $text)[3,5];
1772                 if ($op =~ /[ms]/)
1773                 {
1774                         print "search pattern: $pat\n";
1775                 }
1776                 else
1777                 {
1778                         print "$op is not a pattern matching operation\n";
1779                 }
1780
1781
1782 =head2 C<extract_quotelike> and "here documents"
1783
1784 C<extract_quotelike> can successfully extract "here documents" from an input
1785 string, but with an important caveat in list contexts.
1786
1787 Unlike other types of quote-like literals, a here document is rarely
1788 a contiguous substring. For example, a typical piece of code using
1789 here document might look like this:
1790
1791         <<'EOMSG' || die;
1792         This is the message.
1793         EOMSG
1794         exit;
1795
1796 Given this as an input string in a scalar context, C<extract_quotelike>
1797 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1798 leaving the string " || die;\nexit;" in the original variable. In other words,
1799 the two separate pieces of the here document are successfully extracted and
1800 concatenated.
1801
1802 In a list context, C<extract_quotelike> would return the list
1803
1804 =over 4
1805
1806 =item [0]
1807
1808 "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1809 including fore and aft delimiters),
1810
1811 =item [1]
1812
1813 " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1814
1815 =item [2]
1816
1817 "" (i.e. the prefix substring -- trivial in this case),
1818
1819 =item [3]
1820
1821 "<<" (i.e. the "name" of the quotelike operator)
1822
1823 =item [4]
1824
1825 "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1826
1827 =item [5]
1828
1829 "This is the message.\n" (i.e. the text of the here document),
1830
1831 =item [6]
1832
1833 "EOMSG" (i.e. the right delimiter of the here document),
1834
1835 =item [7..10]
1836
1837 "" (a here document has no second left delimiter, second text, second right
1838 delimiter, or trailing modifiers).
1839
1840 =back
1841
1842 However, the matching position of the input variable would be set to
1843 "exit;" (i.e. I<after> the closing delimiter of the here document),
1844 which would cause the earlier " || die;\nexit;" to be skipped in any
1845 sequence of code fragment extractions.
1846
1847 To avoid this problem, when it encounters a here document whilst
1848 extracting from a modifiable string, C<extract_quotelike> silently
1849 rearranges the string to an equivalent piece of Perl:
1850
1851         <<'EOMSG'
1852         This is the message.
1853         EOMSG
1854         || die;
1855         exit;
1856
1857 in which the here document I<is> contiguous. It still leaves the
1858 matching position after the here document, but now the rest of the line
1859 on which the here document starts is not skipped.
1860
1861 To prevent <extract_quotelike> from mucking about with the input in this way
1862 (this is the only case where a list-context C<extract_quotelike> does so),
1863 you can pass the input variable as an interpolated literal:
1864
1865         $quotelike = extract_quotelike("$var");
1866
1867
1868 =head2 C<extract_codeblock>
1869
1870 C<extract_codeblock> attempts to recognize and extract a balanced
1871 bracket delimited substring that may contain unbalanced brackets
1872 inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1873 is like a combination of C<"extract_bracketed"> and
1874 C<"extract_quotelike">.
1875
1876 C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1877 a text to process, a set of delimiter brackets to look for, and a prefix to
1878 match first. It also takes an optional fourth parameter, which allows the
1879 outermost delimiter brackets to be specified separately (see below).
1880
1881 Omitting the first argument (input text) means process C<$_> instead.
1882 Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1883 Omitting the third argument (prefix argument) implies optional whitespace at the start.
1884 Omitting the fourth argument (outermost delimiter brackets) indicates that the
1885 value of the second argument is to be used for the outermost delimiters.
1886
1887 Once the prefix an dthe outermost opening delimiter bracket have been
1888 recognized, code blocks are extracted by stepping through the input text and
1889 trying the following alternatives in sequence:
1890
1891 =over 4
1892
1893 =item 1.
1894
1895 Try and match a closing delimiter bracket. If the bracket was the same
1896 species as the last opening bracket, return the substring to that
1897 point. If the bracket was mismatched, return an error.
1898
1899 =item 2.
1900
1901 Try to match a quote or quotelike operator. If found, call
1902 C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1903 the error it returned. Otherwise go back to step 1.
1904
1905 =item 3.
1906
1907 Try to match an opening delimiter bracket. If found, call
1908 C<extract_codeblock> recursively to eat the embedded block. If the
1909 recursive call fails, return an error. Otherwise, go back to step 1.
1910
1911 =item 4.
1912
1913 Unconditionally match a bareword or any other single character, and
1914 then go back to step 1.
1915
1916 =back
1917
1918
1919 Examples:
1920
1921         # Find a while loop in the text
1922
1923                 if ($text =~ s/.*?while\s*\{/{/)
1924                 {
1925                         $loop = "while " . extract_codeblock($text);
1926                 }
1927
1928         # Remove the first round-bracketed list (which may include
1929         # round- or curly-bracketed code blocks or quotelike operators)
1930
1931                 extract_codeblock $text, "(){}", '[^(]*';
1932
1933
1934 The ability to specify a different outermost delimiter bracket is useful
1935 in some circumstances. For example, in the Parse::RecDescent module,
1936 parser actions which are to be performed only on a successful parse
1937 are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1938
1939         sentence: subject verb object
1940                         <defer: {$::theVerb = $item{verb}} >
1941
1942 Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1943 within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1944
1945 A deferred action like this:
1946
1947                         <defer: {if ($count>10) {$count--}} >
1948
1949 will be incorrectly parsed as:
1950
1951                         <defer: {if ($count>
1952
1953 because the "less than" operator is interpreted as a closing delimiter.
1954
1955 But, by extracting the directive using
1956 S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1957 the '>' character is only treated as a delimited at the outermost
1958 level of the code block, so the directive is parsed correctly.
1959
1960 =head2 C<extract_multiple>
1961
1962 The C<extract_multiple> subroutine takes a string to be processed and a
1963 list of extractors (subroutines or regular expressions) to apply to that string.
1964
1965 In an array context C<extract_multiple> returns an array of substrings
1966 of the original string, as extracted by the specified extractors.
1967 In a scalar context, C<extract_multiple> returns the first
1968 substring successfully extracted from the original string. In both
1969 scalar and void contexts the original string has the first successfully
1970 extracted substring removed from it. In all contexts
1971 C<extract_multiple> starts at the current C<pos> of the string, and
1972 sets that C<pos> appropriately after it matches.
1973
1974 Hence, the aim of of a call to C<extract_multiple> in a list context
1975 is to split the processed string into as many non-overlapping fields as
1976 possible, by repeatedly applying each of the specified extractors
1977 to the remainder of the string. Thus C<extract_multiple> is
1978 a generalized form of Perl's C<split> subroutine.
1979
1980 The subroutine takes up to four optional arguments:
1981
1982 =over 4
1983
1984 =item 1.
1985
1986 A string to be processed (C<$_> if the string is omitted or C<undef>)
1987
1988 =item 2.
1989
1990 A reference to a list of subroutine references and/or qr// objects and/or
1991 literal strings and/or hash references, specifying the extractors
1992 to be used to split the string. If this argument is omitted (or
1993 C<undef>) the list:
1994
1995         [
1996                 sub { extract_variable($_[0], '') },
1997                 sub { extract_quotelike($_[0],'') },
1998                 sub { extract_codeblock($_[0],'{}','') },
1999         ]
2000
2001 is used.
2002
2003
2004 =item 3.
2005
2006 An number specifying the maximum number of fields to return. If this
2007 argument is omitted (or C<undef>), split continues as long as possible.
2008
2009 If the third argument is I<N>, then extraction continues until I<N> fields
2010 have been successfully extracted, or until the string has been completely
2011 processed.
2012
2013 Note that in scalar and void contexts the value of this argument is
2014 automatically reset to 1 (under C<-w>, a warning is issued if the argument
2015 has to be reset).
2016
2017 =item 4.
2018
2019 A value indicating whether unmatched substrings (see below) within the
2020 text should be skipped or returned as fields. If the value is true,
2021 such substrings are skipped. Otherwise, they are returned.
2022
2023 =back
2024
2025 The extraction process works by applying each extractor in
2026 sequence to the text string.
2027
2028 If the extractor is a subroutine it is called in a list context and is
2029 expected to return a list of a single element, namely the extracted
2030 text. It may optionally also return two further arguments: a string
2031 representing the text left after extraction (like $' for a pattern
2032 match), and a string representing any prefix skipped before the
2033 extraction (like $` in a pattern match). Note that this is designed
2034 to facilitate the use of other Text::Balanced subroutines with
2035 C<extract_multiple>. Note too that the value returned by an extractor
2036 subroutine need not bear any relationship to the corresponding substring
2037 of the original text (see examples below).
2038
2039 If the extractor is a precompiled regular expression or a string,
2040 it is matched against the text in a scalar context with a leading
2041 '\G' and the gc modifiers enabled. The extracted value is either
2042 $1 if that variable is defined after the match, or else the
2043 complete match (i.e. $&).
2044
2045 If the extractor is a hash reference, it must contain exactly one element.
2046 The value of that element is one of the
2047 above extractor types (subroutine reference, regular expression, or string).
2048 The key of that element is the name of a class into which the successful
2049 return value of the extractor will be blessed.
2050
2051 If an extractor returns a defined value, that value is immediately
2052 treated as the next extracted field and pushed onto the list of fields.
2053 If the extractor was specified in a hash reference, the field is also
2054 blessed into the appropriate class,
2055
2056 If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2057 assumed to have failed to extract.
2058 If none of the extractor subroutines succeeds, then one
2059 character is extracted from the start of the text and the extraction
2060 subroutines reapplied. Characters which are thus removed are accumulated and
2061 eventually become the next field (unless the fourth argument is true, in which
2062 case they are discarded).
2063
2064 For example, the following extracts substrings that are valid Perl variables:
2065
2066         @fields = extract_multiple($text,
2067                                    [ sub { extract_variable($_[0]) } ],
2068                                    undef, 1);
2069
2070 This example separates a text into fields which are quote delimited,
2071 curly bracketed, and anything else. The delimited and bracketed
2072 parts are also blessed to identify them (the "anything else" is unblessed):
2073
2074         @fields = extract_multiple($text,
2075                    [
2076                         { Delim => sub { extract_delimited($_[0],q{'"}) } },
2077                         { Brack => sub { extract_bracketed($_[0],'{}') } },
2078                    ]);
2079
2080 This call extracts the next single substring that is a valid Perl quotelike
2081 operator (and removes it from $text):
2082
2083         $quotelike = extract_multiple($text,
2084                                       [
2085                                         sub { extract_quotelike($_[0]) },
2086                                       ], undef, 1);
2087
2088 Finally, here is yet another way to do comma-separated value parsing:
2089
2090         @fields = extract_multiple($csv_text,
2091                                   [
2092                                         sub { extract_delimited($_[0],q{'"}) },
2093                                         qr/([^,]+)(.*)/,
2094                                   ],
2095                                   undef,1);
2096
2097 The list in the second argument means:
2098 I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2099 The undef third argument means:
2100 I<"...as many times as possible...">,
2101 and the true value in the fourth argument means
2102 I<"...discarding anything else that appears (i.e. the commas)">.
2103
2104 If you wanted the commas preserved as separate fields (i.e. like split
2105 does if your split pattern has capturing parentheses), you would
2106 just make the last parameter undefined (or remove it).
2107
2108
2109 =head2 C<gen_delimited_pat>
2110
2111 The C<gen_delimited_pat> subroutine takes a single (string) argument and
2112    > builds a Friedl-style optimized regex that matches a string delimited
2113 by any one of the characters in the single argument. For example:
2114
2115         gen_delimited_pat(q{'"})
2116
2117 returns the regex:
2118
2119         (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2120
2121 Note that the specified delimiters are automatically quotemeta'd.
2122
2123 A typical use of C<gen_delimited_pat> would be to build special purpose tags
2124 for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2125 (which might contain quoted strings):
2126
2127         my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2128
2129         extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2130
2131
2132 C<gen_delimited_pat> may also be called with an optional second argument,
2133 which specifies the "escape" character(s) to be used for each delimiter.
2134 For example to match a Pascal-style string (where ' is the delimiter
2135 and '' is a literal ' within the string):
2136
2137         gen_delimited_pat(q{'},q{'});
2138
2139 Different escape characters can be specified for different delimiters.
2140 For example, to specify that '/' is the escape for single quotes
2141 and '%' is the escape for double quotes:
2142
2143         gen_delimited_pat(q{'"},q{/%});
2144
2145 If more delimiters than escape chars are specified, the last escape char
2146 is used for the remaining delimiters.
2147 If no escape char is specified for a given specified delimiter, '\' is used.
2148
2149 =head2 C<delimited_pat>
2150
2151 Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2152 That name may still be used, but is now deprecated.
2153
2154
2155 =head1 DIAGNOSTICS
2156
2157 In a list context, all the functions return C<(undef,$original_text)>
2158 on failure. In a scalar context, failure is indicated by returning C<undef>
2159 (in this case the input text is not modified in any way).
2160
2161 In addition, on failure in I<any> context, the C<$@> variable is set.
2162 Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2163 below.
2164 Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2165 which the error was detected (although not necessarily where it occurred!)
2166 Printing C<$@> directly produces the error message, with the offset appended.
2167 On success, the C<$@> variable is guaranteed to be C<undef>.
2168
2169 The available diagnostics are:
2170
2171 =over 4
2172
2173 =item  C<Did not find a suitable bracket: "%s">
2174
2175 The delimiter provided to C<extract_bracketed> was not one of
2176 C<'()[]E<lt>E<gt>{}'>.
2177
2178 =item  C<Did not find prefix: /%s/>
2179
2180 A non-optional prefix was specified but wasn't found at the start of the text.
2181
2182 =item  C<Did not find opening bracket after prefix: "%s">
2183
2184 C<extract_bracketed> or C<extract_codeblock> was expecting a
2185 particular kind of bracket at the start of the text, and didn't find it.
2186
2187 =item  C<No quotelike operator found after prefix: "%s">
2188
2189 C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2190 C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2191 it was extracting.
2192
2193 =item  C<Unmatched closing bracket: "%c">
2194
2195 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2196 a closing bracket where none was expected.
2197
2198 =item  C<Unmatched opening bracket(s): "%s">
2199
2200 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2201 out of characters in the text before closing one or more levels of nested
2202 brackets.
2203
2204 =item C<Unmatched embedded quote (%s)>
2205
2206 C<extract_bracketed> attempted to match an embedded quoted substring, but
2207 failed to find a closing quote to match it.
2208
2209 =item C<Did not find closing delimiter to match '%s'>
2210
2211 C<extract_quotelike> was unable to find a closing delimiter to match the
2212 one that opened the quote-like operation.
2213
2214 =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2215
2216 C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2217 a valid bracket delimiter, but it was the wrong species. This usually
2218 indicates a nesting error, but may indicate incorrect quoting or escaping.
2219
2220 =item  C<No block delimiter found after quotelike "%s">
2221
2222 C<extract_quotelike> or C<extract_codeblock> found one of the
2223 quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2224 without a suitable block after it.
2225
2226 =item C<Did not find leading dereferencer>
2227
2228 C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2229 a variable, but didn't find any of them.
2230
2231 =item C<Bad identifier after dereferencer>
2232
2233 C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2234 character was not followed by a legal Perl identifier.
2235
2236 =item C<Did not find expected opening bracket at %s>
2237
2238 C<extract_codeblock> failed to find any of the outermost opening brackets
2239 that were specified.
2240
2241 =item C<Improperly nested codeblock at %s>
2242
2243 A nested code block was found that started with a delimiter that was specified
2244 as being only to be used as an outermost bracket.
2245
2246 =item  C<Missing second block for quotelike "%s">
2247
2248 C<extract_codeblock> or C<extract_quotelike> found one of the
2249 quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2250
2251 =item C<No match found for opening bracket>
2252
2253 C<extract_codeblock> failed to find a closing bracket to match the outermost
2254 opening bracket.
2255
2256 =item C<Did not find opening tag: /%s/>
2257
2258 C<extract_tagged> did not find a suitable opening tag (after any specified
2259 prefix was removed).
2260
2261 =item C<Unable to construct closing tag to match: /%s/>
2262
2263 C<extract_tagged> matched the specified opening tag and tried to
2264 modify the matched text to produce a matching closing tag (because
2265 none was specified). It failed to generate the closing tag, almost
2266 certainly because the opening tag did not start with a
2267 bracket of some kind.
2268
2269 =item C<Found invalid nested tag: %s>
2270
2271 C<extract_tagged> found a nested tag that appeared in the "reject" list
2272 (and the failure mode was not "MAX" or "PARA").
2273
2274 =item C<Found unbalanced nested tag: %s>
2275
2276 C<extract_tagged> found a nested opening tag that was not matched by a
2277 corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2278
2279 =item C<Did not find closing tag>
2280
2281 C<extract_tagged> reached the end of the text without finding a closing tag
2282 to match the original opening tag (and the failure mode was not
2283 "MAX" or "PARA").
2284
2285
2286
2287
2288 =back
2289
2290
2291 =head1 AUTHOR
2292
2293 Damian Conway (damian@conway.org)
2294
2295
2296 =head1 BUGS AND IRRITATIONS
2297
2298 There are undoubtedly serious bugs lurking somewhere in this code, if
2299 only because parts of it give the impression of understanding a great deal
2300 more about Perl than they really do.
2301
2302 Bug reports and other feedback are most welcome.
2303
2304
2305 =head1 COPYRIGHT
2306
2307  Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2308  This module is free software. It may be used, redistributed
2309      and/or modified under the same terms as Perl itself.