#!/usr/bin/perl # Copyright (C) 2005, Larry Wall # This software may be copied under the same terms as Perl. package P5re; use strict; use warnings; our @EXPORT_OK = qw(re re2xml qr2xml); my $indent = 0; my $in = ""; my $delim = 1; my $debug = 0; my $maxbrack; our $extended; our $insensitive; our $singleline; our $multiline; my %xmlish = ( chr(0x00) => "STUPIDXML(#x00)", chr(0x01) => "STUPIDXML(#x01)", chr(0x02) => "STUPIDXML(#x02)", chr(0x03) => "STUPIDXML(#x03)", chr(0x04) => "STUPIDXML(#x04)", chr(0x05) => "STUPIDXML(#x05)", chr(0x06) => "STUPIDXML(#x06)", chr(0x07) => "STUPIDXML(#x07)", chr(0x08) => "STUPIDXML(#x08)", chr(0x09) => " ", chr(0x0a) => " ", chr(0x0b) => "STUPIDXML(#x0b)", chr(0x0c) => "STUPIDXML(#x0c)", chr(0x0d) => " ", chr(0x0e) => "STUPIDXML(#x0e)", chr(0x0f) => "STUPIDXML(#x0f)", chr(0x10) => "STUPIDXML(#x10)", chr(0x11) => "STUPIDXML(#x11)", chr(0x12) => "STUPIDXML(#x12)", chr(0x13) => "STUPIDXML(#x13)", chr(0x14) => "STUPIDXML(#x14)", chr(0x15) => "STUPIDXML(#x15)", chr(0x16) => "STUPIDXML(#x16)", chr(0x17) => "STUPIDXML(#x17)", chr(0x18) => "STUPIDXML(#x18)", chr(0x19) => "STUPIDXML(#x19)", chr(0x1a) => "STUPIDXML(#x1a)", chr(0x1b) => "STUPIDXML(#x1b)", chr(0x1c) => "STUPIDXML(#x1c)", chr(0x1d) => "STUPIDXML(#x1d)", chr(0x1e) => "STUPIDXML(#x1e)", chr(0x1f) => "STUPIDXML(#x1f)", chr(0x7f) => "STUPIDXML(#x7f)", chr(0x80) => "STUPIDXML(#x80)", chr(0x81) => "STUPIDXML(#x81)", chr(0x82) => "STUPIDXML(#x82)", chr(0x83) => "STUPIDXML(#x83)", chr(0x84) => "STUPIDXML(#x84)", chr(0x86) => "STUPIDXML(#x86)", chr(0x87) => "STUPIDXML(#x87)", chr(0x88) => "STUPIDXML(#x88)", chr(0x89) => "STUPIDXML(#x89)", chr(0x90) => "STUPIDXML(#x90)", chr(0x91) => "STUPIDXML(#x91)", chr(0x92) => "STUPIDXML(#x92)", chr(0x93) => "STUPIDXML(#x93)", chr(0x94) => "STUPIDXML(#x94)", chr(0x95) => "STUPIDXML(#x95)", chr(0x96) => "STUPIDXML(#x96)", chr(0x97) => "STUPIDXML(#x97)", chr(0x98) => "STUPIDXML(#x98)", chr(0x99) => "STUPIDXML(#x99)", chr(0x9a) => "STUPIDXML(#x9a)", chr(0x9b) => "STUPIDXML(#x9b)", chr(0x9c) => "STUPIDXML(#x9c)", chr(0x9d) => "STUPIDXML(#x9d)", chr(0x9e) => "STUPIDXML(#x9e)", chr(0x9f) => "STUPIDXML(#x9f)", '<' => "<", '>' => ">", '&' => "&", '"' => """, # XML idiocy ); sub xmlquote { my $text = shift; $text =~ s/(.)/$xmlish{$1} || $1/seg; return $text; } sub text { my $self = shift; return xmlquote($self->{text}); } sub rep { my $self = shift; return xmlquote($self->{rep}); } sub xmlkids { my $self = shift; my $array = $self->{Kids}; my $ret = ""; $indent += 2; $in = ' ' x $indent; foreach my $chunk (@$array) { if (ref $chunk eq "ARRAY") { die; } elsif (ref $chunk) { $ret .= $chunk->xml(); } else { warn $chunk; } } $indent -= 2; $in = ' ' x $indent; return $ret; }; package P5re::RE; our @ISA = 'P5re'; sub xml { my $self = shift; my %flags = @_; if ($flags{indent}) { $indent = delete $flags{indent} || 0; $in = ' ' x $indent; } my $kind = $self->{kind}; my $first = $self->{Kids}[0]; if ($first and ref $first eq 'P5re::Mod') { for my $c (qw(i m s x)) { next unless defined $first->{$c}; $self->{$c} = $first->{$c}; delete $first->{$c}; } } my $modifiers = ""; foreach my $k (sort keys %$self) { next if $k eq 'kind' or $k eq "Kids"; my $v = $self->{$k}; $k =~ s/^[A-Z]//; $modifiers .= " $k=\"$v\""; } my $text = "$in<$kind$modifiers>\n"; $text .= $self->xmlkids(); $text .= "$in\n"; return $text; } package P5re::Alt; our @ISA = 'P5re'; sub xml { my $self = shift; my $text = "$in\n"; $text .= $self->xmlkids(); $text .= "$in\n"; return $text; } #package P5re::Atom; our @ISA = 'P5re'; # #sub xml { # my $self = shift; # my $text = "$in\n"; # $text .= $self->xmlkids(); # $text .= "$in\n"; # return $text; #} package P5re::Quant; our @ISA = 'P5re'; sub xml { my $self = shift; my $q = $self->{rep}; my $min = $self->{min}; my $max = $self->{max}; my $greedy = $self->{greedy}; my $text = "$in\n"; $text .= $self->xmlkids(); $text .= "$in\n"; return $text; } package P5re::White; our @ISA = 'P5re'; sub xml { my $self = shift; return "$intext() . "\" />\n"; } package P5re::Char; our @ISA = 'P5re'; sub xml { my $self = shift; return "$intext() . "\" />\n"; } package P5re::Comment; our @ISA = 'P5re'; sub xml { my $self = shift; return "$inrep() . "\" />\n"; } package P5re::Mod; our @ISA = 'P5re'; sub xml { my $self = shift; my $modifiers = ""; foreach my $k (sort keys %$self) { next if $k eq 'kind' or $k eq "Kids"; my $v = $self->{$k}; $k =~ s/^[A-Z]//; $modifiers .= " $k=\"$v\""; } return "$in\n"; } package P5re::Meta; our @ISA = 'P5re'; sub xml { my $self = shift; my $sem = ""; if ($self->{sem}) { $sem = 'sem="' . $self->{sem} . '" ' } return "$inrep() . "\" $sem/>\n"; } package P5re::Back; our @ISA = 'P5re'; sub xml { my $self = shift; return "$in{to}) . "\"/>\n"; } package P5re::Var; our @ISA = 'P5re'; sub xml { my $self = shift; return "$in{name} . "\" />\n"; } package P5re::Closure; our @ISA = 'P5re'; sub xml { my $self = shift; return "$in{rep}) . "\" />\n"; } package P5re::CClass; our @ISA = 'P5re'; sub xml { my $self = shift; my $neg = $self->{neg} ? "negated" : "normal"; my $text = "$in\n"; $text .= $self->xmlkids(); $text .= "$in\n"; return $text; } package P5re::Range; our @ISA = 'P5re'; sub xml { my $self = shift; my $text = "$in\n"; $text .= $self->xmlkids(); $text .= "$in\n"; return $text; } package P5re; unless (caller) { while (<>) { chomp; print qr2xml($_); print "#######################################\n"; } } sub qrparse { my $qr = shift; my $mod; if ($qr =~ /^s/) { $qr =~ s/^(?:\w*)(\W)((?:\\.|.)*?)\1(.*)\1(\w*)$/$2/; $mod = $4; } else { $qr =~ s/^(?:\w*)(\W)(.*)\1(\w*)$/$2/; $mod = $3; } substr($qr,0,0) = "(?$mod)" if defined $mod and $mod ne ""; return parse($qr,@_); } sub qr2xml { return qrparse(@_)->xml(); } sub re2xml { my $re = shift; return parse($re,@_)->xml(); } sub parse { local($_) = shift; my %flags = @_; $maxbrack = 0; $indent = delete $flags{indent} || 0; $in = ' ' x $indent; warn "$_\n" if $debug; my $re = re('re'); @$re{keys %flags} = values %flags; return $re; } sub re { my $kind = shift; my $oldextended = $extended; my $oldinsensitive = $insensitive; my $oldmultiline = $multiline; my $oldsingleline = $singleline; local $extended = $extended; local $insensitive = $insensitive; local $multiline = $multiline; local $singleline = $singleline; my $first = alt(); my $re; if (not /^\|/) { $first->{kind} = $kind; $re = bless $first, "P5re::RE"; # rebless to remove single alt } else { my @alts = ($first); while (s/^\|//) { push(@alts, alt()); } $re = bless { Kids => [@alts], kind => $kind }, "P5re::RE"; } $re->{x} = $oldextended || 0; $re->{i} = $oldinsensitive || 0; $re->{m} = $oldmultiline || 0; $re->{s} = $oldsingleline || 0; return $re; } sub alt { my @quants; my $quant; while ($quant = quant()) { if (@quants and ref $quant eq ref $quants[-1] and exists $quants[-1]{text} and exists $quant->{text} ) { $quants[-1]{text} .= $quant->{text}; } else { push(@quants, $quant); } } return bless { Kids => [@quants] }, "P5re::Alt"; } sub quant { my $atom = atom(); return 0 unless $atom; # $atom = bless { Kids => [$atom] }, "P5re::Atom"; if (s/^(([*+?])(\??)|\{(\d+)(?:(,)(\d*))?\}(\??))//) { my $min = 0; my $max = "Inf"; my $greed = 1; if ($2) { if ($2 eq '+') { $min = 1; } elsif ($2 eq '?') { $max = 1; } $greed = 0 if $3; } elsif (defined $4) { $min = $4; if ($5) { $max = $6 if $6; } else { $max = $min; } $greed = 0 if $7; } $greed = "na" if $min == $max; return bless { Kids => [$atom], rep => $1, min => $min, max => $max, greedy => $greed }, "P5re::Quant"; } return $atom; } sub atom { my $re; if ($_ eq "") { return 0 } if (/^[)|]/) { return 0 } # whitespace is special because we don't know if /x is in effect if ($extended) { if (s/^(?=\s|#)(\s*(?:#.*)?)//) { return bless { text => $1 }, "P5re::White"; } } # all the parenthesized forms if (s/^\(//) { if (s/^\?://) { $re = re('bracket'); } elsif (s/^(\?#.*?)\)/)/) { $re = bless { rep => "($1)" }, "P5re::Comment"; } elsif (s/^\?=//) { $re = re('lookahead'); } elsif (s/^\?!//) { $re = re('neglookahead'); } elsif (s/^\?<=//) { $re = re('lookbehind'); } elsif (s/^\?//) { $re = re('nobacktrack'); } elsif (s/^(\?\??\{.*?\})\)/)/) { $re = bless { rep => "($1)" }, "P5re::Closure"; } elsif (s/^(\?\(\d+\))//) { my $mods = $1; $re = re('conditional'); $re->{Arep} = "$mods"; } elsif (s/^\?(?=\(\?)//) { my $mods = $1; my $cond = atom(); $re = re('conditional'); unshift(@{$re->{Kids}}, $cond); } elsif (s/^(\?[-\w]+)://) { my $mods = $1; local $extended = $extended; local $insensitive = $insensitive; local $multiline = $multiline; local $singleline = $singleline; setmods($mods); $re = re('bracket'); $re->{Arep} = "($mods)"; $re->{x} = $extended || 0; $re->{i} = $insensitive || 0; $re->{m} = $multiline || 0; $re->{s} = $singleline || 0; } elsif (s/^(\?[-\w]+)//) { my $mods = $1; $re = bless { Arep => "($mods)" }, "P5re::Mod"; setmods($mods); $re->{x} = $extended || 0; $re->{i} = $insensitive || 0; $re->{m} = $multiline || 0; $re->{s} = $singleline || 0; } elsif (s/^\?//) { $re = re('UNRECOGNIZED'); } else { my $brack = ++$maxbrack; $re = re('capture'); $re->{Ato} = $brack; } if (not s/^\)//) { warn "Expected right paren at: '$_'" } return $re; } # special meta if (s/^\.//) { my $s = $singleline ? '.' : '\N'; return bless { rep => '.', sem => $s }, "P5re::Meta"; } if (s/^\^//) { my $s = $multiline ? '^^' : '^'; return bless { rep => '^', sem => $s }, "P5re::Meta"; } if (s/^\$(?:$|(?=[|)]))//) { my $s = $multiline ? '$$' : '$'; return bless { rep => '$', sem => $s }, "P5re::Meta"; } if (s/^([\$\@](\w+|.))//) { # XXX need to handle subscripts here return bless { name => $1 }, "P5re::Var"; } # character classes if (s/^\[//) { my $re = cclass(); if (not s/^\]//) { warn "Expected right bracket at: '$_'" } return $re; } # backwhacks if (/^\\([1-9]\d*)/ and $1 <= $maxbrack) { my $to = $1; onechar(); return bless { to => $to }, "P5re::Back"; } # backwhacks if (/^\\(?=\w)/) { return bless { rep => onechar() }, "P5re::Meta"; } # backwhacks if (s/^\\(.)//) { return bless { text => $1 }, "P5re::Char"; } # optimization, would happen anyway if (s/^(\w+)//) { return bless { text => $1 }, "P5re::Char"; } # random character if (s/^(.)//) { return bless { text => $1 }, "P5re::Char"; } } sub cclass { my @cclass; my $cclass = ""; my $neg = 0; if (s/^\^//) { $neg = 1 } if (s/^([\]\-])//) { $cclass .= $1 } while ($_ ne "" and not /^\]/) { # backwhacks if (/^\\(?=.)|.-/) { my $o1 = onecharobj(); if ($cclass ne "") { push @cclass, bless { text => $cclass }, "P5re::Char"; $cclass = ""; } if (s/^-(?=[^]])//) { my $o2 = onecharobj(); push @cclass, bless { Kids => [$o1, $o2] }, "P5re::Range"; } else { push @cclass, $o1; } } elsif (s/^(\[([:=.])\^?\w*\2\])//) { if ($cclass ne "") { push @cclass, bless { text => $cclass }, "P5re::Char"; $cclass = ""; } push @cclass, bless { rep => $1 }, "P5re::Meta"; } else { $cclass .= onechar(); } } if ($cclass ne "") { push @cclass, bless { text => $cclass }, "P5re::Char"; } return bless { Kids => [@cclass], neg => $neg }, "P5re::CClass"; } sub onecharobj { my $ch = onechar(); if ($ch =~ /^\\/) { $ch = bless { rep => $ch }, "P5re::Meta"; } else { $ch = bless { text => $ch }, "P5re::Char"; } } sub onechar { die "Oops, short cclass" unless s/^(.)//; my $ch = $1; if ($ch eq '\\') { if (s/^([rntf]|[0-7]{1,4})//) { $ch .= $1 } elsif (s/^(x[0-9a-fA-f]{1,2})//) { $ch .= $1 } elsif (s/^(x\{[0-9a-fA-f]+\})//) { $ch .= $1 } elsif (s/^([NpP]\{.*?\})//) { $ch .= $1 } elsif (s/^([cpP].)//) { $ch .= $1 } elsif (s/^(.)//) { $ch .= $1 } else { die "Oops, short backwhack"; } } return $ch; } sub setmods { my $mods = shift; if ($mods =~ /\-.*x/) { $extended = 0; } elsif ($mods =~ /x/) { $extended = 1; } if ($mods =~ /\-.*i/) { $insensitive = 0; } elsif ($mods =~ /i/) { $insensitive = 1; } if ($mods =~ /\-.*m/) { $multiline = 0; } elsif ($mods =~ /m/) { $multiline = 1; } if ($mods =~ /\-.*s/) { $singleline = 0; } elsif ($mods =~ /s/) { $singleline = 1; } } 1;