This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta updates for 6728836, d60d201, 82f9620, 7878705
[perl5.git] / mad / P5re.pm
CommitLineData
6a28abbc
NC
1#!/usr/bin/perl
2
3# Copyright (C) 2005, Larry Wall
4# This software may be copied under the same terms as Perl.
5
6package P5re;
7
8use strict;
9use warnings;
10
11our @EXPORT_OK = qw(re re2xml qr2xml);
12
13my $indent = 0;
14my $in = "";
15my $delim = 1;
16my $debug = 0;
17my $maxbrack;
18
19our $extended;
20our $insensitive;
21our $singleline;
22our $multiline;
23
24my %xmlish = (
25 chr(0x00) => "STUPIDXML(#x00)",
26 chr(0x01) => "STUPIDXML(#x01)",
27 chr(0x02) => "STUPIDXML(#x02)",
28 chr(0x03) => "STUPIDXML(#x03)",
29 chr(0x04) => "STUPIDXML(#x04)",
30 chr(0x05) => "STUPIDXML(#x05)",
31 chr(0x06) => "STUPIDXML(#x06)",
32 chr(0x07) => "STUPIDXML(#x07)",
33 chr(0x08) => "STUPIDXML(#x08)",
34 chr(0x09) => "	",
35 chr(0x0a) => "
",
36 chr(0x0b) => "STUPIDXML(#x0b)",
37 chr(0x0c) => "STUPIDXML(#x0c)",
38 chr(0x0d) => "
",
39 chr(0x0e) => "STUPIDXML(#x0e)",
40 chr(0x0f) => "STUPIDXML(#x0f)",
41 chr(0x10) => "STUPIDXML(#x10)",
42 chr(0x11) => "STUPIDXML(#x11)",
43 chr(0x12) => "STUPIDXML(#x12)",
44 chr(0x13) => "STUPIDXML(#x13)",
45 chr(0x14) => "STUPIDXML(#x14)",
46 chr(0x15) => "STUPIDXML(#x15)",
47 chr(0x16) => "STUPIDXML(#x16)",
48 chr(0x17) => "STUPIDXML(#x17)",
49 chr(0x18) => "STUPIDXML(#x18)",
50 chr(0x19) => "STUPIDXML(#x19)",
51 chr(0x1a) => "STUPIDXML(#x1a)",
52 chr(0x1b) => "STUPIDXML(#x1b)",
53 chr(0x1c) => "STUPIDXML(#x1c)",
54 chr(0x1d) => "STUPIDXML(#x1d)",
55 chr(0x1e) => "STUPIDXML(#x1e)",
56 chr(0x1f) => "STUPIDXML(#x1f)",
57 chr(0x7f) => "STUPIDXML(#x7f)",
58 chr(0x80) => "STUPIDXML(#x80)",
59 chr(0x81) => "STUPIDXML(#x81)",
60 chr(0x82) => "STUPIDXML(#x82)",
61 chr(0x83) => "STUPIDXML(#x83)",
62 chr(0x84) => "STUPIDXML(#x84)",
63 chr(0x86) => "STUPIDXML(#x86)",
64 chr(0x87) => "STUPIDXML(#x87)",
65 chr(0x88) => "STUPIDXML(#x88)",
66 chr(0x89) => "STUPIDXML(#x89)",
67 chr(0x90) => "STUPIDXML(#x90)",
68 chr(0x91) => "STUPIDXML(#x91)",
69 chr(0x92) => "STUPIDXML(#x92)",
70 chr(0x93) => "STUPIDXML(#x93)",
71 chr(0x94) => "STUPIDXML(#x94)",
72 chr(0x95) => "STUPIDXML(#x95)",
73 chr(0x96) => "STUPIDXML(#x96)",
74 chr(0x97) => "STUPIDXML(#x97)",
75 chr(0x98) => "STUPIDXML(#x98)",
76 chr(0x99) => "STUPIDXML(#x99)",
77 chr(0x9a) => "STUPIDXML(#x9a)",
78 chr(0x9b) => "STUPIDXML(#x9b)",
79 chr(0x9c) => "STUPIDXML(#x9c)",
80 chr(0x9d) => "STUPIDXML(#x9d)",
81 chr(0x9e) => "STUPIDXML(#x9e)",
82 chr(0x9f) => "STUPIDXML(#x9f)",
83 '<' => "&lt;",
84 '>' => "&gt;",
85 '&' => "&amp;",
86 '"' => "&#34;", # XML idiocy
87);
88
89sub xmlquote {
90 my $text = shift;
91 $text =~ s/(.)/$xmlish{$1} || $1/seg;
92 return $text;
93}
94
95sub text {
96 my $self = shift;
97 return xmlquote($self->{text});
98}
99
100sub rep {
101 my $self = shift;
102 return xmlquote($self->{rep});
103}
104
105sub xmlkids {
106 my $self = shift;
107 my $array = $self->{Kids};
108 my $ret = "";
109 $indent += 2;
110 $in = ' ' x $indent;
111 foreach my $chunk (@$array) {
112 if (ref $chunk eq "ARRAY") {
113 die;
114 }
115 elsif (ref $chunk) {
116 $ret .= $chunk->xml();
117 }
118 else {
119 warn $chunk;
120 }
121 }
122 $indent -= 2;
123 $in = ' ' x $indent;
124 return $ret;
125};
126
127package P5re::RE; our @ISA = 'P5re';
128
129sub xml {
130 my $self = shift;
131 my %flags = @_;
132 if ($flags{indent}) {
133 $indent = delete $flags{indent} || 0;
134 $in = ' ' x $indent;
135 }
136
137 my $kind = $self->{kind};
138
139 my $first = $self->{Kids}[0];
140 if ($first and ref $first eq 'P5re::Mod') {
141 for my $c (qw(i m s x)) {
142 next unless defined $first->{$c};
143 $self->{$c} = $first->{$c};
144 delete $first->{$c};
145 }
146 }
147
148 my $modifiers = "";
149 foreach my $k (sort keys %$self) {
150 next if $k eq 'kind' or $k eq "Kids";
151 my $v = $self->{$k};
152 $k =~ s/^[A-Z]//;
153 $modifiers .= " $k=\"$v\"";
154 }
155 my $text = "$in<$kind$modifiers>\n";
156 $text .= $self->xmlkids();
157 $text .= "$in</$kind>\n";
158 return $text;
159}
160
161package P5re::Alt; our @ISA = 'P5re';
162
163sub xml {
164 my $self = shift;
165 my $text = "$in<alt>\n";
166 $text .= $self->xmlkids();
167 $text .= "$in</alt>\n";
168 return $text;
169}
170
171#package P5re::Atom; our @ISA = 'P5re';
172#
173#sub xml {
174# my $self = shift;
175# my $text = "$in<atom>\n";
176# $text .= $self->xmlkids();
177# $text .= "$in</atom>\n";
178# return $text;
179#}
180
181package P5re::Quant; our @ISA = 'P5re';
182
183sub xml {
184 my $self = shift;
185 my $q = $self->{rep};
186 my $min = $self->{min};
187 my $max = $self->{max};
188 my $greedy = $self->{greedy};
189 my $text = "$in<quant rep=\"$q\" min=\"$min\" max=\"$max\" greedy=\"$greedy\">\n";
190 $text .= $self->xmlkids();
191 $text .= "$in</quant>\n";
192 return $text;
193}
194
195package P5re::White; our @ISA = 'P5re';
196
197sub xml {
198 my $self = shift;
199 return "$in<white text=\"" . $self->text() . "\" />\n";
200}
201
202package P5re::Char; our @ISA = 'P5re';
203
204sub xml {
205 my $self = shift;
206 return "$in<char text=\"" . $self->text() . "\" />\n";
207}
208
209package P5re::Comment; our @ISA = 'P5re';
210
211sub xml {
212 my $self = shift;
213 return "$in<comment rep=\"" . $self->rep() . "\" />\n";
214}
215
216package P5re::Mod; our @ISA = 'P5re';
217
218sub xml {
219 my $self = shift;
220 my $modifiers = "";
221 foreach my $k (sort keys %$self) {
222 next if $k eq 'kind' or $k eq "Kids";
223 my $v = $self->{$k};
224 $k =~ s/^[A-Z]//;
225 $modifiers .= " $k=\"$v\"";
226 }
227 return "$in<mod$modifiers />\n";
228}
229
230package P5re::Meta; our @ISA = 'P5re';
231
232sub xml {
233 my $self = shift;
234 my $sem = "";
235 if ($self->{sem}) {
236 $sem = 'sem="' . $self->{sem} . '" '
237 }
238 return "$in<meta rep=\"" . $self->rep() . "\" $sem/>\n";
239}
240
241package P5re::Back; our @ISA = 'P5re';
242
243sub xml {
244 my $self = shift;
245 return "$in<backref to=\"" . P5re::xmlquote($self->{to}) . "\"/>\n";
246}
247
248package P5re::Var; our @ISA = 'P5re';
249
250sub xml {
251 my $self = shift;
252 return "$in<var name=\"" . $self->{name} . "\" />\n";
253}
254
255package P5re::Closure; our @ISA = 'P5re';
256
257sub xml {
258 my $self = shift;
259 return "$in<closure rep=\"" . P5re::xmlquote($self->{rep}) . "\" />\n";
260}
261
262package P5re::CClass; our @ISA = 'P5re';
263
264sub xml {
265 my $self = shift;
266 my $neg = $self->{neg} ? "negated" : "normal";
267 my $text = "$in<cclass match=\"$neg\">\n";
268 $text .= $self->xmlkids();
269 $text .= "$in</cclass>\n";
270 return $text;
271}
272
273package P5re::Range; our @ISA = 'P5re';
274
275sub xml {
276 my $self = shift;
277 my $text = "$in<range>\n";
278 $text .= $self->xmlkids();
279 $text .= "$in</range>\n";
280 return $text;
281}
282
283package P5re;
284
285unless (caller) {
286 while (<>) {
287 chomp;
288 print qr2xml($_);
289 print "#######################################\n";
290 }
291}
292
293sub qrparse {
294 my $qr = shift;
295 my $mod;
296 if ($qr =~ /^s/) {
297 $qr =~ s/^(?:\w*)(\W)((?:\\.|.)*?)\1(.*)\1(\w*)$/$2/;
298 $mod = $4;
299 }
300 else {
301 $qr =~ s/^(?:\w*)(\W)(.*)\1(\w*)$/$2/;
302 $mod = $3;
303 }
304 substr($qr,0,0) = "(?$mod)" if defined $mod and $mod ne "";
305 return parse($qr,@_);
306}
307
308sub qr2xml {
309 return qrparse(@_)->xml();
310}
311
312sub re2xml {
313 my $re = shift;
314 return parse($re,@_)->xml();
315}
316
317sub parse {
318 local($_) = shift;
319 my %flags = @_;
320 $maxbrack = 0;
321 $indent = delete $flags{indent} || 0;
322 $in = ' ' x $indent;
323 warn "$_\n" if $debug;
324 my $re = re('re');
325 @$re{keys %flags} = values %flags;
326 return $re;
327}
328
329sub re {
330 my $kind = shift;
331
332 my $oldextended = $extended;
333 my $oldinsensitive = $insensitive;
334 my $oldmultiline = $multiline;
335 my $oldsingleline = $singleline;
336
337 local $extended = $extended;
338 local $insensitive = $insensitive;
339 local $multiline = $multiline;
340 local $singleline = $singleline;
341
342 my $first = alt();
343
344 my $re;
345 if (not /^\|/) {
346 $first->{kind} = $kind;
347 $re = bless $first, "P5re::RE"; # rebless to remove single alt
348 }
349 else {
350 my @alts = ($first);
351
352 while (s/^\|//) {
353 push(@alts, alt());
354 }
355 $re = bless { Kids => [@alts], kind => $kind }, "P5re::RE";
356 }
357
358 $re->{x} = $oldextended || 0;
359 $re->{i} = $oldinsensitive || 0;
360 $re->{m} = $oldmultiline || 0;
361 $re->{s} = $oldsingleline || 0;
362 return $re;
363}
364
365sub alt {
366 my @quants;
367
368 my $quant;
369 while ($quant = quant()) {
370 if (@quants and
371 ref $quant eq ref $quants[-1] and
372 exists $quants[-1]{text} and
373 exists $quant->{text} )
374 {
375 $quants[-1]{text} .= $quant->{text};
376 }
377 else {
378 push(@quants, $quant);
379 }
380 }
381 return bless { Kids => [@quants] }, "P5re::Alt";
382}
383
384sub quant {
385 my $atom = atom();
386 return 0 unless $atom;
387# $atom = bless { Kids => [$atom] }, "P5re::Atom";
388 if (s/^(([*+?])(\??)|\{(\d+)(?:(,)(\d*))?\}(\??))//) {
389 my $min = 0;
390 my $max = "Inf";
391 my $greed = 1;
392 if ($2) {
393 if ($2 eq '+') {
394 $min = 1;
395 }
396 elsif ($2 eq '?') {
397 $max = 1;
398 }
399 $greed = 0 if $3;
400 }
401 elsif (defined $4) {
402 $min = $4;
403 if ($5) {
404 $max = $6 if $6;
405 }
406 else {
407 $max = $min;
408 }
409 $greed = 0 if $7;
410 }
411 $greed = "na" if $min == $max;
412 return bless { Kids => [$atom],
413 rep => $1,
414 min => $min,
415 max => $max,
416 greedy => $greed
417 }, "P5re::Quant";
418 }
419 return $atom;
420}
421
422sub atom {
423 my $re;
424 if ($_ eq "") { return 0 }
425 if (/^[)|]/) { return 0 }
426
427 # whitespace is special because we don't know if /x is in effect
428 if ($extended) {
429 if (s/^(?=\s|#)(\s*(?:#.*)?)//) { return bless { text => $1 }, "P5re::White"; }
430 }
431
432 # all the parenthesized forms
433 if (s/^\(//) {
434 if (s/^\?://) {
435 $re = re('bracket');
436 }
437 elsif (s/^(\?#.*?)\)/)/) {
438 $re = bless { rep => "($1)" }, "P5re::Comment";
439 }
440 elsif (s/^\?=//) {
441 $re = re('lookahead');
442 }
443 elsif (s/^\?!//) {
444 $re = re('neglookahead');
445 }
446 elsif (s/^\?<=//) {
447 $re = re('lookbehind');
448 }
449 elsif (s/^\?<!//) {
450 $re = re('neglookbehind');
451 }
452 elsif (s/^\?>//) {
453 $re = re('nobacktrack');
454 }
455 elsif (s/^(\?\??\{.*?\})\)/)/) {
456 $re = bless { rep => "($1)" }, "P5re::Closure";
457 }
458 elsif (s/^(\?\(\d+\))//) {
459 my $mods = $1;
460 $re = re('conditional');
461 $re->{Arep} = "$mods";
462 }
463 elsif (s/^\?(?=\(\?)//) {
464 my $mods = $1;
465 my $cond = atom();
466 $re = re('conditional');
467 unshift(@{$re->{Kids}}, $cond);
468 }
469 elsif (s/^(\?[-\w]+)://) {
470 my $mods = $1;
471 local $extended = $extended;
472 local $insensitive = $insensitive;
473 local $multiline = $multiline;
474 local $singleline = $singleline;
475 setmods($mods);
476 $re = re('bracket');
477 $re->{Arep} = "($mods)";
478 $re->{x} = $extended || 0;
479 $re->{i} = $insensitive || 0;
480 $re->{m} = $multiline || 0;
481 $re->{s} = $singleline || 0;
482 }
483 elsif (s/^(\?[-\w]+)//) {
484 my $mods = $1;
485 $re = bless { Arep => "($mods)" }, "P5re::Mod";
486 setmods($mods);
487 $re->{x} = $extended || 0;
488 $re->{i} = $insensitive || 0;
489 $re->{m} = $multiline || 0;
490 $re->{s} = $singleline || 0;
491 }
492 elsif (s/^\?//) {
493 $re = re('UNRECOGNIZED');
494 }
495 else {
496 my $brack = ++$maxbrack;
497 $re = re('capture');
498 $re->{Ato} = $brack;
499 }
500
501 if (not s/^\)//) { warn "Expected right paren at: '$_'" }
502 return $re;
503 }
504
505 # special meta
506 if (s/^\.//) {
507 my $s = $singleline ? '.' : '\N';
508 return bless { rep => '.', sem => $s }, "P5re::Meta";
509 }
510 if (s/^\^//) {
511 my $s = $multiline ? '^^' : '^';
512 return bless { rep => '^', sem => $s }, "P5re::Meta";
513 }
514 if (s/^\$(?:$|(?=[|)]))//) {
515 my $s = $multiline ? '$$' : '$';
516 return bless { rep => '$', sem => $s }, "P5re::Meta";
517 }
518 if (s/^([\$\@](\w+|.))//) { # XXX need to handle subscripts here
519 return bless { name => $1 }, "P5re::Var";
520 }
521
522 # character classes
523 if (s/^\[//) {
524 my $re = cclass();
525 if (not s/^\]//) { warn "Expected right bracket at: '$_'" }
526 return $re;
527 }
528
529 # backwhacks
530 if (/^\\([1-9]\d*)/ and $1 <= $maxbrack) {
531 my $to = $1;
532 onechar();
533 return bless { to => $to }, "P5re::Back";
534 }
535
536 # backwhacks
537 if (/^\\(?=\w)/) {
538 return bless { rep => onechar() }, "P5re::Meta";
539 }
540
541 # backwhacks
542 if (s/^\\(.)//) {
543 return bless { text => $1 }, "P5re::Char";
544 }
545
546 # optimization, would happen anyway
547 if (s/^(\w+)//) { return bless { text => $1 }, "P5re::Char"; }
548
549 # random character
550 if (s/^(.)//) { return bless { text => $1 }, "P5re::Char"; }
551}
552
553sub cclass {
554 my @cclass;
555 my $cclass = "";
556 my $neg = 0;
557 if (s/^\^//) { $neg = 1 }
558 if (s/^([\]\-])//) { $cclass .= $1 }
559
560 while ($_ ne "" and not /^\]/) {
561 # backwhacks
562 if (/^\\(?=.)|.-/) {
563 my $o1 = onecharobj();
564 if ($cclass ne "") {
565 push @cclass, bless { text => $cclass }, "P5re::Char";
566 $cclass = "";
567 }
568
569 if (s/^-(?=[^]])//) {
570 my $o2 = onecharobj();
571 push @cclass, bless { Kids => [$o1, $o2] }, "P5re::Range";
572 }
573 else {
574 push @cclass, $o1;
575 }
576 }
577 elsif (s/^(\[([:=.])\^?\w*\2\])//) {
578 if ($cclass ne "") {
579 push @cclass, bless { text => $cclass }, "P5re::Char";
580 $cclass = "";
581 }
582 push @cclass, bless { rep => $1 }, "P5re::Meta";
583 }
584 else {
585 $cclass .= onechar();
586 }
587 }
588
589 if ($cclass ne "") {
590 push @cclass, bless { text => $cclass }, "P5re::Char";
591 }
592 return bless { Kids => [@cclass], neg => $neg }, "P5re::CClass";
593}
594
595sub onecharobj {
596 my $ch = onechar();
597 if ($ch =~ /^\\/) {
598 $ch = bless { rep => $ch }, "P5re::Meta";
599 }
600 else {
601 $ch = bless { text => $ch }, "P5re::Char";
602 }
603}
604
605sub onechar {
606 die "Oops, short cclass" unless s/^(.)//;
607 my $ch = $1;
608 if ($ch eq '\\') {
609 if (s/^([rntf]|[0-7]{1,4})//) { $ch .= $1 }
610 elsif (s/^(x[0-9a-fA-f]{1,2})//) { $ch .= $1 }
611 elsif (s/^(x\{[0-9a-fA-f]+\})//) { $ch .= $1 }
612 elsif (s/^([NpP]\{.*?\})//) { $ch .= $1 }
613 elsif (s/^([cpP].)//) { $ch .= $1 }
614 elsif (s/^(.)//) { $ch .= $1 }
615 else {
616 die "Oops, short backwhack";
617 }
618 }
619 return $ch;
620}
621
622sub setmods {
623 my $mods = shift;
624 if ($mods =~ /\-.*x/) {
625 $extended = 0;
626 }
627 elsif ($mods =~ /x/) {
628 $extended = 1;
629 }
630 if ($mods =~ /\-.*i/) {
631 $insensitive = 0;
632 }
633 elsif ($mods =~ /i/) {
634 $insensitive = 1;
635 }
636 if ($mods =~ /\-.*m/) {
637 $multiline = 0;
638 }
639 elsif ($mods =~ /m/) {
640 $multiline = 1;
641 }
642 if ($mods =~ /\-.*s/) {
643 $singleline = 0;
644 }
645 elsif ($mods =~ /s/) {
646 $singleline = 1;
647 }
648}
649
6501;