1 package CharClass::Matcher;
5 use warnings FATAL => 'all';
6 use Text::Wrap qw(wrap);
9 $Data::Dumper::Useqq= 1;
10 our $hex_fmt= "0x%02X";
12 require 'regen/regen_lib.pl';
16 CharClass::Matcher -- Generate C macros that match character classes efficiently
20 perl Porting/regcharclass.pl
24 Dynamically generates macros for detecting special charclasses
25 in latin-1, utf8, and codepoint forms. Macros can be set to return
26 the length (in bytes) of the matched codepoint, or the codepoint itself.
28 To regenerate regcharclass.h, run this script from perl-root. No arguments
31 Using WHATEVER as an example the following macros will be produced:
35 =item is_WHATEVER(s,is_utf8)
37 =item is_WHATEVER_safe(s,e,is_utf8)
39 Do a lookup as appropriate based on the is_utf8 flag. When possible
40 comparisons involving octect<128 are done before checking the is_utf8
41 flag, hopefully saving time.
43 =item is_WHATEVER_utf8(s)
45 =item is_WHATEVER_utf8_safe(s,e)
47 Do a lookup assuming the string is encoded in (normalized) UTF8.
49 =item is_WHATEVER_latin1(s)
51 =item is_WHATEVER_latin1_safe(s,e)
53 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
55 =item is_WHATEVER_cp(cp)
57 Check to see if the string matches a given codepoint (hypothetically a
58 U32). The condition is constructed as as to "break out" as early as
59 possible if the codepoint is out of range of the condition.
63 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
65 Thus if the character is X+1 only two comparisons will be done. Making
66 matching lookups slower, but non-matching faster.
70 Additionally it is possible to generate C<what_> variants that return
71 the codepoint read instead of the number of octets read, this can be
72 done by suffixing '-cp' to the type description.
76 perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
81 Author: Yves Orton (demerphq) 2007
85 No tests directly here (although the regex engine will fail tests
86 if this code is broken). Insufficient documentation and no Getopts
87 handler for using the module as a script.
91 You may distribute under the terms of either the GNU General Public
92 License or the Artistic License, as specified in the README file.
96 # Sub naming convention:
97 # __func : private subroutine, can not be called as a method
98 # _func : private method, not meant for external use
99 # func : public method.
102 #-------------------------------------------------------------------------------
104 # ($cp,$n,$l,$u)=__uni_latin($str);
106 # Return a list of arrays, each of which when interpreted correctly
107 # represent the string in some given encoding with specific conditions.
109 # $cp - list of codepoints that make up the string.
110 # $n - list of octets that make up the string if all codepoints < 128
111 # $l - list of octets that make up the string in latin1 encoding if all
112 # codepoints < 256, and at least one codepoint is >127.
113 # $u - list of octets that make up the string in utf8 if any codepoint >127
116 #-----------+----------
126 for my $ch ( split //, $str ) {
129 $max= $cp if $max < $cp;
135 $l= [@cp] if $max && $max < 256;
137 my $copy= $str; # must copy string, FB_CROAK makes encode destructive
138 $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
139 # $u is utf8 but with the utf8 flag OFF
140 # therefore "C*" gets us the values of the bytes involved.
141 $u= [ unpack "C*", $u ] if defined $u;
143 return ( \@cp, $n, $l, $u );
147 # $clean= __clean($expr);
149 # Cleanup a ternary expression, removing unnecessary parens and apply some
150 # simplifications using regexes.
156 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
158 #print "$parens\n$expr\n";
159 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
160 1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
161 \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
162 \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
167 # $text= __macro(@args);
168 # Join args together by newlines, and then neatly add backslashes to the end
169 # of every line as expected by the C pre-processor for #define's.
173 my $str= join "\n", @_;
175 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
176 my $last= pop @lines;
177 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
178 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
183 # my $op=__incrdepth($op);
185 # take an 'op' hashref and add one to it and all its childrens depths.
190 return unless ref $op;
192 __incrdepth( $op->{yes} );
193 __incrdepth( $op->{no} );
197 # join two branches of an opcode together with a condition, incrementing
198 # the depth on the yes branch when we do so.
199 # returns the new root opcode of the tree.
201 my ( $cond, $yes, $no )= @_;
204 yes => __incrdepth( $yes ),
214 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
216 # Create a new CharClass::Matcher object by parsing the text in
217 # the txt array. Currently applies the following rules:
219 # Element starts with C<0x>, line is evaled the result treated as
220 # a number which is passed to chr().
222 # Element starts with C<">, line is evaled and the result treated
225 # Each string is then stored in the 'strs' subhash as a hash record
226 # made up of the results of __uni_latin1, using the keynames
227 # 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
228 # 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
230 # Size data is tracked per type in the 'size' subhash.
238 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
244 title => $opt{title} || '',
246 foreach my $txt ( @{ $opt{txt} } ) {
248 if ( $str =~ /^[""]/ ) {
250 } elsif ($str =~ / - /x ) { # A range: Replace this element on the
251 # list with its expansion
252 my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
253 die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
254 foreach my $cp (hex $lower .. hex $upper) {
255 push @{$opt{txt}}, sprintf "0x%X", $cp;
258 } elsif ( $str =~ /^0x/ ) {
260 } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
262 use Unicode::UCD qw(prop_invlist);
264 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
267 # An empty return could mean an unknown property, or merely
268 # that it is empty. Call in scalar context to differentiate
269 my $count = prop_invlist($property, '_perl_core_internal_ok');
270 die "$property not found" unless defined $count;
273 # Replace this element on the list with the property's expansion
274 for (my $i = 0; $i < @invlist; $i += 2) {
275 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
276 push @{$opt{txt}}, sprintf "0x%X", $cp;
281 die "Unparsable line: $txt\n";
285 my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
286 my $UTF8= $low || $utf8;
287 my $LATIN1= $low || $latin1;
288 #die Dumper($txt,$cp,$low,$latin1,$utf8)
289 # if $txt=~/NEL/ or $utf8 and @$utf8>3;
291 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
292 ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
293 my $rec= $self->{strs}{$str};
294 foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
295 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
296 if $self->{strs}{$str}{$key};
298 $self->{has_multi} ||= @$cp > 1;
299 $self->{has_ascii} ||= $latin1 && @$latin1;
300 $self->{has_low} ||= $low && @$low;
301 $self->{has_high} ||= !$low && !$latin1;
303 $self->{val_fmt}= $hex_fmt;
304 $self->{count}= 0 + keys %{ $self->{strs} };
308 # my $trie = make_trie($type,$maxlen);
310 # using the data stored in the object build a trie of a specific type,
311 # and with specific maximum depth. The trie is made up the elements of
312 # the given types array for each string in the object (assuming it is
315 # returns the trie, or undef if there was no relevant data in the object.
319 my ( $self, $type, $maxlen )= @_;
321 my $strs= $self->{strs};
323 foreach my $rec ( values %$strs ) {
324 die "panic: unknown type '$type'"
325 if !exists $rec->{$type};
326 my $dat= $rec->{$type};
328 next if $maxlen && @$dat > $maxlen;
330 foreach my $elem ( @$dat ) {
331 $node->{$elem} ||= {};
332 $node= $node->{$elem};
334 $node->{''}= $rec->{str};
336 return 0 + keys( %trie ) ? \%trie : undef;
339 # my $optree= _optree()
341 # recursively convert a trie to an optree where every node represents
347 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
348 return unless defined $trie;
349 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
350 die "Can't do 'cp' optree from multi-codepoint strings";
353 $else= 0 unless defined $else;
354 $depth= 0 unless defined $depth;
356 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
358 if ( $ret_type eq 'cp' ) {
359 $else= $self->{strs}{ $trie->{''} }{cp}[0];
360 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
361 } elsif ( $ret_type eq 'len' ) {
363 } elsif ( $ret_type eq 'both') {
364 $else= $self->{strs}{ $trie->{''} }{cp}[0];
365 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
366 $else= "len=$depth, $else";
369 return $else if !@conds;
372 my ( $yes_res, $as_code, @cond );
373 my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
375 $node->{vals}= [@cond];
376 $node->{test}= $test;
377 $node->{yes}= $yes_res;
378 $node->{depth}= $depth;
382 my $cond= shift @conds;
384 $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
386 my $res_code= Dumper( $res );
387 if ( !$yes_res || $res_code ne $as_code ) {
392 ( $yes_res, $as_code )= ( $res, $res_code );
402 # my $optree= optree(%opts);
404 # Convert a trie to an optree, wrapper for _optree
409 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
410 $opt{ret_type} ||= 'len';
411 my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
412 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
415 # my $optree= generic_optree(%opts);
417 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
418 # sets of strings, including a branch for handling the string type check.
425 $opt{ret_type} ||= 'len';
426 my $test_type= 'depth';
427 my $else= $opt{else} || 0;
429 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
430 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
432 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
436 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
437 } elsif ( $latin1 ) {
438 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
440 my $low= $self->make_trie( 'low', $opt{max_depth} );
442 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
450 # create a string length guarded optree.
456 my $type= $opt{type};
458 die "Can't do a length_optree on type 'cp', makes no sense."
461 my ( @size, $method );
463 if ( $type eq 'generic' ) {
464 $method= 'generic_optree';
466 %{ $self->{size}{low} || {} },
467 %{ $self->{size}{latin1} || {} },
468 %{ $self->{size}{utf8} || {} }
470 @size= sort { $a <=> $b } keys %sizes;
473 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
476 my $else= ( $opt{else} ||= 0 );
477 for my $size ( @size ) {
478 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
479 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
480 $else= __cond_join( $cond, $optree, $else );
486 # turn a list of conditions into a text expression
487 # - merges ranges of conditions, and joins the result with ||
489 my ( $self, $op, $combine )= @_;
490 my $cond= $op->{vals};
491 my $test= $op->{test};
492 return "( $test )" if !defined $cond;
498 if ( $ranges[-1][0] == $ranges[-1][1] ) {
499 $ranges[-1]= $ranges[-1][0];
500 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
501 $ranges[-1]= $ranges[-1][0];
502 push @ranges, $ranges[-1] + 1;
506 for my $cond ( @$cond ) {
507 if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
509 push @ranges, [ $cond, $cond ];
515 return $self->_combine( $test, @ranges )
520 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
522 : sprintf( "$self->{val_fmt} == $test", $_ );
524 return "( " . join( " || ", @ranges ) . " )";
528 # recursively turn a list of conditions into a fast break-out condition
529 # used by _cond_as_str() for 'cp' type macros.
531 my ( $self, $test, @cond )= @_;
533 my $item= shift @cond;
537 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
539 $gtv= sprintf "$self->{val_fmt}", $item->[1];
541 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
542 $gtv= sprintf "$self->{val_fmt}", $item;
545 return "( $cstr || ( $gtv < $test &&\n"
546 . $self->_combine( $test, @cond ) . " ) )";
553 # recursively convert an optree to text with reasonably neat formatting
555 my ( $self, $op, $combine, $brace )= @_;
559 my $cond= $self->_cond_as_str( $op, $combine );
560 my $yes= $self->_render( $op->{yes}, $combine, 1 );
561 my $no= $self->_render( $op->{no}, $combine, 0 );
562 return "( $cond )" if $yes eq '1' and $no eq '0';
563 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
564 return "$lb$cond ? $yes : $no$rb"
565 if !ref( $op->{yes} ) && !ref( $op->{no} );
567 my $ind= "\n" . ( $ind1 x $op->{depth} );
569 if ( ref $op->{yes} ) {
570 $yes= $ind . $ind1 . $yes;
575 return "$lb$cond ?$yes$ind: $no$rb";
578 # $expr=render($op,$combine)
580 # convert an optree to text with reasonably neat formatting. If $combine
581 # is true then the condition is created using "fast breakouts" which
582 # produce uglier expressions that are more efficient for common case,
583 # longer lists such as that resulting from type 'cp' output.
584 # Currently only used for type 'cp' macros.
586 my ( $self, $op, $combine )= @_;
587 my $str= "( " . $self->_render( $op, $combine ) . " )";
588 return __clean( $str );
592 # make a macro of a given type.
593 # calls into make_trie and (generic_|length_)optree as needed
595 # type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
596 # ret_type : 'cp' or 'len'
597 # safe : add length guards to macro
599 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
600 # in which case it defaults to 'cp' as well.
602 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
603 # sequences in it, as the generated macro will accept only a single codepoint
612 my $type= $opts{type} || 'generic';
613 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
615 and $self->{has_multi};
616 my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
619 $method= 'length_optree';
620 } elsif ( $type eq 'generic' ) {
621 $method= 'generic_optree';
625 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
626 my $text= $self->render( $optree, $type eq 'cp' );
627 my @args= $type eq 'cp' ? 'cp' : 's';
628 push @args, "e" if $opts{safe};
629 push @args, "is_utf8" if $type eq 'generic';
630 push @args, "len" if $ret_type eq 'both';
631 my $pfx= $ret_type eq 'both' ? 'what_len_' :
632 $ret_type eq 'cp' ? 'what_' : 'is_';
633 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
634 $ext .= "_safe" if $opts{safe};
635 my $argstr= join ",", @args;
636 return "/*** GENERATED CODE ***/\n"
637 . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
640 # if we arent being used as a module (highly likely) then process
641 # the __DATA__ below and produce macros in regcharclass.h
642 # if an argument is provided to the script then it is assumed to
643 # be the path of the file to output to, if the arg is '-' outputs
647 my $path= shift @ARGV || "regcharclass.h";
649 if ( $path eq '-' ) {
652 $out_fh = open_new( $path );
654 print $out_fh read_only_top( lang => 'C', by => $0,
655 file => 'regcharclass.h', style => '*',
656 copyright => [2007, 2011] );
657 print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
659 my ( $op, $title, @txt, @types, @mods );
662 print $out_fh "/*\n\t$op: $title\n\n";
663 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
664 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
666 #die Dumper(\@types,\@mods);
668 foreach my $type_spec ( @types ) {
669 my ( $type, $ret )= split /-/, $type_spec;
671 foreach my $mod ( @mods ) {
672 next if $mod eq 'safe' and $type eq 'cp';
673 my $macro= $obj->make_macro(
676 safe => $mod eq 'safe'
678 print $out_fh $macro, "\n";
689 ( $op, $title )= split /\s*:\s*/, $_, 2;
691 } elsif ( s/^=>// ) {
692 my ( $type, $modifier )= split /:/, $_;
693 @types= split ' ', $type;
694 @mods= split ' ', $modifier;
701 print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
704 print $out_fh "/* ex: set ro: */\n";
706 read_only_bottom_close_and_rename($out_fh)
711 # Valid types: generic, LATIN1, UTF8, low, latin1, utf8
712 # default return value is octects read.
713 # append -cp to make it codepoint matched.
714 # modifiers come after the colon, valid possibilities
715 # being 'fast' and 'safe'.
717 # Accepts a single code point per line, prefaced by '0x'
718 # or a range of two code points separated by a minus (and optional space)
719 # or a single \p{} per line.
721 # This is no longer used, but retained in case it is needed some day. Put the
722 # lines below under __DATA__
723 # TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t
724 # => generic cp generic-cp generic-both :fast safe
725 # 0x00DF # LATIN SMALL LETTER SHARP S
726 # 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
727 # 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
728 # 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
729 # 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
730 # 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
732 1; # in the unlikely case we are being used as a module
735 LNBREAK: Line Break: \R
736 => generic UTF8 LATIN1 :fast safe
737 "\x0D\x0A" # CRLF - Network (Windows) line ending
740 HORIZWS: Horizontal Whitespace: \h \H
741 => generic UTF8 LATIN1 cp :fast safe
744 VERTWS: Vertical Whitespace: \v \V
745 => generic UTF8 LATIN1 cp :fast safe