2 package CharClass::Matcher;
6 use warnings FATAL => 'all';
7 use Text::Wrap qw(wrap);
9 $Data::Dumper::Useqq= 1;
10 our $hex_fmt= "0x%02X";
15 sub ASCII_PLATFORM { (ord('A') == 65) }
17 require 'regen/regen_lib.pl';
21 CharClass::Matcher -- Generate C macros that match character classes efficiently
25 perl Porting/regcharclass.pl
29 Dynamically generates macros for detecting special charclasses
30 in latin-1, utf8, and codepoint forms. Macros can be set to return
31 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
33 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
36 Using WHATEVER as an example the following macros can be produced, depending
37 on the input parameters (how to get each is described by internal comments at
38 the C<__DATA__> line):
42 =item C<is_WHATEVER(s,is_utf8)>
44 =item C<is_WHATEVER_safe(s,e,is_utf8)>
46 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
47 comparisons involving octect<128 are done before checking the C<is_utf8>
48 flag, hopefully saving time.
50 The version without the C<_safe> suffix should be used only when the input is
51 known to be well-formed.
53 =item C<is_WHATEVER_utf8(s)>
55 =item C<is_WHATEVER_utf8_safe(s,e)>
57 Do a lookup assuming the string is encoded in (normalized) UTF8.
59 The version without the C<_safe> suffix should be used only when the input is
60 known to be well-formed.
62 =item C<is_WHATEVER_latin1(s)>
64 =item C<is_WHATEVER_latin1_safe(s,e)>
66 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
68 The version without the C<_safe> suffix should be used only when it is known
69 that C<s> contains at least one character.
71 =item C<is_WHATEVER_cp(cp)>
73 Check to see if the string matches a given codepoint (hypothetically a
74 U32). The condition is constructed as as to "break out" as early as
75 possible if the codepoint is out of range of the condition.
79 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
81 Thus if the character is X+1 only two comparisons will be done. Making
82 matching lookups slower, but non-matching faster.
84 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
86 A variant form of each of the macro types described above can be generated, in
87 which the code point is returned by the macro, and an extra parameter (in the
88 final position) is added, which is a pointer for the macro to set the byte
89 length of the returned code point.
91 These forms all have a C<what_len> prefix instead of the C<is_>, for example
92 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
93 C<what_len_WHATEVER_utf8(s,len)>.
95 These forms should not be used I<except> on small sets of mostly widely
96 separated code points; otherwise the code generated is inefficient. For these
97 cases, it is best to use the C<is_> forms, and then find the code point with
98 C<utf8_to_uvchr_buf>(). This program can fail with a "deep recursion"
99 message on the worst of the inappropriate sets. Examine the generated macro
100 to see if it is acceptable.
102 =item C<what_WHATEVER_FOO(arg1, ...)>
104 A variant form of each of the C<is_> macro types described above can be generated, in
105 which the code point and not the length is returned by the macro. These have
106 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
107 not be used where the set contains a NULL, as 0 is returned for two different
108 cases: a) the set doesn't include the input code point; b) the set does
109 include it, and it is a NULL.
115 perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
120 Author: Yves Orton (demerphq) 2007. Maintained by Perl5 Porters.
124 No tests directly here (although the regex engine will fail tests
125 if this code is broken). Insufficient documentation and no Getopts
126 handler for using the module as a script.
130 You may distribute under the terms of either the GNU General Public
131 License or the Artistic License, as specified in the README file.
135 # Sub naming convention:
136 # __func : private subroutine, can not be called as a method
137 # _func : private method, not meant for external use
138 # func : public method.
141 #-------------------------------------------------------------------------------
143 # ($cp,$n,$l,$u)=__uni_latin($str);
145 # Return a list of arrays, each of which when interpreted correctly
146 # represent the string in some given encoding with specific conditions.
148 # $cp - list of codepoints that make up the string.
149 # $n - list of octets that make up the string if all codepoints are invariant
150 # regardless of if the string is in UTF-8 or not.
151 # $l - list of octets that make up the string in latin1 encoding if all
152 # codepoints < 256, and at least one codepoint is UTF-8 variant.
153 # $u - list of octets that make up the string in utf8 if any codepoint is
157 #-----------+----------
158 # 0 - 127 : $n (127/128 are the values for ASCII platforms)
168 my $only_has_invariants = 1;
169 for my $ch ( split //, $str ) {
172 push @cp_high, $cp if $cp > 255;
173 $max= $cp if $max < $cp;
174 if (! ASCII_PLATFORM && $only_has_invariants) {
176 $only_has_invariants = 0;
180 utf8::upgrade($temp);
181 my @utf8 = unpack "U0C*", $temp;
182 $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
187 $only_has_invariants = $max < 128 if ASCII_PLATFORM;
188 if ($only_has_invariants) {
191 $l= [@cp] if $max && $max < 256;
195 $u= [ unpack "U0C*", $u ] if defined $u;
197 return ( \@cp, \@cp_high, $n, $l, $u );
201 # $clean= __clean($expr);
203 # Cleanup a ternary expression, removing unnecessary parens and apply some
204 # simplifications using regexes.
213 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
215 ## remove redundant parens
216 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
219 # repeatedly simplify conditions like
220 # ( (cond1) ? ( (cond2) ? X : Y ) : Y )
222 # ( ( (cond1) && (cond2) ) ? X : Y )
223 # Also similarly handles expressions like:
224 # : (cond1) ? ( (cond2) ? X : Y ) : Y )
225 # Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is
226 # purely to ensure we have a balanced set of parens in the expression which makes
227 # it easier to understand the pattern in an editor that understands paren's, we do
228 # not expect either of these cases to actually fire. - Yves
234 \? \s* ($parens|[^()?:\s]+?) \s*
235 : \s* ($parens|[^()?:\s]+?) \s*
239 /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
240 #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000;
241 #$expr=~s/\s+//g if length $expr > 8000;
243 die "Expression too long" if length $expr > 8000;
249 # $text= __macro(@args);
250 # Join args together by newlines, and then neatly add backslashes to the end
251 # of every line as expected by the C pre-processor for #define's.
255 my $str= join "\n", @_;
257 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
258 my $last= pop @lines;
259 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
260 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
265 # my $op=__incrdepth($op);
267 # take an 'op' hashref and add one to it and all its childrens depths.
272 return unless ref $op;
274 __incrdepth( $op->{yes} );
275 __incrdepth( $op->{no} );
279 # join two branches of an opcode together with a condition, incrementing
280 # the depth on the yes branch when we do so.
281 # returns the new root opcode of the tree.
283 my ( $cond, $yes, $no )= @_;
286 yes => __incrdepth( $yes ),
296 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
298 # Create a new CharClass::Matcher object by parsing the text in
299 # the txt array. Currently applies the following rules:
301 # Element starts with C<0x>, line is evaled the result treated as
302 # a number which is passed to chr().
304 # Element starts with C<">, line is evaled and the result treated
307 # Each string is then stored in the 'strs' subhash as a hash record
308 # made up of the results of __uni_latin1, using the keynames
309 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
310 # 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
312 # Size data is tracked per type in the 'size' subhash.
320 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
326 title => $opt{title} || '',
328 foreach my $txt ( @{ $opt{txt} } ) {
330 if ( $str =~ /^[""]/ ) {
332 } elsif ($str =~ / - /x ) { # A range: Replace this element on the
333 # list with its expansion
334 my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
335 die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
336 foreach my $cp (hex $lower .. hex $upper) {
337 push @{$opt{txt}}, sprintf "0x%X", $cp;
340 } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
341 # Otherwise undocumented, a leading N means is already in the
342 # native character set; don't convert.
344 } elsif ( $str =~ /^0x/ ) {
347 # Convert from Unicode/ASCII to native, if necessary
348 $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
351 } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
353 use Unicode::UCD qw(prop_invlist);
355 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
358 # An empty return could mean an unknown property, or merely
359 # that it is empty. Call in scalar context to differentiate
360 my $count = prop_invlist($property, '_perl_core_internal_ok');
361 die "$property not found" unless defined $count;
364 # Replace this element on the list with the property's expansion
365 for (my $i = 0; $i < @invlist; $i += 2) {
366 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
368 # prop_invlist() returns native values; add leading 'N'
370 push @{$opt{txt}}, sprintf "N0x%X", $cp;
374 } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
375 die "do '$1' failed: $!$@" if ! do $1 or $@;
377 } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
378 my @results = eval "$1";
379 die "eval '$1' failed: $@" if $@;
380 push @{$opt{txt}}, @results;
383 die "Unparsable line: $txt\n";
385 my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $str );
386 my $UTF8= $low || $utf8;
387 my $LATIN1= $low || $latin1;
388 my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
389 #die Dumper($txt,$cp,$low,$latin1,$utf8)
390 # if $txt=~/NEL/ or $utf8 and @$utf8>3;
392 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}=
393 ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 );
394 my $rec= $self->{strs}{$str};
395 foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
396 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
397 if $self->{strs}{$str}{$key};
399 $self->{has_multi} ||= @$cp > 1;
400 $self->{has_ascii} ||= $latin1 && @$latin1;
401 $self->{has_low} ||= $low && @$low;
402 $self->{has_high} ||= !$low && !$latin1;
404 $self->{val_fmt}= $hex_fmt;
405 $self->{count}= 0 + keys %{ $self->{strs} };
409 # my $trie = make_trie($type,$maxlen);
411 # using the data stored in the object build a trie of a specific type,
412 # and with specific maximum depth. The trie is made up the elements of
413 # the given types array for each string in the object (assuming it is
416 # returns the trie, or undef if there was no relevant data in the object.
420 my ( $self, $type, $maxlen )= @_;
422 my $strs= $self->{strs};
424 foreach my $rec ( values %$strs ) {
425 die "panic: unknown type '$type'"
426 if !exists $rec->{$type};
427 my $dat= $rec->{$type};
429 next if $maxlen && @$dat > $maxlen;
431 foreach my $elem ( @$dat ) {
432 $node->{$elem} ||= {};
433 $node= $node->{$elem};
435 $node->{''}= $rec->{str};
437 return 0 + keys( %trie ) ? \%trie : undef;
443 # This returns a list of the positions of the bits in the input word that
449 push @positions, $position if $word & 1;
456 # my $optree= _optree()
458 # recursively convert a trie to an optree where every node represents
464 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
465 return unless defined $trie;
466 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
467 die "Can't do 'cp' optree from multi-codepoint strings";
470 $else= 0 unless defined $else;
471 $depth= 0 unless defined $depth;
473 # if we have an emptry string as a key it means we are in an
474 # accepting state and unless we can match further on should
475 # return the value of the '' key.
476 if (exists $trie->{''} ) {
477 # we can now update the "else" value, anything failing to match
478 # after this point should return the value from this.
479 if ( $ret_type eq 'cp' ) {
480 $else= $self->{strs}{ $trie->{''} }{cp}[0];
481 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
482 } elsif ( $ret_type eq 'len' ) {
484 } elsif ( $ret_type eq 'both') {
485 $else= $self->{strs}{ $trie->{''} }{cp}[0];
486 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
487 $else= "len=$depth, $else";
490 # extract the meaningful keys from the trie, filter out '' as
491 # it means we are an accepting state (end of sequence).
492 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
494 # if we havent any keys there is no further we can match and we
495 # can return the "else" value.
496 return $else if !@conds;
499 my $test= $test_type =~ /^cp/ ? "cp" : "((U8*)s)[$depth]";
500 # first we loop over the possible keys/conditions and find out what they look like
501 # we group conditions with the same optree together.
504 local $Data::Dumper::Sortkeys=1;
505 foreach my $cond ( @conds ) {
507 # get the optree for this child/condition
508 my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 );
509 # convert it to a string with Dumper
510 my $res_code= Dumper( $res );
512 push @{$dmp_res{$res_code}{vals}}, $cond;
513 if (!$dmp_res{$res_code}{optree}) {
514 $dmp_res{$res_code}{optree}= $res;
515 push @res_order, $res_code;
519 # now that we have deduped the optrees we construct a new optree containing the merged
523 foreach my $res_code_idx (0 .. $#res_order) {
524 my $res_code= $res_order[$res_code_idx];
525 $node->{vals}= $dmp_res{$res_code}{vals};
526 $node->{test}= $test;
527 $node->{yes}= $dmp_res{$res_code}{optree};
528 $node->{depth}= $depth;
529 if ($res_code_idx < $#res_order) {
530 $node= $node->{no}= {};
540 # my $optree= optree(%opts);
542 # Convert a trie to an optree, wrapper for _optree
547 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
548 $opt{ret_type} ||= 'len';
549 my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
550 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
553 # my $optree= generic_optree(%opts);
555 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
556 # sets of strings, including a branch for handling the string type check.
563 $opt{ret_type} ||= 'len';
564 my $test_type= 'depth';
565 my $else= $opt{else} || 0;
567 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
568 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
570 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
574 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
575 } elsif ( $latin1 ) {
576 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
578 my $low= $self->make_trie( 'low', $opt{max_depth} );
580 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
588 # create a string length guarded optree.
594 my $type= $opt{type};
596 die "Can't do a length_optree on type 'cp', makes no sense."
599 my ( @size, $method );
601 if ( $type eq 'generic' ) {
602 $method= 'generic_optree';
604 %{ $self->{size}{low} || {} },
605 %{ $self->{size}{latin1} || {} },
606 %{ $self->{size}{utf8} || {} }
608 @size= sort { $a <=> $b } keys %sizes;
611 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
614 my $else= ( $opt{else} ||= 0 );
615 for my $size ( @size ) {
616 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
617 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
618 $else= __cond_join( $cond, $optree, $else );
623 sub calculate_mask(@) {
624 # Look at the input list of byte values. This routine returns an array of
625 # mask/base pairs to generate that list.
628 my $list_count = @list;
630 # Consider a set of byte values, A, B, C .... If we want to determine if
631 # <c> is one of them, we can write c==A || c==B || c==C .... If the
632 # values are consecutive, we can shorten that to A<=c && c<=Z, which uses
633 # far fewer branches. If only some of them are consecutive we can still
634 # save some branches by creating range tests for just those that are
635 # consecutive. _cond_as_str() does this work for looking for ranges.
637 # Another approach is to look at the bit patterns for A, B, C .... and see
638 # if they have some commonalities. That's what this function does. For
639 # example, consider a set consisting of the bytes
640 # 0xF0, 0xF1, 0xF2, and 0xF3. We could write:
641 # 0xF0 <= c && c <= 0xF4
642 # But the following mask/compare also works, and has just one test:
644 # The reason it works is that the set consists of exactly those bytes
645 # whose first 4 bits are 1, and the next two are 0. (The value of the
646 # other 2 bits is immaterial in determining if a byte is in the set or
647 # not.) The mask masks out those 2 irrelevant bits, and the comparison
648 # makes sure that the result matches all bytes which match those 6
649 # material bits exactly. In other words, the set of bytes contains
650 # exactly those whose bottom two bit positions are either 0 or 1. The
651 # same principle applies to bit positions that are not necessarily
652 # adjacent. And it can be applied to bytes that differ in 1 through all 8
653 # bit positions. In order to be a candidate for this optimization, the
654 # number of bytes in the set must be a power of 2.
656 # Consider a different example, the set 0x53, 0x54, 0x73, and 0x74. That
657 # requires 4 tests using either ranges or individual values, and even
658 # though the number in the set is a power of 2, it doesn't qualify for the
659 # mask optimization described above because the number of bits that are
660 # different is too large for that. However, the set can be expressed as
661 # two branches with masks thusly:
662 # (c & 0xDF) == 0x53 || (c & 0xDF) == 0x54
663 # a branch savings of 50%. This is done by splitting the set into two
664 # subsets each of which has 2 elements, and within each set the values
667 # This function attempts to find some way to save some branches using the
668 # mask technique. If not, it returns an empty list; if so, it
669 # returns a list consisting of
670 # [ [compare1, mask1], [compare2, mask2], ...
671 # [compare_n, undef], [compare_m, undef], ...
673 # The <mask> is undef in the above for those bytes that must be tested
676 # This function does not attempt to find the optimal set. To do so would
677 # probably require testing all possible combinations, and keeping track of
678 # the current best one.
680 # There are probably much better algorithms, but this is the one I (khw)
681 # came up with. We start with doing a bit-wise compare of every byte in
682 # the set with every other byte. The results are sorted into arrays of
683 # all those that differ by the same bit positions. These are stored in a
684 # hash with the each key being the bits they differ in. Here is the hash
685 # for the 0x53, 0x54, 0x73, 0x74 set:
713 # The set consisting of values which differ in the 4 bit positions 0, 1,
714 # 2, and 5 from some other value in the set consists of all 4 values.
715 # Likewise all 4 values differ from some other value in the 3 bit
716 # positions 0, 1, and 2; and all 4 values differ from some other value in
717 # the single bit position 5. The keys at the uppermost level in the above
718 # hash, 1, 3, and 4, give the number of bit positions that each sub-key
719 # below it has. For example, the 4 key could have as its value an array
720 # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
721 # such. The best optimization will group the most values into a single
722 # mask. The most values will be the ones that differ in the most
723 # positions, the ones with the largest value for the topmost key. These
724 # keys, are thus just for convenience of sorting by that number, and do
725 # not have any bearing on the core of the algorithm.
727 # We start with an element from largest number of differing bits. The
728 # largest in this case is 4 bits, and there is only one situation in this
729 # set which has 4 differing bits, "0,1,2,5". We look for any subset of
730 # this set which has 16 values that differ in these 4 bits. There aren't
731 # any, because there are only 4 values in the entire set. We then look at
732 # the next possible thing, which is 3 bits differing in positions "0,1,2".
733 # We look for a subset that has 8 values that differ in these 3 bits.
734 # Again there are none. So we go to look for the next possible thing,
735 # which is a subset of 2**1 values that differ only in bit position 5. 83
736 # and 115 do, so we calculate a mask and base for those and remove them
737 # from every set. Since there is only the one set remaining, we remove
738 # them from just this one. We then look to see if there is another set of
739 # 2 values that differ in bit position 5. 84 and 116 do, so we calculate
740 # a mask and base for those and remove them from every set (again only
741 # this set remains in this example). The set is now empty, and there are
742 # no more sets to look at, so we are done.
744 if ($list_count == 256) { # All 256 is trivially masked
750 # Generate bits-differing lists for each element compared against each
752 for my $i (0 .. $list_count - 2) {
753 for my $j ($i + 1 .. $list_count - 1) {
754 my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
755 my $differ_count = @bits_that_differ;
756 my $key = join ",", @bits_that_differ;
757 push @{$hash{$differ_count}{$key}}, $list[$i] unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
758 push @{$hash{$differ_count}{$key}}, $list[$j];
762 print STDERR __LINE__, ": calculate_mask() called: List of values grouped by differing bits: ", Dumper \%hash if DEBUG;
765 foreach my $count (reverse sort { $a <=> $b } keys %hash) {
766 my $need = 2 ** $count; # Need 8 values for 3 differing bits, etc
767 foreach my $bits (sort keys $hash{$count}) {
769 print STDERR __LINE__, ": For $count bit(s) difference ($bits), need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
771 # Look only as long as there are at least as many elements in the
772 # subset as are needed
773 while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
775 print STDERR __LINE__, ": Looking at bit positions ($bits): ", Dumper $hash{$count}{$bits} if DEBUG;
777 # Start with the first element in it
778 my $try_base = $hash{$count}{$bits}[0];
779 my @subset = $try_base;
781 # If it succeeds, we return a mask and a base to compare
782 # against the masked value. That base will be the AND of
783 # every element in the subset. Initialize to the one element
785 my $compare = $try_base;
787 # We are trying to find a subset of this that has <need>
788 # elements that differ in the bit positions given by the
789 # string $bits, which is comma separated.
790 my @bits = split ",", $bits;
792 TRY: # Look through the remainder of the list for other
793 # elements that differ only by these bit positions.
795 for (my $i = 1; $i < $cur_count; $i++) {
796 my $try_this = $hash{$count}{$bits}[$i];
797 my @positions = pop_count($try_base ^ $try_this);
799 print STDERR __LINE__, ": $try_base vs $try_this: is (", join(',', @positions), ") a subset of ($bits)?" if DEBUG;;
801 foreach my $pos (@positions) {
802 unless (grep { $pos == $_ } @bits) {
803 print STDERR " No\n" if DEBUG;
804 my $remaining = $cur_count - $i - 1;
805 if ($remaining && @subset + $remaining < $need) {
806 print STDERR __LINE__, ": Can stop trying $try_base, because even if all the remaining $remaining values work, they wouldn't add up to the needed $need when combined with the existing ", scalar @subset, " ones\n" if DEBUG;
813 print STDERR " Yes\n" if DEBUG;
814 push @subset, $try_this;
816 # Add this to the mask base, in case it ultimately
818 $compare &= $try_this;
821 print STDERR __LINE__, ": subset (", join(", ", @subset), ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
823 if (@subset < $need) {
824 shift @{$hash{$count}{$bits}};
825 next; # Try with next value
830 foreach my $position (@bits) {
831 $mask |= 1 << $position;
833 $mask = ~$mask & 0xFF;
834 push @final_results, [$compare, $mask];
836 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n", __LINE__, $compare, $compare, $mask if DEBUG;
838 # These values are now spoken for. Remove them from future
840 foreach my $remove_count (sort keys %hash) {
841 foreach my $bits (sort keys %{$hash{$remove_count}}) {
842 foreach my $to_remove (@subset) {
843 @{$hash{$remove_count}{$bits}} = grep { $_ != $to_remove } @{$hash{$remove_count}{$bits}};
851 # Any values that remain in the list are ones that have to be tested for
854 foreach my $count (reverse sort { $a <=> $b } keys %hash) {
855 foreach my $bits (sort keys $hash{$count}) {
856 foreach my $remaining (@{$hash{$count}{$bits}}) {
858 # If we already know about this value, just ignore it.
859 next if grep { $remaining == $_ } @individuals;
861 # Otherwise it needs to be returned as something to match
863 push @final_results, [$remaining, undef];
864 push @individuals, $remaining;
869 # Sort by increasing numeric value
870 @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
872 print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
874 return @final_results;
878 # turn a list of conditions into a text expression
879 # - merges ranges of conditions, and joins the result with ||
881 my ( $self, $op, $combine, $opts_ref )= @_;
882 my $cond= $op->{vals};
883 my $test= $op->{test};
884 my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
885 return "( $test )" if !defined $cond;
890 # We skip this if there are optimizations that
891 # we can apply (below) to the individual ranges
892 if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
893 if ( $ranges[-1][0] == $ranges[-1][1] ) {
894 $ranges[-1]= $ranges[-1][0];
895 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
896 $ranges[-1]= $ranges[-1][0];
897 push @ranges, $ranges[-1] + 1;
901 for my $condition ( @$cond ) {
902 if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
904 push @ranges, [ $condition, $condition ];
911 return $self->_combine( $test, @ranges )
918 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
920 : sprintf( "$self->{val_fmt} == $test", $_ );
923 return "( " . join( " || ", @ranges ) . " )";
926 # If the input set has certain characteristics, we can optimize tests
927 # for it. This doesn't apply if returning the code point, as we want
928 # each element of the set individually. The code above is for this
931 return 1 if @$cond == 256; # If all bytes match, is trivially true
936 # See if the entire set shares optimizable characterstics, and if so,
937 # return the optimization. We delay checking for this on sets with
938 # just a single range, as there may be better optimizations available
940 @masks = calculate_mask(@$cond);
942 # Stringify the output of calculate_mask()
945 foreach my $mask_ref (@masks) {
946 if (defined $mask_ref->[1]) {
947 push @return, sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask_ref->[1], $mask_ref->[0];
949 else { # An undefined mask means to use the value as-is
950 push @return, sprintf "$test == $self->{val_fmt}", $mask_ref->[0];
954 # The best possible case below for specifying this set of values via
955 # ranges is 1 branch per range. If our mask method yielded better
956 # results, there is no sense trying something that is bound to be
958 if (@return < @ranges) {
959 return "( " . join( " || ", @return ) . " )";
966 # Here, there was no entire-class optimization that was clearly better
967 # than doing things by ranges. Look at each range.
968 my $range_count_extra = 0;
969 for (my $i = 0; $i < @ranges; $i++) {
970 if (! ref $ranges[$i]) { # Trivial case: no range
971 $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
973 elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
974 $ranges[$i] = # Trivial case: single element range
975 sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
980 # Well-formed UTF-8 continuation bytes on ascii platforms must be
981 # in the range 0x80 .. 0xBF. If we know that the input is
982 # well-formed (indicated by not trying to be 'safe'), we can omit
983 # tests that verify that the input is within either of these
984 # bounds. (No legal UTF-8 character can begin with anything in
985 # this range, so we don't have to worry about this being a
986 # continuation byte or not.)
988 && ! $opts_ref->{safe}
989 && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
991 my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
992 my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
994 # If the range is the entire legal range, it matches any legal
995 # byte, so we can omit both tests. (This should happen only
996 # if the number of ranges is 1.)
997 if ($lower_limit_is_80 && $upper_limit_is_BF) {
1000 elsif ($lower_limit_is_80) { # Just use the upper limit test
1001 $output = sprintf("( $test <= $self->{val_fmt} )",
1004 elsif ($upper_limit_is_BF) { # Just use the lower limit test
1005 $output = sprintf("( $test >= $self->{val_fmt} )",
1010 # If we didn't change to omit a test above, see if the number of
1011 # elements is a power of 2 (only a single bit in the
1012 # representation of its count will be set) and if so, it may be
1013 # that a mask/compare optimization is possible.
1015 && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
1018 push @list, $_ for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
1019 my @this_masks = calculate_mask(@list);
1021 # Use the mask if there is just one for the whole range.
1022 # Otherwise there is no savings over the two branches that can
1024 if (@this_masks == 1 && defined $this_masks[0][1]) {
1025 $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $this_masks[0][1], $this_masks[0][0];
1029 if ($output ne "") { # Prefer any optimization
1030 $ranges[$i] = $output;
1033 # No optimization happened. We need a test that the code
1034 # point is within both bounds. But, if the bounds are
1035 # adjacent code points, it is cleaner to say
1036 # 'first == test || second == test'
1038 # 'first <= test && test <= second'
1040 $range_count_extra++; # This range requires 2 branches to
1042 if ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
1044 . join( " || ", ( map
1045 { sprintf "$self->{val_fmt} == $test", $_ }
1049 else { # Full bounds checking
1050 $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
1056 # We have generated the list of bytes in two ways; one trying to use masks
1057 # to cut the number of branches down, and the other to look at individual
1058 # ranges (some of which could be cut down by using a mask for just it).
1059 # We return whichever method uses the fewest branches.
1061 . join( " || ", (@masks && @masks < @ranges + $range_count_extra)
1068 # recursively turn a list of conditions into a fast break-out condition
1069 # used by _cond_as_str() for 'cp' type macros.
1071 my ( $self, $test, @cond )= @_;
1073 my $item= shift @cond;
1077 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1079 $gtv= sprintf "$self->{val_fmt}", $item->[1];
1081 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
1082 $gtv= sprintf "$self->{val_fmt}", $item;
1085 my $combine= $self->_combine( $test, @cond );
1087 return "( $cstr || ( $gtv < $test &&\n"
1088 . $combine . " ) )";
1090 return "( $cstr || $combine )";
1098 # recursively convert an optree to text with reasonably neat formatting
1100 my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_;
1101 return 0 if ! defined $op; # The set is empty
1105 my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1106 #no warnings 'recursion'; # This would allow really really inefficient
1107 # code to be generated. See pod
1108 my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def, $submacros );
1109 return $yes if $cond eq '1';
1111 my $no= $self->_render( $op->{no}, $combine, 0, $opts_ref, $def, $submacros );
1112 return "( $cond )" if $yes eq '1' and $no eq '0';
1113 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1114 return "$lb$cond ? $yes : $no$rb"
1115 if !ref( $op->{yes} ) && !ref( $op->{no} );
1117 my $ind= "\n" . ( $ind1 x $op->{depth} );
1119 if ( ref $op->{yes} ) {
1120 $yes= $ind . $ind1 . $yes;
1125 my $str= "$lb$cond ?$yes$ind: $no$rb";
1126 if (length $str > 6000) {
1127 push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $yes_idx= 0+@$submacros), $yes;
1128 push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $no_idx= 0+@$submacros), $no;
1129 return sprintf "%s%s ? $def : $def%s", $lb, $cond, "_part$yes_idx", "_part$no_idx", $rb;
1134 # $expr=render($op,$combine)
1136 # convert an optree to text with reasonably neat formatting. If $combine
1137 # is true then the condition is created using "fast breakouts" which
1138 # produce uglier expressions that are more efficient for common case,
1139 # longer lists such as that resulting from type 'cp' output.
1140 # Currently only used for type 'cp' macros.
1142 my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_;
1145 my $macro= sprintf "#define $def_fmt\n( %s )", "", $self->_render( $op, $combine, 0, $opts_ref, $def_fmt, \@submacros );
1147 return join "\n\n", map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) } @submacros, $macro;
1151 # make a macro of a given type.
1152 # calls into make_trie and (generic_|length_)optree as needed
1154 # type : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
1155 # ret_type : 'cp' or 'len'
1156 # safe : add length guards to macro
1158 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1159 # in which case it defaults to 'cp' as well.
1161 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1162 # sequences in it, as the generated macro will accept only a single codepoint
1165 # returns the macro.
1171 my $type= $opts{type} || 'generic';
1172 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
1174 and $self->{has_multi};
1175 my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1177 if ( $opts{safe} ) {
1178 $method= 'length_optree';
1179 } elsif ( $type eq 'generic' ) {
1180 $method= 'generic_optree';
1184 my @args= $type =~ /^cp/ ? 'cp' : 's';
1185 push @args, "e" if $opts{safe};
1186 push @args, "is_utf8" if $type eq 'generic';
1187 push @args, "len" if $ret_type eq 'both';
1188 my $pfx= $ret_type eq 'both' ? 'what_len_' :
1189 $ret_type eq 'cp' ? 'what_' : 'is_';
1190 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
1191 $ext .= "_safe" if $opts{safe};
1192 my $argstr= join ",", @args;
1193 my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
1194 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1195 return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt );
1198 # if we arent being used as a module (highly likely) then process
1199 # the __DATA__ below and produce macros in regcharclass.h
1200 # if an argument is provided to the script then it is assumed to
1201 # be the path of the file to output to, if the arg is '-' outputs
1205 my $path= shift @ARGV || "regcharclass.h";
1207 if ( $path eq '-' ) {
1210 $out_fh = open_new( $path );
1212 print $out_fh read_only_top( lang => 'C', by => $0,
1213 file => 'regcharclass.h', style => '*',
1214 copyright => [2007, 2011] );
1215 print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
1217 my ( $op, $title, @txt, @types, %mods );
1221 # Skip if to compile on a different platform.
1222 return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
1223 return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
1225 print $out_fh "/*\n\t$op: $title\n\n";
1226 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1227 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
1229 #die Dumper(\@types,\%mods);
1232 push @mods, 'safe' if delete $mods{safe};
1233 unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
1238 die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1241 foreach my $type_spec ( @types ) {
1242 my ( $type, $ret )= split /-/, $type_spec;
1244 foreach my $mod ( @mods ) {
1245 next if $mod eq 'safe' and $type =~ /^cp/;
1247 my $macro= $obj->make_macro(
1250 safe => $mod eq 'safe'
1252 print $out_fh $macro, "\n";
1258 s/^ \s* (?: \# .* ) ? $ //x; # squeeze out comment and blanks
1262 $doit->(); # This starts a new definition; do the previous one
1263 ( $op, $title )= split /\s*:\s*/, $_, 2;
1265 } elsif ( s/^=>// ) {
1266 my ( $type, $modifier )= split /:/, $_;
1267 @types= split ' ', $type;
1269 map { $mods{$_} = 1 } split ' ', $modifier;
1276 print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
1279 print $out_fh "/* ex: set ro: */\n";
1281 read_only_bottom_close_and_rename($out_fh)
1285 # The form of the input is a series of definitions to make macros for.
1286 # The first line gives the base name of the macro, followed by a colon, and
1287 # then text to be used in comments associated with the macro that are its
1288 # title or description. In all cases the first (perhaps only) parameter to
1289 # the macro is a pointer to the first byte of the code point it is to test to
1290 # see if it is in the class determined by the macro. In the case of non-UTF8,
1291 # the code point consists only of a single byte.
1293 # The second line must begin with a '=>' and be followed by the types of
1294 # macro(s) to be generated; these are specified below. A colon follows the
1295 # types, followed by the modifiers, also specified below. At least one
1296 # modifier is required.
1298 # The subsequent lines give what code points go into the class defined by the
1299 # macro. Multiple characters may be specified via a string like "\x0D\x0A",
1300 # enclosed in quotes. Otherwise the lines consist of one of:
1301 # 1) a single Unicode code point, prefaced by 0x
1302 # 2) a single range of Unicode code points separated by a minus (and
1304 # 3) a single Unicode property specified in the standard Perl form
1306 # 4) a line like 'do path'. This will do a 'do' on the file given by
1307 # 'path'. It is assumed that this does nothing but load subroutines
1308 # (See item 5 below). The reason 'require path' is not used instead is
1309 # because 'do' doesn't assume that path is in @INC.
1310 # 5) a subroutine call
1311 # &pkg::foo(arg1, ...)
1312 # where pkg::foo was loaded by a 'do' line (item 4). The subroutine
1313 # returns an array of entries of forms like items 1-3 above. This
1314 # allows more complex inputs than achievable from the other input types.
1316 # A blank line or one whose first non-blank character is '#' is a comment.
1317 # The definition of the macro is terminated by a line unlike those described.
1320 # low generate a macro whose name is 'is_BASE_low' and defines a
1321 # class that includes only ASCII-range chars. (BASE is the
1322 # input macro base name.)
1323 # latin1 generate a macro whose name is 'is_BASE_latin1' and defines a
1324 # class that includes only upper-Latin1-range chars. It is not
1325 # designed to take a UTF-8 input parameter.
1326 # high generate a macro whose name is 'is_BASE_high' and defines a
1327 # class that includes all relevant code points that are above
1328 # the Latin1 range. This is for very specialized uses only.
1329 # It is designed to take only an input UTF-8 parameter.
1330 # utf8 generate a macro whose name is 'is_BASE_utf8' and defines a
1331 # class that includes all relevant characters that aren't ASCII.
1332 # It is designed to take only an input UTF-8 parameter.
1333 # LATIN1 generate a macro whose name is 'is_BASE_latin1' and defines a
1334 # class that includes both ASCII and upper-Latin1-range chars.
1335 # It is not designed to take a UTF-8 input parameter.
1336 # UTF8 generate a macro whose name is 'is_BASE_utf8' and defines a
1337 # class that can include any code point, adding the 'low' ones
1338 # to what 'utf8' works on. It is designed to take only an input
1340 # generic generate a macro whose name is 'is_BASE". It has a 2nd,
1341 # boolean, parameter which indicates if the first one points to
1342 # a UTF-8 string or not. Thus it works in all circumstances.
1343 # cp generate a macro whose name is 'is_BASE_cp' and defines a
1344 # class that returns true if the UV parameter is a member of the
1345 # class; false if not.
1346 # cp_high like cp, but it is assumed that it is known that the UV
1347 # parameter is above Latin1. The name of the generated macro is
1348 # 'is_BASE_cp_high'. This is different from high-cp, derived
1350 # A macro of the given type is generated for each type listed in the input.
1351 # The default return value is the number of octets read to generate the match.
1352 # Append "-cp" to the type to have it instead return the matched codepoint.
1353 # The macro name is changed to 'what_BASE...'. See pod for
1355 # Appending '-both" instead adds an extra parameter to the end of the argument
1356 # list, which is a pointer as to where to store the number of
1357 # bytes matched, while also returning the code point. The macro
1358 # name is changed to 'what_len_BASE...'. See pod for caveats
1361 # safe The input string is not necessarily valid UTF-8. In
1362 # particular an extra parameter (always the 2nd) to the macro is
1363 # required, which points to one beyond the end of the string.
1364 # The macro will make sure not to read off the end of the
1365 # string. In the case of non-UTF8, it makes sure that the
1366 # string has at least one byte in it. The macro name has
1367 # '_safe' appended to it.
1368 # fast The input string is valid UTF-8. No bounds checking is done,
1369 # and the macro can make assumptions that lead to faster
1371 # only_ascii_platform Skip this definition if this program is being run on
1372 # a non-ASCII platform.
1373 # only_ebcdic_platform Skip this definition if this program is being run on
1374 # a non-EBCDIC platform.
1375 # No modifier need be specified; fast is assumed for this case. If both
1376 # 'fast', and 'safe' are specified, two macros will be created for each
1379 # If run on a non-ASCII platform will automatically convert the Unicode input
1380 # to native. The documentation above is slightly wrong in this case. 'low'
1381 # actually refers to code points whose UTF-8 representation is the same as the
1382 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1383 # code points less than 256.
1385 1; # in the unlikely case we are being used as a module
1388 # This is no longer used, but retained in case it is needed some day.
1389 # TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t
1390 # => generic cp generic-cp generic-both :fast safe
1391 # 0x00DF # LATIN SMALL LETTER SHARP S
1392 # 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1393 # 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1394 # 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1395 # 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1396 # 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1398 LNBREAK: Line Break: \R
1399 => generic UTF8 LATIN1 :fast safe
1400 "\x0D\x0A" # CRLF - Network (Windows) line ending
1403 HORIZWS: Horizontal Whitespace: \h \H
1404 => generic UTF8 LATIN1 high cp cp_high :fast safe
1407 VERTWS: Vertical Whitespace: \v \V
1408 => generic UTF8 high LATIN1 cp cp_high :fast safe
1411 XDIGIT: Hexadecimal digits
1412 => UTF8 high cp_high :fast
1415 XPERLSPACE: \p{XPerlSpace}
1416 => generic UTF8 high cp_high :fast
1419 REPLACEMENT: Unicode REPLACEMENT CHARACTER
1423 NONCHAR: Non character code points
1427 SURROGATE: Surrogate characters
1431 GCB_L: Grapheme_Cluster_Break=L
1435 GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
1439 GCB_Prepend: Grapheme_Cluster_Break=Prepend
1443 GCB_RI: Grapheme_Cluster_Break=RI
1447 GCB_SPECIAL_BEGIN_START: Grapheme_Cluster_Break=special_begin_starts
1449 \p{_X_Special_Begin_Start}
1451 GCB_T: Grapheme_Cluster_Break=T
1455 GCB_V: Grapheme_Cluster_Break=V
1459 # This program was run with this enabled, and the results copied to utf8.h;
1460 # then this was commented out because it takes so long to figure out these 2
1461 # million code points. The results would not change unless utf8.h decides it
1462 # wants a maximum other than 4 bytes, or this program creates better
1464 #UTF8_CHAR: Matches utf8 from 1 to 4 bytes
1465 #=> UTF8 :safe only_ascii_platform
1468 # This hasn't been commented out, because we haven't an EBCDIC platform to run
1469 # it on, and the 3 types of EBCDIC allegedly supported by Perl would have
1471 UTF8_CHAR: Matches utf8 from 1 to 5 bytes
1472 => UTF8 :safe only_ebcdic_platform
1475 QUOTEMETA: Meta-characters that \Q should quote
1479 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1481 do regen/regcharclass_multi_char_folds.pl
1484 ®charclass_multi_char_folds::multi_char_folds(1)
1486 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1489 ®charclass_multi_char_folds::multi_char_folds(0)
1492 PATWS: pattern white space
1493 => generic cp : fast safe