2 package CharClass::Matcher;
6 use warnings FATAL => 'all';
7 use Text::Wrap qw(wrap);
9 $Data::Dumper::Useqq= 1;
10 our $hex_fmt= "0x%02X";
12 sub ASCII_PLATFORM { (ord('A') == 65) }
14 require 'regen/regen_lib.pl';
18 CharClass::Matcher -- Generate C macros that match character classes efficiently
22 perl Porting/regcharclass.pl
26 Dynamically generates macros for detecting special charclasses
27 in latin-1, utf8, and codepoint forms. Macros can be set to return
28 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
30 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
33 Using WHATEVER as an example the following macros can be produced, depending
34 on the input parameters (how to get each is described by internal comments at
35 the C<__DATA__> line):
39 =item C<is_WHATEVER(s,is_utf8)>
41 =item C<is_WHATEVER_safe(s,e,is_utf8)>
43 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
44 comparisons involving octect<128 are done before checking the C<is_utf8>
45 flag, hopefully saving time.
47 The version without the C<_safe> suffix should be used only when the input is
48 known to be well-formed.
50 =item C<is_WHATEVER_utf8(s)>
52 =item C<is_WHATEVER_utf8_safe(s,e)>
54 Do a lookup assuming the string is encoded in (normalized) UTF8.
56 The version without the C<_safe> suffix should be used only when the input is
57 known to be well-formed.
59 =item C<is_WHATEVER_latin1(s)>
61 =item C<is_WHATEVER_latin1_safe(s,e)>
63 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
65 The version without the C<_safe> suffix should be used only when it is known
66 that C<s> contains at least one character.
68 =item C<is_WHATEVER_cp(cp)>
70 Check to see if the string matches a given codepoint (hypothetically a
71 U32). The condition is constructed as as to "break out" as early as
72 possible if the codepoint is out of range of the condition.
76 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
78 Thus if the character is X+1 only two comparisons will be done. Making
79 matching lookups slower, but non-matching faster.
81 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
83 A variant form of each of the macro types described above can be generated, in
84 which the code point is returned by the macro, and an extra parameter (in the
85 final position) is added, which is a pointer for the macro to set the byte
86 length of the returned code point.
88 These forms all have a C<what_len> prefix instead of the C<is_>, for example
89 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
90 C<what_len_WHATEVER_utf8(s,len)>.
92 These forms should not be used I<except> on small sets of mostly widely
93 separated code points; otherwise the code generated is inefficient. For these
94 cases, it is best to use the C<is_> forms, and then find the code point with
95 C<utf8_to_uvchr_buf>(). This program can fail with a "deep recursion"
96 message on the worst of the inappropriate sets. Examine the generated macro
97 to see if it is acceptable.
99 =item C<what_WHATEVER_FOO(arg1, ...)>
101 A variant form of each of the C<is_> macro types described above can be generated, in
102 which the code point and not the length is returned by the macro. These have
103 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
104 not be used where the set contains a NULL, as 0 is returned for two different
105 cases: a) the set doesn't include the input code point; b) the set does
106 include it, and it is a NULL.
112 perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
117 Author: Yves Orton (demerphq) 2007. Maintained by Perl5 Porters.
121 No tests directly here (although the regex engine will fail tests
122 if this code is broken). Insufficient documentation and no Getopts
123 handler for using the module as a script.
127 You may distribute under the terms of either the GNU General Public
128 License or the Artistic License, as specified in the README file.
132 # Sub naming convention:
133 # __func : private subroutine, can not be called as a method
134 # _func : private method, not meant for external use
135 # func : public method.
138 #-------------------------------------------------------------------------------
140 # ($cp,$n,$l,$u)=__uni_latin($str);
142 # Return a list of arrays, each of which when interpreted correctly
143 # represent the string in some given encoding with specific conditions.
145 # $cp - list of codepoints that make up the string.
146 # $n - list of octets that make up the string if all codepoints are invariant
147 # regardless of if the string is in UTF-8 or not.
148 # $l - list of octets that make up the string in latin1 encoding if all
149 # codepoints < 256, and at least one codepoint is UTF-8 variant.
150 # $u - list of octets that make up the string in utf8 if any codepoint is
154 #-----------+----------
155 # 0 - 127 : $n (127/128 are the values for ASCII platforms)
164 my $only_has_invariants = 1;
165 for my $ch ( split //, $str ) {
168 $max= $cp if $max < $cp;
169 if (! ASCII_PLATFORM && $only_has_invariants) {
171 $only_has_invariants = 0;
175 utf8::upgrade($temp);
176 my @utf8 = unpack "U0C*", $temp;
177 $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
182 $only_has_invariants = $max < 128 if ASCII_PLATFORM;
183 if ($only_has_invariants) {
186 $l= [@cp] if $max && $max < 256;
190 $u= [ unpack "U0C*", $u ] if defined $u;
192 return ( \@cp, $n, $l, $u );
196 # $clean= __clean($expr);
198 # Cleanup a ternary expression, removing unnecessary parens and apply some
199 # simplifications using regexes.
205 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
207 #print "$parens\n$expr\n";
208 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
209 1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
210 \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
211 \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
216 # $text= __macro(@args);
217 # Join args together by newlines, and then neatly add backslashes to the end
218 # of every line as expected by the C pre-processor for #define's.
222 my $str= join "\n", @_;
224 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
225 my $last= pop @lines;
226 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
227 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
232 # my $op=__incrdepth($op);
234 # take an 'op' hashref and add one to it and all its childrens depths.
239 return unless ref $op;
241 __incrdepth( $op->{yes} );
242 __incrdepth( $op->{no} );
246 # join two branches of an opcode together with a condition, incrementing
247 # the depth on the yes branch when we do so.
248 # returns the new root opcode of the tree.
250 my ( $cond, $yes, $no )= @_;
253 yes => __incrdepth( $yes ),
263 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
265 # Create a new CharClass::Matcher object by parsing the text in
266 # the txt array. Currently applies the following rules:
268 # Element starts with C<0x>, line is evaled the result treated as
269 # a number which is passed to chr().
271 # Element starts with C<">, line is evaled and the result treated
274 # Each string is then stored in the 'strs' subhash as a hash record
275 # made up of the results of __uni_latin1, using the keynames
276 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
277 # 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
279 # Size data is tracked per type in the 'size' subhash.
287 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
293 title => $opt{title} || '',
295 foreach my $txt ( @{ $opt{txt} } ) {
297 if ( $str =~ /^[""]/ ) {
299 } elsif ($str =~ / - /x ) { # A range: Replace this element on the
300 # list with its expansion
301 my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
302 die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
303 foreach my $cp (hex $lower .. hex $upper) {
304 push @{$opt{txt}}, sprintf "0x%X", $cp;
307 } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
308 # Otherwise undocumented, a leading N means is already in the
309 # native character set; don't convert.
311 } elsif ( $str =~ /^0x/ ) {
314 # Convert from Unicode/ASCII to native, if necessary
315 $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
318 } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
320 use Unicode::UCD qw(prop_invlist);
322 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
325 # An empty return could mean an unknown property, or merely
326 # that it is empty. Call in scalar context to differentiate
327 my $count = prop_invlist($property, '_perl_core_internal_ok');
328 die "$property not found" unless defined $count;
331 # Replace this element on the list with the property's expansion
332 for (my $i = 0; $i < @invlist; $i += 2) {
333 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
335 # prop_invlist() returns native values; add leading 'N'
337 push @{$opt{txt}}, sprintf "N0x%X", $cp;
342 die "Unparsable line: $txt\n";
344 my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
345 my $UTF8= $low || $utf8;
346 my $LATIN1= $low || $latin1;
347 my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
348 #die Dumper($txt,$cp,$low,$latin1,$utf8)
349 # if $txt=~/NEL/ or $utf8 and @$utf8>3;
351 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp UTF8 LATIN1 )}=
352 ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $UTF8, $LATIN1 );
353 my $rec= $self->{strs}{$str};
354 foreach my $key ( qw(low utf8 latin1 high cp UTF8 LATIN1) ) {
355 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
356 if $self->{strs}{$str}{$key};
358 $self->{has_multi} ||= @$cp > 1;
359 $self->{has_ascii} ||= $latin1 && @$latin1;
360 $self->{has_low} ||= $low && @$low;
361 $self->{has_high} ||= !$low && !$latin1;
363 $self->{val_fmt}= $hex_fmt;
364 $self->{count}= 0 + keys %{ $self->{strs} };
368 # my $trie = make_trie($type,$maxlen);
370 # using the data stored in the object build a trie of a specific type,
371 # and with specific maximum depth. The trie is made up the elements of
372 # the given types array for each string in the object (assuming it is
375 # returns the trie, or undef if there was no relevant data in the object.
379 my ( $self, $type, $maxlen )= @_;
381 my $strs= $self->{strs};
383 foreach my $rec ( values %$strs ) {
384 die "panic: unknown type '$type'"
385 if !exists $rec->{$type};
386 my $dat= $rec->{$type};
388 next if $maxlen && @$dat > $maxlen;
390 foreach my $elem ( @$dat ) {
391 $node->{$elem} ||= {};
392 $node= $node->{$elem};
394 $node->{''}= $rec->{str};
396 return 0 + keys( %trie ) ? \%trie : undef;
402 # This returns a list of the positions of the bits in the input word that
408 push @positions, $position if $word & 1;
415 # my $optree= _optree()
417 # recursively convert a trie to an optree where every node represents
423 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
424 return unless defined $trie;
425 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
426 die "Can't do 'cp' optree from multi-codepoint strings";
429 $else= 0 unless defined $else;
430 $depth= 0 unless defined $depth;
432 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
433 if (exists $trie->{''} ) {
434 if ( $ret_type eq 'cp' ) {
435 $else= $self->{strs}{ $trie->{''} }{cp}[0];
436 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
437 } elsif ( $ret_type eq 'len' ) {
439 } elsif ( $ret_type eq 'both') {
440 $else= $self->{strs}{ $trie->{''} }{cp}[0];
441 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
442 $else= "len=$depth, $else";
445 return $else if !@conds;
448 my ( $yes_res, $as_code, @cond );
449 my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
451 $node->{vals}= [@cond];
452 $node->{test}= $test;
453 $node->{yes}= $yes_res;
454 $node->{depth}= $depth;
458 my $cond= shift @conds;
460 $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
462 my $res_code= Dumper( $res );
463 if ( !$yes_res || $res_code ne $as_code ) {
468 ( $yes_res, $as_code )= ( $res, $res_code );
478 # my $optree= optree(%opts);
480 # Convert a trie to an optree, wrapper for _optree
485 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
486 $opt{ret_type} ||= 'len';
487 my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
488 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
491 # my $optree= generic_optree(%opts);
493 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
494 # sets of strings, including a branch for handling the string type check.
501 $opt{ret_type} ||= 'len';
502 my $test_type= 'depth';
503 my $else= $opt{else} || 0;
505 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
506 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
508 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
512 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
513 } elsif ( $latin1 ) {
514 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
516 my $low= $self->make_trie( 'low', $opt{max_depth} );
518 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
526 # create a string length guarded optree.
532 my $type= $opt{type};
534 die "Can't do a length_optree on type 'cp', makes no sense."
537 my ( @size, $method );
539 if ( $type eq 'generic' ) {
540 $method= 'generic_optree';
542 %{ $self->{size}{low} || {} },
543 %{ $self->{size}{latin1} || {} },
544 %{ $self->{size}{utf8} || {} }
546 @size= sort { $a <=> $b } keys %sizes;
549 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
552 my $else= ( $opt{else} ||= 0 );
553 for my $size ( @size ) {
554 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
555 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
556 $else= __cond_join( $cond, $optree, $else );
561 sub calculate_mask(@) {
563 my $list_count = @list;
565 # Look at the input list of byte values. This routine sees if the set
566 # consisting of those bytes is exactly determinable by using a
567 # mask/compare operation. If not, it returns an empty list; if so, it
568 # returns a list consisting of (mask, compare). For example, consider a
569 # set consisting of the numbers 0xF0, 0xF1, 0xF2, and 0xF3. If we want to
570 # know if a number 'c' is in the set, we could write:
571 # 0xF0 <= c && c <= 0xF4
572 # But the following mask/compare also works, and has just one test:
574 # The reason it works is that the set consists of exactly those numbers
575 # whose first 4 bits are 1, and the next two are 0. (The value of the
576 # other 2 bits is immaterial in determining if a number is in the set or
577 # not.) The mask masks out those 2 irrelevant bits, and the comparison
578 # makes sure that the result matches all bytes that which match those 6
579 # material bits exactly. In other words, the set of numbers contains
580 # exactly those whose bottom two bit positions are either 0 or 1. The
581 # same principle applies to bit positions that are not necessarily
582 # adjacent. And it can be applied to bytes that differ in 1 through all 8
583 # bit positions. In order to be a candidate for this optimization, the
584 # number of numbers in the test must be a power of 2. Based on this
585 # count, we know the number of bit positions that must differ.
586 my $bit_diff_count = 0;
587 my $compare = $list[0];
588 if ($list_count == 2) {
591 elsif ($list_count == 4) {
594 elsif ($list_count == 8) {
597 elsif ($list_count == 16) {
600 elsif ($list_count == 32) {
603 elsif ($list_count == 64) {
606 elsif ($list_count == 128) {
609 elsif ($list_count == 256) {
613 # If the count wasn't a power of 2, we can't apply this optimization
614 return if ! $bit_diff_count;
618 # For each byte in the list, find the bit positions in it whose value
619 # differs from the first byte in the set.
620 for (my $i = 1; $i < @list; $i++) {
621 my @positions = pop_count($list[0] ^ $list[$i]);
623 # If the number of differing bits is greater than those permitted by
624 # the set size, this optimization doesn't apply.
625 return if @positions > $bit_diff_count;
627 # Save the bit positions that differ.
628 foreach my $bit (@positions) {
632 # If the total so far is greater than those permitted by the set size,
633 # this optimization doesn't apply.
634 return if keys %bit_map > $bit_diff_count;
637 # The value to compare against is the AND of all the members of the
638 # set. The bit positions that are the same in all will be correct in
639 # the AND, and the bit positions that differ will be 0.
640 $compare &= $list[$i];
643 # To get to here, we have gone through all bytes in the set,
644 # and determined that they all differ from each other in at most
645 # the number of bits allowed for the set's quantity. And since we have
646 # tested all 2**N possibilities, we know that the set includes no fewer
647 # elements than we need,, so the optimization applies.
648 die "panic: internal logic error" if keys %bit_map != $bit_diff_count;
650 # The mask is the bit positions where things differ, complemented.
652 foreach my $position (keys %bit_map) {
653 $mask |= 1 << $position;
655 $mask = ~$mask & 0xFF;
657 return ($mask, $compare);
661 # turn a list of conditions into a text expression
662 # - merges ranges of conditions, and joins the result with ||
664 my ( $self, $op, $combine, $opts_ref )= @_;
665 my $cond= $op->{vals};
666 my $test= $op->{test};
667 my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
668 return "( $test )" if !defined $cond;
673 # We skip this if there are optimizations that
674 # we can apply (below) to the individual ranges
675 if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
676 if ( $ranges[-1][0] == $ranges[-1][1] ) {
677 $ranges[-1]= $ranges[-1][0];
678 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
679 $ranges[-1]= $ranges[-1][0];
680 push @ranges, $ranges[-1] + 1;
684 for my $condition ( @$cond ) {
685 if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
687 push @ranges, [ $condition, $condition ];
694 return $self->_combine( $test, @ranges )
701 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
703 : sprintf( "$self->{val_fmt} == $test", $_ );
707 # If the input set has certain characteristics, we can optimize tests
708 # for it. This doesn't apply if returning the code point, as we want
709 # each element of the set individually. The code above is for this
712 return 1 if @$cond == 256; # If all bytes match, is trivially true
715 # See if the entire set shares optimizable characterstics, and if
716 # so, return the optimization. We delay checking for this on sets
717 # with just a single range, as there may be better optimizations
718 # available in that case.
719 my ($mask, $base) = calculate_mask(@$cond);
720 if (defined $mask && defined $base) {
721 return sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask, $base;
725 # Here, there was no entire-class optimization. Look at each range.
726 for (my $i = 0; $i < @ranges; $i++) {
727 if (! ref $ranges[$i]) { # Trivial case: no range
728 $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
730 elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
731 $ranges[$i] = # Trivial case: single element range
732 sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
737 # Well-formed UTF-8 continuation bytes on ascii platforms must
738 # be in the range 0x80 .. 0xBF. If we know that the input is
739 # well-formed (indicated by not trying to be 'safe'), we can
740 # omit tests that verify that the input is within either of
741 # these bounds. (No legal UTF-8 character can begin with
742 # anything in this range, so we don't have to worry about this
743 # being a continuation byte or not.)
745 && ! $opts_ref->{safe}
746 && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
748 my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
749 my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
751 # If the range is the entire legal range, it matches any
752 # legal byte, so we can omit both tests. (This should
753 # happen only if the number of ranges is 1.)
754 if ($lower_limit_is_80 && $upper_limit_is_BF) {
757 elsif ($lower_limit_is_80) { # Just use the upper limit test
758 $output = sprintf("( $test <= $self->{val_fmt} )",
761 elsif ($upper_limit_is_BF) { # Just use the lower limit test
762 $output = sprintf("( $test >= $self->{val_fmt} )",
767 # If we didn't change to omit a test above, see if the number
768 # of elements is a power of 2 (only a single bit in the
769 # representation of its count will be set) and if so, it may
770 # be that a mask/compare optimization is possible.
772 && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
775 push @list, $_ for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
776 my ($mask, $base) = calculate_mask(@list);
777 if (defined $mask && defined $base) {
778 $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $mask, $base;
782 if ($output ne "") { # Prefer any optimization
783 $ranges[$i] = $output;
785 elsif ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
786 # No optimization happened. We need a test that the code
787 # point is within both bounds. But, if the bounds are
788 # adjacent code points, it is cleaner to say
789 # 'first == test || second == test'
791 # 'first <= test && test <= second'
793 . join( " || ", ( map
794 { sprintf "$self->{val_fmt} == $test", $_ }
798 else { # Full bounds checking
799 $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
805 return "( " . join( " || ", @ranges ) . " )";
810 # recursively turn a list of conditions into a fast break-out condition
811 # used by _cond_as_str() for 'cp' type macros.
813 my ( $self, $test, @cond )= @_;
815 my $item= shift @cond;
819 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
821 $gtv= sprintf "$self->{val_fmt}", $item->[1];
823 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
824 $gtv= sprintf "$self->{val_fmt}", $item;
827 return "( $cstr || ( $gtv < $test &&\n"
828 . $self->_combine( $test, @cond ) . " ) )";
835 # recursively convert an optree to text with reasonably neat formatting
837 my ( $self, $op, $combine, $brace, $opts_ref )= @_;
838 return 0 if ! defined $op; # The set is empty
842 my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
843 #no warnings 'recursion'; # This would allow really really inefficient
844 # code to be generated. See pod
845 my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref );
846 return $yes if $cond eq '1';
848 my $no= $self->_render( $op->{no}, $combine, 0, $opts_ref );
849 return "( $cond )" if $yes eq '1' and $no eq '0';
850 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
851 return "$lb$cond ? $yes : $no$rb"
852 if !ref( $op->{yes} ) && !ref( $op->{no} );
854 my $ind= "\n" . ( $ind1 x $op->{depth} );
856 if ( ref $op->{yes} ) {
857 $yes= $ind . $ind1 . $yes;
862 return "$lb$cond ?$yes$ind: $no$rb";
865 # $expr=render($op,$combine)
867 # convert an optree to text with reasonably neat formatting. If $combine
868 # is true then the condition is created using "fast breakouts" which
869 # produce uglier expressions that are more efficient for common case,
870 # longer lists such as that resulting from type 'cp' output.
871 # Currently only used for type 'cp' macros.
873 my ( $self, $op, $combine, $opts_ref )= @_;
874 my $str= "( " . $self->_render( $op, $combine, 0, $opts_ref ) . " )";
875 return __clean( $str );
879 # make a macro of a given type.
880 # calls into make_trie and (generic_|length_)optree as needed
882 # type : 'cp','generic','high','low','latin1','utf8','LATIN1','UTF8'
883 # ret_type : 'cp' or 'len'
884 # safe : add length guards to macro
886 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
887 # in which case it defaults to 'cp' as well.
889 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
890 # sequences in it, as the generated macro will accept only a single codepoint
899 my $type= $opts{type} || 'generic';
900 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
902 and $self->{has_multi};
903 my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
906 $method= 'length_optree';
907 } elsif ( $type eq 'generic' ) {
908 $method= 'generic_optree';
912 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
913 my $text= $self->render( $optree, $type eq 'cp', \%opts );
914 my @args= $type eq 'cp' ? 'cp' : 's';
915 push @args, "e" if $opts{safe};
916 push @args, "is_utf8" if $type eq 'generic';
917 push @args, "len" if $ret_type eq 'both';
918 my $pfx= $ret_type eq 'both' ? 'what_len_' :
919 $ret_type eq 'cp' ? 'what_' : 'is_';
920 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
921 $ext .= "_safe" if $opts{safe};
922 my $argstr= join ",", @args;
923 return "/*** GENERATED CODE ***/\n"
924 . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
927 # if we arent being used as a module (highly likely) then process
928 # the __DATA__ below and produce macros in regcharclass.h
929 # if an argument is provided to the script then it is assumed to
930 # be the path of the file to output to, if the arg is '-' outputs
934 my $path= shift @ARGV || "regcharclass.h";
936 if ( $path eq '-' ) {
939 $out_fh = open_new( $path );
941 print $out_fh read_only_top( lang => 'C', by => $0,
942 file => 'regcharclass.h', style => '*',
943 copyright => [2007, 2011] );
944 print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
946 my ( $op, $title, @txt, @types, %mods );
950 # Skip if to compile on a different platform.
951 return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
952 return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
954 print $out_fh "/*\n\t$op: $title\n\n";
955 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
956 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
958 #die Dumper(\@types,\%mods);
961 push @mods, 'safe' if delete $mods{safe};
962 unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
967 die "Unknown modifiers: ", join ", ", map { "'$_'" } keys %mods;
970 foreach my $type_spec ( @types ) {
971 my ( $type, $ret )= split /-/, $type_spec;
973 foreach my $mod ( @mods ) {
974 next if $mod eq 'safe' and $type eq 'cp';
976 my $macro= $obj->make_macro(
979 safe => $mod eq 'safe'
981 print $out_fh $macro, "\n";
987 s/^ \s* (?: \# .* ) ? $ //x; # squeeze out comment and blanks
991 $doit->(); # This starts a new definition; do the previous one
992 ( $op, $title )= split /\s*:\s*/, $_, 2;
994 } elsif ( s/^=>// ) {
995 my ( $type, $modifier )= split /:/, $_;
996 @types= split ' ', $type;
998 map { $mods{$_} = 1 } split ' ', $modifier;
1005 print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
1008 print $out_fh "/* ex: set ro: */\n";
1010 read_only_bottom_close_and_rename($out_fh)
1014 # The form of the input is a series of definitions to make macros for.
1015 # The first line gives the base name of the macro, followed by a colon, and
1016 # then text to be used in comments associated with the macro that are its
1017 # title or description. In all cases the first (perhaps only) parameter to
1018 # the macro is a pointer to the first byte of the code point it is to test to
1019 # see if it is in the class determined by the macro. In the case of non-UTF8,
1020 # the code point consists only of a single byte.
1022 # The second line must begin with a '=>' and be followed by the types of
1023 # macro(s) to be generated; these are specified below. A colon follows the
1024 # types, followed by the modifiers, also specified below. At least one
1025 # modifier is required.
1027 # The subsequent lines give what code points go into the class defined by the
1028 # macro. Multiple characters may be specified via a string like "\x0D\x0A",
1029 # enclosed in quotes. Otherwise the lines consist of single Unicode code
1030 # point, prefaced by 0x; or a single range of Unicode code points separated by
1031 # a minus (and optional space); or a single Unicode property specified in the
1032 # standard Perl form "\p{...}".
1034 # A blank line or one whose first non-blank character is '#' is a comment.
1035 # The definition of the macro is terminated by a line unlike those described.
1038 # low generate a macro whose name is 'is_BASE_low' and defines a
1039 # class that includes only ASCII-range chars. (BASE is the
1040 # input macro base name.)
1041 # latin1 generate a macro whose name is 'is_BASE_latin1' and defines a
1042 # class that includes only upper-Latin1-range chars. It is not
1043 # designed to take a UTF-8 input parameter.
1044 # high generate a macro whose name is 'is_BASE_high' and defines a
1045 # class that includes all relevant code points that are above
1046 # the Latin1 range. This is for very specialized uses only.
1047 # It is designed to take only an input UTF-8 parameter.
1048 # utf8 generate a macro whose name is 'is_BASE_utf8' and defines a
1049 # class that includes all relevant characters that aren't ASCII.
1050 # It is designed to take only an input UTF-8 parameter.
1051 # LATIN1 generate a macro whose name is 'is_BASE_latin1' and defines a
1052 # class that includes both ASCII and upper-Latin1-range chars.
1053 # It is not designed to take a UTF-8 input parameter.
1054 # UTF8 generate a macro whose name is 'is_BASE_utf8' and defines a
1055 # class that can include any code point, adding the 'low' ones
1056 # to what 'utf8' works on. It is designed to take only an input
1058 # generic generate a macro whose name is 'is_BASE". It has a 2nd,
1059 # boolean, parameter which indicates if the first one points to
1060 # a UTF-8 string or not. Thus it works in all circumstances.
1061 # cp generate a macro whose name is 'is_BASE_cp' and defines a
1062 # class that returns true if the UV parameter is a member of the
1063 # class; false if not.
1064 # A macro of the given type is generated for each type listed in the input.
1065 # The default return value is the number of octets read to generate the match.
1066 # Append "-cp" to the type to have it instead return the matched codepoint.
1067 # The macro name is changed to 'what_BASE...'. See pod for
1069 # Appending '-both" instead adds an extra parameter to the end of the argument
1070 # list, which is a pointer as to where to store the number of
1071 # bytes matched, while also returning the code point. The macro
1072 # name is changed to 'what_len_BASE...'. See pod for caveats
1075 # safe The input string is not necessarily valid UTF-8. In
1076 # particular an extra parameter (always the 2nd) to the macro is
1077 # required, which points to one beyond the end of the string.
1078 # The macro will make sure not to read off the end of the
1079 # string. In the case of non-UTF8, it makes sure that the
1080 # string has at least one byte in it. The macro name has
1081 # '_safe' appended to it.
1082 # fast The input string is valid UTF-8. No bounds checking is done,
1083 # and the macro can make assumptions that lead to faster
1085 # only_ascii_platform Skip this definition if this program is being run on
1086 # a non-ASCII platform.
1087 # only_ebcdic_platform Skip this definition if this program is being run on
1088 # a non-EBCDIC platform.
1089 # No modifier need be specified; fast is assumed for this case. If both
1090 # 'fast', and 'safe' are specified, two macros will be created for each
1093 # If run on a non-ASCII platform will automatically convert the Unicode input
1094 # to native. The documentation above is slightly wrong in this case. 'low'
1095 # actually refers to code points whose UTF-8 representation is the same as the
1096 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1097 # code points less than 256.
1099 1; # in the unlikely case we are being used as a module
1102 # This is no longer used, but retained in case it is needed some day.
1103 # TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t
1104 # => generic cp generic-cp generic-both :fast safe
1105 # 0x00DF # LATIN SMALL LETTER SHARP S
1106 # 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1107 # 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1108 # 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1109 # 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1110 # 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1112 LNBREAK: Line Break: \R
1113 => generic UTF8 LATIN1 :fast safe
1114 "\x0D\x0A" # CRLF - Network (Windows) line ending
1117 HORIZWS: Horizontal Whitespace: \h \H
1118 => generic UTF8 LATIN1 cp :fast safe
1121 VERTWS: Vertical Whitespace: \v \V
1122 => generic UTF8 LATIN1 cp :fast safe
1125 REPLACEMENT: Unicode REPLACEMENT CHARACTER
1129 NONCHAR: Non character code points
1133 SURROGATE: Surrogate characters
1137 GCB_L: Grapheme_Cluster_Break=L
1141 GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
1145 GCB_Prepend: Grapheme_Cluster_Break=Prepend
1149 GCB_RI: Grapheme_Cluster_Break=RI
1153 GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins
1155 \p{_X_Special_Begin}
1157 GCB_T: Grapheme_Cluster_Break=T
1161 GCB_V: Grapheme_Cluster_Break=V
1165 # This program was run with this enabled, and the results copied to utf8.h;
1166 # then this was commented out because it takes so long to figure out these 2
1167 # million code points. The results would not change unless utf8.h decides it
1168 # wants a maximum other than 4 bytes, or this program creates better
1170 #UTF8_CHAR: Matches utf8 from 1 to 4 bytes
1171 #=> UTF8 :safe only_ascii_platform
1174 # This hasn't been commented out, because we haven't an EBCDIC platform to run
1175 # it on, and the 3 types of EBCDIC allegedly supported by Perl would have
1177 UTF8_CHAR: Matches utf8 from 1 to 5 bytes
1178 => UTF8 :safe only_ebcdic_platform
1181 QUOTEMETA: Meta-characters that \Q should quote
1185 FOR_TESTING_DEMO: This is used to test if we generate incorrect code (currently is bad)
1188 "\x{3B9}\x{308}\x{301}"