regen/regcharclass.pl

   1 #!perl
   2 package CharClass::Matcher;
   3 use strict;
   4 use 5.008;
   5 use warnings;
   6 use warnings FATAL => 'all';
   7 use Text::Wrap qw(wrap);
   8 use Data::Dumper;
   9 $Data::Dumper::Useqq= 1;
  10 our $hex_fmt= "0x%02X";
  11
  12 sub DEBUG () { 0 }
  13 $|=1 if DEBUG;
  14
  15 sub ASCII_PLATFORM { (ord('A') == 65) }
  16
  17 require 'regen/regen_lib.pl';
  18
  19 =head1 NAME
  20
  21 CharClass::Matcher -- Generate C macros that match character classes efficiently
  22
  23 =head1 SYNOPSIS
  24
  25     perl Porting/regcharclass.pl
  26
  27 =head1 DESCRIPTION
  28
  29 Dynamically generates macros for detecting special charclasses
  30 in latin-1, utf8, and codepoint forms. Macros can be set to return
  31 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
  32
  33 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
  34 are necessary.
  35
  36 Using WHATEVER as an example the following macros can be produced, depending
  37 on the input parameters (how to get each is described by internal comments at
  38 the C<__DATA__> line):
  39
  40 =over 4
  41
  42 =item C<is_WHATEVER(s,is_utf8)>
  43
  44 =item C<is_WHATEVER_safe(s,e,is_utf8)>
  45
  46 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
  47 comparisons involving octect<128 are done before checking the C<is_utf8>
  48 flag, hopefully saving time.
  49
  50 The version without the C<_safe> suffix should be used only when the input is
  51 known to be well-formed.
  52
  53 =item C<is_WHATEVER_utf8(s)>
  54
  55 =item C<is_WHATEVER_utf8_safe(s,e)>
  56
  57 Do a lookup assuming the string is encoded in (normalized) UTF8.
  58
  59 The version without the C<_safe> suffix should be used only when the input is
  60 known to be well-formed.
  61
  62 =item C<is_WHATEVER_latin1(s)>
  63
  64 =item C<is_WHATEVER_latin1_safe(s,e)>
  65
  66 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  67
  68 The version without the C<_safe> suffix should be used only when it is known
  69 that C<s> contains at least one character.
  70
  71 =item C<is_WHATEVER_cp(cp)>
  72
  73 Check to see if the string matches a given codepoint (hypothetically a
  74 U32). The condition is constructed as to "break out" as early as
  75 possible if the codepoint is out of range of the condition.
  76
  77 IOW:
  78
  79   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  80
  81 Thus if the character is X+1 only two comparisons will be done. Making
  82 matching lookups slower, but non-matching faster.
  83
  84 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
  85
  86 A variant form of each of the macro types described above can be generated, in
  87 which the code point is returned by the macro, and an extra parameter (in the
  88 final position) is added, which is a pointer for the macro to set the byte
  89 length of the returned code point.
  90
  91 These forms all have a C<what_len> prefix instead of the C<is_>, for example
  92 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
  93 C<what_len_WHATEVER_utf8(s,len)>.
  94
  95 These forms should not be used I<except> on small sets of mostly widely
  96 separated code points; otherwise the code generated is inefficient.  For these
  97 cases, it is best to use the C<is_> forms, and then find the code point with
  98 C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
  99 message on the worst of the inappropriate sets.  Examine the generated macro
 100 to see if it is acceptable.
 101
 102 =item C<what_WHATEVER_FOO(arg1, ...)>
 103
 104 A variant form of each of the C<is_> macro types described above can be generated, in
 105 which the code point and not the length is returned by the macro.  These have
 106 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
 107 not be used where the set contains a NULL, as 0 is returned for two different
 108 cases: a) the set doesn't include the input code point; b) the set does
 109 include it, and it is a NULL.
 110
 111 =back
 112
 113 =head2 CODE FORMAT
 114
 115 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
 116
 117
 118 =head1 AUTHOR
 119
 120 Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
 121
 122 =head1 BUGS
 123
 124 No tests directly here (although the regex engine will fail tests
 125 if this code is broken). Insufficient documentation and no Getopts
 126 handler for using the module as a script.
 127
 128 =head1 LICENSE
 129
 130 You may distribute under the terms of either the GNU General Public
 131 License or the Artistic License, as specified in the README file.
 132
 133 =cut
 134
 135 # Sub naming convention:
 136 # __func : private subroutine, can not be called as a method
 137 # _func  : private method, not meant for external use
 138 # func   : public method.
 139
 140 # private subs
 141 #-------------------------------------------------------------------------------
 142 #
 143 # ($cp,$n,$l,$u)=__uni_latin($str);
 144 #
 145 # Return a list of arrays, each of which when interpreted correctly
 146 # represent the string in some given encoding with specific conditions.
 147 #
 148 # $cp - list of codepoints that make up the string.
 149 # $n  - list of octets that make up the string if all codepoints are invariant
 150 #       regardless of if the string is in UTF-8 or not.
 151 # $l  - list of octets that make up the string in latin1 encoding if all
 152 #       codepoints < 256, and at least one codepoint is UTF-8 variant.
 153 # $u  - list of octets that make up the string in utf8 if any codepoint is
 154 #       UTF-8 variant
 155 #
 156 #   High CP | Defined
 157 #-----------+----------
 158 #   0 - 127 : $n            (127/128 are the values for ASCII platforms)
 159 # 128 - 255 : $l, $u
 160 # 256 - ... : $u
 161 #
 162
 163 sub __uni_latin1 {
 164     my $str= shift;
 165     my $max= 0;
 166     my @cp;
 167     my @cp_high;
 168     my $only_has_invariants = 1;
 169     for my $ch ( split //, $str ) {
 170         my $cp= ord $ch;
 171         push @cp, $cp;
 172         push @cp_high, $cp if $cp > 255;
 173         $max= $cp if $max < $cp;
 174         if (! ASCII_PLATFORM && $only_has_invariants) {
 175             if ($cp > 255) {
 176                 $only_has_invariants = 0;
 177             }
 178             else {
 179                 my $temp = chr($cp);
 180                 utf8::upgrade($temp);
 181                 my @utf8 = unpack "U0C*", $temp;
 182                 $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
 183             }
 184         }
 185     }
 186     my ( $n, $l, $u );
 187     $only_has_invariants = $max < 128 if ASCII_PLATFORM;
 188     if ($only_has_invariants) {
 189         $n= [@cp];
 190     } else {
 191         $l= [@cp] if $max && $max < 256;
 192
 193         $u= $str;
 194         utf8::upgrade($u);
 195         $u= [ unpack "U0C*", $u ] if defined $u;
 196     }
 197     return ( \@cp, \@cp_high, $n, $l, $u );
 198 }
 199
 200 #
 201 # $clean= __clean($expr);
 202 #
 203 # Cleanup a ternary expression, removing unnecessary parens and apply some
 204 # simplifications using regexes.
 205 #
 206
 207 sub __clean {
 208     my ( $expr )= @_;
 209
 210     #return $expr;
 211
 212     our $parens;
 213     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 214
 215     ## remove redundant parens
 216     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 217
 218
 219     # repeatedly simplify conditions like
 220     #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
 221     # into
 222     #       ( ( (cond1) && (cond2) ) ? X : Y )
 223     # Also similarly handles expressions like:
 224     #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
 225     # Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is
 226     # purely to ensure we have a balanced set of parens in the expression which makes
 227     # it easier to understand the pattern in an editor that understands paren's, we do
 228     # not expect either of these cases to actually fire. - Yves
 229     1 while $expr =~ s/
 230         ([:()])  \s*
 231             ($parens) \s*
 232             \? \s*
 233                 \( \s* ($parens) \s*
 234                     \? \s* ($parens|[^()?:\s]+?) \s*
 235                     :  \s* ($parens|[^()?:\s]+?) \s*
 236                 \) \s*
 237             : \s* \5 \s*
 238         ([()])
 239     /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
 240     #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000;
 241     #$expr=~s/\s+//g if length $expr > 8000;
 242
 243     die "Expression too long" if length $expr > 8000;
 244
 245     return $expr;
 246 }
 247
 248 #
 249 # $text= __macro(@args);
 250 # Join args together by newlines, and then neatly add backslashes to the end
 251 # of every  line as expected by the C pre-processor for #define's.
 252 #
 253
 254 sub __macro {
 255     my $str= join "\n", @_;
 256     $str =~ s/\s*$//;
 257     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 258     my $last= pop @lines;
 259     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 260     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 261     return $str . "\n";
 262 }
 263
 264 #
 265 # my $op=__incrdepth($op);
 266 #
 267 # take an 'op' hashref and add one to it and all its childrens depths.
 268 #
 269
 270 sub __incrdepth {
 271     my $op= shift;
 272     return unless ref $op;
 273     $op->{depth} += 1;
 274     __incrdepth( $op->{yes} );
 275     __incrdepth( $op->{no} );
 276     return $op;
 277 }
 278
 279 # join two branches of an opcode together with a condition, incrementing
 280 # the depth on the yes branch when we do so.
 281 # returns the new root opcode of the tree.
 282 sub __cond_join {
 283     my ( $cond, $yes, $no )= @_;
 284     return {
 285         test  => $cond,
 286         yes   => __incrdepth( $yes ),
 287         no    => $no,
 288         depth => 0,
 289     };
 290 }
 291
 292 # Methods
 293
 294 # constructor
 295 #
 296 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 297 #
 298 # Create a new CharClass::Matcher object by parsing the text in
 299 # the txt array. Currently applies the following rules:
 300 #
 301 # Element starts with C<0x>, line is evaled the result treated as
 302 # a number which is passed to chr().
 303 #
 304 # Element starts with C<">, line is evaled and the result treated
 305 # as a string.
 306 #
 307 # Each string is then stored in the 'strs' subhash as a hash record
 308 # made up of the results of __uni_latin1, using the keynames
 309 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
 310 # 'UTF8' which hold a merge of 'low' and their lowercase equivalents.
 311 #
 312 # Size data is tracked per type in the 'size' subhash.
 313 #
 314 # Return an object
 315 #
 316 sub new {
 317     my $class= shift;
 318     my %opt= @_;
 319     for ( qw(op txt) ) {
 320         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 321           if !exists $opt{$_};
 322     }
 323
 324     my $self= bless {
 325         op    => $opt{op},
 326         title => $opt{title} || '',
 327     }, $class;
 328     foreach my $txt ( @{ $opt{txt} } ) {
 329         my $str= $txt;
 330         if ( $str =~ /^[""]/ ) {
 331             $str= eval $str;
 332         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
 333                                     # list with its expansion
 334             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
 335             die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
 336             foreach my $cp (hex $lower .. hex $upper) {
 337                 push @{$opt{txt}}, sprintf "0x%X", $cp;
 338             }
 339             next;
 340         } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
 341             # Otherwise undocumented, a leading N means is already in the
 342             # native character set; don't convert.
 343             $str= chr eval $str;
 344         } elsif ( $str =~ /^0x/ ) {
 345             $str= eval $str;
 346
 347             # Convert from Unicode/ASCII to native, if necessary
 348             $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
 349                                                     && $str <= 0xFF;
 350             $str = chr $str;
 351         } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
 352             my $property = $1;
 353             use Unicode::UCD qw(prop_invlist);
 354
 355             my @invlist = prop_invlist($property, '_perl_core_internal_ok');
 356             if (! @invlist) {
 357
 358                 # An empty return could mean an unknown property, or merely
 359                 # that it is empty.  Call in scalar context to differentiate
 360                 my $count = prop_invlist($property, '_perl_core_internal_ok');
 361                 die "$property not found" unless defined $count;
 362             }
 363
 364             # Replace this element on the list with the property's expansion
 365             for (my $i = 0; $i < @invlist; $i += 2) {
 366                 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
 367
 368                     # prop_invlist() returns native values; add leading 'N'
 369                     # to indicate that.
 370                     push @{$opt{txt}}, sprintf "N0x%X", $cp;
 371                 }
 372             }
 373             next;
 374         } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
 375             die "do '$1' failed: $!$@" if ! do $1 or $@;
 376             next;
 377         } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
 378             my @results = eval "$1";
 379             die "eval '$1' failed: $@" if $@;
 380             push @{$opt{txt}}, @results;
 381             next;
 382         } else {
 383             die "Unparsable line: $txt\n";
 384         }
 385         my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $str );
 386         my $UTF8= $low   || $utf8;
 387         my $LATIN1= $low || $latin1;
 388         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
 389         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 390         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 391
 392         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}=
 393           ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 );
 394         my $rec= $self->{strs}{$str};
 395         foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
 396             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 397               if $self->{strs}{$str}{$key};
 398         }
 399         $self->{has_multi} ||= @$cp > 1;
 400         $self->{has_ascii} ||= $latin1 && @$latin1;
 401         $self->{has_low}   ||= $low && @$low;
 402         $self->{has_high}  ||= !$low && !$latin1;
 403     }
 404     $self->{val_fmt}= $hex_fmt;
 405     $self->{count}= 0 + keys %{ $self->{strs} };
 406     return $self;
 407 }
 408
 409 # my $trie = make_trie($type,$maxlen);
 410 #
 411 # using the data stored in the object build a trie of a specific type,
 412 # and with specific maximum depth. The trie is made up the elements of
 413 # the given types array for each string in the object (assuming it is
 414 # not too long.)
 415 #
 416 # returns the trie, or undef if there was no relevant data in the object.
 417 #
 418
 419 sub make_trie {
 420     my ( $self, $type, $maxlen )= @_;
 421
 422     my $strs= $self->{strs};
 423     my %trie;
 424     foreach my $rec ( values %$strs ) {
 425         die "panic: unknown type '$type'"
 426           if !exists $rec->{$type};
 427         my $dat= $rec->{$type};
 428         next unless $dat;
 429         next if $maxlen && @$dat > $maxlen;
 430         my $node= \%trie;
 431         foreach my $elem ( @$dat ) {
 432             $node->{$elem} ||= {};
 433             $node= $node->{$elem};
 434         }
 435         $node->{''}= $rec->{str};
 436     }
 437     return 0 + keys( %trie ) ? \%trie : undef;
 438 }
 439
 440 sub pop_count ($) {
 441     my $word = shift;
 442
 443     # This returns a list of the positions of the bits in the input word that
 444     # are 1.
 445
 446     my @positions;
 447     my $position = 0;
 448     while ($word) {
 449         push @positions, $position if $word & 1;
 450         $position++;
 451         $word >>= 1;
 452     }
 453     return @positions;
 454 }
 455
 456 # my $optree= _optree()
 457 #
 458 # recursively convert a trie to an optree where every node represents
 459 # an if else branch.
 460 #
 461 #
 462
 463 sub _optree {
 464     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 465     return unless defined $trie;
 466     if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
 467         die "Can't do 'cp' optree from multi-codepoint strings";
 468     }
 469     $ret_type ||= 'len';
 470     $else= 0  unless defined $else;
 471     $depth= 0 unless defined $depth;
 472
 473     # if we have an empty string as a key it means we are in an
 474     # accepting state and unless we can match further on should
 475     # return the value of the '' key.
 476     if (exists $trie->{''} ) {
 477         # we can now update the "else" value, anything failing to match
 478         # after this point should return the value from this.
 479         if ( $ret_type eq 'cp' ) {
 480             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 481             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 482         } elsif ( $ret_type eq 'len' ) {
 483             $else= $depth;
 484         } elsif ( $ret_type eq 'both') {
 485             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 486             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 487             $else= "len=$depth, $else";
 488         }
 489     }
 490     # extract the meaningful keys from the trie, filter out '' as
 491     # it means we are an accepting state (end of sequence).
 492     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 493
 494     # if we haven't any keys there is no further we can match and we
 495     # can return the "else" value.
 496     return $else if !@conds;
 497
 498     # Assuming Perl is being released from an ASCII platform, the below makes
 499     # it work for non-UTF-8 out-of-the box when porting to non-ASCII, by
 500     # adding a translation back to ASCII.  This is the wrong thing to do for
 501     # UTF-EBCDIC, as that is different from UTF-8.  But the intent here is
 502     # that this regen should be run on the target system, which will omit the
 503     # translation, and generate the correct UTF-EBCDIC.  On ASCII systems, the
 504     # translation macros expand to just their argument, so there is no harm
 505     # done nor performance penalty by including them.
 506     my $test;
 507     if ($test_type =~ /^cp/) {
 508         $test = "cp";
 509         $test = "NATIVE_TO_UNI($test)" if ASCII_PLATFORM;
 510     }
 511     else {
 512         $test = "((U8*)s)[$depth]";
 513         $test = "NATIVE_TO_LATIN1($test)" if ASCII_PLATFORM;
 514     }
 515
 516     # first we loop over the possible keys/conditions and find out what they
 517     # look like we group conditions with the same optree together.
 518     my %dmp_res;
 519     my @res_order;
 520     local $Data::Dumper::Sortkeys=1;
 521     foreach my $cond ( @conds ) {
 522
 523         # get the optree for this child/condition
 524         my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 );
 525         # convert it to a string with Dumper
 526         my $res_code= Dumper( $res );
 527
 528         push @{$dmp_res{$res_code}{vals}}, $cond;
 529         if (!$dmp_res{$res_code}{optree}) {
 530             $dmp_res{$res_code}{optree}= $res;
 531             push @res_order, $res_code;
 532         }
 533     }
 534
 535     # now that we have deduped the optrees we construct a new optree containing the merged
 536     # results.
 537     my %root;
 538     my $node= \%root;
 539     foreach my $res_code_idx (0 .. $#res_order) {
 540         my $res_code= $res_order[$res_code_idx];
 541         $node->{vals}= $dmp_res{$res_code}{vals};
 542         $node->{test}= $test;
 543         $node->{yes}= $dmp_res{$res_code}{optree};
 544         $node->{depth}= $depth;
 545         if ($res_code_idx < $#res_order) {
 546             $node= $node->{no}= {};
 547         } else {
 548             $node->{no}= $else;
 549         }
 550     }
 551
 552     # return the optree.
 553     return \%root;
 554 }
 555
 556 # my $optree= optree(%opts);
 557 #
 558 # Convert a trie to an optree, wrapper for _optree
 559
 560 sub optree {
 561     my $self= shift;
 562     my %opt= @_;
 563     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 564     $opt{ret_type} ||= 'len';
 565     my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
 566     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 567 }
 568
 569 # my $optree= generic_optree(%opts);
 570 #
 571 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 572 # sets of strings, including a branch for handling the string type check.
 573 #
 574
 575 sub generic_optree {
 576     my $self= shift;
 577     my %opt= @_;
 578
 579     $opt{ret_type} ||= 'len';
 580     my $test_type= 'depth';
 581     my $else= $opt{else} || 0;
 582
 583     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 584     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 585
 586     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 587       for $latin1, $utf8;
 588
 589     if ( $utf8 ) {
 590         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 591     } elsif ( $latin1 ) {
 592         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 593     }
 594     if ($opt{type} eq 'generic') {
 595         my $low= $self->make_trie( 'low', $opt{max_depth} );
 596         if ( $low ) {
 597             $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 598         }
 599     }
 600
 601     return $else;
 602 }
 603
 604 # length_optree()
 605 #
 606 # create a string length guarded optree.
 607 #
 608
 609 sub length_optree {
 610     my $self= shift;
 611     my %opt= @_;
 612     my $type= $opt{type};
 613
 614     die "Can't do a length_optree on type 'cp', makes no sense."
 615       if $type =~ /^cp/;
 616
 617     my ( @size, $method );
 618
 619     if ( $type =~ /generic/ ) {
 620         $method= 'generic_optree';
 621         my %sizes= (
 622             %{ $self->{size}{low}    || {} },
 623             %{ $self->{size}{latin1} || {} },
 624             %{ $self->{size}{utf8}   || {} }
 625         );
 626         @size= sort { $a <=> $b } keys %sizes;
 627     } else {
 628         $method= 'optree';
 629         @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 630     }
 631
 632     my $else= ( $opt{else} ||= 0 );
 633     for my $size ( @size ) {
 634         my $optree= $self->$method( %opt, type => $type, max_depth => $size );
 635         my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 636         $else= __cond_join( $cond, $optree, $else );
 637     }
 638     return $else;
 639 }
 640
 641 sub calculate_mask(@) {
 642     # Look at the input list of byte values.  This routine returns an array of
 643     # mask/base pairs to generate that list.
 644
 645     my @list = @_;
 646     my $list_count = @list;
 647
 648     # Consider a set of byte values, A, B, C ....  If we want to determine if
 649     # <c> is one of them, we can write c==A || c==B || c==C ....  If the
 650     # values are consecutive, we can shorten that to A<=c && c<=Z, which uses
 651     # far fewer branches.  If only some of them are consecutive we can still
 652     # save some branches by creating range tests for just those that are
 653     # consecutive. _cond_as_str() does this work for looking for ranges.
 654     #
 655     # Another approach is to look at the bit patterns for A, B, C .... and see
 656     # if they have some commonalities.  That's what this function does.  For
 657     # example, consider a set consisting of the bytes
 658     # 0xF0, 0xF1, 0xF2, and 0xF3.  We could write:
 659     #   0xF0 <= c && c <= 0xF4
 660     # But the following mask/compare also works, and has just one test:
 661     #   (c & 0xFC) == 0xF0
 662     # The reason it works is that the set consists of exactly those bytes
 663     # whose first 4 bits are 1, and the next two are 0.  (The value of the
 664     # other 2 bits is immaterial in determining if a byte is in the set or
 665     # not.)  The mask masks out those 2 irrelevant bits, and the comparison
 666     # makes sure that the result matches all bytes which match those 6
 667     # material bits exactly.  In other words, the set of bytes contains
 668     # exactly those whose bottom two bit positions are either 0 or 1.  The
 669     # same principle applies to bit positions that are not necessarily
 670     # adjacent.  And it can be applied to bytes that differ in 1 through all 8
 671     # bit positions.  In order to be a candidate for this optimization, the
 672     # number of bytes in the set must be a power of 2.
 673     #
 674     # Consider a different example, the set 0x53, 0x54, 0x73, and 0x74.  That
 675     # requires 4 tests using either ranges or individual values, and even
 676     # though the number in the set is a power of 2, it doesn't qualify for the
 677     # mask optimization described above because the number of bits that are
 678     # different is too large for that.  However, the set can be expressed as
 679     # two branches with masks thusly:
 680     #   (c & 0xDF) == 0x53 || (c & 0xDF) == 0x54
 681     # a branch savings of 50%.  This is done by splitting the set into two
 682     # subsets each of which has 2 elements, and within each set the values
 683     # differ by 1 byte.
 684     #
 685     # This function attempts to find some way to save some branches using the
 686     # mask technique.  If not, it returns an empty list; if so, it
 687     # returns a list consisting of
 688     #   [ [compare1, mask1], [compare2, mask2], ...
 689     #     [compare_n, undef], [compare_m, undef], ...
 690     #   ]
 691     # The <mask> is undef in the above for those bytes that must be tested
 692     # for individually.
 693     #
 694     # This function does not attempt to find the optimal set.  To do so would
 695     # probably require testing all possible combinations, and keeping track of
 696     # the current best one.
 697     #
 698     # There are probably much better algorithms, but this is the one I (khw)
 699     # came up with.  We start with doing a bit-wise compare of every byte in
 700     # the set with every other byte.  The results are sorted into arrays of
 701     # all those that differ by the same bit positions.  These are stored in a
 702     # hash with the each key being the bits they differ in.  Here is the hash
 703     # for the 0x53, 0x54, 0x73, 0x74 set:
 704     # {
 705     #    4 => {
 706     #            "0,1,2,5" => [
 707     #                            83,
 708     #                            116,
 709     #                            84,
 710     #                            115
 711     #                        ]
 712     #        },
 713     #    3 => {
 714     #            "0,1,2" => [
 715     #                        83,
 716     #                        84,
 717     #                        115,
 718     #                        116
 719     #                        ]
 720     #        }
 721     #    1 => {
 722     #            5 => [
 723     #                    83,
 724     #                    115,
 725     #                    84,
 726     #                    116
 727     #                ]
 728     #        },
 729     # }
 730     #
 731     # The set consisting of values which differ in the 4 bit positions 0, 1,
 732     # 2, and 5 from some other value in the set consists of all 4 values.
 733     # Likewise all 4 values differ from some other value in the 3 bit
 734     # positions 0, 1, and 2; and all 4 values differ from some other value in
 735     # the single bit position 5.  The keys at the uppermost level in the above
 736     # hash, 1, 3, and 4, give the number of bit positions that each sub-key
 737     # below it has.  For example, the 4 key could have as its value an array
 738     # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
 739     # such.  The best optimization will group the most values into a single
 740     # mask.  The most values will be the ones that differ in the most
 741     # positions, the ones with the largest value for the topmost key.  These
 742     # keys, are thus just for convenience of sorting by that number, and do
 743     # not have any bearing on the core of the algorithm.
 744     #
 745     # We start with an element from largest number of differing bits.  The
 746     # largest in this case is 4 bits, and there is only one situation in this
 747     # set which has 4 differing bits, "0,1,2,5".  We look for any subset of
 748     # this set which has 16 values that differ in these 4 bits.  There aren't
 749     # any, because there are only 4 values in the entire set.  We then look at
 750     # the next possible thing, which is 3 bits differing in positions "0,1,2".
 751     # We look for a subset that has 8 values that differ in these 3 bits.
 752     # Again there are none.  So we go to look for the next possible thing,
 753     # which is a subset of 2**1 values that differ only in bit position 5.  83
 754     # and 115 do, so we calculate a mask and base for those and remove them
 755     # from every set.  Since there is only the one set remaining, we remove
 756     # them from just this one.  We then look to see if there is another set of
 757     # 2 values that differ in bit position 5.  84 and 116 do, so we calculate
 758     # a mask and base for those and remove them from every set (again only
 759     # this set remains in this example).  The set is now empty, and there are
 760     # no more sets to look at, so we are done.
 761
 762     if ($list_count == 256) {   # All 256 is trivially masked
 763         return (0, 0);
 764     }
 765
 766     my %hash;
 767
 768     # Generate bits-differing lists for each element compared against each
 769     # other element
 770     for my $i (0 .. $list_count - 2) {
 771         for my $j ($i + 1 .. $list_count - 1) {
 772             my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
 773             my $differ_count = @bits_that_differ;
 774             my $key = join ",", @bits_that_differ;
 775             push @{$hash{$differ_count}{$key}}, $list[$i] unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
 776             push @{$hash{$differ_count}{$key}}, $list[$j];
 777         }
 778     }
 779
 780     print STDERR __LINE__, ": calculate_mask() called:  List of values grouped by differing bits: ", Dumper \%hash if DEBUG;
 781
 782     my @final_results;
 783     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 784         my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
 785         foreach my $bits (sort keys $hash{$count}) {
 786
 787             print STDERR __LINE__, ": For $count bit(s) difference ($bits), need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
 788
 789             # Look only as long as there are at least as many elements in the
 790             # subset as are needed
 791             while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
 792
 793                 print STDERR __LINE__, ": Looking at bit positions ($bits): ", Dumper $hash{$count}{$bits} if DEBUG;
 794
 795                 # Start with the first element in it
 796                 my $try_base = $hash{$count}{$bits}[0];
 797                 my @subset = $try_base;
 798
 799                 # If it succeeds, we return a mask and a base to compare
 800                 # against the masked value.  That base will be the AND of
 801                 # every element in the subset.  Initialize to the one element
 802                 # we have so far.
 803                 my $compare = $try_base;
 804
 805                 # We are trying to find a subset of this that has <need>
 806                 # elements that differ in the bit positions given by the
 807                 # string $bits, which is comma separated.
 808                 my @bits = split ",", $bits;
 809
 810                 TRY: # Look through the remainder of the list for other
 811                      # elements that differ only by these bit positions.
 812
 813                 for (my $i = 1; $i < $cur_count; $i++) {
 814                     my $try_this = $hash{$count}{$bits}[$i];
 815                     my @positions = pop_count($try_base ^ $try_this);
 816
 817                     print STDERR __LINE__, ": $try_base vs $try_this: is (", join(',', @positions), ") a subset of ($bits)?" if DEBUG;;
 818
 819                     foreach my $pos (@positions) {
 820                         unless (grep { $pos == $_ } @bits) {
 821                             print STDERR "  No\n" if DEBUG;
 822                             my $remaining = $cur_count - $i - 1;
 823                             if ($remaining && @subset + $remaining < $need) {
 824                                 print STDERR __LINE__, ": Can stop trying $try_base, because even if all the remaining $remaining values work, they wouldn't add up to the needed $need when combined with the existing ", scalar @subset, " ones\n" if DEBUG;
 825                                 last TRY;
 826                             }
 827                             next TRY;
 828                         }
 829                     }
 830
 831                     print STDERR "  Yes\n" if DEBUG;
 832                     push @subset, $try_this;
 833
 834                     # Add this to the mask base, in case it ultimately
 835                     # succeeds,
 836                     $compare &= $try_this;
 837                 }
 838
 839                 print STDERR __LINE__, ": subset (", join(", ", @subset), ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
 840
 841                 if (@subset < $need) {
 842                     shift @{$hash{$count}{$bits}};
 843                     next;   # Try with next value
 844                 }
 845
 846                 # Create the mask
 847                 my $mask = 0;
 848                 foreach my $position (@bits) {
 849                     $mask |= 1 << $position;
 850                 }
 851                 $mask = ~$mask & 0xFF;
 852                 push @final_results, [$compare, $mask];
 853
 854                 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n", __LINE__, $compare, $compare, $mask if DEBUG;
 855
 856                 # These values are now spoken for.  Remove them from future
 857                 # consideration
 858                 foreach my $remove_count (sort keys %hash) {
 859                     foreach my $bits (sort keys %{$hash{$remove_count}}) {
 860                         foreach my $to_remove (@subset) {
 861                             @{$hash{$remove_count}{$bits}} = grep { $_ != $to_remove } @{$hash{$remove_count}{$bits}};
 862                         }
 863                     }
 864                 }
 865             }
 866         }
 867     }
 868
 869     # Any values that remain in the list are ones that have to be tested for
 870     # individually.
 871     my @individuals;
 872     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 873         foreach my $bits (sort keys $hash{$count}) {
 874             foreach my $remaining (@{$hash{$count}{$bits}}) {
 875
 876                 # If we already know about this value, just ignore it.
 877                 next if grep { $remaining == $_ } @individuals;
 878
 879                 # Otherwise it needs to be returned as something to match
 880                 # individually
 881                 push @final_results, [$remaining, undef];
 882                 push @individuals, $remaining;
 883             }
 884         }
 885     }
 886
 887     # Sort by increasing numeric value
 888     @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
 889
 890     print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
 891
 892     return @final_results;
 893 }
 894
 895 # _cond_as_str
 896 # turn a list of conditions into a text expression
 897 # - merges ranges of conditions, and joins the result with ||
 898 sub _cond_as_str {
 899     my ( $self, $op, $combine, $opts_ref )= @_;
 900     my $cond= $op->{vals};
 901     my $test= $op->{test};
 902     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
 903     return "( $test )" if !defined $cond;
 904
 905     # rangify the list.
 906     my @ranges;
 907     my $Update= sub {
 908         # We skip this if there are optimizations that
 909         # we can apply (below) to the individual ranges
 910         if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
 911             if ( $ranges[-1][0] == $ranges[-1][1] ) {
 912                 $ranges[-1]= $ranges[-1][0];
 913             } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
 914                 $ranges[-1]= $ranges[-1][0];
 915                 push @ranges, $ranges[-1] + 1;
 916             }
 917         }
 918     };
 919     for my $condition ( @$cond ) {
 920         if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
 921             $Update->();
 922             push @ranges, [ $condition, $condition ];
 923         } else {
 924             $ranges[-1][1]++;
 925         }
 926     }
 927     $Update->();
 928
 929     return $self->_combine( $test, @ranges )
 930       if $combine;
 931
 932     if ($is_cp_ret) {
 933         @ranges= map {
 934             ref $_
 935             ? sprintf(
 936                 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
 937                 @$_ )
 938             : sprintf( "$self->{val_fmt} == $test", $_ );
 939         } @ranges;
 940
 941         return "( " . join( " || ", @ranges ) . " )";
 942     }
 943
 944     # If the input set has certain characteristics, we can optimize tests
 945     # for it.  This doesn't apply if returning the code point, as we want
 946     # each element of the set individually.  The code above is for this
 947     # simpler case.
 948
 949     return 1 if @$cond == 256;  # If all bytes match, is trivially true
 950
 951     my @masks;
 952     if (@ranges > 1) {
 953
 954         # See if the entire set shares optimizable characteristics, and if so,
 955         # return the optimization.  We delay checking for this on sets with
 956         # just a single range, as there may be better optimizations available
 957         # in that case.
 958         @masks = calculate_mask(@$cond);
 959
 960         # Stringify the output of calculate_mask()
 961         if (@masks) {
 962             my @return;
 963             foreach my $mask_ref (@masks) {
 964                 if (defined $mask_ref->[1]) {
 965                     push @return, sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask_ref->[1], $mask_ref->[0];
 966                 }
 967                 else {  # An undefined mask means to use the value as-is
 968                     push @return, sprintf "$test == $self->{val_fmt}", $mask_ref->[0];
 969                 }
 970             }
 971
 972             # The best possible case below for specifying this set of values via
 973             # ranges is 1 branch per range.  If our mask method yielded better
 974             # results, there is no sense trying something that is bound to be
 975             # worse.
 976             if (@return < @ranges) {
 977                 return "( " . join( " || ", @return ) . " )";
 978             }
 979
 980             @masks = @return;
 981         }
 982     }
 983
 984     # Here, there was no entire-class optimization that was clearly better
 985     # than doing things by ranges.  Look at each range.
 986     my $range_count_extra = 0;
 987     for (my $i = 0; $i < @ranges; $i++) {
 988         if (! ref $ranges[$i]) {    # Trivial case: no range
 989             $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
 990         }
 991         elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
 992             $ranges[$i] =           # Trivial case: single element range
 993                     sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
 994         }
 995         else {
 996             my $output = "";
 997
 998             # Well-formed UTF-8 continuation bytes on ascii platforms must be
 999             # in the range 0x80 .. 0xBF.  If we know that the input is
1000             # well-formed (indicated by not trying to be 'safe'), we can omit
1001             # tests that verify that the input is within either of these
1002             # bounds.  (No legal UTF-8 character can begin with anything in
1003             # this range, so we don't have to worry about this being a
1004             # continuation byte or not.)
1005             if (ASCII_PLATFORM
1006                 && ! $opts_ref->{safe}
1007                 && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
1008             {
1009                 my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
1010                 my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
1011
1012                 # If the range is the entire legal range, it matches any legal
1013                 # byte, so we can omit both tests.  (This should happen only
1014                 # if the number of ranges is 1.)
1015                 if ($lower_limit_is_80 && $upper_limit_is_BF) {
1016                     return 1;
1017                 }
1018                 elsif ($lower_limit_is_80) { # Just use the upper limit test
1019                     $output = sprintf("( $test <= $self->{val_fmt} )",
1020                                         $ranges[$i]->[1]);
1021                 }
1022                 elsif ($upper_limit_is_BF) { # Just use the lower limit test
1023                     $output = sprintf("( $test >= $self->{val_fmt} )",
1024                                     $ranges[$i]->[0]);
1025                 }
1026             }
1027
1028             # If we didn't change to omit a test above, see if the number of
1029             # elements is a power of 2 (only a single bit in the
1030             # representation of its count will be set) and if so, it may be
1031             # that a mask/compare optimization is possible.
1032             if ($output eq ""
1033                 && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
1034             {
1035                 my @list;
1036                 push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
1037                 my @this_masks = calculate_mask(@list);
1038
1039                 # Use the mask if there is just one for the whole range.
1040                 # Otherwise there is no savings over the two branches that can
1041                 # define the range.
1042                 if (@this_masks == 1 && defined $this_masks[0][1]) {
1043                     $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $this_masks[0][1], $this_masks[0][0];
1044                 }
1045             }
1046
1047             if ($output ne "") {  # Prefer any optimization
1048                 $ranges[$i] = $output;
1049             }
1050             else {
1051                 # No optimization happened.  We need a test that the code
1052                 # point is within both bounds.  But, if the bounds are
1053                 # adjacent code points, it is cleaner to say
1054                 # 'first == test || second == test'
1055                 # than it is to say
1056                 # 'first <= test && test <= second'
1057
1058                 $range_count_extra++;   # This range requires 2 branches to
1059                                         # represent
1060                 if ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
1061                     $ranges[$i] = "( "
1062                                 .  join( " || ", ( map
1063                                     { sprintf "$self->{val_fmt} == $test", $_ }
1064                                     @{$ranges[$i]} ) )
1065                                 . " )";
1066                 }
1067                 else {  # Full bounds checking
1068                     $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
1069                 }
1070             }
1071         }
1072     }
1073
1074     # We have generated the list of bytes in two ways; one trying to use masks
1075     # to cut the number of branches down, and the other to look at individual
1076     # ranges (some of which could be cut down by using a mask for just it).
1077     # We return whichever method uses the fewest branches.
1078     return "( "
1079            . join( " || ", (@masks && @masks < @ranges + $range_count_extra)
1080                             ? @masks
1081                             : @ranges)
1082            . " )";
1083 }
1084
1085 # _combine
1086 # recursively turn a list of conditions into a fast break-out condition
1087 # used by _cond_as_str() for 'cp' type macros.
1088 sub _combine {
1089     my ( $self, $test, @cond )= @_;
1090     return if !@cond;
1091     my $item= shift @cond;
1092     my ( $cstr, $gtv );
1093     if ( ref $item ) {
1094         $cstr=
1095           sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1096             @$item );
1097         $gtv= sprintf "$self->{val_fmt}", $item->[1];
1098     } else {
1099         $cstr= sprintf( "$self->{val_fmt} == $test", $item );
1100         $gtv= sprintf "$self->{val_fmt}", $item;
1101     }
1102     if ( @cond ) {
1103         my $combine= $self->_combine( $test, @cond );
1104         if (@cond >1) {
1105             return "( $cstr || ( $gtv < $test &&\n"
1106                    . $combine . " ) )";
1107         } else {
1108             return "( $cstr || $combine )";
1109         }
1110     } else {
1111         return $cstr;
1112     }
1113 }
1114
1115 # _render()
1116 # recursively convert an optree to text with reasonably neat formatting
1117 sub _render {
1118     my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_;
1119     return 0 if ! defined $op;  # The set is empty
1120     if ( !ref $op ) {
1121         return $op;
1122     }
1123     my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1124     #no warnings 'recursion';   # This would allow really really inefficient
1125                                 # code to be generated.  See pod
1126     my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def, $submacros );
1127     return $yes if $cond eq '1';
1128
1129     my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref, $def, $submacros );
1130     return "( $cond )" if $yes eq '1' and $no eq '0';
1131     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1132     return "$lb$cond ? $yes : $no$rb"
1133       if !ref( $op->{yes} ) && !ref( $op->{no} );
1134     my $ind1= " " x 4;
1135     my $ind= "\n" . ( $ind1 x $op->{depth} );
1136
1137     if ( ref $op->{yes} ) {
1138         $yes= $ind . $ind1 . $yes;
1139     } else {
1140         $yes= " " . $yes;
1141     }
1142
1143     my $str= "$lb$cond ?$yes$ind: $no$rb";
1144     if (length $str > 6000) {
1145         push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $yes_idx= 0+@$submacros), $yes;
1146         push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $no_idx= 0+@$submacros), $no;
1147         return sprintf "%s%s ? $def : $def%s", $lb, $cond, "_part$yes_idx", "_part$no_idx", $rb;
1148     }
1149     return $str;
1150 }
1151
1152 # $expr=render($op,$combine)
1153 #
1154 # convert an optree to text with reasonably neat formatting. If $combine
1155 # is true then the condition is created using "fast breakouts" which
1156 # produce uglier expressions that are more efficient for common case,
1157 # longer lists such as that resulting from type 'cp' output.
1158 # Currently only used for type 'cp' macros.
1159 sub render {
1160     my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_;
1161
1162     my @submacros;
1163     my $macro= sprintf "#define $def_fmt\n( %s )", "", $self->_render( $op, $combine, 0, $opts_ref, $def_fmt, \@submacros );
1164
1165     return join "\n\n", map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) } @submacros, $macro;
1166 }
1167
1168 # make_macro
1169 # make a macro of a given type.
1170 # calls into make_trie and (generic_|length_)optree as needed
1171 # Opts are:
1172 # type     : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
1173 # ret_type : 'cp' or 'len'
1174 # safe     : add length guards to macro
1175 #
1176 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1177 # in which case it defaults to 'cp' as well.
1178 #
1179 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1180 # sequences in it, as the generated macro will accept only a single codepoint
1181 # as an argument.
1182 #
1183 # returns the macro.
1184
1185
1186 sub make_macro {
1187     my $self= shift;
1188     my %opts= @_;
1189     my $type= $opts{type} || 'generic';
1190     die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
1191       if $type =~ /^cp/
1192       and $self->{has_multi};
1193     my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1194     my $method;
1195     if ( $opts{safe} ) {
1196         $method= 'length_optree';
1197     } elsif ( $type =~ /generic/ ) {
1198         $method= 'generic_optree';
1199     } else {
1200         $method= 'optree';
1201     }
1202     my @args= $type =~ /^cp/ ? 'cp' : 's';
1203     push @args, "e" if $opts{safe};
1204     push @args, "is_utf8" if $type =~ /generic/;
1205     push @args, "len" if $ret_type eq 'both';
1206     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
1207              $ret_type eq 'cp'      ? 'what_'     : 'is_';
1208     my $ext= $type     =~ /generic/ ? ''          : '_' . lc( $type );
1209     $ext .= '_non_low' if $type eq 'generic_non_low';
1210     $ext .= "_safe" if $opts{safe};
1211     my $argstr= join ",", @args;
1212     my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
1213     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1214     return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt );
1215 }
1216
1217 # if we aren't being used as a module (highly likely) then process
1218 # the __DATA__ below and produce macros in regcharclass.h
1219 # if an argument is provided to the script then it is assumed to
1220 # be the path of the file to output to, if the arg is '-' outputs
1221 # to STDOUT.
1222 if ( !caller ) {
1223     $|++;
1224     my $path= shift @ARGV || "regcharclass.h";
1225     my $out_fh;
1226     if ( $path eq '-' ) {
1227         $out_fh= \*STDOUT;
1228     } else {
1229         $out_fh = open_new( $path );
1230     }
1231     print $out_fh read_only_top( lang => 'C', by => $0,
1232                                  file => 'regcharclass.h', style => '*',
1233                                  copyright => [2007, 2011] );
1234     print $out_fh "\n#ifndef H_REGCHARCLASS   /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
1235
1236     my ( $op, $title, @txt, @types, %mods );
1237     my $doit= sub {
1238         return unless $op;
1239
1240         # Skip if to compile on a different platform.
1241         return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
1242         return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
1243
1244         print $out_fh "/*\n\t$op: $title\n\n";
1245         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1246         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
1247
1248         #die Dumper(\@types,\%mods);
1249
1250         my @mods;
1251         push @mods, 'safe' if delete $mods{safe};
1252         unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
1253                                                                 # do this one
1254                                                                 # first, as
1255                                                                 # traditional
1256         if (%mods) {
1257             die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1258         }
1259
1260         foreach my $type_spec ( @types ) {
1261             my ( $type, $ret )= split /-/, $type_spec;
1262             $ret ||= 'len';
1263             foreach my $mod ( @mods ) {
1264                 next if $mod eq 'safe' and $type =~ /^cp/;
1265                 delete $mods{$mod};
1266                 my $macro= $obj->make_macro(
1267                     type     => $type,
1268                     ret_type => $ret,
1269                     safe     => $mod eq 'safe'
1270                 );
1271                 print $out_fh $macro, "\n";
1272             }
1273         }
1274     };
1275
1276     while ( <DATA> ) {
1277         s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1278         next unless /\S/;
1279         chomp;
1280         if ( /^[A-Z]/ ) {
1281             $doit->();  # This starts a new definition; do the previous one
1282             ( $op, $title )= split /\s*:\s*/, $_, 2;
1283             @txt= ();
1284         } elsif ( s/^=>// ) {
1285             my ( $type, $modifier )= split /:/, $_;
1286             @types= split ' ', $type;
1287             undef %mods;
1288             map { $mods{$_} = 1 } split ' ',  $modifier;
1289         } else {
1290             push @txt, "$_";
1291         }
1292     }
1293     $doit->();
1294
1295     print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
1296
1297     if($path eq '-') {
1298         print $out_fh "/* ex: set ro: */\n";
1299     } else {
1300         read_only_bottom_close_and_rename($out_fh)
1301     }
1302 }
1303
1304 # The form of the input is a series of definitions to make macros for.
1305 # The first line gives the base name of the macro, followed by a colon, and
1306 # then text to be used in comments associated with the macro that are its
1307 # title or description.  In all cases the first (perhaps only) parameter to
1308 # the macro is a pointer to the first byte of the code point it is to test to
1309 # see if it is in the class determined by the macro.  In the case of non-UTF8,
1310 # the code point consists only of a single byte.
1311 #
1312 # The second line must begin with a '=>' and be followed by the types of
1313 # macro(s) to be generated; these are specified below.  A colon follows the
1314 # types, followed by the modifiers, also specified below.  At least one
1315 # modifier is required.
1316 #
1317 # The subsequent lines give what code points go into the class defined by the
1318 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1319 # enclosed in quotes.  Otherwise the lines consist of one of:
1320 #   1)  a single Unicode code point, prefaced by 0x
1321 #   2)  a single range of Unicode code points separated by a minus (and
1322 #       optional space)
1323 #   3)  a single Unicode property specified in the standard Perl form
1324 #       "\p{...}"
1325 #   4)  a line like 'do path'.  This will do a 'do' on the file given by
1326 #       'path'.  It is assumed that this does nothing but load subroutines
1327 #       (See item 5 below).  The reason 'require path' is not used instead is
1328 #       because 'do' doesn't assume that path is in @INC.
1329 #   5)  a subroutine call
1330 #           &pkg::foo(arg1, ...)
1331 #       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1332 #       returns an array of entries of forms like items 1-3 above.  This
1333 #       allows more complex inputs than achievable from the other input types.
1334 #
1335 # A blank line or one whose first non-blank character is '#' is a comment.
1336 # The definition of the macro is terminated by a line unlike those described.
1337 #
1338 # Valid types:
1339 #   low         generate a macro whose name is 'is_BASE_low' and defines a
1340 #               class that includes only ASCII-range chars.  (BASE is the
1341 #               input macro base name.)
1342 #   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1343 #               class that includes only upper-Latin1-range chars.  It is not
1344 #               designed to take a UTF-8 input parameter.
1345 #   high        generate a macro whose name is 'is_BASE_high' and defines a
1346 #               class that includes all relevant code points that are above
1347 #               the Latin1 range.  This is for very specialized uses only.
1348 #               It is designed to take only an input UTF-8 parameter.
1349 #   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1350 #               class that includes all relevant characters that aren't ASCII.
1351 #               It is designed to take only an input UTF-8 parameter.
1352 #   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1353 #               class that includes both ASCII and upper-Latin1-range chars.
1354 #               It is not designed to take a UTF-8 input parameter.
1355 #   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1356 #               class that can include any code point, adding the 'low' ones
1357 #               to what 'utf8' works on.  It is designed to take only an input
1358 #               UTF-8 parameter.
1359 #   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1360 #               boolean, parameter which indicates if the first one points to
1361 #               a UTF-8 string or not.  Thus it works in all circumstances.
1362 #   generic_non_low generate a macro whose name is 'is_BASE_non_low".  It has
1363 #               a 2nd, boolean, parameter which indicates if the first one
1364 #               points to a UTF-8 string or not.  It excludes any ASCII-range
1365 #               matches, but otherwise it works in all circumstances.
1366 #   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1367 #               class that returns true if the UV parameter is a member of the
1368 #               class; false if not.
1369 #   cp_high     like cp, but it is assumed that it is known that the UV
1370 #               parameter is above Latin1.  The name of the generated macro is
1371 #               'is_BASE_cp_high'.  This is different from high-cp, derived
1372 #               below.
1373 # A macro of the given type is generated for each type listed in the input.
1374 # The default return value is the number of octets read to generate the match.
1375 # Append "-cp" to the type to have it instead return the matched codepoint.
1376 #               The macro name is changed to 'what_BASE...'.  See pod for
1377 #               caveats
1378 # Appending '-both" instead adds an extra parameter to the end of the argument
1379 #               list, which is a pointer as to where to store the number of
1380 #               bytes matched, while also returning the code point.  The macro
1381 #               name is changed to 'what_len_BASE...'.  See pod for caveats
1382 #
1383 # Valid modifiers:
1384 #   safe        The input string is not necessarily valid UTF-8.  In
1385 #               particular an extra parameter (always the 2nd) to the macro is
1386 #               required, which points to one beyond the end of the string.
1387 #               The macro will make sure not to read off the end of the
1388 #               string.  In the case of non-UTF8, it makes sure that the
1389 #               string has at least one byte in it.  The macro name has
1390 #               '_safe' appended to it.
1391 #   fast        The input string is valid UTF-8.  No bounds checking is done,
1392 #               and the macro can make assumptions that lead to faster
1393 #               execution.
1394 #   only_ascii_platform   Skip this definition if this program is being run on
1395 #               a non-ASCII platform.
1396 #   only_ebcdic_platform  Skip this definition if this program is being run on
1397 #               a non-EBCDIC platform.
1398 # No modifier need be specified; fast is assumed for this case.  If both
1399 # 'fast', and 'safe' are specified, two macros will be created for each
1400 # 'type'.
1401 #
1402 # If run on a non-ASCII platform will automatically convert the Unicode input
1403 # to native.  The documentation above is slightly wrong in this case.  'low'
1404 # actually refers to code points whose UTF-8 representation is the same as the
1405 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1406 # code points less than 256.
1407
1408 1; # in the unlikely case we are being used as a module
1409
1410 __DATA__
1411 # This is no longer used, but retained in case it is needed some day.
1412 # TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1413 # => generic cp generic-cp generic-both :fast safe
1414 # 0x00DF        # LATIN SMALL LETTER SHARP S
1415 # 0x0390        # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1416 # 0x03B0        # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1417 # 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1418 # 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1419 # 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1420
1421 LNBREAK: Line Break: \R
1422 => generic UTF8 LATIN1 :fast safe
1423 "\x0D\x0A"      # CRLF - Network (Windows) line ending
1424 \p{VertSpace}
1425
1426 HORIZWS: Horizontal Whitespace: \h \H
1427 => generic UTF8 LATIN1 high cp cp_high :fast safe
1428 \p{HorizSpace}
1429
1430 VERTWS: Vertical Whitespace: \v \V
1431 => generic UTF8 high LATIN1 cp cp_high :fast safe
1432 \p{VertSpace}
1433
1434 XDIGIT: Hexadecimal digits
1435 => UTF8 high cp_high :fast
1436 \p{XDigit}
1437
1438 XPERLSPACE: \p{XPerlSpace}
1439 => generic UTF8 high cp_high :fast
1440 \p{XPerlSpace}
1441
1442 REPLACEMENT: Unicode REPLACEMENT CHARACTER
1443 => UTF8 :safe
1444 0xFFFD
1445
1446 NONCHAR: Non character code points
1447 => UTF8 :fast
1448 \p{Nchar}
1449
1450 SURROGATE: Surrogate characters
1451 => UTF8 :fast
1452 \p{Gc=Cs}
1453
1454 GCB_L: Grapheme_Cluster_Break=L
1455 => UTF8 :fast
1456 \p{_X_GCB_L}
1457
1458 GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
1459 => UTF8 :fast
1460 \p{_X_LV_LVT_V}
1461
1462 GCB_Prepend: Grapheme_Cluster_Break=Prepend
1463 => UTF8 :fast
1464 \p{_X_GCB_Prepend}
1465
1466 GCB_RI: Grapheme_Cluster_Break=RI
1467 => UTF8 :fast
1468 \p{_X_RI}
1469
1470 GCB_SPECIAL_BEGIN_START: Grapheme_Cluster_Break=special_begin_starts
1471 => UTF8 :fast
1472 \p{_X_Special_Begin_Start}
1473
1474 GCB_T: Grapheme_Cluster_Break=T
1475 => UTF8 :fast
1476 \p{_X_GCB_T}
1477
1478 GCB_V: Grapheme_Cluster_Break=V
1479 => UTF8 :fast
1480 \p{_X_GCB_V}
1481
1482 # This program was run with this enabled, and the results copied to utf8.h;
1483 # then this was commented out because it takes so long to figure out these 2
1484 # million code points.  The results would not change unless utf8.h decides it
1485 # wants a maximum other than 4 bytes, or this program creates better
1486 # optimizations
1487 #UTF8_CHAR: Matches utf8 from 1 to 4 bytes
1488 #=> UTF8 :safe only_ascii_platform
1489 #0x0 - 0x1FFFFF
1490
1491 # This hasn't been commented out, because we haven't an EBCDIC platform to run
1492 # it on, and the 3 types of EBCDIC allegedly supported by Perl would have
1493 # different results
1494 UTF8_CHAR: Matches utf8 from 1 to 5 bytes
1495 => UTF8 :safe only_ebcdic_platform
1496 0x0 - 0x3FFFFF:
1497
1498 QUOTEMETA: Meta-characters that \Q should quote
1499 => high :fast
1500 \p{_Perl_Quotemeta}
1501
1502 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1503 => UTF8 :safe
1504 do regen/regcharclass_multi_char_folds.pl
1505
1506 # 1 => All folds
1507 &regcharclass_multi_char_folds::multi_char_folds(1)
1508
1509 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1510 => LATIN1 :safe
1511
1512 &regcharclass_multi_char_folds::multi_char_folds(0)
1513 # 0 => Latin1-only
1514
1515 PATWS: pattern white space
1516 => generic generic_non_low cp : fast safe
1517 \p{PatWS}