regen/regcharclass.pl

   1 #!perl
   2 package CharClass::Matcher;
   3 use strict;
   4 use 5.008;
   5 use warnings;
   6 use warnings FATAL => 'all';
   7 no warnings 'experimental::autoderef';
   8 use Data::Dumper;
   9 $Data::Dumper::Useqq= 1;
  10 our $hex_fmt= "0x%02X";
  11
  12 sub DEBUG () { 0 }
  13 $|=1 if DEBUG;
  14
  15 require 'regen/regen_lib.pl';
  16 require 'regen/charset_translations.pl';
  17 require "regen/regcharclass_multi_char_folds.pl";
  18
  19 =head1 NAME
  20
  21 CharClass::Matcher -- Generate C macros that match character classes efficiently
  22
  23 =head1 SYNOPSIS
  24
  25     perl Porting/regcharclass.pl
  26
  27 =head1 DESCRIPTION
  28
  29 Dynamically generates macros for detecting special charclasses
  30 in latin-1, utf8, and codepoint forms. Macros can be set to return
  31 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
  32
  33 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
  34 are necessary.
  35
  36 Using WHATEVER as an example the following macros can be produced, depending
  37 on the input parameters (how to get each is described by internal comments at
  38 the C<__DATA__> line):
  39
  40 =over 4
  41
  42 =item C<is_WHATEVER(s,is_utf8)>
  43
  44 =item C<is_WHATEVER_safe(s,e,is_utf8)>
  45
  46 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
  47 comparisons involving octect<128 are done before checking the C<is_utf8>
  48 flag, hopefully saving time.
  49
  50 The version without the C<_safe> suffix should be used only when the input is
  51 known to be well-formed.
  52
  53 =item C<is_WHATEVER_utf8(s)>
  54
  55 =item C<is_WHATEVER_utf8_safe(s,e)>
  56
  57 Do a lookup assuming the string is encoded in (normalized) UTF8.
  58
  59 The version without the C<_safe> suffix should be used only when the input is
  60 known to be well-formed.
  61
  62 =item C<is_WHATEVER_latin1(s)>
  63
  64 =item C<is_WHATEVER_latin1_safe(s,e)>
  65
  66 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  67
  68 The version without the C<_safe> suffix should be used only when it is known
  69 that C<s> contains at least one character.
  70
  71 =item C<is_WHATEVER_cp(cp)>
  72
  73 Check to see if the string matches a given codepoint (hypothetically a
  74 U32). The condition is constructed as to "break out" as early as
  75 possible if the codepoint is out of range of the condition.
  76
  77 IOW:
  78
  79   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  80
  81 Thus if the character is X+1 only two comparisons will be done. Making
  82 matching lookups slower, but non-matching faster.
  83
  84 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
  85
  86 A variant form of each of the macro types described above can be generated, in
  87 which the code point is returned by the macro, and an extra parameter (in the
  88 final position) is added, which is a pointer for the macro to set the byte
  89 length of the returned code point.
  90
  91 These forms all have a C<what_len> prefix instead of the C<is_>, for example
  92 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
  93 C<what_len_WHATEVER_utf8(s,len)>.
  94
  95 These forms should not be used I<except> on small sets of mostly widely
  96 separated code points; otherwise the code generated is inefficient.  For these
  97 cases, it is best to use the C<is_> forms, and then find the code point with
  98 C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
  99 message on the worst of the inappropriate sets.  Examine the generated macro
 100 to see if it is acceptable.
 101
 102 =item C<what_WHATEVER_FOO(arg1, ...)>
 103
 104 A variant form of each of the C<is_> macro types described above can be generated, in
 105 which the code point and not the length is returned by the macro.  These have
 106 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
 107 not be used where the set contains a NULL, as 0 is returned for two different
 108 cases: a) the set doesn't include the input code point; b) the set does
 109 include it, and it is a NULL.
 110
 111 =back
 112
 113 The above isn't quite complete, as for specialized purposes one can get a
 114 macro like C<is_WHATEVER_utf8_no_length_checks(s)>, which assumes that it is
 115 already known that there is enough space to hold the character starting at
 116 C<s>, but otherwise checks that it is well-formed.  In other words, this is
 117 intermediary in checking between C<is_WHATEVER_utf8(s)> and
 118 C<is_WHATEVER_utf8_safe(s,e)>.
 119
 120 =head2 CODE FORMAT
 121
 122 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
 123
 124
 125 =head1 AUTHOR
 126
 127 Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
 128
 129 =head1 BUGS
 130
 131 No tests directly here (although the regex engine will fail tests
 132 if this code is broken). Insufficient documentation and no Getopts
 133 handler for using the module as a script.
 134
 135 =head1 LICENSE
 136
 137 You may distribute under the terms of either the GNU General Public
 138 License or the Artistic License, as specified in the README file.
 139
 140 =cut
 141
 142 # Sub naming convention:
 143 # __func : private subroutine, can not be called as a method
 144 # _func  : private method, not meant for external use
 145 # func   : public method.
 146
 147 # private subs
 148 #-------------------------------------------------------------------------------
 149 #
 150 # ($cp,$n,$l,$u)=__uni_latin($str);
 151 #
 152 # Return a list of arrays, each of which when interpreted correctly
 153 # represent the string in some given encoding with specific conditions.
 154 #
 155 # $cp - list of codepoints that make up the string.
 156 # $n  - list of octets that make up the string if all codepoints are invariant
 157 #       regardless of if the string is in UTF-8 or not.
 158 # $l  - list of octets that make up the string in latin1 encoding if all
 159 #       codepoints < 256, and at least one codepoint is UTF-8 variant.
 160 # $u  - list of octets that make up the string in utf8 if any codepoint is
 161 #       UTF-8 variant
 162 #
 163 #   High CP | Defined
 164 #-----------+----------
 165 #   0 - 127 : $n            (127/128 are the values for ASCII platforms)
 166 # 128 - 255 : $l, $u
 167 # 256 - ... : $u
 168 #
 169
 170 sub __uni_latin1 {
 171     my $charset= shift;
 172     my $str= shift;
 173     my $max= 0;
 174     my @cp;
 175     my @cp_high;
 176     my $only_has_invariants = 1;
 177     my $a2n = get_a2n($charset);
 178     for my $ch ( split //, $str ) {
 179         my $cp= ord $ch;
 180         $max= $cp if $max < $cp;
 181         if ($cp > 255) {
 182             push @cp, $cp;
 183             push @cp_high, $cp;
 184         }
 185         else {
 186             push @cp, $a2n->[$cp];
 187         }
 188     }
 189     my ( $n, $l, $u );
 190     $only_has_invariants = ($charset =~ /ascii/i) ? $max < 128 : $max < 160;
 191     if ($only_has_invariants) {
 192         $n= [@cp];
 193     } else {
 194         $l= [@cp] if $max && $max < 256;
 195
 196         my @u;
 197         for my $ch ( split //, $str ) {
 198             push @u, map { ord } split //, cp_2_utfbytes(ord $ch, $charset);
 199         }
 200         $u = \@u;
 201     }
 202     return ( \@cp, \@cp_high, $n, $l, $u );
 203 }
 204
 205 #
 206 # $clean= __clean($expr);
 207 #
 208 # Cleanup a ternary expression, removing unnecessary parens and apply some
 209 # simplifications using regexes.
 210 #
 211
 212 sub __clean {
 213     my ( $expr )= @_;
 214
 215     #return $expr;
 216
 217     our $parens;
 218     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 219
 220     ## remove redundant parens
 221     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 222
 223
 224     # repeatedly simplify conditions like
 225     #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
 226     # into
 227     #       ( ( (cond1) && (cond2) ) ? X : Y )
 228     # Also similarly handles expressions like:
 229     #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
 230     # Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is
 231     # purely to ensure we have a balanced set of parens in the expression which makes
 232     # it easier to understand the pattern in an editor that understands paren's, we do
 233     # not expect either of these cases to actually fire. - Yves
 234     1 while $expr =~ s/
 235         ([:()])  \s*
 236             ($parens) \s*
 237             \? \s*
 238                 \( \s* ($parens) \s*
 239                     \? \s* ($parens|[^()?:\s]+?) \s*
 240                     :  \s* ($parens|[^()?:\s]+?) \s*
 241                 \) \s*
 242             : \s* \5 \s*
 243         ([()])
 244     /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
 245     #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000;
 246     #$expr=~s/\s+//g if length $expr > 8000;
 247
 248     die "Expression too long" if length $expr > 8000;
 249
 250     return $expr;
 251 }
 252
 253 #
 254 # $text= __macro(@args);
 255 # Join args together by newlines, and then neatly add backslashes to the end
 256 # of every  line as expected by the C pre-processor for #define's.
 257 #
 258
 259 sub __macro {
 260     my $str= join "\n", @_;
 261     $str =~ s/\s*$//;
 262     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 263     my $last= pop @lines;
 264     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 265     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 266     return $str . "\n";
 267 }
 268
 269 #
 270 # my $op=__incrdepth($op);
 271 #
 272 # take an 'op' hashref and add one to it and all its childrens depths.
 273 #
 274
 275 sub __incrdepth {
 276     my $op= shift;
 277     return unless ref $op;
 278     $op->{depth} += 1;
 279     __incrdepth( $op->{yes} );
 280     __incrdepth( $op->{no} );
 281     return $op;
 282 }
 283
 284 # join two branches of an opcode together with a condition, incrementing
 285 # the depth on the yes branch when we do so.
 286 # returns the new root opcode of the tree.
 287 sub __cond_join {
 288     my ( $cond, $yes, $no )= @_;
 289     if (ref $yes) {
 290         return {
 291             test  => $cond,
 292             yes   => __incrdepth( $yes ),
 293             no    => $no,
 294             depth => 0,
 295         };
 296     }
 297     else {
 298         return {
 299             test  => $cond,
 300             yes   => $yes,
 301             no    => __incrdepth($no),
 302             depth => 0,
 303         };
 304     }
 305 }
 306
 307 # Methods
 308
 309 # constructor
 310 #
 311 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 312 #
 313 # Create a new CharClass::Matcher object by parsing the text in
 314 # the txt array. Currently applies the following rules:
 315 #
 316 # Element starts with C<0x>, line is evaled the result treated as
 317 # a number which is passed to chr().
 318 #
 319 # Element starts with C<">, line is evaled and the result treated
 320 # as a string.
 321 #
 322 # Each string is then stored in the 'strs' subhash as a hash record
 323 # made up of the results of __uni_latin1, using the keynames
 324 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
 325 # 'UTF8' which hold a merge of 'low' and their lowercase equivalents.
 326 #
 327 # Size data is tracked per type in the 'size' subhash.
 328 #
 329 # Return an object
 330 #
 331 sub new {
 332     my $class= shift;
 333     my %opt= @_;
 334     for ( qw(op txt) ) {
 335         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 336           if !exists $opt{$_};
 337     }
 338
 339     my $self= bless {
 340         op    => $opt{op},
 341         title => $opt{title} || '',
 342     }, $class;
 343     foreach my $txt ( @{ $opt{txt} } ) {
 344         my $str= $txt;
 345         if ( $str =~ /^[""]/ ) {
 346             $str= eval $str;
 347         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
 348                                     # list with its expansion
 349             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
 350             die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
 351             foreach my $cp (hex $lower .. hex $upper) {
 352                 push @{$opt{txt}}, sprintf "0x%X", $cp;
 353             }
 354             next;
 355         } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
 356             # Otherwise undocumented, a leading N means is already in the
 357             # native character set; don't convert.
 358             $str= chr eval $str;
 359         } elsif ( $str =~ /^0x/ ) {
 360             $str= eval $str;
 361             $str = chr $str;
 362         } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
 363             my $property = $1;
 364             use Unicode::UCD qw(prop_invlist);
 365
 366             my @invlist = prop_invlist($property, '_perl_core_internal_ok');
 367             if (! @invlist) {
 368
 369                 # An empty return could mean an unknown property, or merely
 370                 # that it is empty.  Call in scalar context to differentiate
 371                 my $count = prop_invlist($property, '_perl_core_internal_ok');
 372                 die "$property not found" unless defined $count;
 373             }
 374
 375             # Replace this element on the list with the property's expansion
 376             for (my $i = 0; $i < @invlist; $i += 2) {
 377                 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
 378
 379                     # prop_invlist() returns native values; add leading 'N'
 380                     # to indicate that.
 381                     push @{$opt{txt}}, sprintf "N0x%X", $cp;
 382                 }
 383             }
 384             next;
 385         } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
 386             die "do '$1' failed: $!$@" if ! do $1 or $@;
 387             next;
 388         } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
 389             my @results = eval "$1";
 390             die "eval '$1' failed: $@" if $@;
 391             push @{$opt{txt}}, @results;
 392             next;
 393         } else {
 394             die "Unparsable line: $txt\n";
 395         }
 396         my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $opt{charset}, $str );
 397         my $UTF8= $low   || $utf8;
 398         my $LATIN1= $low || $latin1;
 399         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
 400         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 401         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 402
 403         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}=
 404           ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 );
 405         my $rec= $self->{strs}{$str};
 406         foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
 407             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 408               if $self->{strs}{$str}{$key};
 409         }
 410         $self->{has_multi} ||= @$cp > 1;
 411         $self->{has_ascii} ||= $latin1 && @$latin1;
 412         $self->{has_low}   ||= $low && @$low;
 413         $self->{has_high}  ||= !$low && !$latin1;
 414     }
 415     $self->{val_fmt}= $hex_fmt;
 416     $self->{count}= 0 + keys %{ $self->{strs} };
 417     return $self;
 418 }
 419
 420 # my $trie = make_trie($type,$maxlen);
 421 #
 422 # using the data stored in the object build a trie of a specific type,
 423 # and with specific maximum depth. The trie is made up the elements of
 424 # the given types array for each string in the object (assuming it is
 425 # not too long.)
 426 #
 427 # returns the trie, or undef if there was no relevant data in the object.
 428 #
 429
 430 sub make_trie {
 431     my ( $self, $type, $maxlen )= @_;
 432
 433     my $strs= $self->{strs};
 434     my %trie;
 435     foreach my $rec ( values %$strs ) {
 436         die "panic: unknown type '$type'"
 437           if !exists $rec->{$type};
 438         my $dat= $rec->{$type};
 439         next unless $dat;
 440         next if $maxlen && @$dat > $maxlen;
 441         my $node= \%trie;
 442         foreach my $elem ( @$dat ) {
 443             $node->{$elem} ||= {};
 444             $node= $node->{$elem};
 445         }
 446         $node->{''}= $rec->{str};
 447     }
 448     return 0 + keys( %trie ) ? \%trie : undef;
 449 }
 450
 451 sub pop_count ($) {
 452     my $word = shift;
 453
 454     # This returns a list of the positions of the bits in the input word that
 455     # are 1.
 456
 457     my @positions;
 458     my $position = 0;
 459     while ($word) {
 460         push @positions, $position if $word & 1;
 461         $position++;
 462         $word >>= 1;
 463     }
 464     return @positions;
 465 }
 466
 467 # my $optree= _optree()
 468 #
 469 # recursively convert a trie to an optree where every node represents
 470 # an if else branch.
 471 #
 472 #
 473
 474 sub _optree {
 475     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 476     return unless defined $trie;
 477     if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
 478         die "Can't do 'cp' optree from multi-codepoint strings";
 479     }
 480     $ret_type ||= 'len';
 481     $else= 0  unless defined $else;
 482     $depth= 0 unless defined $depth;
 483
 484     # if we have an empty string as a key it means we are in an
 485     # accepting state and unless we can match further on should
 486     # return the value of the '' key.
 487     if (exists $trie->{''} ) {
 488         # we can now update the "else" value, anything failing to match
 489         # after this point should return the value from this.
 490         if ( $ret_type eq 'cp' ) {
 491             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 492             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 493         } elsif ( $ret_type eq 'len' ) {
 494             $else= $depth;
 495         } elsif ( $ret_type eq 'both') {
 496             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 497             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 498             $else= "len=$depth, $else";
 499         }
 500     }
 501     # extract the meaningful keys from the trie, filter out '' as
 502     # it means we are an accepting state (end of sequence).
 503     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 504
 505     # if we haven't any keys there is no further we can match and we
 506     # can return the "else" value.
 507     return $else if !@conds;
 508
 509     my $test = $test_type =~ /^cp/ ? "cp" : "((U8*)s)[$depth]";
 510
 511     # First we loop over the possible keys/conditions and find out what they
 512     # look like; we group conditions with the same optree together.
 513     my %dmp_res;
 514     my @res_order;
 515     local $Data::Dumper::Sortkeys=1;
 516     foreach my $cond ( @conds ) {
 517
 518         # get the optree for this child/condition
 519         my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 );
 520         # convert it to a string with Dumper
 521         my $res_code= Dumper( $res );
 522
 523         push @{$dmp_res{$res_code}{vals}}, $cond;
 524         if (!$dmp_res{$res_code}{optree}) {
 525             $dmp_res{$res_code}{optree}= $res;
 526             push @res_order, $res_code;
 527         }
 528     }
 529
 530     # now that we have deduped the optrees we construct a new optree containing the merged
 531     # results.
 532     my %root;
 533     my $node= \%root;
 534     foreach my $res_code_idx (0 .. $#res_order) {
 535         my $res_code= $res_order[$res_code_idx];
 536         $node->{vals}= $dmp_res{$res_code}{vals};
 537         $node->{test}= $test;
 538         $node->{yes}= $dmp_res{$res_code}{optree};
 539         $node->{depth}= $depth;
 540         if ($res_code_idx < $#res_order) {
 541             $node= $node->{no}= {};
 542         } else {
 543             $node->{no}= $else;
 544         }
 545     }
 546
 547     # return the optree.
 548     return \%root;
 549 }
 550
 551 # my $optree= optree(%opts);
 552 #
 553 # Convert a trie to an optree, wrapper for _optree
 554
 555 sub optree {
 556     my $self= shift;
 557     my %opt= @_;
 558     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 559     $opt{ret_type} ||= 'len';
 560     my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
 561     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 562 }
 563
 564 # my $optree= generic_optree(%opts);
 565 #
 566 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 567 # sets of strings, including a branch for handling the string type check.
 568 #
 569
 570 sub generic_optree {
 571     my $self= shift;
 572     my %opt= @_;
 573
 574     $opt{ret_type} ||= 'len';
 575     my $test_type= 'depth';
 576     my $else= $opt{else} || 0;
 577
 578     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 579     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 580
 581     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 582       for $latin1, $utf8;
 583
 584     if ( $utf8 ) {
 585         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 586     } elsif ( $latin1 ) {
 587         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 588     }
 589     if ($opt{type} eq 'generic') {
 590         my $low= $self->make_trie( 'low', $opt{max_depth} );
 591         if ( $low ) {
 592             $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 593         }
 594     }
 595
 596     return $else;
 597 }
 598
 599 # length_optree()
 600 #
 601 # create a string length guarded optree.
 602 #
 603
 604 sub length_optree {
 605     my $self= shift;
 606     my %opt= @_;
 607     my $type= $opt{type};
 608
 609     die "Can't do a length_optree on type 'cp', makes no sense."
 610       if $type =~ /^cp/;
 611
 612     my $else= ( $opt{else} ||= 0 );
 613
 614     my $method = $type =~ /generic/ ? 'generic_optree' : 'optree';
 615     if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) {
 616
 617         # Here is non-generic output (meaning that we are only generating one
 618         # type), and all things that match have the same number ('size') of
 619         # bytes.  The length guard is simply that we have that number of
 620         # bytes.
 621         my @size = keys %{$self->{size}{$type}};
 622         my $cond= "((e) - (s)) >= $size[0]";
 623         my $optree = $self->$method(%opt);
 624         $else= __cond_join( $cond, $optree, $else );
 625     }
 626     elsif ($self->{has_multi}) {
 627         my @size;
 628
 629         # Here, there can be a match of a multiple character string.  We use
 630         # the traditional method which is to have a branch for each possible
 631         # size (longest first) and test for the legal values for that size.
 632         my %sizes= (
 633             %{ $self->{size}{low}    || {} },
 634             %{ $self->{size}{latin1} || {} },
 635             %{ $self->{size}{utf8}   || {} }
 636         );
 637         if ($method eq 'generic_optree') {
 638             @size= sort { $a <=> $b } keys %sizes;
 639         } else {
 640             @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 641         }
 642         for my $size ( @size ) {
 643             my $optree= $self->$method( %opt, type => $type, max_depth => $size );
 644             my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 645             $else= __cond_join( $cond, $optree, $else );
 646         }
 647     }
 648     else {
 649         my $utf8;
 650
 651         # Here, has more than one possible size, and only matches a single
 652         # character.  For non-utf8, the needed length is 1; for utf8, it is
 653         # found by array lookup 'UTF8SKIP'.
 654
 655         # If want just the code points above 255, set up to look for those;
 656         # otherwise assume will be looking for all non-UTF-8-invariant code
 657         # poiints.
 658         my $trie_type = ($type eq 'high') ? 'high' : 'utf8';
 659
 660         # If we do want more than the 0-255 range, find those, and if they
 661         # exist...
 662         if ($opt{type} !~ /latin1/i && ($utf8 = $self->make_trie($trie_type, 0))) {
 663
 664             # ... get them into an optree, and set them up as the 'else' clause
 665             $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 );
 666
 667             # We could make this
 668             #   UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))";
 669             # to avoid doing the UTF8SKIP and subsequent branches for invariants
 670             # that don't match.  But the current macros that get generated
 671             # have only a few things that can match past this, so I (khw)
 672             # don't think it is worth it.  (Even better would be to use
 673             # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it
 674             # if it saves a bunch.  We assume that input text likely to be
 675             # well-formed .
 676             my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))";
 677             $else = __cond_join($cond, $utf8, $else);
 678
 679             # For 'generic', we also will want the latin1 UTF-8 variants for
 680             # the case where the input isn't UTF-8.
 681             my $latin1;
 682             if ($method eq 'generic_optree') {
 683                 $latin1 = $self->make_trie( 'latin1', 1);
 684                 $latin1= $self->_optree( $latin1, 'depth', $opt{ret_type}, 0, 0 );
 685             }
 686
 687             # If we want the UTF-8 invariants, get those.
 688             my $low;
 689             if ($opt{type} !~ /non_low|high/
 690                 && ($low= $self->make_trie( 'low', 1)))
 691             {
 692                 $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 );
 693
 694                 # Expand out the UTF-8 invariants as a string so that we
 695                 # can use them as the conditional
 696                 $low = $self->_cond_as_str( $low, 0, \%opt);
 697
 698                 # If there are Latin1 variants, add a test for them.
 699                 if ($latin1) {
 700                     $else = __cond_join("(! is_utf8 )", $latin1, $else);
 701                 }
 702                 elsif ($method eq 'generic_optree') {
 703
 704                     # Otherwise for 'generic' only we know that what
 705                     # follows must be valid for just UTF-8 strings,
 706                     $else->{test} = "( is_utf8 && $else->{test} )";
 707                 }
 708
 709                 # If the invariants match, we are done; otherwise we have
 710                 # to go to the 'else' clause.
 711                 $else = __cond_join($low, 1, $else);
 712             }
 713             elsif ($latin1) {   # Here, didn't want or didn't have invariants,
 714                                 # but we do have latin variants
 715                 $else = __cond_join("(! is_utf8)", $latin1, $else);
 716             }
 717
 718             # We need at least one byte available to start off the tests
 719             $else = __cond_join("LIKELY((e) > (s))", $else, 0);
 720         }
 721         else {  # Here, we don't want or there aren't any variants.  A single
 722                 # byte available is enough.
 723             my $cond= "((e) > (s))";
 724             my $optree = $self->$method(%opt);
 725             $else= __cond_join( $cond, $optree, $else );
 726         }
 727     }
 728
 729     return $else;
 730 }
 731
 732 sub calculate_mask(@) {
 733     # Look at the input list of byte values.  This routine returns an array of
 734     # mask/base pairs to generate that list.
 735
 736     my @list = @_;
 737     my $list_count = @list;
 738
 739     # Consider a set of byte values, A, B, C ....  If we want to determine if
 740     # <c> is one of them, we can write c==A || c==B || c==C ....  If the
 741     # values are consecutive, we can shorten that to A<=c && c<=Z, which uses
 742     # far fewer branches.  If only some of them are consecutive we can still
 743     # save some branches by creating range tests for just those that are
 744     # consecutive. _cond_as_str() does this work for looking for ranges.
 745     #
 746     # Another approach is to look at the bit patterns for A, B, C .... and see
 747     # if they have some commonalities.  That's what this function does.  For
 748     # example, consider a set consisting of the bytes
 749     # 0xF0, 0xF1, 0xF2, and 0xF3.  We could write:
 750     #   0xF0 <= c && c <= 0xF4
 751     # But the following mask/compare also works, and has just one test:
 752     #   (c & 0xFC) == 0xF0
 753     # The reason it works is that the set consists of exactly those bytes
 754     # whose first 4 bits are 1, and the next two are 0.  (The value of the
 755     # other 2 bits is immaterial in determining if a byte is in the set or
 756     # not.)  The mask masks out those 2 irrelevant bits, and the comparison
 757     # makes sure that the result matches all bytes which match those 6
 758     # material bits exactly.  In other words, the set of bytes contains
 759     # exactly those whose bottom two bit positions are either 0 or 1.  The
 760     # same principle applies to bit positions that are not necessarily
 761     # adjacent.  And it can be applied to bytes that differ in 1 through all 8
 762     # bit positions.  In order to be a candidate for this optimization, the
 763     # number of bytes in the set must be a power of 2.
 764     #
 765     # Consider a different example, the set 0x53, 0x54, 0x73, and 0x74.  That
 766     # requires 4 tests using either ranges or individual values, and even
 767     # though the number in the set is a power of 2, it doesn't qualify for the
 768     # mask optimization described above because the number of bits that are
 769     # different is too large for that.  However, the set can be expressed as
 770     # two branches with masks thusly:
 771     #   (c & 0xDF) == 0x53 || (c & 0xDF) == 0x54
 772     # a branch savings of 50%.  This is done by splitting the set into two
 773     # subsets each of which has 2 elements, and within each set the values
 774     # differ by 1 byte.
 775     #
 776     # This function attempts to find some way to save some branches using the
 777     # mask technique.  If not, it returns an empty list; if so, it
 778     # returns a list consisting of
 779     #   [ [compare1, mask1], [compare2, mask2], ...
 780     #     [compare_n, undef], [compare_m, undef], ...
 781     #   ]
 782     # The <mask> is undef in the above for those bytes that must be tested
 783     # for individually.
 784     #
 785     # This function does not attempt to find the optimal set.  To do so would
 786     # probably require testing all possible combinations, and keeping track of
 787     # the current best one.
 788     #
 789     # There are probably much better algorithms, but this is the one I (khw)
 790     # came up with.  We start with doing a bit-wise compare of every byte in
 791     # the set with every other byte.  The results are sorted into arrays of
 792     # all those that differ by the same bit positions.  These are stored in a
 793     # hash with the each key being the bits they differ in.  Here is the hash
 794     # for the 0x53, 0x54, 0x73, 0x74 set:
 795     # {
 796     #    4 => {
 797     #            "0,1,2,5" => [
 798     #                            83,
 799     #                            116,
 800     #                            84,
 801     #                            115
 802     #                        ]
 803     #        },
 804     #    3 => {
 805     #            "0,1,2" => [
 806     #                        83,
 807     #                        84,
 808     #                        115,
 809     #                        116
 810     #                        ]
 811     #        }
 812     #    1 => {
 813     #            5 => [
 814     #                    83,
 815     #                    115,
 816     #                    84,
 817     #                    116
 818     #                ]
 819     #        },
 820     # }
 821     #
 822     # The set consisting of values which differ in the 4 bit positions 0, 1,
 823     # 2, and 5 from some other value in the set consists of all 4 values.
 824     # Likewise all 4 values differ from some other value in the 3 bit
 825     # positions 0, 1, and 2; and all 4 values differ from some other value in
 826     # the single bit position 5.  The keys at the uppermost level in the above
 827     # hash, 1, 3, and 4, give the number of bit positions that each sub-key
 828     # below it has.  For example, the 4 key could have as its value an array
 829     # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
 830     # such.  The best optimization will group the most values into a single
 831     # mask.  The most values will be the ones that differ in the most
 832     # positions, the ones with the largest value for the topmost key.  These
 833     # keys, are thus just for convenience of sorting by that number, and do
 834     # not have any bearing on the core of the algorithm.
 835     #
 836     # We start with an element from largest number of differing bits.  The
 837     # largest in this case is 4 bits, and there is only one situation in this
 838     # set which has 4 differing bits, "0,1,2,5".  We look for any subset of
 839     # this set which has 16 values that differ in these 4 bits.  There aren't
 840     # any, because there are only 4 values in the entire set.  We then look at
 841     # the next possible thing, which is 3 bits differing in positions "0,1,2".
 842     # We look for a subset that has 8 values that differ in these 3 bits.
 843     # Again there are none.  So we go to look for the next possible thing,
 844     # which is a subset of 2**1 values that differ only in bit position 5.  83
 845     # and 115 do, so we calculate a mask and base for those and remove them
 846     # from every set.  Since there is only the one set remaining, we remove
 847     # them from just this one.  We then look to see if there is another set of
 848     # 2 values that differ in bit position 5.  84 and 116 do, so we calculate
 849     # a mask and base for those and remove them from every set (again only
 850     # this set remains in this example).  The set is now empty, and there are
 851     # no more sets to look at, so we are done.
 852
 853     if ($list_count == 256) {   # All 256 is trivially masked
 854         return (0, 0);
 855     }
 856
 857     my %hash;
 858
 859     # Generate bits-differing lists for each element compared against each
 860     # other element
 861     for my $i (0 .. $list_count - 2) {
 862         for my $j ($i + 1 .. $list_count - 1) {
 863             my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
 864             my $differ_count = @bits_that_differ;
 865             my $key = join ",", @bits_that_differ;
 866             push @{$hash{$differ_count}{$key}}, $list[$i] unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
 867             push @{$hash{$differ_count}{$key}}, $list[$j];
 868         }
 869     }
 870
 871     print STDERR __LINE__, ": calculate_mask() called:  List of values grouped by differing bits: ", Dumper \%hash if DEBUG;
 872
 873     my @final_results;
 874     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 875         my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
 876         foreach my $bits (sort keys $hash{$count}) {
 877
 878             print STDERR __LINE__, ": For $count bit(s) difference ($bits), need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
 879
 880             # Look only as long as there are at least as many elements in the
 881             # subset as are needed
 882             while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
 883
 884                 print STDERR __LINE__, ": Looking at bit positions ($bits): ", Dumper $hash{$count}{$bits} if DEBUG;
 885
 886                 # Start with the first element in it
 887                 my $try_base = $hash{$count}{$bits}[0];
 888                 my @subset = $try_base;
 889
 890                 # If it succeeds, we return a mask and a base to compare
 891                 # against the masked value.  That base will be the AND of
 892                 # every element in the subset.  Initialize to the one element
 893                 # we have so far.
 894                 my $compare = $try_base;
 895
 896                 # We are trying to find a subset of this that has <need>
 897                 # elements that differ in the bit positions given by the
 898                 # string $bits, which is comma separated.
 899                 my @bits = split ",", $bits;
 900
 901                 TRY: # Look through the remainder of the list for other
 902                      # elements that differ only by these bit positions.
 903
 904                 for (my $i = 1; $i < $cur_count; $i++) {
 905                     my $try_this = $hash{$count}{$bits}[$i];
 906                     my @positions = pop_count($try_base ^ $try_this);
 907
 908                     print STDERR __LINE__, ": $try_base vs $try_this: is (", join(',', @positions), ") a subset of ($bits)?" if DEBUG;;
 909
 910                     foreach my $pos (@positions) {
 911                         unless (grep { $pos == $_ } @bits) {
 912                             print STDERR "  No\n" if DEBUG;
 913                             my $remaining = $cur_count - $i - 1;
 914                             if ($remaining && @subset + $remaining < $need) {
 915                                 print STDERR __LINE__, ": Can stop trying $try_base, because even if all the remaining $remaining values work, they wouldn't add up to the needed $need when combined with the existing ", scalar @subset, " ones\n" if DEBUG;
 916                                 last TRY;
 917                             }
 918                             next TRY;
 919                         }
 920                     }
 921
 922                     print STDERR "  Yes\n" if DEBUG;
 923                     push @subset, $try_this;
 924
 925                     # Add this to the mask base, in case it ultimately
 926                     # succeeds,
 927                     $compare &= $try_this;
 928                 }
 929
 930                 print STDERR __LINE__, ": subset (", join(", ", @subset), ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
 931
 932                 if (@subset < $need) {
 933                     shift @{$hash{$count}{$bits}};
 934                     next;   # Try with next value
 935                 }
 936
 937                 # Create the mask
 938                 my $mask = 0;
 939                 foreach my $position (@bits) {
 940                     $mask |= 1 << $position;
 941                 }
 942                 $mask = ~$mask & 0xFF;
 943                 push @final_results, [$compare, $mask];
 944
 945                 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n", __LINE__, $compare, $compare, $mask if DEBUG;
 946
 947                 # These values are now spoken for.  Remove them from future
 948                 # consideration
 949                 foreach my $remove_count (sort keys %hash) {
 950                     foreach my $bits (sort keys %{$hash{$remove_count}}) {
 951                         foreach my $to_remove (@subset) {
 952                             @{$hash{$remove_count}{$bits}} = grep { $_ != $to_remove } @{$hash{$remove_count}{$bits}};
 953                         }
 954                     }
 955                 }
 956             }
 957         }
 958     }
 959
 960     # Any values that remain in the list are ones that have to be tested for
 961     # individually.
 962     my @individuals;
 963     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 964         foreach my $bits (sort keys $hash{$count}) {
 965             foreach my $remaining (@{$hash{$count}{$bits}}) {
 966
 967                 # If we already know about this value, just ignore it.
 968                 next if grep { $remaining == $_ } @individuals;
 969
 970                 # Otherwise it needs to be returned as something to match
 971                 # individually
 972                 push @final_results, [$remaining, undef];
 973                 push @individuals, $remaining;
 974             }
 975         }
 976     }
 977
 978     # Sort by increasing numeric value
 979     @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
 980
 981     print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
 982
 983     return @final_results;
 984 }
 985
 986 # _cond_as_str
 987 # turn a list of conditions into a text expression
 988 # - merges ranges of conditions, and joins the result with ||
 989 sub _cond_as_str {
 990     my ( $self, $op, $combine, $opts_ref )= @_;
 991     my $cond= $op->{vals};
 992     my $test= $op->{test};
 993     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
 994     return "( $test )" if !defined $cond;
 995
 996     # rangify the list.
 997     my @ranges;
 998     my $Update= sub {
 999         # We skip this if there are optimizations that
1000         # we can apply (below) to the individual ranges
1001         if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
1002             if ( $ranges[-1][0] == $ranges[-1][1] ) {
1003                 $ranges[-1]= $ranges[-1][0];
1004             } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
1005                 $ranges[-1]= $ranges[-1][0];
1006                 push @ranges, $ranges[-1] + 1;
1007             }
1008         }
1009     };
1010     for my $condition ( @$cond ) {
1011         if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
1012             $Update->();
1013             push @ranges, [ $condition, $condition ];
1014         } else {
1015             $ranges[-1][1]++;
1016         }
1017     }
1018     $Update->();
1019
1020     return $self->_combine( $test, @ranges )
1021       if $combine;
1022
1023     if ($is_cp_ret) {
1024         @ranges= map {
1025             ref $_
1026             ? sprintf(
1027                 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1028                 @$_ )
1029             : sprintf( "$self->{val_fmt} == $test", $_ );
1030         } @ranges;
1031
1032         return "( " . join( " || ", @ranges ) . " )";
1033     }
1034
1035     # If the input set has certain characteristics, we can optimize tests
1036     # for it.  This doesn't apply if returning the code point, as we want
1037     # each element of the set individually.  The code above is for this
1038     # simpler case.
1039
1040     return 1 if @$cond == 256;  # If all bytes match, is trivially true
1041
1042     my @masks;
1043     if (@ranges > 1) {
1044
1045         # See if the entire set shares optimizable characteristics, and if so,
1046         # return the optimization.  We delay checking for this on sets with
1047         # just a single range, as there may be better optimizations available
1048         # in that case.
1049         @masks = calculate_mask(@$cond);
1050
1051         # Stringify the output of calculate_mask()
1052         if (@masks) {
1053             my @return;
1054             foreach my $mask_ref (@masks) {
1055                 if (defined $mask_ref->[1]) {
1056                     push @return, sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask_ref->[1], $mask_ref->[0];
1057                 }
1058                 else {  # An undefined mask means to use the value as-is
1059                     push @return, sprintf "$test == $self->{val_fmt}", $mask_ref->[0];
1060                 }
1061             }
1062
1063             # The best possible case below for specifying this set of values via
1064             # ranges is 1 branch per range.  If our mask method yielded better
1065             # results, there is no sense trying something that is bound to be
1066             # worse.
1067             if (@return < @ranges) {
1068                 return "( " . join( " || ", @return ) . " )";
1069             }
1070
1071             @masks = @return;
1072         }
1073     }
1074
1075     # Here, there was no entire-class optimization that was clearly better
1076     # than doing things by ranges.  Look at each range.
1077     my $range_count_extra = 0;
1078     for (my $i = 0; $i < @ranges; $i++) {
1079         if (! ref $ranges[$i]) {    # Trivial case: no range
1080             $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
1081         }
1082         elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
1083             $ranges[$i] =           # Trivial case: single element range
1084                     sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
1085         }
1086         elsif ($ranges[$i]->[0] == 0) {
1087             # If the range matches all 256 possible bytes, it is trivially
1088             # true.
1089             return 1 if $ranges[0]->[1] == 0xFF;    # @ranges must be 1 in
1090                                                     # this case
1091             $ranges[$i] = sprintf "( $test <= $self->{val_fmt} )",
1092                                                                $ranges[$i]->[1];
1093         }
1094         elsif ($ranges[$i]->[1] == 255) {
1095
1096             # Similarly the max possible is 255, so can omit an upper bound
1097             # test if the calculated max is the max possible one.
1098             $ranges[$i] = sprintf "( $test >= $self->{val_fmt} )",
1099                                                                 $ranges[0]->[0];
1100         }
1101         else {
1102             my $output = "";
1103
1104             # Well-formed UTF-8 continuation bytes on ascii platforms must be
1105             # in the range 0x80 .. 0xBF.  If we know that the input is
1106             # well-formed (indicated by not trying to be 'safe'), we can omit
1107             # tests that verify that the input is within either of these
1108             # bounds.  (No legal UTF-8 character can begin with anything in
1109             # this range, so we don't have to worry about this being a
1110             # continuation byte or not.)
1111             if ($opts_ref->{charset} =~ /ascii/i
1112                 && (! $opts_ref->{safe} && ! $opts_ref->{no_length_checks})
1113                 && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
1114             {
1115                 my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
1116                 my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
1117
1118                 # If the range is the entire legal range, it matches any legal
1119                 # byte, so we can omit both tests.  (This should happen only
1120                 # if the number of ranges is 1.)
1121                 if ($lower_limit_is_80 && $upper_limit_is_BF) {
1122                     return 1;
1123                 }
1124                 elsif ($lower_limit_is_80) { # Just use the upper limit test
1125                     $output = sprintf("( $test <= $self->{val_fmt} )",
1126                                         $ranges[$i]->[1]);
1127                 }
1128                 elsif ($upper_limit_is_BF) { # Just use the lower limit test
1129                     $output = sprintf("( $test >= $self->{val_fmt} )",
1130                                     $ranges[$i]->[0]);
1131                 }
1132             }
1133
1134             # If we didn't change to omit a test above, see if the number of
1135             # elements is a power of 2 (only a single bit in the
1136             # representation of its count will be set) and if so, it may be
1137             # that a mask/compare optimization is possible.
1138             if ($output eq ""
1139                 && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
1140             {
1141                 my @list;
1142                 push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
1143                 my @this_masks = calculate_mask(@list);
1144
1145                 # Use the mask if there is just one for the whole range.
1146                 # Otherwise there is no savings over the two branches that can
1147                 # define the range.
1148                 if (@this_masks == 1 && defined $this_masks[0][1]) {
1149                     $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $this_masks[0][1], $this_masks[0][0];
1150                 }
1151             }
1152
1153             if ($output ne "") {  # Prefer any optimization
1154                 $ranges[$i] = $output;
1155             }
1156             else {
1157                 # No optimization happened.  We need a test that the code
1158                 # point is within both bounds.  But, if the bounds are
1159                 # adjacent code points, it is cleaner to say
1160                 # 'first == test || second == test'
1161                 # than it is to say
1162                 # 'first <= test && test <= second'
1163
1164                 $range_count_extra++;   # This range requires 2 branches to
1165                                         # represent
1166                 if ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
1167                     $ranges[$i] = "( "
1168                                 .  join( " || ", ( map
1169                                     { sprintf "$self->{val_fmt} == $test", $_ }
1170                                     @{$ranges[$i]} ) )
1171                                 . " )";
1172                 }
1173                 else {  # Full bounds checking
1174                     $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
1175                 }
1176             }
1177         }
1178     }
1179
1180     # We have generated the list of bytes in two ways; one trying to use masks
1181     # to cut the number of branches down, and the other to look at individual
1182     # ranges (some of which could be cut down by using a mask for just it).
1183     # We return whichever method uses the fewest branches.
1184     return "( "
1185            . join( " || ", (@masks && @masks < @ranges + $range_count_extra)
1186                             ? @masks
1187                             : @ranges)
1188            . " )";
1189 }
1190
1191 # _combine
1192 # recursively turn a list of conditions into a fast break-out condition
1193 # used by _cond_as_str() for 'cp' type macros.
1194 sub _combine {
1195     my ( $self, $test, @cond )= @_;
1196     return if !@cond;
1197     my $item= shift @cond;
1198     my ( $cstr, $gtv );
1199     if ( ref $item ) {  # @item should be a 2-element array giving range start
1200                         # and end
1201         if ($item->[0] == 0) {  # UV's are never negative, so skip "0 <= "
1202                                 # test which could generate a compiler warning
1203                                 # that test is always true
1204             $cstr= sprintf( "$test <= $self->{val_fmt}", $item->[1] );
1205         }
1206         else {
1207             $cstr=
1208           sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1209                    @$item );
1210         }
1211         $gtv= sprintf "$self->{val_fmt}", $item->[1];
1212     } else {
1213         $cstr= sprintf( "$self->{val_fmt} == $test", $item );
1214         $gtv= sprintf "$self->{val_fmt}", $item;
1215     }
1216     if ( @cond ) {
1217         my $combine= $self->_combine( $test, @cond );
1218         if (@cond >1) {
1219             return "( $cstr || ( $gtv < $test &&\n"
1220                    . $combine . " ) )";
1221         } else {
1222             return "( $cstr || $combine )";
1223         }
1224     } else {
1225         return $cstr;
1226     }
1227 }
1228
1229 # _render()
1230 # recursively convert an optree to text with reasonably neat formatting
1231 sub _render {
1232     my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_;
1233     return 0 if ! defined $op;  # The set is empty
1234     if ( !ref $op ) {
1235         return $op;
1236     }
1237     my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1238     #no warnings 'recursion';   # This would allow really really inefficient
1239                                 # code to be generated.  See pod
1240     my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def, $submacros );
1241     return $yes if $cond eq '1';
1242
1243     my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref, $def, $submacros );
1244     return "( $cond )" if $yes eq '1' and $no eq '0';
1245     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1246     return "$lb$cond ? $yes : $no$rb"
1247       if !ref( $op->{yes} ) && !ref( $op->{no} );
1248     my $ind1= " " x 4;
1249     my $ind= "\n" . ( $ind1 x $op->{depth} );
1250
1251     if ( ref $op->{yes} ) {
1252         $yes= $ind . $ind1 . $yes;
1253     } else {
1254         $yes= " " . $yes;
1255     }
1256
1257     my $str= "$lb$cond ?$yes$ind: $no$rb";
1258     if (length $str > 6000) {
1259         push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $yes_idx= 0+@$submacros), $yes;
1260         push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $no_idx= 0+@$submacros), $no;
1261         return sprintf "%s%s ? $def : $def%s", $lb, $cond, "_part$yes_idx", "_part$no_idx", $rb;
1262     }
1263     return $str;
1264 }
1265
1266 # $expr=render($op,$combine)
1267 #
1268 # convert an optree to text with reasonably neat formatting. If $combine
1269 # is true then the condition is created using "fast breakouts" which
1270 # produce uglier expressions that are more efficient for common case,
1271 # longer lists such as that resulting from type 'cp' output.
1272 # Currently only used for type 'cp' macros.
1273 sub render {
1274     my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_;
1275
1276     my @submacros;
1277     my $macro= sprintf "#define $def_fmt\n( %s )", "", $self->_render( $op, $combine, 0, $opts_ref, $def_fmt, \@submacros );
1278
1279     return join "\n\n", map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) } @submacros, $macro;
1280 }
1281
1282 # make_macro
1283 # make a macro of a given type.
1284 # calls into make_trie and (generic_|length_)optree as needed
1285 # Opts are:
1286 # type             : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
1287 # ret_type         : 'cp' or 'len'
1288 # safe             : don't assume is well-formed UTF-8, so don't skip any range
1289 #                    checks, and add length guards to macro
1290 # no_length_checks : like safe, but don't add length guards.
1291 #
1292 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1293 # in which case it defaults to 'cp' as well.
1294 #
1295 # It is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1296 # sequences in it, as the generated macro will accept only a single codepoint
1297 # as an argument.
1298 #
1299 # It is also illegal to do a non-safe macro on a pattern with multi-codepoint
1300 # sequences in it, as even if it is known to be well-formed, we need to not
1301 # run off the end of the buffer when, say, the buffer ends with the first two
1302 # characters, but three are looked at by the macro.
1303 #
1304 # returns the macro.
1305
1306
1307 sub make_macro {
1308     my $self= shift;
1309     my %opts= @_;
1310     my $type= $opts{type} || 'generic';
1311     if ($self->{has_multi}) {
1312         if ($type =~ /^cp/) {
1313             die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
1314         }
1315         elsif (! $opts{safe}) {
1316             die "'safe' is required on multi-codepoint character class '$self->{op}'"
1317         }
1318     }
1319     my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1320     my $method;
1321     if ( $opts{safe} ) {
1322         $method= 'length_optree';
1323     } elsif ( $type =~ /generic/ ) {
1324         $method= 'generic_optree';
1325     } else {
1326         $method= 'optree';
1327     }
1328     my @args= $type =~ /^cp/ ? 'cp' : 's';
1329     push @args, "e" if $opts{safe};
1330     push @args, "is_utf8" if $type =~ /generic/;
1331     push @args, "len" if $ret_type eq 'both';
1332     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
1333              $ret_type eq 'cp'      ? 'what_'     : 'is_';
1334     my $ext= $type     =~ /generic/ ? ''          : '_' . lc( $type );
1335     $ext .= '_non_low' if $type eq 'generic_non_low';
1336     $ext .= "_safe" if $opts{safe};
1337     $ext .= "_no_length_checks" if $opts{no_length_checks};
1338     my $argstr= join ",", @args;
1339     my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
1340     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1341     return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt );
1342 }
1343
1344 # if we aren't being used as a module (highly likely) then process
1345 # the __DATA__ below and produce macros in regcharclass.h
1346 # if an argument is provided to the script then it is assumed to
1347 # be the path of the file to output to, if the arg is '-' outputs
1348 # to STDOUT.
1349 if ( !caller ) {
1350     $|++;
1351     my $path= shift @ARGV || "regcharclass.h";
1352     my $out_fh;
1353     if ( $path eq '-' ) {
1354         $out_fh= \*STDOUT;
1355     } else {
1356         $out_fh = open_new( $path );
1357     }
1358     print $out_fh read_only_top( lang => 'C', by => $0,
1359                                  file => 'regcharclass.h', style => '*',
1360                                  copyright => [2007, 2011],
1361                                  final => <<EOF,
1362 WARNING: These macros are for internal Perl core use only, and may be
1363 changed or removed without notice.
1364 EOF
1365     );
1366     print $out_fh "\n#ifndef H_REGCHARCLASS   /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n";
1367
1368     my ( $op, $title, @txt, @types, %mods );
1369     my $doit= sub ($) {
1370         return unless $op;
1371
1372         my $charset = shift;
1373
1374         # Skip if to compile on a different platform.
1375         return if delete $mods{only_ascii_platform} && $charset !~ /ascii/i;
1376         return if delete $mods{only_ebcdic_platform} && $charset !~ /ebcdic/i;
1377
1378         print $out_fh "/*\n\t$op: $title\n\n";
1379         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1380         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt, charset => $charset);
1381
1382         #die Dumper(\@types,\%mods);
1383
1384         my @mods;
1385         push @mods, 'safe' if delete $mods{safe};
1386         push @mods, 'no_length_checks' if delete $mods{no_length_checks};
1387         unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
1388                                                                 # do this one
1389                                                                 # first, as
1390                                                                 # traditional
1391         if (%mods) {
1392             die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1393         }
1394
1395         foreach my $type_spec ( @types ) {
1396             my ( $type, $ret )= split /-/, $type_spec;
1397             $ret ||= 'len';
1398             foreach my $mod ( @mods ) {
1399
1400                 # 'safe' is irrelevant with code point macros, so skip if
1401                 # there is also a 'fast', but don't skip if this is the only
1402                 # way a cp macro will get generated.  Below we convert 'safe'
1403                 # to 'fast' in this instance
1404                 next if $type =~ /^cp/
1405                         && ($mod eq 'safe' || $mod eq 'no_length_checks')
1406                         && grep { 'fast' =~ $_ } @mods;
1407                 delete $mods{$mod};
1408                 my $macro= $obj->make_macro(
1409                     type     => $type,
1410                     ret_type => $ret,
1411                     safe     => $mod eq 'safe' && $type !~ /^cp/,
1412                     charset  => $charset,
1413                     no_length_checks => $mod eq 'no_length_checks' && $type !~ /^cp/,
1414                 );
1415                 print $out_fh $macro, "\n";
1416             }
1417         }
1418     };
1419
1420     my @data = <DATA>;
1421     foreach my $charset (get_supported_code_pages()) {
1422         my $first_time = 1;
1423         undef $op;
1424         undef $title;
1425         undef @txt;
1426         undef @types;
1427         undef %mods;
1428         print $out_fh "\n", get_conditional_compile_line_start($charset);
1429         my @data_copy = @data;
1430         for (@data_copy) {
1431             s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1432             next unless /\S/;
1433             chomp;
1434             if ( /^[A-Z]/ ) {
1435                 $doit->($charset) unless $first_time;  # This starts a new
1436                                                        # definition; do the
1437                                                        # previous one
1438                 $first_time = 0;
1439                 ( $op, $title )= split /\s*:\s*/, $_, 2;
1440                 @txt= ();
1441             } elsif ( s/^=>// ) {
1442                 my ( $type, $modifier )= split /:/, $_;
1443                 @types= split ' ', $type;
1444                 undef %mods;
1445                 map { $mods{$_} = 1 } split ' ',  $modifier;
1446             } else {
1447                 push @txt, "$_";
1448             }
1449         }
1450         $doit->($charset);
1451         print $out_fh get_conditional_compile_line_end();
1452     }
1453
1454     print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
1455
1456     if($path eq '-') {
1457         print $out_fh "/* ex: set ro: */\n";
1458     } else {
1459         # Some of the sources for these macros come from Unicode tables
1460         my $sources_list = "lib/unicore/mktables.lst";
1461         my @sources = ($0, qw(lib/unicore/mktables
1462                               lib/Unicode/UCD.pm
1463                               regen/regcharclass_multi_char_folds.pl
1464                               regen/charset_translations.pl
1465                              ));
1466         {
1467             # Depend on mktables’ own sources.  It’s a shorter list of files than
1468             # those that Unicode::UCD uses.
1469             if (! open my $mktables_list, $sources_list) {
1470
1471                 # This should force a rebuild once $sources_list exists
1472                 push @sources, $sources_list;
1473             }
1474             else {
1475                 while(<$mktables_list>) {
1476                     last if /===/;
1477                     chomp;
1478                     push @sources, "lib/unicore/$_" if /^[^#]/;
1479                 }
1480             }
1481         }
1482         read_only_bottom_close_and_rename($out_fh, \@sources)
1483     }
1484 }
1485
1486 # The form of the input is a series of definitions to make macros for.
1487 # The first line gives the base name of the macro, followed by a colon, and
1488 # then text to be used in comments associated with the macro that are its
1489 # title or description.  In all cases the first (perhaps only) parameter to
1490 # the macro is a pointer to the first byte of the code point it is to test to
1491 # see if it is in the class determined by the macro.  In the case of non-UTF8,
1492 # the code point consists only of a single byte.
1493 #
1494 # The second line must begin with a '=>' and be followed by the types of
1495 # macro(s) to be generated; these are specified below.  A colon follows the
1496 # types, followed by the modifiers, also specified below.  At least one
1497 # modifier is required.
1498 #
1499 # The subsequent lines give what code points go into the class defined by the
1500 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1501 # enclosed in quotes.  Otherwise the lines consist of one of:
1502 #   1)  a single Unicode code point, prefaced by 0x
1503 #   2)  a single range of Unicode code points separated by a minus (and
1504 #       optional space)
1505 #   3)  a single Unicode property specified in the standard Perl form
1506 #       "\p{...}"
1507 #   4)  a line like 'do path'.  This will do a 'do' on the file given by
1508 #       'path'.  It is assumed that this does nothing but load subroutines
1509 #       (See item 5 below).  The reason 'require path' is not used instead is
1510 #       because 'do' doesn't assume that path is in @INC.
1511 #   5)  a subroutine call
1512 #           &pkg::foo(arg1, ...)
1513 #       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1514 #       returns an array of entries of forms like items 1-3 above.  This
1515 #       allows more complex inputs than achievable from the other input types.
1516 #
1517 # A blank line or one whose first non-blank character is '#' is a comment.
1518 # The definition of the macro is terminated by a line unlike those described.
1519 #
1520 # Valid types:
1521 #   low         generate a macro whose name is 'is_BASE_low' and defines a
1522 #               class that includes only ASCII-range chars.  (BASE is the
1523 #               input macro base name.)
1524 #   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1525 #               class that includes only upper-Latin1-range chars.  It is not
1526 #               designed to take a UTF-8 input parameter.
1527 #   high        generate a macro whose name is 'is_BASE_high' and defines a
1528 #               class that includes all relevant code points that are above
1529 #               the Latin1 range.  This is for very specialized uses only.
1530 #               It is designed to take only an input UTF-8 parameter.
1531 #   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1532 #               class that includes all relevant characters that aren't ASCII.
1533 #               It is designed to take only an input UTF-8 parameter.
1534 #   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1535 #               class that includes both ASCII and upper-Latin1-range chars.
1536 #               It is not designed to take a UTF-8 input parameter.
1537 #   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1538 #               class that can include any code point, adding the 'low' ones
1539 #               to what 'utf8' works on.  It is designed to take only an input
1540 #               UTF-8 parameter.
1541 #   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1542 #               boolean, parameter which indicates if the first one points to
1543 #               a UTF-8 string or not.  Thus it works in all circumstances.
1544 #   generic_non_low generate a macro whose name is 'is_BASE_non_low".  It has
1545 #               a 2nd, boolean, parameter which indicates if the first one
1546 #               points to a UTF-8 string or not.  It excludes any ASCII-range
1547 #               matches, but otherwise it works in all circumstances.
1548 #   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1549 #               class that returns true if the UV parameter is a member of the
1550 #               class; false if not.
1551 #   cp_high     like cp, but it is assumed that it is known that the UV
1552 #               parameter is above Latin1.  The name of the generated macro is
1553 #               'is_BASE_cp_high'.  This is different from high-cp, derived
1554 #               below.
1555 # A macro of the given type is generated for each type listed in the input.
1556 # The default return value is the number of octets read to generate the match.
1557 # Append "-cp" to the type to have it instead return the matched codepoint.
1558 #               The macro name is changed to 'what_BASE...'.  See pod for
1559 #               caveats
1560 # Appending '-both" instead adds an extra parameter to the end of the argument
1561 #               list, which is a pointer as to where to store the number of
1562 #               bytes matched, while also returning the code point.  The macro
1563 #               name is changed to 'what_len_BASE...'.  See pod for caveats
1564 #
1565 # Valid modifiers:
1566 #   safe        The input string is not necessarily valid UTF-8.  In
1567 #               particular an extra parameter (always the 2nd) to the macro is
1568 #               required, which points to one beyond the end of the string.
1569 #               The macro will make sure not to read off the end of the
1570 #               string.  In the case of non-UTF8, it makes sure that the
1571 #               string has at least one byte in it.  The macro name has
1572 #               '_safe' appended to it.
1573 #   no_length_checks  The input string is not necessarily valid UTF-8, but it
1574 #               is to be assumed that the length has already been checked and
1575 #               found to be valid
1576 #   fast        The input string is valid UTF-8.  No bounds checking is done,
1577 #               and the macro can make assumptions that lead to faster
1578 #               execution.
1579 #   only_ascii_platform   Skip this definition if the character set is for
1580 #               a non-ASCII platform.
1581 #   only_ebcdic_platform  Skip this definition if the character set is for
1582 #               a non-EBCDIC platform.
1583 # No modifier need be specified; fast is assumed for this case.  If both
1584 # 'fast', and 'safe' are specified, two macros will be created for each
1585 # 'type'.
1586 #
1587 # If run on a non-ASCII platform will automatically convert the Unicode input
1588 # to native.  The documentation above is slightly wrong in this case.  'low'
1589 # actually refers to code points whose UTF-8 representation is the same as the
1590 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1591 # code points less than 256.
1592
1593 1; # in the unlikely case we are being used as a module
1594
1595 __DATA__
1596 # This is no longer used, but retained in case it is needed some day.
1597 # TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1598 # => generic cp generic-cp generic-both :fast safe
1599 # 0x00DF        # LATIN SMALL LETTER SHARP S
1600 # 0x0390        # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1601 # 0x03B0        # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1602 # 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1603 # 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1604 # 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1605
1606 LNBREAK: Line Break: \R
1607 => generic UTF8 LATIN1 : safe
1608 "\x0D\x0A"      # CRLF - Network (Windows) line ending
1609 \p{VertSpace}
1610
1611 HORIZWS: Horizontal Whitespace: \h \H
1612 => high cp_high : fast
1613 \p{HorizSpace}
1614
1615 VERTWS: Vertical Whitespace: \v \V
1616 => high cp_high : fast
1617 \p{VertSpace}
1618
1619 XDIGIT: Hexadecimal digits
1620 => high cp_high : fast
1621 \p{XDigit}
1622
1623 XPERLSPACE: \p{XPerlSpace}
1624 => high cp_high : fast
1625 \p{XPerlSpace}
1626
1627 REPLACEMENT: Unicode REPLACEMENT CHARACTER
1628 => UTF8 :safe
1629 0xFFFD
1630
1631 NONCHAR: Non character code points
1632 => UTF8 :fast
1633 \p{Nchar}
1634
1635 SURROGATE: Surrogate characters
1636 => UTF8 :fast
1637 \p{Gc=Cs}
1638
1639 # This program was run with this enabled, and the results copied to utf8.h;
1640 # then this was commented out because it takes so long to figure out these 2
1641 # million code points.  The results would not change unless utf8.h decides it
1642 # wants a maximum other than 4 bytes, or this program creates better
1643 # optimizations.  Trying with 5 bytes used too much memory to calculate.
1644 #
1645 # We don't generate code for invariants here because the EBCDIC form is too
1646 # complicated and would slow things down; instead the user should test for
1647 # invariants first.
1648 #
1649 # NOTE: The number of bytes generated here must match the value in
1650 # IS_UTF8_CHAR_FAST in utf8.h
1651 #
1652 #UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes
1653 #=> UTF8 :no_length_checks only_ascii_platform
1654 #0x80 - 0x1FFFFF
1655
1656 # This hasn't been commented out, but the number of bytes it works on has been
1657 # cut down to 3, so it doesn't cover the full legal Unicode range.  Making it
1658 # 5 bytes would cover beyond the full range, but takes quite a bit of time and
1659 # memory to calculate.  The generated table varies depending on the EBCDIC
1660 # code page.
1661
1662 # NOTE: The number of bytes generated here must match the value in
1663 # IS_UTF8_CHAR_FAST in utf8.h
1664 #
1665 UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
1666 => UTF8 :no_length_checks only_ebcdic_platform
1667 0xA0 - 0x3FFF
1668
1669 QUOTEMETA: Meta-characters that \Q should quote
1670 => high :fast
1671 \p{_Perl_Quotemeta}
1672
1673 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1674 => UTF8 :safe
1675
1676 # 1 => All folds
1677 &regcharclass_multi_char_folds::multi_char_folds(1)
1678
1679 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1680 => LATIN1 : safe
1681
1682 &regcharclass_multi_char_folds::multi_char_folds(0)
1683 # 0 => Latin1-only
1684
1685 FOLDS_TO_MULTI: characters that fold to multi-char strings
1686 => UTF8 :fast
1687 \p{_Perl_Folds_To_Multi_Char}
1688
1689 PROBLEMATIC_LOCALE_FOLD : characters whose fold is problematic under locale
1690 => UTF8 cp :fast
1691 \p{_Perl_Problematic_Locale_Folds}
1692
1693 PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are problematic under locale
1694 => UTF8 cp :fast
1695 \p{_Perl_Problematic_Locale_Foldeds_Start}
1696
1697 PATWS: pattern white space
1698 => generic cp : safe
1699 \p{PatWS}