regen/regcharclass.pl

   1 #!perl
   2 package CharClass::Matcher;
   3 use strict;
   4 use 5.008;
   5 use warnings;
   6 use warnings FATAL => 'all';
   7 use Data::Dumper;
   8 $Data::Dumper::Useqq= 1;
   9 our $hex_fmt= "0x%02X";
  10
  11 sub DEBUG () { 0 }
  12 $|=1 if DEBUG;
  13
  14 require 'regen/regen_lib.pl';
  15 require 'regen/charset_translations.pl';
  16 require "regen/regcharclass_multi_char_folds.pl";
  17
  18 =head1 NAME
  19
  20 CharClass::Matcher -- Generate C macros that match character classes efficiently
  21
  22 =head1 SYNOPSIS
  23
  24     perl Porting/regcharclass.pl
  25
  26 =head1 DESCRIPTION
  27
  28 Dynamically generates macros for detecting special charclasses
  29 in latin-1, utf8, and codepoint forms. Macros can be set to return
  30 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
  31
  32 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
  33 are necessary.
  34
  35 Using WHATEVER as an example the following macros can be produced, depending
  36 on the input parameters (how to get each is described by internal comments at
  37 the C<__DATA__> line):
  38
  39 =over 4
  40
  41 =item C<is_WHATEVER(s,is_utf8)>
  42
  43 =item C<is_WHATEVER_safe(s,e,is_utf8)>
  44
  45 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
  46 comparisons involving octect<128 are done before checking the C<is_utf8>
  47 flag, hopefully saving time.
  48
  49 The version without the C<_safe> suffix should be used only when the input is
  50 known to be well-formed.
  51
  52 =item C<is_WHATEVER_utf8(s)>
  53
  54 =item C<is_WHATEVER_utf8_safe(s,e)>
  55
  56 Do a lookup assuming the string is encoded in (normalized) UTF8.
  57
  58 The version without the C<_safe> suffix should be used only when the input is
  59 known to be well-formed.
  60
  61 =item C<is_WHATEVER_latin1(s)>
  62
  63 =item C<is_WHATEVER_latin1_safe(s,e)>
  64
  65 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  66
  67 The version without the C<_safe> suffix should be used only when it is known
  68 that C<s> contains at least one character.
  69
  70 =item C<is_WHATEVER_cp(cp)>
  71
  72 Check to see if the string matches a given codepoint (hypothetically a
  73 U32). The condition is constructed as to "break out" as early as
  74 possible if the codepoint is out of range of the condition.
  75
  76 IOW:
  77
  78   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  79
  80 Thus if the character is X+1 only two comparisons will be done. Making
  81 matching lookups slower, but non-matching faster.
  82
  83 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
  84
  85 A variant form of each of the macro types described above can be generated, in
  86 which the code point is returned by the macro, and an extra parameter (in the
  87 final position) is added, which is a pointer for the macro to set the byte
  88 length of the returned code point.
  89
  90 These forms all have a C<what_len> prefix instead of the C<is_>, for example
  91 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
  92 C<what_len_WHATEVER_utf8(s,len)>.
  93
  94 These forms should not be used I<except> on small sets of mostly widely
  95 separated code points; otherwise the code generated is inefficient.  For these
  96 cases, it is best to use the C<is_> forms, and then find the code point with
  97 C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
  98 message on the worst of the inappropriate sets.  Examine the generated macro
  99 to see if it is acceptable.
 100
 101 =item C<what_WHATEVER_FOO(arg1, ...)>
 102
 103 A variant form of each of the C<is_> macro types described above can be generated, in
 104 which the code point and not the length is returned by the macro.  These have
 105 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
 106 not be used where the set contains a NULL, as 0 is returned for two different
 107 cases: a) the set doesn't include the input code point; b) the set does
 108 include it, and it is a NULL.
 109
 110 =back
 111
 112 The above isn't quite complete, as for specialized purposes one can get a
 113 macro like C<is_WHATEVER_utf8_no_length_checks(s)>, which assumes that it is
 114 already known that there is enough space to hold the character starting at
 115 C<s>, but otherwise checks that it is well-formed.  In other words, this is
 116 intermediary in checking between C<is_WHATEVER_utf8(s)> and
 117 C<is_WHATEVER_utf8_safe(s,e)>.
 118
 119 =head2 CODE FORMAT
 120
 121 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
 122
 123
 124 =head1 AUTHOR
 125
 126 Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
 127
 128 =head1 BUGS
 129
 130 No tests directly here (although the regex engine will fail tests
 131 if this code is broken). Insufficient documentation and no Getopts
 132 handler for using the module as a script.
 133
 134 =head1 LICENSE
 135
 136 You may distribute under the terms of either the GNU General Public
 137 License or the Artistic License, as specified in the README file.
 138
 139 =cut
 140
 141 # Sub naming convention:
 142 # __func : private subroutine, can not be called as a method
 143 # _func  : private method, not meant for external use
 144 # func   : public method.
 145
 146 # private subs
 147 #-------------------------------------------------------------------------------
 148 #
 149 # ($cp,$n,$l,$u)=__uni_latin($str);
 150 #
 151 # Return a list of arrays, each of which when interpreted correctly
 152 # represent the string in some given encoding with specific conditions.
 153 #
 154 # $cp - list of codepoints that make up the string.
 155 # $n  - list of octets that make up the string if all codepoints are invariant
 156 #       regardless of if the string is in UTF-8 or not.
 157 # $l  - list of octets that make up the string in latin1 encoding if all
 158 #       codepoints < 256, and at least one codepoint is UTF-8 variant.
 159 # $u  - list of octets that make up the string in utf8 if any codepoint is
 160 #       UTF-8 variant
 161 #
 162 #   High CP | Defined
 163 #-----------+----------
 164 #   0 - 127 : $n            (127/128 are the values for ASCII platforms)
 165 # 128 - 255 : $l, $u
 166 # 256 - ... : $u
 167 #
 168
 169 sub __uni_latin1 {
 170     my $charset= shift;
 171     my $str= shift;
 172     my $max= 0;
 173     my @cp;
 174     my @cp_high;
 175     my $only_has_invariants = 1;
 176     my $a2n = get_a2n($charset);
 177     for my $ch ( split //, $str ) {
 178         my $cp= ord $ch;
 179         $max= $cp if $max < $cp;
 180         if ($cp > 255) {
 181             push @cp, $cp;
 182             push @cp_high, $cp;
 183         }
 184         else {
 185             push @cp, $a2n->[$cp];
 186         }
 187     }
 188     my ( $n, $l, $u );
 189     $only_has_invariants = ($charset =~ /ascii/i) ? $max < 128 : $max < 160;
 190     if ($only_has_invariants) {
 191         $n= [@cp];
 192     } else {
 193         $l= [@cp] if $max && $max < 256;
 194
 195         my @u;
 196         for my $ch ( split //, $str ) {
 197             push @u, map { ord } split //, cp_2_utfbytes(ord $ch, $charset);
 198         }
 199         $u = \@u;
 200     }
 201     return ( \@cp, \@cp_high, $n, $l, $u );
 202 }
 203
 204 #
 205 # $clean= __clean($expr);
 206 #
 207 # Cleanup a ternary expression, removing unnecessary parens and apply some
 208 # simplifications using regexes.
 209 #
 210
 211 sub __clean {
 212     my ( $expr )= @_;
 213
 214     #return $expr;
 215
 216     our $parens;
 217     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 218
 219     ## remove redundant parens
 220     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 221
 222
 223     # repeatedly simplify conditions like
 224     #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
 225     # into
 226     #       ( ( (cond1) && (cond2) ) ? X : Y )
 227     # Also similarly handles expressions like:
 228     #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
 229     # Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is
 230     # purely to ensure we have a balanced set of parens in the expression which makes
 231     # it easier to understand the pattern in an editor that understands paren's, we do
 232     # not expect either of these cases to actually fire. - Yves
 233     1 while $expr =~ s/
 234         ([:()])  \s*
 235             ($parens) \s*
 236             \? \s*
 237                 \( \s* ($parens) \s*
 238                     \? \s* ($parens|[^()?:\s]+?) \s*
 239                     :  \s* ($parens|[^()?:\s]+?) \s*
 240                 \) \s*
 241             : \s* \5 \s*
 242         ([()])
 243     /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
 244     #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000;
 245     #$expr=~s/\s+//g if length $expr > 8000;
 246
 247     die "Expression too long" if length $expr > 8000;
 248
 249     return $expr;
 250 }
 251
 252 #
 253 # $text= __macro(@args);
 254 # Join args together by newlines, and then neatly add backslashes to the end
 255 # of every  line as expected by the C pre-processor for #define's.
 256 #
 257
 258 sub __macro {
 259     my $str= join "\n", @_;
 260     $str =~ s/\s*$//;
 261     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 262     my $last= pop @lines;
 263     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 264     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 265     return $str . "\n";
 266 }
 267
 268 #
 269 # my $op=__incrdepth($op);
 270 #
 271 # take an 'op' hashref and add one to it and all its childrens depths.
 272 #
 273
 274 sub __incrdepth {
 275     my $op= shift;
 276     return unless ref $op;
 277     $op->{depth} += 1;
 278     __incrdepth( $op->{yes} );
 279     __incrdepth( $op->{no} );
 280     return $op;
 281 }
 282
 283 # join two branches of an opcode together with a condition, incrementing
 284 # the depth on the yes branch when we do so.
 285 # returns the new root opcode of the tree.
 286 sub __cond_join {
 287     my ( $cond, $yes, $no )= @_;
 288     if (ref $yes) {
 289         return {
 290             test  => $cond,
 291             yes   => __incrdepth( $yes ),
 292             no    => $no,
 293             depth => 0,
 294         };
 295     }
 296     else {
 297         return {
 298             test  => $cond,
 299             yes   => $yes,
 300             no    => __incrdepth($no),
 301             depth => 0,
 302         };
 303     }
 304 }
 305
 306 # Methods
 307
 308 # constructor
 309 #
 310 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 311 #
 312 # Create a new CharClass::Matcher object by parsing the text in
 313 # the txt array. Currently applies the following rules:
 314 #
 315 # Element starts with C<0x>, line is evaled the result treated as
 316 # a number which is passed to chr().
 317 #
 318 # Element starts with C<">, line is evaled and the result treated
 319 # as a string.
 320 #
 321 # Each string is then stored in the 'strs' subhash as a hash record
 322 # made up of the results of __uni_latin1, using the keynames
 323 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
 324 # 'UTF8' which hold a merge of 'low' and their lowercase equivalents.
 325 #
 326 # Size data is tracked per type in the 'size' subhash.
 327 #
 328 # Return an object
 329 #
 330 sub new {
 331     my $class= shift;
 332     my %opt= @_;
 333     for ( qw(op txt) ) {
 334         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 335           if !exists $opt{$_};
 336     }
 337
 338     my $self= bless {
 339         op    => $opt{op},
 340         title => $opt{title} || '',
 341     }, $class;
 342     foreach my $txt ( @{ $opt{txt} } ) {
 343         my $str= $txt;
 344         if ( $str =~ /^[""]/ ) {
 345             $str= eval $str;
 346         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
 347                                     # list with its expansion
 348             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
 349             die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
 350             foreach my $cp (hex $lower .. hex $upper) {
 351                 push @{$opt{txt}}, sprintf "0x%X", $cp;
 352             }
 353             next;
 354         } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
 355             # Otherwise undocumented, a leading N means is already in the
 356             # native character set; don't convert.
 357             $str= chr eval $str;
 358         } elsif ( $str =~ /^0x/ ) {
 359             $str= eval $str;
 360             $str = chr $str;
 361         } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
 362             my $property = $1;
 363             use Unicode::UCD qw(prop_invlist);
 364
 365             my @invlist = prop_invlist($property, '_perl_core_internal_ok');
 366             if (! @invlist) {
 367
 368                 # An empty return could mean an unknown property, or merely
 369                 # that it is empty.  Call in scalar context to differentiate
 370                 my $count = prop_invlist($property, '_perl_core_internal_ok');
 371                 die "$property not found" unless defined $count;
 372             }
 373
 374             # Replace this element on the list with the property's expansion
 375             for (my $i = 0; $i < @invlist; $i += 2) {
 376                 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
 377
 378                     # prop_invlist() returns native values; add leading 'N'
 379                     # to indicate that.
 380                     push @{$opt{txt}}, sprintf "N0x%X", $cp;
 381                 }
 382             }
 383             next;
 384         } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
 385             die "do '$1' failed: $!$@" if ! do $1 or $@;
 386             next;
 387         } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
 388             my @results = eval "$1";
 389             die "eval '$1' failed: $@" if $@;
 390             push @{$opt{txt}}, @results;
 391             next;
 392         } else {
 393             die "Unparsable line: $txt\n";
 394         }
 395         my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $opt{charset}, $str );
 396         my $UTF8= $low   || $utf8;
 397         my $LATIN1= $low || $latin1;
 398         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
 399         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 400         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 401
 402         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}=
 403           ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 );
 404         my $rec= $self->{strs}{$str};
 405         foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
 406             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 407               if $self->{strs}{$str}{$key};
 408         }
 409         $self->{has_multi} ||= @$cp > 1;
 410         $self->{has_ascii} ||= $latin1 && @$latin1;
 411         $self->{has_low}   ||= $low && @$low;
 412         $self->{has_high}  ||= !$low && !$latin1;
 413     }
 414     $self->{val_fmt}= $hex_fmt;
 415     $self->{count}= 0 + keys %{ $self->{strs} };
 416     return $self;
 417 }
 418
 419 # my $trie = make_trie($type,$maxlen);
 420 #
 421 # using the data stored in the object build a trie of a specific type,
 422 # and with specific maximum depth. The trie is made up the elements of
 423 # the given types array for each string in the object (assuming it is
 424 # not too long.)
 425 #
 426 # returns the trie, or undef if there was no relevant data in the object.
 427 #
 428
 429 sub make_trie {
 430     my ( $self, $type, $maxlen )= @_;
 431
 432     my $strs= $self->{strs};
 433     my %trie;
 434     foreach my $rec ( values %$strs ) {
 435         die "panic: unknown type '$type'"
 436           if !exists $rec->{$type};
 437         my $dat= $rec->{$type};
 438         next unless $dat;
 439         next if $maxlen && @$dat > $maxlen;
 440         my $node= \%trie;
 441         foreach my $elem ( @$dat ) {
 442             $node->{$elem} ||= {};
 443             $node= $node->{$elem};
 444         }
 445         $node->{''}= $rec->{str};
 446     }
 447     return 0 + keys( %trie ) ? \%trie : undef;
 448 }
 449
 450 sub pop_count ($) {
 451     my $word = shift;
 452
 453     # This returns a list of the positions of the bits in the input word that
 454     # are 1.
 455
 456     my @positions;
 457     my $position = 0;
 458     while ($word) {
 459         push @positions, $position if $word & 1;
 460         $position++;
 461         $word >>= 1;
 462     }
 463     return @positions;
 464 }
 465
 466 # my $optree= _optree()
 467 #
 468 # recursively convert a trie to an optree where every node represents
 469 # an if else branch.
 470 #
 471 #
 472
 473 sub _optree {
 474     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 475     return unless defined $trie;
 476     if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
 477         die "Can't do 'cp' optree from multi-codepoint strings";
 478     }
 479     $ret_type ||= 'len';
 480     $else= 0  unless defined $else;
 481     $depth= 0 unless defined $depth;
 482
 483     # if we have an empty string as a key it means we are in an
 484     # accepting state and unless we can match further on should
 485     # return the value of the '' key.
 486     if (exists $trie->{''} ) {
 487         # we can now update the "else" value, anything failing to match
 488         # after this point should return the value from this.
 489         if ( $ret_type eq 'cp' ) {
 490             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 491             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 492         } elsif ( $ret_type eq 'len' ) {
 493             $else= $depth;
 494         } elsif ( $ret_type eq 'both') {
 495             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 496             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 497             $else= "len=$depth, $else";
 498         }
 499     }
 500     # extract the meaningful keys from the trie, filter out '' as
 501     # it means we are an accepting state (end of sequence).
 502     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 503
 504     # if we haven't any keys there is no further we can match and we
 505     # can return the "else" value.
 506     return $else if !@conds;
 507
 508     my $test = $test_type =~ /^cp/ ? "cp" : "((U8*)s)[$depth]";
 509
 510     # First we loop over the possible keys/conditions and find out what they
 511     # look like; we group conditions with the same optree together.
 512     my %dmp_res;
 513     my @res_order;
 514     local $Data::Dumper::Sortkeys=1;
 515     foreach my $cond ( @conds ) {
 516
 517         # get the optree for this child/condition
 518         my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 );
 519         # convert it to a string with Dumper
 520         my $res_code= Dumper( $res );
 521
 522         push @{$dmp_res{$res_code}{vals}}, $cond;
 523         if (!$dmp_res{$res_code}{optree}) {
 524             $dmp_res{$res_code}{optree}= $res;
 525             push @res_order, $res_code;
 526         }
 527     }
 528
 529     # now that we have deduped the optrees we construct a new optree containing the merged
 530     # results.
 531     my %root;
 532     my $node= \%root;
 533     foreach my $res_code_idx (0 .. $#res_order) {
 534         my $res_code= $res_order[$res_code_idx];
 535         $node->{vals}= $dmp_res{$res_code}{vals};
 536         $node->{test}= $test;
 537         $node->{yes}= $dmp_res{$res_code}{optree};
 538         $node->{depth}= $depth;
 539         if ($res_code_idx < $#res_order) {
 540             $node= $node->{no}= {};
 541         } else {
 542             $node->{no}= $else;
 543         }
 544     }
 545
 546     # return the optree.
 547     return \%root;
 548 }
 549
 550 # my $optree= optree(%opts);
 551 #
 552 # Convert a trie to an optree, wrapper for _optree
 553
 554 sub optree {
 555     my $self= shift;
 556     my %opt= @_;
 557     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 558     $opt{ret_type} ||= 'len';
 559     my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
 560     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 561 }
 562
 563 # my $optree= generic_optree(%opts);
 564 #
 565 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 566 # sets of strings, including a branch for handling the string type check.
 567 #
 568
 569 sub generic_optree {
 570     my $self= shift;
 571     my %opt= @_;
 572
 573     $opt{ret_type} ||= 'len';
 574     my $test_type= 'depth';
 575     my $else= $opt{else} || 0;
 576
 577     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 578     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 579
 580     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 581       for $latin1, $utf8;
 582
 583     if ( $utf8 ) {
 584         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 585     } elsif ( $latin1 ) {
 586         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 587     }
 588     if ($opt{type} eq 'generic') {
 589         my $low= $self->make_trie( 'low', $opt{max_depth} );
 590         if ( $low ) {
 591             $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 592         }
 593     }
 594
 595     return $else;
 596 }
 597
 598 # length_optree()
 599 #
 600 # create a string length guarded optree.
 601 #
 602
 603 sub length_optree {
 604     my $self= shift;
 605     my %opt= @_;
 606     my $type= $opt{type};
 607
 608     die "Can't do a length_optree on type 'cp', makes no sense."
 609       if $type =~ /^cp/;
 610
 611     my $else= ( $opt{else} ||= 0 );
 612
 613     my $method = $type =~ /generic/ ? 'generic_optree' : 'optree';
 614     if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) {
 615
 616         # Here is non-generic output (meaning that we are only generating one
 617         # type), and all things that match have the same number ('size') of
 618         # bytes.  The length guard is simply that we have that number of
 619         # bytes.
 620         my @size = keys %{$self->{size}{$type}};
 621         my $cond= "((e) - (s)) >= $size[0]";
 622         my $optree = $self->$method(%opt);
 623         $else= __cond_join( $cond, $optree, $else );
 624     }
 625     elsif ($self->{has_multi}) {
 626         my @size;
 627
 628         # Here, there can be a match of a multiple character string.  We use
 629         # the traditional method which is to have a branch for each possible
 630         # size (longest first) and test for the legal values for that size.
 631         my %sizes= (
 632             %{ $self->{size}{low}    || {} },
 633             %{ $self->{size}{latin1} || {} },
 634             %{ $self->{size}{utf8}   || {} }
 635         );
 636         if ($method eq 'generic_optree') {
 637             @size= sort { $a <=> $b } keys %sizes;
 638         } else {
 639             @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 640         }
 641         for my $size ( @size ) {
 642             my $optree= $self->$method( %opt, type => $type, max_depth => $size );
 643             my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 644             $else= __cond_join( $cond, $optree, $else );
 645         }
 646     }
 647     else {
 648         my $utf8;
 649
 650         # Here, has more than one possible size, and only matches a single
 651         # character.  For non-utf8, the needed length is 1; for utf8, it is
 652         # found by array lookup 'UTF8SKIP'.
 653
 654         # If want just the code points above 255, set up to look for those;
 655         # otherwise assume will be looking for all non-UTF-8-invariant code
 656         # poiints.
 657         my $trie_type = ($type eq 'high') ? 'high' : 'utf8';
 658
 659         # If we do want more than the 0-255 range, find those, and if they
 660         # exist...
 661         if ($opt{type} !~ /latin1/i && ($utf8 = $self->make_trie($trie_type, 0))) {
 662
 663             # ... get them into an optree, and set them up as the 'else' clause
 664             $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 );
 665
 666             # We could make this
 667             #   UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))";
 668             # to avoid doing the UTF8SKIP and subsequent branches for invariants
 669             # that don't match.  But the current macros that get generated
 670             # have only a few things that can match past this, so I (khw)
 671             # don't think it is worth it.  (Even better would be to use
 672             # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it
 673             # if it saves a bunch.  We assume that input text likely to be
 674             # well-formed .
 675             my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))";
 676             $else = __cond_join($cond, $utf8, $else);
 677
 678             # For 'generic', we also will want the latin1 UTF-8 variants for
 679             # the case where the input isn't UTF-8.
 680             my $latin1;
 681             if ($method eq 'generic_optree') {
 682                 $latin1 = $self->make_trie( 'latin1', 1);
 683                 $latin1= $self->_optree( $latin1, 'depth', $opt{ret_type}, 0, 0 );
 684             }
 685
 686             # If we want the UTF-8 invariants, get those.
 687             my $low;
 688             if ($opt{type} !~ /non_low|high/
 689                 && ($low= $self->make_trie( 'low', 1)))
 690             {
 691                 $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 );
 692
 693                 # Expand out the UTF-8 invariants as a string so that we
 694                 # can use them as the conditional
 695                 $low = $self->_cond_as_str( $low, 0, \%opt);
 696
 697                 # If there are Latin1 variants, add a test for them.
 698                 if ($latin1) {
 699                     $else = __cond_join("(! is_utf8 )", $latin1, $else);
 700                 }
 701                 elsif ($method eq 'generic_optree') {
 702
 703                     # Otherwise for 'generic' only we know that what
 704                     # follows must be valid for just UTF-8 strings,
 705                     $else->{test} = "( is_utf8 && $else->{test} )";
 706                 }
 707
 708                 # If the invariants match, we are done; otherwise we have
 709                 # to go to the 'else' clause.
 710                 $else = __cond_join($low, 1, $else);
 711             }
 712             elsif ($latin1) {   # Here, didn't want or didn't have invariants,
 713                                 # but we do have latin variants
 714                 $else = __cond_join("(! is_utf8)", $latin1, $else);
 715             }
 716
 717             # We need at least one byte available to start off the tests
 718             $else = __cond_join("LIKELY((e) > (s))", $else, 0);
 719         }
 720         else {  # Here, we don't want or there aren't any variants.  A single
 721                 # byte available is enough.
 722             my $cond= "((e) > (s))";
 723             my $optree = $self->$method(%opt);
 724             $else= __cond_join( $cond, $optree, $else );
 725         }
 726     }
 727
 728     return $else;
 729 }
 730
 731 sub calculate_mask(@) {
 732     # Look at the input list of byte values.  This routine returns an array of
 733     # mask/base pairs to generate that list.
 734
 735     my @list = @_;
 736     my $list_count = @list;
 737
 738     # Consider a set of byte values, A, B, C ....  If we want to determine if
 739     # <c> is one of them, we can write c==A || c==B || c==C ....  If the
 740     # values are consecutive, we can shorten that to A<=c && c<=Z, which uses
 741     # far fewer branches.  If only some of them are consecutive we can still
 742     # save some branches by creating range tests for just those that are
 743     # consecutive. _cond_as_str() does this work for looking for ranges.
 744     #
 745     # Another approach is to look at the bit patterns for A, B, C .... and see
 746     # if they have some commonalities.  That's what this function does.  For
 747     # example, consider a set consisting of the bytes
 748     # 0xF0, 0xF1, 0xF2, and 0xF3.  We could write:
 749     #   0xF0 <= c && c <= 0xF4
 750     # But the following mask/compare also works, and has just one test:
 751     #   (c & 0xFC) == 0xF0
 752     # The reason it works is that the set consists of exactly those bytes
 753     # whose first 4 bits are 1, and the next two are 0.  (The value of the
 754     # other 2 bits is immaterial in determining if a byte is in the set or
 755     # not.)  The mask masks out those 2 irrelevant bits, and the comparison
 756     # makes sure that the result matches all bytes which match those 6
 757     # material bits exactly.  In other words, the set of bytes contains
 758     # exactly those whose bottom two bit positions are either 0 or 1.  The
 759     # same principle applies to bit positions that are not necessarily
 760     # adjacent.  And it can be applied to bytes that differ in 1 through all 8
 761     # bit positions.  In order to be a candidate for this optimization, the
 762     # number of bytes in the set must be a power of 2.
 763     #
 764     # Consider a different example, the set 0x53, 0x54, 0x73, and 0x74.  That
 765     # requires 4 tests using either ranges or individual values, and even
 766     # though the number in the set is a power of 2, it doesn't qualify for the
 767     # mask optimization described above because the number of bits that are
 768     # different is too large for that.  However, the set can be expressed as
 769     # two branches with masks thusly:
 770     #   (c & 0xDF) == 0x53 || (c & 0xDF) == 0x54
 771     # a branch savings of 50%.  This is done by splitting the set into two
 772     # subsets each of which has 2 elements, and within each set the values
 773     # differ by 1 byte.
 774     #
 775     # This function attempts to find some way to save some branches using the
 776     # mask technique.  If not, it returns an empty list; if so, it
 777     # returns a list consisting of
 778     #   [ [compare1, mask1], [compare2, mask2], ...
 779     #     [compare_n, undef], [compare_m, undef], ...
 780     #   ]
 781     # The <mask> is undef in the above for those bytes that must be tested
 782     # for individually.
 783     #
 784     # This function does not attempt to find the optimal set.  To do so would
 785     # probably require testing all possible combinations, and keeping track of
 786     # the current best one.
 787     #
 788     # There are probably much better algorithms, but this is the one I (khw)
 789     # came up with.  We start with doing a bit-wise compare of every byte in
 790     # the set with every other byte.  The results are sorted into arrays of
 791     # all those that differ by the same bit positions.  These are stored in a
 792     # hash with the each key being the bits they differ in.  Here is the hash
 793     # for the 0x53, 0x54, 0x73, 0x74 set:
 794     # {
 795     #    4 => {
 796     #            "0,1,2,5" => [
 797     #                            83,
 798     #                            116,
 799     #                            84,
 800     #                            115
 801     #                        ]
 802     #        },
 803     #    3 => {
 804     #            "0,1,2" => [
 805     #                        83,
 806     #                        84,
 807     #                        115,
 808     #                        116
 809     #                        ]
 810     #        }
 811     #    1 => {
 812     #            5 => [
 813     #                    83,
 814     #                    115,
 815     #                    84,
 816     #                    116
 817     #                ]
 818     #        },
 819     # }
 820     #
 821     # The set consisting of values which differ in the 4 bit positions 0, 1,
 822     # 2, and 5 from some other value in the set consists of all 4 values.
 823     # Likewise all 4 values differ from some other value in the 3 bit
 824     # positions 0, 1, and 2; and all 4 values differ from some other value in
 825     # the single bit position 5.  The keys at the uppermost level in the above
 826     # hash, 1, 3, and 4, give the number of bit positions that each sub-key
 827     # below it has.  For example, the 4 key could have as its value an array
 828     # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
 829     # such.  The best optimization will group the most values into a single
 830     # mask.  The most values will be the ones that differ in the most
 831     # positions, the ones with the largest value for the topmost key.  These
 832     # keys, are thus just for convenience of sorting by that number, and do
 833     # not have any bearing on the core of the algorithm.
 834     #
 835     # We start with an element from largest number of differing bits.  The
 836     # largest in this case is 4 bits, and there is only one situation in this
 837     # set which has 4 differing bits, "0,1,2,5".  We look for any subset of
 838     # this set which has 16 values that differ in these 4 bits.  There aren't
 839     # any, because there are only 4 values in the entire set.  We then look at
 840     # the next possible thing, which is 3 bits differing in positions "0,1,2".
 841     # We look for a subset that has 8 values that differ in these 3 bits.
 842     # Again there are none.  So we go to look for the next possible thing,
 843     # which is a subset of 2**1 values that differ only in bit position 5.  83
 844     # and 115 do, so we calculate a mask and base for those and remove them
 845     # from every set.  Since there is only the one set remaining, we remove
 846     # them from just this one.  We then look to see if there is another set of
 847     # 2 values that differ in bit position 5.  84 and 116 do, so we calculate
 848     # a mask and base for those and remove them from every set (again only
 849     # this set remains in this example).  The set is now empty, and there are
 850     # no more sets to look at, so we are done.
 851
 852     if ($list_count == 256) {   # All 256 is trivially masked
 853         return (0, 0);
 854     }
 855
 856     my %hash;
 857
 858     # Generate bits-differing lists for each element compared against each
 859     # other element
 860     for my $i (0 .. $list_count - 2) {
 861         for my $j ($i + 1 .. $list_count - 1) {
 862             my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
 863             my $differ_count = @bits_that_differ;
 864             my $key = join ",", @bits_that_differ;
 865             push @{$hash{$differ_count}{$key}}, $list[$i] unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
 866             push @{$hash{$differ_count}{$key}}, $list[$j];
 867         }
 868     }
 869
 870     print STDERR __LINE__, ": calculate_mask() called:  List of values grouped by differing bits: ", Dumper \%hash if DEBUG;
 871
 872     my @final_results;
 873     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 874         my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
 875         foreach my $bits (sort keys $hash{$count}->%*) {
 876
 877             print STDERR __LINE__, ": For $count bit(s) difference ($bits), need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
 878
 879             # Look only as long as there are at least as many elements in the
 880             # subset as are needed
 881             while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
 882
 883                 print STDERR __LINE__, ": Looking at bit positions ($bits): ", Dumper $hash{$count}{$bits} if DEBUG;
 884
 885                 # Start with the first element in it
 886                 my $try_base = $hash{$count}{$bits}[0];
 887                 my @subset = $try_base;
 888
 889                 # If it succeeds, we return a mask and a base to compare
 890                 # against the masked value.  That base will be the AND of
 891                 # every element in the subset.  Initialize to the one element
 892                 # we have so far.
 893                 my $compare = $try_base;
 894
 895                 # We are trying to find a subset of this that has <need>
 896                 # elements that differ in the bit positions given by the
 897                 # string $bits, which is comma separated.
 898                 my @bits = split ",", $bits;
 899
 900                 TRY: # Look through the remainder of the list for other
 901                      # elements that differ only by these bit positions.
 902
 903                 for (my $i = 1; $i < $cur_count; $i++) {
 904                     my $try_this = $hash{$count}{$bits}[$i];
 905                     my @positions = pop_count($try_base ^ $try_this);
 906
 907                     print STDERR __LINE__, ": $try_base vs $try_this: is (", join(',', @positions), ") a subset of ($bits)?" if DEBUG;;
 908
 909                     foreach my $pos (@positions) {
 910                         unless (grep { $pos == $_ } @bits) {
 911                             print STDERR "  No\n" if DEBUG;
 912                             my $remaining = $cur_count - $i - 1;
 913                             if ($remaining && @subset + $remaining < $need) {
 914                                 print STDERR __LINE__, ": Can stop trying $try_base, because even if all the remaining $remaining values work, they wouldn't add up to the needed $need when combined with the existing ", scalar @subset, " ones\n" if DEBUG;
 915                                 last TRY;
 916                             }
 917                             next TRY;
 918                         }
 919                     }
 920
 921                     print STDERR "  Yes\n" if DEBUG;
 922                     push @subset, $try_this;
 923
 924                     # Add this to the mask base, in case it ultimately
 925                     # succeeds,
 926                     $compare &= $try_this;
 927                 }
 928
 929                 print STDERR __LINE__, ": subset (", join(", ", @subset), ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
 930
 931                 if (@subset < $need) {
 932                     shift @{$hash{$count}{$bits}};
 933                     next;   # Try with next value
 934                 }
 935
 936                 # Create the mask
 937                 my $mask = 0;
 938                 foreach my $position (@bits) {
 939                     $mask |= 1 << $position;
 940                 }
 941                 $mask = ~$mask & 0xFF;
 942                 push @final_results, [$compare, $mask];
 943
 944                 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n", __LINE__, $compare, $compare, $mask if DEBUG;
 945
 946                 # These values are now spoken for.  Remove them from future
 947                 # consideration
 948                 foreach my $remove_count (sort keys %hash) {
 949                     foreach my $bits (sort keys %{$hash{$remove_count}}) {
 950                         foreach my $to_remove (@subset) {
 951                             @{$hash{$remove_count}{$bits}} = grep { $_ != $to_remove } @{$hash{$remove_count}{$bits}};
 952                         }
 953                     }
 954                 }
 955             }
 956         }
 957     }
 958
 959     # Any values that remain in the list are ones that have to be tested for
 960     # individually.
 961     my @individuals;
 962     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 963         foreach my $bits (sort keys $hash{$count}->%*) {
 964             foreach my $remaining (@{$hash{$count}{$bits}}) {
 965
 966                 # If we already know about this value, just ignore it.
 967                 next if grep { $remaining == $_ } @individuals;
 968
 969                 # Otherwise it needs to be returned as something to match
 970                 # individually
 971                 push @final_results, [$remaining, undef];
 972                 push @individuals, $remaining;
 973             }
 974         }
 975     }
 976
 977     # Sort by increasing numeric value
 978     @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
 979
 980     print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
 981
 982     return @final_results;
 983 }
 984
 985 # _cond_as_str
 986 # turn a list of conditions into a text expression
 987 # - merges ranges of conditions, and joins the result with ||
 988 sub _cond_as_str {
 989     my ( $self, $op, $combine, $opts_ref )= @_;
 990     my $cond= $op->{vals};
 991     my $test= $op->{test};
 992     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
 993     return "( $test )" if !defined $cond;
 994
 995     # rangify the list.
 996     my @ranges;
 997     my $Update= sub {
 998         # We skip this if there are optimizations that
 999         # we can apply (below) to the individual ranges
1000         if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
1001             if ( $ranges[-1][0] == $ranges[-1][1] ) {
1002                 $ranges[-1]= $ranges[-1][0];
1003             } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
1004                 $ranges[-1]= $ranges[-1][0];
1005                 push @ranges, $ranges[-1] + 1;
1006             }
1007         }
1008     };
1009     for my $condition ( @$cond ) {
1010         if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
1011             $Update->();
1012             push @ranges, [ $condition, $condition ];
1013         } else {
1014             $ranges[-1][1]++;
1015         }
1016     }
1017     $Update->();
1018
1019     return $self->_combine( $test, @ranges )
1020       if $combine;
1021
1022     if ($is_cp_ret) {
1023         @ranges= map {
1024             ref $_
1025             ? sprintf(
1026                 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1027                 @$_ )
1028             : sprintf( "$self->{val_fmt} == $test", $_ );
1029         } @ranges;
1030
1031         return "( " . join( " || ", @ranges ) . " )";
1032     }
1033
1034     # If the input set has certain characteristics, we can optimize tests
1035     # for it.  This doesn't apply if returning the code point, as we want
1036     # each element of the set individually.  The code above is for this
1037     # simpler case.
1038
1039     return 1 if @$cond == 256;  # If all bytes match, is trivially true
1040
1041     my @masks;
1042     if (@ranges > 1) {
1043
1044         # See if the entire set shares optimizable characteristics, and if so,
1045         # return the optimization.  We delay checking for this on sets with
1046         # just a single range, as there may be better optimizations available
1047         # in that case.
1048         @masks = calculate_mask(@$cond);
1049
1050         # Stringify the output of calculate_mask()
1051         if (@masks) {
1052             my @return;
1053             foreach my $mask_ref (@masks) {
1054                 if (defined $mask_ref->[1]) {
1055                     push @return, sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask_ref->[1], $mask_ref->[0];
1056                 }
1057                 else {  # An undefined mask means to use the value as-is
1058                     push @return, sprintf "$test == $self->{val_fmt}", $mask_ref->[0];
1059                 }
1060             }
1061
1062             # The best possible case below for specifying this set of values via
1063             # ranges is 1 branch per range.  If our mask method yielded better
1064             # results, there is no sense trying something that is bound to be
1065             # worse.
1066             if (@return < @ranges) {
1067                 return "( " . join( " || ", @return ) . " )";
1068             }
1069
1070             @masks = @return;
1071         }
1072     }
1073
1074     # Here, there was no entire-class optimization that was clearly better
1075     # than doing things by ranges.  Look at each range.
1076     my $range_count_extra = 0;
1077     for (my $i = 0; $i < @ranges; $i++) {
1078         if (! ref $ranges[$i]) {    # Trivial case: no range
1079             $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
1080         }
1081         elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
1082             $ranges[$i] =           # Trivial case: single element range
1083                     sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
1084         }
1085         elsif ($ranges[$i]->[0] == 0) {
1086             # If the range matches all 256 possible bytes, it is trivially
1087             # true.
1088             return 1 if $ranges[0]->[1] == 0xFF;    # @ranges must be 1 in
1089                                                     # this case
1090             $ranges[$i] = sprintf "( $test <= $self->{val_fmt} )",
1091                                                                $ranges[$i]->[1];
1092         }
1093         elsif ($ranges[$i]->[1] == 255) {
1094
1095             # Similarly the max possible is 255, so can omit an upper bound
1096             # test if the calculated max is the max possible one.
1097             $ranges[$i] = sprintf "( $test >= $self->{val_fmt} )",
1098                                                                 $ranges[0]->[0];
1099         }
1100         else {
1101             my $output = "";
1102
1103             # Well-formed UTF-8 continuation bytes on ascii platforms must be
1104             # in the range 0x80 .. 0xBF.  If we know that the input is
1105             # well-formed (indicated by not trying to be 'safe'), we can omit
1106             # tests that verify that the input is within either of these
1107             # bounds.  (No legal UTF-8 character can begin with anything in
1108             # this range, so we don't have to worry about this being a
1109             # continuation byte or not.)
1110             if ($opts_ref->{charset} =~ /ascii/i
1111                 && (! $opts_ref->{safe} && ! $opts_ref->{no_length_checks})
1112                 && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
1113             {
1114                 my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
1115                 my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
1116
1117                 # If the range is the entire legal range, it matches any legal
1118                 # byte, so we can omit both tests.  (This should happen only
1119                 # if the number of ranges is 1.)
1120                 if ($lower_limit_is_80 && $upper_limit_is_BF) {
1121                     return 1;
1122                 }
1123                 elsif ($lower_limit_is_80) { # Just use the upper limit test
1124                     $output = sprintf("( $test <= $self->{val_fmt} )",
1125                                         $ranges[$i]->[1]);
1126                 }
1127                 elsif ($upper_limit_is_BF) { # Just use the lower limit test
1128                     $output = sprintf("( $test >= $self->{val_fmt} )",
1129                                     $ranges[$i]->[0]);
1130                 }
1131             }
1132
1133             # If we didn't change to omit a test above, see if the number of
1134             # elements is a power of 2 (only a single bit in the
1135             # representation of its count will be set) and if so, it may be
1136             # that a mask/compare optimization is possible.
1137             if ($output eq ""
1138                 && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
1139             {
1140                 my @list;
1141                 push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
1142                 my @this_masks = calculate_mask(@list);
1143
1144                 # Use the mask if there is just one for the whole range.
1145                 # Otherwise there is no savings over the two branches that can
1146                 # define the range.
1147                 if (@this_masks == 1 && defined $this_masks[0][1]) {
1148                     $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $this_masks[0][1], $this_masks[0][0];
1149                 }
1150             }
1151
1152             if ($output ne "") {  # Prefer any optimization
1153                 $ranges[$i] = $output;
1154             }
1155             else {
1156                 # No optimization happened.  We need a test that the code
1157                 # point is within both bounds.  But, if the bounds are
1158                 # adjacent code points, it is cleaner to say
1159                 # 'first == test || second == test'
1160                 # than it is to say
1161                 # 'first <= test && test <= second'
1162
1163                 $range_count_extra++;   # This range requires 2 branches to
1164                                         # represent
1165                 if ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
1166                     $ranges[$i] = "( "
1167                                 .  join( " || ", ( map
1168                                     { sprintf "$self->{val_fmt} == $test", $_ }
1169                                     @{$ranges[$i]} ) )
1170                                 . " )";
1171                 }
1172                 else {  # Full bounds checking
1173                     $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
1174                 }
1175             }
1176         }
1177     }
1178
1179     # We have generated the list of bytes in two ways; one trying to use masks
1180     # to cut the number of branches down, and the other to look at individual
1181     # ranges (some of which could be cut down by using a mask for just it).
1182     # We return whichever method uses the fewest branches.
1183     return "( "
1184            . join( " || ", (@masks && @masks < @ranges + $range_count_extra)
1185                             ? @masks
1186                             : @ranges)
1187            . " )";
1188 }
1189
1190 # _combine
1191 # recursively turn a list of conditions into a fast break-out condition
1192 # used by _cond_as_str() for 'cp' type macros.
1193 sub _combine {
1194     my ( $self, $test, @cond )= @_;
1195     return if !@cond;
1196     my $item= shift @cond;
1197     my ( $cstr, $gtv );
1198     if ( ref $item ) {  # @item should be a 2-element array giving range start
1199                         # and end
1200         if ($item->[0] == 0) {  # UV's are never negative, so skip "0 <= "
1201                                 # test which could generate a compiler warning
1202                                 # that test is always true
1203             $cstr= sprintf( "$test <= $self->{val_fmt}", $item->[1] );
1204         }
1205         else {
1206             $cstr=
1207           sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1208                    @$item );
1209         }
1210         $gtv= sprintf "$self->{val_fmt}", $item->[1];
1211     } else {
1212         $cstr= sprintf( "$self->{val_fmt} == $test", $item );
1213         $gtv= sprintf "$self->{val_fmt}", $item;
1214     }
1215     if ( @cond ) {
1216         my $combine= $self->_combine( $test, @cond );
1217         if (@cond >1) {
1218             return "( $cstr || ( $gtv < $test &&\n"
1219                    . $combine . " ) )";
1220         } else {
1221             return "( $cstr || $combine )";
1222         }
1223     } else {
1224         return $cstr;
1225     }
1226 }
1227
1228 # _render()
1229 # recursively convert an optree to text with reasonably neat formatting
1230 sub _render {
1231     my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_;
1232     return 0 if ! defined $op;  # The set is empty
1233     if ( !ref $op ) {
1234         return $op;
1235     }
1236     my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1237     #no warnings 'recursion';   # This would allow really really inefficient
1238                                 # code to be generated.  See pod
1239     my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def, $submacros );
1240     return $yes if $cond eq '1';
1241
1242     my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref, $def, $submacros );
1243     return "( $cond )" if $yes eq '1' and $no eq '0';
1244     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1245     return "$lb$cond ? $yes : $no$rb"
1246       if !ref( $op->{yes} ) && !ref( $op->{no} );
1247     my $ind1= " " x 4;
1248     my $ind= "\n" . ( $ind1 x $op->{depth} );
1249
1250     if ( ref $op->{yes} ) {
1251         $yes= $ind . $ind1 . $yes;
1252     } else {
1253         $yes= " " . $yes;
1254     }
1255
1256     my $str= "$lb$cond ?$yes$ind: $no$rb";
1257     if (length $str > 6000) {
1258         push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $yes_idx= 0+@$submacros), $yes;
1259         push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $no_idx= 0+@$submacros), $no;
1260         return sprintf "%s%s ? $def : $def%s", $lb, $cond, "_part$yes_idx", "_part$no_idx", $rb;
1261     }
1262     return $str;
1263 }
1264
1265 # $expr=render($op,$combine)
1266 #
1267 # convert an optree to text with reasonably neat formatting. If $combine
1268 # is true then the condition is created using "fast breakouts" which
1269 # produce uglier expressions that are more efficient for common case,
1270 # longer lists such as that resulting from type 'cp' output.
1271 # Currently only used for type 'cp' macros.
1272 sub render {
1273     my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_;
1274
1275     my @submacros;
1276     my $macro= sprintf "#define $def_fmt\n( %s )", "", $self->_render( $op, $combine, 0, $opts_ref, $def_fmt, \@submacros );
1277
1278     return join "\n\n", map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) } @submacros, $macro;
1279 }
1280
1281 # make_macro
1282 # make a macro of a given type.
1283 # calls into make_trie and (generic_|length_)optree as needed
1284 # Opts are:
1285 # type             : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
1286 # ret_type         : 'cp' or 'len'
1287 # safe             : don't assume is well-formed UTF-8, so don't skip any range
1288 #                    checks, and add length guards to macro
1289 # no_length_checks : like safe, but don't add length guards.
1290 #
1291 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1292 # in which case it defaults to 'cp' as well.
1293 #
1294 # It is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1295 # sequences in it, as the generated macro will accept only a single codepoint
1296 # as an argument.
1297 #
1298 # It is also illegal to do a non-safe macro on a pattern with multi-codepoint
1299 # sequences in it, as even if it is known to be well-formed, we need to not
1300 # run off the end of the buffer when, say, the buffer ends with the first two
1301 # characters, but three are looked at by the macro.
1302 #
1303 # returns the macro.
1304
1305
1306 sub make_macro {
1307     my $self= shift;
1308     my %opts= @_;
1309     my $type= $opts{type} || 'generic';
1310     if ($self->{has_multi}) {
1311         if ($type =~ /^cp/) {
1312             die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
1313         }
1314         elsif (! $opts{safe}) {
1315             die "'safe' is required on multi-codepoint character class '$self->{op}'"
1316         }
1317     }
1318     my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1319     my $method;
1320     if ( $opts{safe} ) {
1321         $method= 'length_optree';
1322     } elsif ( $type =~ /generic/ ) {
1323         $method= 'generic_optree';
1324     } else {
1325         $method= 'optree';
1326     }
1327     my @args= $type =~ /^cp/ ? 'cp' : 's';
1328     push @args, "e" if $opts{safe};
1329     push @args, "is_utf8" if $type =~ /generic/;
1330     push @args, "len" if $ret_type eq 'both';
1331     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
1332              $ret_type eq 'cp'      ? 'what_'     : 'is_';
1333     my $ext= $type     =~ /generic/ ? ''          : '_' . lc( $type );
1334     $ext .= '_non_low' if $type eq 'generic_non_low';
1335     $ext .= "_safe" if $opts{safe};
1336     $ext .= "_no_length_checks" if $opts{no_length_checks};
1337     my $argstr= join ",", @args;
1338     my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
1339     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1340     return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt );
1341 }
1342
1343 # if we aren't being used as a module (highly likely) then process
1344 # the __DATA__ below and produce macros in regcharclass.h
1345 # if an argument is provided to the script then it is assumed to
1346 # be the path of the file to output to, if the arg is '-' outputs
1347 # to STDOUT.
1348 if ( !caller ) {
1349     $|++;
1350     my $path= shift @ARGV || "regcharclass.h";
1351     my $out_fh;
1352     if ( $path eq '-' ) {
1353         $out_fh= \*STDOUT;
1354     } else {
1355         $out_fh = open_new( $path );
1356     }
1357     print $out_fh read_only_top( lang => 'C', by => $0,
1358                                  file => 'regcharclass.h', style => '*',
1359                                  copyright => [2007, 2011],
1360                                  final => <<EOF,
1361 WARNING: These macros are for internal Perl core use only, and may be
1362 changed or removed without notice.
1363 EOF
1364     );
1365     print $out_fh "\n#ifndef H_REGCHARCLASS   /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n";
1366
1367     my ( $op, $title, @txt, @types, %mods );
1368     my $doit= sub ($) {
1369         return unless $op;
1370
1371         my $charset = shift;
1372
1373         # Skip if to compile on a different platform.
1374         return if delete $mods{only_ascii_platform} && $charset !~ /ascii/i;
1375         return if delete $mods{only_ebcdic_platform} && $charset !~ /ebcdic/i;
1376
1377         print $out_fh "/*\n\t$op: $title\n\n";
1378         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1379         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt, charset => $charset);
1380
1381         #die Dumper(\@types,\%mods);
1382
1383         my @mods;
1384         push @mods, 'safe' if delete $mods{safe};
1385         push @mods, 'no_length_checks' if delete $mods{no_length_checks};
1386         unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
1387                                                                 # do this one
1388                                                                 # first, as
1389                                                                 # traditional
1390         if (%mods) {
1391             die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1392         }
1393
1394         foreach my $type_spec ( @types ) {
1395             my ( $type, $ret )= split /-/, $type_spec;
1396             $ret ||= 'len';
1397             foreach my $mod ( @mods ) {
1398
1399                 # 'safe' is irrelevant with code point macros, so skip if
1400                 # there is also a 'fast', but don't skip if this is the only
1401                 # way a cp macro will get generated.  Below we convert 'safe'
1402                 # to 'fast' in this instance
1403                 next if $type =~ /^cp/
1404                         && ($mod eq 'safe' || $mod eq 'no_length_checks')
1405                         && grep { 'fast' =~ $_ } @mods;
1406                 delete $mods{$mod};
1407                 my $macro= $obj->make_macro(
1408                     type     => $type,
1409                     ret_type => $ret,
1410                     safe     => $mod eq 'safe' && $type !~ /^cp/,
1411                     charset  => $charset,
1412                     no_length_checks => $mod eq 'no_length_checks' && $type !~ /^cp/,
1413                 );
1414                 print $out_fh $macro, "\n";
1415             }
1416         }
1417     };
1418
1419     my @data = <DATA>;
1420     foreach my $charset (get_supported_code_pages()) {
1421         my $first_time = 1;
1422         undef $op;
1423         undef $title;
1424         undef @txt;
1425         undef @types;
1426         undef %mods;
1427         print $out_fh "\n", get_conditional_compile_line_start($charset);
1428         my @data_copy = @data;
1429         for (@data_copy) {
1430             s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1431             next unless /\S/;
1432             chomp;
1433             if ( /^[A-Z]/ ) {
1434                 $doit->($charset) unless $first_time;  # This starts a new
1435                                                        # definition; do the
1436                                                        # previous one
1437                 $first_time = 0;
1438                 ( $op, $title )= split /\s*:\s*/, $_, 2;
1439                 @txt= ();
1440             } elsif ( s/^=>// ) {
1441                 my ( $type, $modifier )= split /:/, $_;
1442                 @types= split ' ', $type;
1443                 undef %mods;
1444                 map { $mods{$_} = 1 } split ' ',  $modifier;
1445             } else {
1446                 push @txt, "$_";
1447             }
1448         }
1449         $doit->($charset);
1450         print $out_fh get_conditional_compile_line_end();
1451     }
1452
1453     print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
1454
1455     if($path eq '-') {
1456         print $out_fh "/* ex: set ro: */\n";
1457     } else {
1458         # Some of the sources for these macros come from Unicode tables
1459         my $sources_list = "lib/unicore/mktables.lst";
1460         my @sources = ($0, qw(lib/unicore/mktables
1461                               lib/Unicode/UCD.pm
1462                               regen/regcharclass_multi_char_folds.pl
1463                               regen/charset_translations.pl
1464                              ));
1465         {
1466             # Depend on mktables’ own sources.  It’s a shorter list of files than
1467             # those that Unicode::UCD uses.
1468             if (! open my $mktables_list, $sources_list) {
1469
1470                 # This should force a rebuild once $sources_list exists
1471                 push @sources, $sources_list;
1472             }
1473             else {
1474                 while(<$mktables_list>) {
1475                     last if /===/;
1476                     chomp;
1477                     push @sources, "lib/unicore/$_" if /^[^#]/;
1478                 }
1479             }
1480         }
1481         read_only_bottom_close_and_rename($out_fh, \@sources)
1482     }
1483 }
1484
1485 # The form of the input is a series of definitions to make macros for.
1486 # The first line gives the base name of the macro, followed by a colon, and
1487 # then text to be used in comments associated with the macro that are its
1488 # title or description.  In all cases the first (perhaps only) parameter to
1489 # the macro is a pointer to the first byte of the code point it is to test to
1490 # see if it is in the class determined by the macro.  In the case of non-UTF8,
1491 # the code point consists only of a single byte.
1492 #
1493 # The second line must begin with a '=>' and be followed by the types of
1494 # macro(s) to be generated; these are specified below.  A colon follows the
1495 # types, followed by the modifiers, also specified below.  At least one
1496 # modifier is required.
1497 #
1498 # The subsequent lines give what code points go into the class defined by the
1499 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1500 # enclosed in quotes.  Otherwise the lines consist of one of:
1501 #   1)  a single Unicode code point, prefaced by 0x
1502 #   2)  a single range of Unicode code points separated by a minus (and
1503 #       optional space)
1504 #   3)  a single Unicode property specified in the standard Perl form
1505 #       "\p{...}"
1506 #   4)  a line like 'do path'.  This will do a 'do' on the file given by
1507 #       'path'.  It is assumed that this does nothing but load subroutines
1508 #       (See item 5 below).  The reason 'require path' is not used instead is
1509 #       because 'do' doesn't assume that path is in @INC.
1510 #   5)  a subroutine call
1511 #           &pkg::foo(arg1, ...)
1512 #       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1513 #       returns an array of entries of forms like items 1-3 above.  This
1514 #       allows more complex inputs than achievable from the other input types.
1515 #
1516 # A blank line or one whose first non-blank character is '#' is a comment.
1517 # The definition of the macro is terminated by a line unlike those described.
1518 #
1519 # Valid types:
1520 #   low         generate a macro whose name is 'is_BASE_low' and defines a
1521 #               class that includes only ASCII-range chars.  (BASE is the
1522 #               input macro base name.)
1523 #   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1524 #               class that includes only upper-Latin1-range chars.  It is not
1525 #               designed to take a UTF-8 input parameter.
1526 #   high        generate a macro whose name is 'is_BASE_high' and defines a
1527 #               class that includes all relevant code points that are above
1528 #               the Latin1 range.  This is for very specialized uses only.
1529 #               It is designed to take only an input UTF-8 parameter.
1530 #   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1531 #               class that includes all relevant characters that aren't ASCII.
1532 #               It is designed to take only an input UTF-8 parameter.
1533 #   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1534 #               class that includes both ASCII and upper-Latin1-range chars.
1535 #               It is not designed to take a UTF-8 input parameter.
1536 #   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1537 #               class that can include any code point, adding the 'low' ones
1538 #               to what 'utf8' works on.  It is designed to take only an input
1539 #               UTF-8 parameter.
1540 #   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1541 #               boolean, parameter which indicates if the first one points to
1542 #               a UTF-8 string or not.  Thus it works in all circumstances.
1543 #   generic_non_low generate a macro whose name is 'is_BASE_non_low".  It has
1544 #               a 2nd, boolean, parameter which indicates if the first one
1545 #               points to a UTF-8 string or not.  It excludes any ASCII-range
1546 #               matches, but otherwise it works in all circumstances.
1547 #   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1548 #               class that returns true if the UV parameter is a member of the
1549 #               class; false if not.
1550 #   cp_high     like cp, but it is assumed that it is known that the UV
1551 #               parameter is above Latin1.  The name of the generated macro is
1552 #               'is_BASE_cp_high'.  This is different from high-cp, derived
1553 #               below.
1554 # A macro of the given type is generated for each type listed in the input.
1555 # The default return value is the number of octets read to generate the match.
1556 # Append "-cp" to the type to have it instead return the matched codepoint.
1557 #               The macro name is changed to 'what_BASE...'.  See pod for
1558 #               caveats
1559 # Appending '-both" instead adds an extra parameter to the end of the argument
1560 #               list, which is a pointer as to where to store the number of
1561 #               bytes matched, while also returning the code point.  The macro
1562 #               name is changed to 'what_len_BASE...'.  See pod for caveats
1563 #
1564 # Valid modifiers:
1565 #   safe        The input string is not necessarily valid UTF-8.  In
1566 #               particular an extra parameter (always the 2nd) to the macro is
1567 #               required, which points to one beyond the end of the string.
1568 #               The macro will make sure not to read off the end of the
1569 #               string.  In the case of non-UTF8, it makes sure that the
1570 #               string has at least one byte in it.  The macro name has
1571 #               '_safe' appended to it.
1572 #   no_length_checks  The input string is not necessarily valid UTF-8, but it
1573 #               is to be assumed that the length has already been checked and
1574 #               found to be valid
1575 #   fast        The input string is valid UTF-8.  No bounds checking is done,
1576 #               and the macro can make assumptions that lead to faster
1577 #               execution.
1578 #   only_ascii_platform   Skip this definition if the character set is for
1579 #               a non-ASCII platform.
1580 #   only_ebcdic_platform  Skip this definition if the character set is for
1581 #               a non-EBCDIC platform.
1582 # No modifier need be specified; fast is assumed for this case.  If both
1583 # 'fast', and 'safe' are specified, two macros will be created for each
1584 # 'type'.
1585 #
1586 # If run on a non-ASCII platform will automatically convert the Unicode input
1587 # to native.  The documentation above is slightly wrong in this case.  'low'
1588 # actually refers to code points whose UTF-8 representation is the same as the
1589 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1590 # code points less than 256.
1591
1592 1; # in the unlikely case we are being used as a module
1593
1594 __DATA__
1595 # This is no longer used, but retained in case it is needed some day.
1596 # TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1597 # => generic cp generic-cp generic-both :fast safe
1598 # 0x00DF        # LATIN SMALL LETTER SHARP S
1599 # 0x0390        # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1600 # 0x03B0        # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1601 # 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1602 # 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1603 # 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1604
1605 LNBREAK: Line Break: \R
1606 => generic UTF8 LATIN1 : safe
1607 "\x0D\x0A"      # CRLF - Network (Windows) line ending
1608 \p{VertSpace}
1609
1610 HORIZWS: Horizontal Whitespace: \h \H
1611 => high cp_high : fast
1612 \p{HorizSpace}
1613
1614 VERTWS: Vertical Whitespace: \v \V
1615 => high cp_high : fast
1616 \p{VertSpace}
1617
1618 XDIGIT: Hexadecimal digits
1619 => high cp_high : fast
1620 \p{XDigit}
1621
1622 XPERLSPACE: \p{XPerlSpace}
1623 => high cp_high : fast
1624 \p{XPerlSpace}
1625
1626 REPLACEMENT: Unicode REPLACEMENT CHARACTER
1627 => UTF8 :safe
1628 0xFFFD
1629
1630 NONCHAR: Non character code points
1631 => UTF8 :fast
1632 \p{Nchar}
1633
1634 SURROGATE: Surrogate characters
1635 => UTF8 :fast
1636 \p{Gc=Cs}
1637
1638 # This program was run with this enabled, and the results copied to utf8.h;
1639 # then this was commented out because it takes so long to figure out these 2
1640 # million code points.  The results would not change unless utf8.h decides it
1641 # wants a maximum other than 4 bytes, or this program creates better
1642 # optimizations.  Trying with 5 bytes used too much memory to calculate.
1643 #
1644 # We don't generate code for invariants here because the EBCDIC form is too
1645 # complicated and would slow things down; instead the user should test for
1646 # invariants first.
1647 #
1648 # NOTE: The number of bytes generated here must match the value in
1649 # IS_UTF8_CHAR_FAST in utf8.h
1650 #
1651 #UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes
1652 #=> UTF8 :no_length_checks only_ascii_platform
1653 #0x80 - 0x1FFFFF
1654
1655 # This hasn't been commented out, but the number of bytes it works on has been
1656 # cut down to 3, so it doesn't cover the full legal Unicode range.  Making it
1657 # 5 bytes would cover beyond the full range, but takes quite a bit of time and
1658 # memory to calculate.  The generated table varies depending on the EBCDIC
1659 # code page.
1660
1661 # NOTE: The number of bytes generated here must match the value in
1662 # IS_UTF8_CHAR_FAST in utf8.h
1663 #
1664 UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
1665 => UTF8 :no_length_checks only_ebcdic_platform
1666 0xA0 - 0x3FFF
1667
1668 QUOTEMETA: Meta-characters that \Q should quote
1669 => high :fast
1670 \p{_Perl_Quotemeta}
1671
1672 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1673 => UTF8 :safe
1674
1675 # 1 => All folds
1676 &regcharclass_multi_char_folds::multi_char_folds(1)
1677
1678 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1679 => LATIN1 : safe
1680
1681 &regcharclass_multi_char_folds::multi_char_folds(0)
1682 # 0 => Latin1-only
1683
1684 FOLDS_TO_MULTI: characters that fold to multi-char strings
1685 => UTF8 :fast
1686 \p{_Perl_Folds_To_Multi_Char}
1687
1688 PROBLEMATIC_LOCALE_FOLD : characters whose fold is problematic under locale
1689 => UTF8 cp :fast
1690 \p{_Perl_Problematic_Locale_Folds}
1691
1692 PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are problematic under locale
1693 => UTF8 cp :fast
1694 \p{_Perl_Problematic_Locale_Foldeds_Start}
1695
1696 PATWS: pattern white space
1697 => generic cp : safe
1698 \p{PatWS}