regen/regcharclass.pl

   1 #!perl
   2 package CharClass::Matcher;
   3 use strict;
   4 use 5.008;
   5 use warnings;
   6 use warnings FATAL => 'all';
   7 use Data::Dumper;
   8 $Data::Dumper::Useqq= 1;
   9
  10 sub DEBUG () { 0 }
  11 $|=1 if DEBUG;
  12
  13 require './regen/regen_lib.pl';
  14 require './regen/charset_translations.pl';
  15 require "./regen/regcharclass_multi_char_folds.pl";
  16
  17 =head1 NAME
  18
  19 CharClass::Matcher -- Generate C macros that match character classes efficiently
  20
  21 =head1 SYNOPSIS
  22
  23     perl regen/regcharclass.pl
  24
  25 =head1 DESCRIPTION
  26
  27 Dynamically generates macros for detecting special charclasses
  28 in latin-1, utf8, and codepoint forms. Macros can be set to return
  29 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
  30
  31 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
  32 are necessary.
  33
  34 Using WHATEVER as an example the following macros can be produced, depending
  35 on the input parameters (how to get each is described by internal comments at
  36 the C<__DATA__> line):
  37
  38 =over 4
  39
  40 =item C<is_WHATEVER(s,is_utf8)>
  41
  42 =item C<is_WHATEVER_safe(s,e,is_utf8)>
  43
  44 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
  45 comparisons involving octet<128 are done before checking the C<is_utf8>
  46 flag, hopefully saving time.
  47
  48 The version without the C<_safe> suffix should be used only when the input is
  49 known to be well-formed.
  50
  51 =item C<is_WHATEVER_utf8(s)>
  52
  53 =item C<is_WHATEVER_utf8_safe(s,e)>
  54
  55 Do a lookup assuming the string is encoded in (normalized) UTF8.
  56
  57 The version without the C<_safe> suffix should be used only when the input is
  58 known to be well-formed.
  59
  60 =item C<is_WHATEVER_latin1(s)>
  61
  62 =item C<is_WHATEVER_latin1_safe(s,e)>
  63
  64 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  65
  66 The version without the C<_safe> suffix should be used only when it is known
  67 that C<s> contains at least one character.
  68
  69 =item C<is_WHATEVER_cp(cp)>
  70
  71 Check to see if the string matches a given codepoint (hypothetically a
  72 U32). The condition is constructed as to "break out" as early as
  73 possible if the codepoint is out of range of the condition.
  74
  75 IOW:
  76
  77   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  78
  79 Thus if the character is X+1 only two comparisons will be done. Making
  80 matching lookups slower, but non-matching faster.
  81
  82 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
  83
  84 A variant form of each of the macro types described above can be generated, in
  85 which the code point is returned by the macro, and an extra parameter (in the
  86 final position) is added, which is a pointer for the macro to set the byte
  87 length of the returned code point.
  88
  89 These forms all have a C<what_len> prefix instead of the C<is_>, for example
  90 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
  91 C<what_len_WHATEVER_utf8(s,len)>.
  92
  93 These forms should not be used I<except> on small sets of mostly widely
  94 separated code points; otherwise the code generated is inefficient.  For these
  95 cases, it is best to use the C<is_> forms, and then find the code point with
  96 C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
  97 message on the worst of the inappropriate sets.  Examine the generated macro
  98 to see if it is acceptable.
  99
 100 =item C<what_WHATEVER_FOO(arg1, ...)>
 101
 102 A variant form of each of the C<is_> macro types described above can be generated, in
 103 which the code point and not the length is returned by the macro.  These have
 104 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
 105 not be used where the set contains a NULL, as 0 is returned for two different
 106 cases: a) the set doesn't include the input code point; b) the set does
 107 include it, and it is a NULL.
 108
 109 =back
 110
 111 The above isn't quite complete, as for specialized purposes one can get a
 112 macro like C<is_WHATEVER_utf8_no_length_checks(s)>, which assumes that it is
 113 already known that there is enough space to hold the character starting at
 114 C<s>, but otherwise checks that it is well-formed.  In other words, this is
 115 intermediary in checking between C<is_WHATEVER_utf8(s)> and
 116 C<is_WHATEVER_utf8_safe(s,e)>.
 117
 118 =head2 CODE FORMAT
 119
 120 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
 121
 122
 123 =head1 AUTHOR
 124
 125 Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
 126
 127 =head1 BUGS
 128
 129 No tests directly here (although the regex engine will fail tests
 130 if this code is broken). Insufficient documentation and no Getopts
 131 handler for using the module as a script.
 132
 133 =head1 LICENSE
 134
 135 You may distribute under the terms of either the GNU General Public
 136 License or the Artistic License, as specified in the README file.
 137
 138 =cut
 139
 140 # Sub naming convention:
 141 # __func : private subroutine, can not be called as a method
 142 # _func  : private method, not meant for external use
 143 # func   : public method.
 144
 145 # private subs
 146 #-------------------------------------------------------------------------------
 147 #
 148 # ($cp,$n,$l,$u)=__uni_latin($str);
 149 #
 150 # Return a list of arrays, each of which when interpreted correctly
 151 # represent the string in some given encoding with specific conditions.
 152 #
 153 # $cp - list of codepoints that make up the string.
 154 # $n  - list of octets that make up the string if all codepoints are invariant
 155 #       regardless of if the string is in UTF-8 or not.
 156 # $l  - list of octets that make up the string in latin1 encoding if all
 157 #       codepoints < 256, and at least one codepoint is UTF-8 variant.
 158 # $u  - list of octets that make up the string in utf8 if any codepoint is
 159 #       UTF-8 variant
 160 #
 161 #   High CP | Defined
 162 #-----------+----------
 163 #   0 - 127 : $n            (127/128 are the values for ASCII platforms)
 164 # 128 - 255 : $l, $u
 165 # 256 - ... : $u
 166 #
 167
 168 sub __uni_latin1 {
 169     my $charset = shift;
 170     my $a2n= shift;
 171     my $str= shift;
 172     my $max= 0;
 173     my @cp;
 174     my @cp_high;
 175     my $only_has_invariants = 1;
 176     for my $ch ( split //, $str ) {
 177         my $cp= ord $ch;
 178         $max= $cp if $max < $cp;
 179         if ($cp > 255) {
 180             push @cp, $cp;
 181             push @cp_high, $cp;
 182         }
 183         else {
 184             push @cp, $a2n->[$cp];
 185         }
 186     }
 187     my ( $n, $l, $u );
 188     $only_has_invariants = ($charset =~ /ascii/i) ? $max < 128 : $max < 160;
 189     if ($only_has_invariants) {
 190         $n= [@cp];
 191     } else {
 192         $l= [@cp] if $max && $max < 256;
 193
 194         my @u;
 195         for my $ch ( split //, $str ) {
 196             push @u, map { ord } split //, cp_2_utfbytes(ord $ch, $charset);
 197         }
 198         $u = \@u;
 199     }
 200     return ( \@cp, \@cp_high, $n, $l, $u );
 201 }
 202
 203 #
 204 # $clean= __clean($expr);
 205 #
 206 # Cleanup a ternary expression, removing unnecessary parens and apply some
 207 # simplifications using regexes.
 208 #
 209
 210 sub __clean {
 211     my ( $expr )= @_;
 212
 213     #return $expr;
 214
 215     our $parens;
 216     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 217
 218     ## remove redundant parens
 219     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 220
 221
 222     # repeatedly simplify conditions like
 223     #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
 224     # into
 225     #       ( ( (cond1) && (cond2) ) ? X : Y )
 226     # Also similarly handles expressions like:
 227     #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
 228     # Note the inclusion of the close paren in ([:()]) and the open paren in
 229     # ([()]) is purely to ensure we have a balanced set of parens in the
 230     # expression which makes it easier to understand the pattern in an editor
 231     # that understands paren's, we do not expect either of these cases to
 232     # actually fire. - Yves
 233     1 while $expr =~ s/
 234         ([:()])  \s*
 235             ($parens) \s*
 236             \? \s*
 237                 \( \s* ($parens) \s*
 238                     \? \s* ($parens|[^()?:\s]+?) \s*
 239                     :  \s* ($parens|[^()?:\s]+?) \s*
 240                 \) \s*
 241             : \s* \5 \s*
 242         ([()])
 243     /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
 244     #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000;
 245     #$expr=~s/\s+//g if length $expr > 8000;
 246
 247     die "Expression too long" if length $expr > 8000;
 248
 249     return $expr;
 250 }
 251
 252 #
 253 # $text= __macro(@args);
 254 # Join args together by newlines, and then neatly add backslashes to the end
 255 # of every  line as expected by the C pre-processor for #define's.
 256 #
 257
 258 sub __macro {
 259     my $str= join "\n", @_;
 260     $str =~ s/\s*$//;
 261     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 262     my $last= pop @lines;
 263     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 264     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 265     return $str . "\n";
 266 }
 267
 268 #
 269 # my $op=__incrdepth($op);
 270 #
 271 # take an 'op' hashref and add one to it and all its childrens depths.
 272 #
 273
 274 sub __incrdepth {
 275     my $op= shift;
 276     return unless ref $op;
 277     $op->{depth} += 1;
 278     __incrdepth( $op->{yes} );
 279     __incrdepth( $op->{no} );
 280     return $op;
 281 }
 282
 283 # join two branches of an opcode together with a condition, incrementing
 284 # the depth on the yes branch when we do so.
 285 # returns the new root opcode of the tree.
 286 sub __cond_join {
 287     my ( $cond, $yes, $no )= @_;
 288     if (ref $yes) {
 289         return {
 290             test  => $cond,
 291             yes   => __incrdepth( $yes ),
 292             no    => $no,
 293             depth => 0,
 294         };
 295     }
 296     else {
 297         return {
 298             test  => $cond,
 299             yes   => $yes,
 300             no    => __incrdepth($no),
 301             depth => 0,
 302         };
 303     }
 304 }
 305
 306 my $hex_fmt= "0x%02X";
 307
 308 sub val_fmt
 309 {
 310     my $self = shift;
 311     my $arg = shift;
 312
 313     # Format 'arg' using the printable character if it has one, or a %x if
 314     # not, returning a string containing the result
 315
 316     # Return what always returned for an unexpected argument
 317     return $hex_fmt unless defined $arg && $arg !~ /\D/;
 318
 319     # We convert only things inside Latin1
 320     if ($arg < 256) {
 321
 322         # Find the ASCII equivalent of this argument (as the current character
 323         # set might not be ASCII)
 324         my $char = chr $self->{n2a}->[$arg];
 325
 326         # If printable, return it, escaping \ and '
 327         return "'$char'" if $char =~ /[^\\'[:^print:]]/a;
 328         return "'\\\\'" if $char eq "\\";
 329         return "'\''" if $char eq "'";
 330
 331         # Handle the mnemonic controls
 332         my $pos = index("\a\b\e\f\n\r\t\cK", $char);
 333         return "'\\" . substr("abefnrtv", $pos, 1) . "'" if $pos >= 0;
 334     }
 335
 336     # Otherwise, just the input, formatted
 337     return sprintf $hex_fmt, $arg;
 338 }
 339
 340 # Methods
 341
 342 # constructor
 343 #
 344 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 345 #
 346 # Create a new CharClass::Matcher object by parsing the text in
 347 # the txt array. Currently applies the following rules:
 348 #
 349 # Element starts with C<0x>, line is evaled the result treated as
 350 # a number which is passed to chr().
 351 #
 352 # Element starts with C<">, line is evaled and the result treated
 353 # as a string.
 354 #
 355 # Each string is then stored in the 'strs' subhash as a hash record
 356 # made up of the results of __uni_latin1, using the keynames
 357 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
 358 # 'UTF8' which hold a merge of 'low' and their lowercase equivalents.
 359 #
 360 # Size data is tracked per type in the 'size' subhash.
 361 #
 362 # Return an object
 363
 364 my %n2a;    # Inversion of a2n, for each character set
 365
 366 sub new {
 367     my $class= shift;
 368     my %opt= @_;
 369     my %hash_return;
 370     for ( qw(op txt) ) {
 371         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 372           if !exists $opt{$_};
 373     }
 374
 375     my $self= bless {
 376         op    => $opt{op},
 377         title => $opt{title} || '',
 378     }, $class;
 379
 380     my $charset = $opt{charset};
 381     my $a2n = get_a2n($charset);
 382
 383     # We need to construct the map going the other way if not already done
 384     unless (defined $n2a{$charset}) {
 385         for (my $i = 0; $i < 256; $i++) {
 386             $n2a{$charset}->[$a2n->[$i]] = $i;
 387         }
 388     }
 389
 390     foreach my $txt ( @{ $opt{txt} } ) {
 391         my $str= $txt;
 392         if ( $str =~ /^[""]/ ) {
 393             $str= eval $str;
 394         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
 395                                     # list with its expansion
 396             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
 397             die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'"
 398                                         if ! defined $lower || ! defined $upper;
 399             foreach my $cp (hex $lower .. hex $upper) {
 400                 push @{$opt{txt}}, sprintf "0x%X", $cp;
 401             }
 402             next;
 403         } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
 404             # Otherwise undocumented, a leading N means is already in the
 405             # native character set; don't convert.
 406             $str= chr eval $str;
 407         } elsif ( $str =~ /^0x/ ) {
 408             $str= eval $str;
 409             $str = chr $str;
 410         } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
 411             my $property = $1;
 412             use Unicode::UCD qw(prop_invlist);
 413
 414             my @invlist = prop_invlist($property, '_perl_core_internal_ok');
 415             if (! @invlist) {
 416
 417                 # An empty return could mean an unknown property, or merely
 418                 # that it is empty.  Call in scalar context to differentiate
 419                 my $count = prop_invlist($property, '_perl_core_internal_ok');
 420                 die "$property not found" unless defined $count;
 421             }
 422
 423             # Replace this element on the list with the property's expansion
 424             for (my $i = 0; $i < @invlist; $i += 2) {
 425                 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
 426
 427                     # prop_invlist() returns native values; add leading 'N'
 428                     # to indicate that.
 429                     push @{$opt{txt}}, sprintf "N0x%X", $cp;
 430                 }
 431             }
 432             next;
 433         } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
 434             die "do '$1' failed: $!$@" if ! do $1 or $@;
 435             next;
 436         } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
 437             my @results = eval "$1";
 438             die "eval '$1' failed: $@" if $@;
 439             push @{$opt{txt}}, @results;
 440             next;
 441         } elsif ($str =~ / ^ % \s* ( .* ) /x) { # user-furnished sub() call
 442             %hash_return = eval "$1";
 443             die "eval '$1' failed: $@" if $@;
 444             push @{$opt{txt}}, keys %hash_return;
 445             die "Only one multi character expansion currently allowed per rule"
 446                                                         if  $self->{multi_maps};
 447             next;
 448         } else {
 449             die "Unparsable line: $txt\n";
 450         }
 451         my ( $cp, $cp_high, $low, $latin1, $utf8 )
 452                                         = __uni_latin1($charset, $a2n, $str );
 453         my $from;
 454         if (defined $hash_return{"\"$str\""}) {
 455             $from = $hash_return{"\"$str\""};
 456             $from = $a2n->[$from] if $from < 256;
 457         }
 458         my $UTF8= $low   || $utf8;
 459         my $LATIN1= $low || $latin1;
 460         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
 461         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 462         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 463
 464         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 from )}=
 465           ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1, $from );
 466         my $rec= $self->{strs}{$str};
 467         foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
 468             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 469               if $self->{strs}{$str}{$key};
 470         }
 471         $self->{has_multi} ||= @$cp > 1;
 472         $self->{has_ascii} ||= $latin1 && @$latin1;
 473         $self->{has_low}   ||= $low && @$low;
 474         $self->{has_high}  ||= !$low && !$latin1;
 475     }
 476     $self->{n2a} = $n2a{$charset};
 477     $self->{count}= 0 + keys %{ $self->{strs} };
 478     return $self;
 479 }
 480
 481 # my $trie = make_trie($type,$maxlen);
 482 #
 483 # using the data stored in the object build a trie of a specific type,
 484 # and with specific maximum depth. The trie is made up the elements of
 485 # the given types array for each string in the object (assuming it is
 486 # not too long.)
 487 #
 488 # returns the trie, or undef if there was no relevant data in the object.
 489 #
 490
 491 sub make_trie {
 492     my ( $self, $type, $maxlen )= @_;
 493
 494     my $strs= $self->{strs};
 495     my %trie;
 496     foreach my $rec ( values %$strs ) {
 497         die "panic: unknown type '$type'"
 498           if !exists $rec->{$type};
 499         my $dat= $rec->{$type};
 500         next unless $dat;
 501         next if $maxlen && @$dat > $maxlen;
 502         my $node= \%trie;
 503         foreach my $elem ( @$dat ) {
 504             $node->{$elem} ||= {};
 505             $node= $node->{$elem};
 506         }
 507         $node->{''}= $rec->{str};
 508     }
 509     return 0 + keys( %trie ) ? \%trie : undef;
 510 }
 511
 512 sub pop_count ($) {
 513     my $word = shift;
 514
 515     # This returns a list of the positions of the bits in the input word that
 516     # are 1.
 517
 518     my @positions;
 519     my $position = 0;
 520     while ($word) {
 521         push @positions, $position if $word & 1;
 522         $position++;
 523         $word >>= 1;
 524     }
 525     return @positions;
 526 }
 527
 528 # my $optree= _optree()
 529 #
 530 # recursively convert a trie to an optree where every node represents
 531 # an if else branch.
 532 #
 533 #
 534
 535 sub _optree {
 536     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 537     return unless defined $trie;
 538     $ret_type ||= 'len';
 539     $else= 0  unless defined $else;
 540     $depth= 0 unless defined $depth;
 541
 542     # if we have an empty string as a key it means we are in an
 543     # accepting state and unless we can match further on should
 544     # return the value of the '' key.
 545     if (exists $trie->{''} ) {
 546         # we can now update the "else" value, anything failing to match
 547         # after this point should return the value from this.
 548         my $prefix = $self->{strs}{ $trie->{''} };
 549         if ( $ret_type eq 'cp' ) {
 550             $else= $prefix->{from};
 551             $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
 552             $else= $self->val_fmt($else) if $else > 9;
 553         } elsif ( $ret_type eq 'len' ) {
 554             $else= $depth;
 555         } elsif ( $ret_type eq 'both') {
 556             $else= $prefix->{from};
 557             $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
 558             $else= $self->val_fmt($else) if $else > 9;
 559             $else= "len=$depth, $else";
 560         }
 561     }
 562     # extract the meaningful keys from the trie, filter out '' as
 563     # it means we are an accepting state (end of sequence).
 564     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 565
 566     # if we haven't any keys there is no further we can match and we
 567     # can return the "else" value.
 568     return $else if !@conds;
 569
 570     my $test = $test_type =~ /^cp/ ? "cp" : "((const U8*)s)[$depth]";
 571
 572     # First we loop over the possible keys/conditions and find out what they
 573     # look like; we group conditions with the same optree together.
 574     my %dmp_res;
 575     my @res_order;
 576     local $Data::Dumper::Sortkeys=1;
 577     foreach my $cond ( @conds ) {
 578
 579         # get the optree for this child/condition
 580         my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type,
 581                                                             $else, $depth + 1 );
 582         # convert it to a string with Dumper
 583         my $res_code= Dumper( $res );
 584
 585         push @{$dmp_res{$res_code}{vals}}, $cond;
 586         if (!$dmp_res{$res_code}{optree}) {
 587             $dmp_res{$res_code}{optree}= $res;
 588             push @res_order, $res_code;
 589         }
 590     }
 591
 592     # now that we have deduped the optrees we construct a new optree
 593     # containing the merged
 594     # results.
 595     my %root;
 596     my $node= \%root;
 597     foreach my $res_code_idx (0 .. $#res_order) {
 598         my $res_code= $res_order[$res_code_idx];
 599         $node->{vals}= $dmp_res{$res_code}{vals};
 600         $node->{test}= $test;
 601         $node->{yes}= $dmp_res{$res_code}{optree};
 602         $node->{depth}= $depth;
 603         if ($res_code_idx < $#res_order) {
 604             $node= $node->{no}= {};
 605         } else {
 606             $node->{no}= $else;
 607         }
 608     }
 609
 610     # return the optree.
 611     return \%root;
 612 }
 613
 614 # my $optree= optree(%opts);
 615 #
 616 # Convert a trie to an optree, wrapper for _optree
 617
 618 sub optree {
 619     my $self= shift;
 620     my %opt= @_;
 621     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 622     $opt{ret_type} ||= 'len';
 623     my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
 624     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 625 }
 626
 627 # my $optree= generic_optree(%opts);
 628 #
 629 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 630 # sets of strings, including a branch for handling the string type check.
 631 #
 632
 633 sub generic_optree {
 634     my $self= shift;
 635     my %opt= @_;
 636
 637     $opt{ret_type} ||= 'len';
 638     my $test_type= 'depth';
 639     my $else= $opt{else} || 0;
 640
 641     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 642     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 643
 644     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 645       for $latin1, $utf8;
 646
 647     if ( $utf8 ) {
 648         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 649     } elsif ( $latin1 ) {
 650         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 651     }
 652     if ($opt{type} eq 'generic') {
 653         my $low= $self->make_trie( 'low', $opt{max_depth} );
 654         if ( $low ) {
 655             $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 656         }
 657     }
 658
 659     return $else;
 660 }
 661
 662 # length_optree()
 663 #
 664 # create a string length guarded optree.
 665 #
 666
 667 sub length_optree {
 668     my $self= shift;
 669     my %opt= @_;
 670     my $type= $opt{type};
 671
 672     die "Can't do a length_optree on type 'cp', makes no sense."
 673       if $type =~ /^cp/;
 674
 675     my $else= ( $opt{else} ||= 0 );
 676
 677     return $else if $self->{count} == 0;
 678
 679     my $method = $type =~ /generic/ ? 'generic_optree' : 'optree';
 680     if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) {
 681
 682         # Here is non-generic output (meaning that we are only generating one
 683         # type), and all things that match have the same number ('size') of
 684         # bytes.  The length guard is simply that we have that number of
 685         # bytes.
 686         my @size = keys %{$self->{size}{$type}};
 687         my $cond= "((e) - (s)) >= $size[0]";
 688         my $optree = $self->$method(%opt);
 689         $else= __cond_join( $cond, $optree, $else );
 690     }
 691     elsif ($self->{has_multi}) {
 692         my @size;
 693
 694         # Here, there can be a match of a multiple character string.  We use
 695         # the traditional method which is to have a branch for each possible
 696         # size (longest first) and test for the legal values for that size.
 697         my %sizes= (
 698             %{ $self->{size}{low}    || {} },
 699             %{ $self->{size}{latin1} || {} },
 700             %{ $self->{size}{utf8}   || {} }
 701         );
 702         if ($method eq 'generic_optree') {
 703             @size= sort { $a <=> $b } keys %sizes;
 704         } else {
 705             @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 706         }
 707         for my $size ( @size ) {
 708             my $optree= $self->$method(%opt, type => $type, max_depth => $size);
 709             my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 710             $else= __cond_join( $cond, $optree, $else );
 711         }
 712     }
 713     else {
 714         my $utf8;
 715
 716         # Here, has more than one possible size, and only matches a single
 717         # character.  For non-utf8, the needed length is 1; for utf8, it is
 718         # found by array lookup 'UTF8SKIP'.
 719
 720         # If want just the code points above 255, set up to look for those;
 721         # otherwise assume will be looking for all non-UTF-8-invariant code
 722         # poiints.
 723         my $trie_type = ($type eq 'high') ? 'high' : 'utf8';
 724
 725         # If we do want more than the 0-255 range, find those, and if they
 726         # exist...
 727         if (   $opt{type} !~ /latin1/i
 728             && ($utf8 = $self->make_trie($trie_type, 0)))
 729         {
 730
 731             # ... get them into an optree, and set them up as the 'else' clause
 732             $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 );
 733
 734             # We could make this
 735             #   UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))";
 736             # to avoid doing the UTF8SKIP and subsequent branches for invariants
 737             # that don't match.  But the current macros that get generated
 738             # have only a few things that can match past this, so I (khw)
 739             # don't think it is worth it.  (Even better would be to use
 740             # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it
 741             # if it saves a bunch.  We assume that input text likely to be
 742             # well-formed .
 743             my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))";
 744             $else = __cond_join($cond, $utf8, $else);
 745
 746             # For 'generic', we also will want the latin1 UTF-8 variants for
 747             # the case where the input isn't UTF-8.
 748             my $latin1;
 749             if ($method eq 'generic_optree') {
 750                 $latin1 = $self->make_trie( 'latin1', 1);
 751                 $latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0);
 752             }
 753
 754             # If we want the UTF-8 invariants, get those.
 755             my $low;
 756             if ($opt{type} !~ /non_low|high/
 757                 && ($low= $self->make_trie( 'low', 1)))
 758             {
 759                 $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 );
 760
 761                 # Expand out the UTF-8 invariants as a string so that we
 762                 # can use them as the conditional
 763                 $low = $self->_cond_as_str( $low, 0, \%opt);
 764
 765                 # If there are Latin1 variants, add a test for them.
 766                 if ($latin1) {
 767                     $else = __cond_join("(! is_utf8 )", $latin1, $else);
 768                 }
 769                 elsif ($method eq 'generic_optree') {
 770
 771                     # Otherwise for 'generic' only we know that what
 772                     # follows must be valid for just UTF-8 strings,
 773                     $else->{test} = "( is_utf8 && $else->{test} )";
 774                 }
 775
 776                 # If the invariants match, we are done; otherwise we have
 777                 # to go to the 'else' clause.
 778                 $else = __cond_join($low, 1, $else);
 779             }
 780             elsif ($latin1) {   # Here, didn't want or didn't have invariants,
 781                                 # but we do have latin variants
 782                 $else = __cond_join("(! is_utf8)", $latin1, $else);
 783             }
 784
 785             # We need at least one byte available to start off the tests
 786             $else = __cond_join("LIKELY((e) > (s))", $else, 0);
 787         }
 788         else {  # Here, we don't want or there aren't any variants.  A single
 789                 # byte available is enough.
 790             my $cond= "((e) > (s))";
 791             my $optree = $self->$method(%opt);
 792             $else= __cond_join( $cond, $optree, $else );
 793         }
 794     }
 795
 796     return $else;
 797 }
 798
 799 sub calculate_mask(@) {
 800     # Look at the input list of byte values.  This routine returns an array of
 801     # mask/base pairs to generate that list.
 802
 803     my @list = @_;
 804     my $list_count = @list;
 805
 806     # Consider a set of byte values, A, B, C ....  If we want to determine if
 807     # <c> is one of them, we can write c==A || c==B || c==C ....  If the
 808     # values are consecutive, we can shorten that to inRANGE(c, 'A', 'Z'),
 809     # which uses far fewer branches.  If only some of them are consecutive we
 810     # can still save some branches by creating range tests for just those that
 811     # are consecutive. _cond_as_str() does this work for looking for ranges.
 812     #
 813     # Another approach is to look at the bit patterns for A, B, C .... and see
 814     # if they have some commonalities.  That's what this function does.  For
 815     # example, consider a set consisting of the bytes
 816     # 0x42, 0x43, 0x62, and 0x63.  We could write:
 817     #   inRANGE(c, 0x42, 0x43) || inRANGE(c, 0x62, 0x63)
 818     # which through the magic of casting has not 4, but 2 tests.  But the
 819     # following mask/compare also works, and has just one test:
 820     #   (c & 0xDE) == 0x42
 821     # The reason it works is that the set consists of exactly the 4 bit
 822     # patterns which have either 0 or 1 in the two bit positions that are 0 in
 823     # the mask.  They have the same value in each bit position where the mask
 824     # is 1.  The comparison makes sure that the result matches all bytes which
 825     # match those six 1 bits exactly.  This can be applied to bytes that
 826     # differ in 1 through all 8 bit positions.  In order to be a candidate for
 827     # this optimization, the number of bytes in the set must be a power of 2.
 828     #
 829     # It may be that the bytes needing to be matched can't be done with a
 830     # single mask.  But it may be possible to have two (or more) sets, each
 831     # with a separate mask.  This function attempts to find some way to save
 832     # some branches using the mask technique.  If not, it returns an empty
 833     # list; if so, it returns a list consisting of
 834     #   [ [compare1, mask1], [compare2, mask2], ...
 835     #     [compare_n, undef], [compare_m, undef], ...
 836     #   ]
 837     # The <mask> is undef in the above for those bytes that must be tested
 838     # for individually.
 839     #
 840     # This function does not attempt to find the optimal set.  To do so would
 841     # probably require testing all possible combinations, and keeping track of
 842     # the current best one.
 843     #
 844     # There are probably much better algorithms, but this is the one I (khw)
 845     # came up with.  We start with doing a bit-wise compare of every byte in
 846     # the set with every other byte.  The results are sorted into arrays of
 847     # all those that differ by the same bit positions.  These are stored in a
 848     # hash with the each key being the bits they differ in.  Here is the hash
 849     # for the 0x53, 0x54, 0x73, 0x74 set:
 850     # {
 851     #    4 => {
 852     #            "0,1,2,5" => [
 853     #                            83,
 854     #                            116,
 855     #                            84,
 856     #                            115
 857     #                        ]
 858     #        },
 859     #    3 => {
 860     #            "0,1,2" => [
 861     #                        83,
 862     #                        84,
 863     #                        115,
 864     #                        116
 865     #                        ]
 866     #        }
 867     #    1 => {
 868     #            5 => [
 869     #                    83,
 870     #                    115,
 871     #                    84,
 872     #                    116
 873     #                ]
 874     #        },
 875     # }
 876     #
 877     # The set consisting of values which differ in the 4 bit positions 0, 1,
 878     # 2, and 5 from some other value in the set consists of all 4 values.
 879     # Likewise all 4 values differ from some other value in the 3 bit
 880     # positions 0, 1, and 2; and all 4 values differ from some other value in
 881     # the single bit position 5.  The keys at the uppermost level in the above
 882     # hash, 1, 3, and 4, give the number of bit positions that each sub-key
 883     # below it has.  For example, the 4 key could have as its value an array
 884     # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
 885     # such.  The best optimization will group the most values into a single
 886     # mask.  The most values will be the ones that differ in the most
 887     # positions, the ones with the largest value for the topmost key.  These
 888     # keys, are thus just for convenience of sorting by that number, and do
 889     # not have any bearing on the core of the algorithm.
 890     #
 891     # We start with an element from largest number of differing bits.  The
 892     # largest in this case is 4 bits, and there is only one situation in this
 893     # set which has 4 differing bits, "0,1,2,5".  We look for any subset of
 894     # this set which has 16 values that differ in these 4 bits.  There aren't
 895     # any, because there are only 4 values in the entire set.  We then look at
 896     # the next possible thing, which is 3 bits differing in positions "0,1,2".
 897     # We look for a subset that has 8 values that differ in these 3 bits.
 898     # Again there are none.  So we go to look for the next possible thing,
 899     # which is a subset of 2**1 values that differ only in bit position 5.  83
 900     # and 115 do, so we calculate a mask and base for those and remove them
 901     # from every set.  Since there is only the one set remaining, we remove
 902     # them from just this one.  We then look to see if there is another set of
 903     # 2 values that differ in bit position 5.  84 and 116 do, so we calculate
 904     # a mask and base for those and remove them from every set (again only
 905     # this set remains in this example).  The set is now empty, and there are
 906     # no more sets to look at, so we are done.
 907
 908     if ($list_count == 256) {   # All 256 is trivially masked
 909         return (0, 0);
 910     }
 911
 912     my %hash;
 913
 914     # Generate bits-differing lists for each element compared against each
 915     # other element
 916     for my $i (0 .. $list_count - 2) {
 917         for my $j ($i + 1 .. $list_count - 1) {
 918             my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
 919             my $differ_count = @bits_that_differ;
 920             my $key = join ",", @bits_that_differ;
 921             push @{$hash{$differ_count}{$key}}, $list[$i]
 922                 unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
 923             push @{$hash{$differ_count}{$key}}, $list[$j];
 924         }
 925     }
 926
 927     print STDERR __LINE__, ": calculate_mask() called:  List of values grouped",
 928                                 " by differing bits: ", Dumper \%hash if DEBUG;
 929
 930     my @final_results;
 931     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 932         my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
 933         foreach my $bits (sort keys $hash{$count}->%*) {
 934
 935             print STDERR __LINE__, ": For $count bit(s) difference ($bits),",
 936             " need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
 937
 938             # Look only as long as there are at least as many elements in the
 939             # subset as are needed
 940             while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
 941
 942                 print STDERR __LINE__, ": Looking at bit positions ($bits): ",
 943                                           Dumper $hash{$count}{$bits} if DEBUG;
 944
 945                 # Start with the first element in it
 946                 my $try_base = $hash{$count}{$bits}[0];
 947                 my @subset = $try_base;
 948
 949                 # If it succeeds, we return a mask and a base to compare
 950                 # against the masked value.  That base will be the AND of
 951                 # every element in the subset.  Initialize to the one element
 952                 # we have so far.
 953                 my $compare = $try_base;
 954
 955                 # We are trying to find a subset of this that has <need>
 956                 # elements that differ in the bit positions given by the
 957                 # string $bits, which is comma separated.
 958                 my @bits = split ",", $bits;
 959
 960                 TRY: # Look through the remainder of the list for other
 961                      # elements that differ only by these bit positions.
 962
 963                 for (my $i = 1; $i < $cur_count; $i++) {
 964                     my $try_this = $hash{$count}{$bits}[$i];
 965                     my @positions = pop_count($try_base ^ $try_this);
 966
 967                     print STDERR __LINE__, ": $try_base vs $try_this: is (",
 968                       join(',', @positions), ") a subset of ($bits)?" if DEBUG;
 969
 970                     foreach my $pos (@positions) {
 971                         unless (grep { $pos == $_ } @bits) {
 972                             print STDERR "  No\n" if DEBUG;
 973                             my $remaining = $cur_count - $i - 1;
 974                             if ($remaining && @subset + $remaining < $need) {
 975                                 print STDERR __LINE__, ": Can stop trying",
 976                                     " $try_base, because even if all the",
 977                                     " remaining $remaining values work, they",
 978                                     " wouldn't add up to the needed $need when",
 979                                     " combined with the existing ",
 980                                             scalar @subset, " ones\n" if DEBUG;
 981                                 last TRY;
 982                             }
 983                             next TRY;
 984                         }
 985                     }
 986
 987                     print STDERR "  Yes\n" if DEBUG;
 988                     push @subset, $try_this;
 989
 990                     # Add this to the mask base, in case it ultimately
 991                     # succeeds,
 992                     $compare &= $try_this;
 993                 }
 994
 995                 print STDERR __LINE__, ": subset (", join(", ", @subset),
 996                  ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
 997
 998                 if (@subset < $need) {
 999                     shift @{$hash{$count}{$bits}};
1000                     next;   # Try with next value
1001                 }
1002
1003                 # Create the mask
1004                 my $mask = 0;
1005                 foreach my $position (@bits) {
1006                     $mask |= 1 << $position;
1007                 }
1008                 $mask = ~$mask & 0xFF;
1009                 push @final_results, [$compare, $mask];
1010
1011                 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n",
1012                                 __LINE__, $compare, $compare, $mask if DEBUG;
1013
1014                 # These values are now spoken for.  Remove them from future
1015                 # consideration
1016                 foreach my $remove_count (sort keys %hash) {
1017                     foreach my $bits (sort keys %{$hash{$remove_count}}) {
1018                         foreach my $to_remove (@subset) {
1019                             @{$hash{$remove_count}{$bits}}
1020                                     = grep { $_ != $to_remove }
1021                                                 @{$hash{$remove_count}{$bits}};
1022                         }
1023                     }
1024                 }
1025             }
1026         }
1027     }
1028
1029     # Any values that remain in the list are ones that have to be tested for
1030     # individually.
1031     my @individuals;
1032     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
1033         foreach my $bits (sort keys $hash{$count}->%*) {
1034             foreach my $remaining (@{$hash{$count}{$bits}}) {
1035
1036                 # If we already know about this value, just ignore it.
1037                 next if grep { $remaining == $_ } @individuals;
1038
1039                 # Otherwise it needs to be returned as something to match
1040                 # individually
1041                 push @final_results, [$remaining, undef];
1042                 push @individuals, $remaining;
1043             }
1044         }
1045     }
1046
1047     # Sort by increasing numeric value
1048     @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
1049
1050     print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
1051
1052     return @final_results;
1053 }
1054
1055 # _cond_as_str
1056 # turn a list of conditions into a text expression
1057 # - merges ranges of conditions, and joins the result with ||
1058 sub _cond_as_str {
1059     my ( $self, $op, $combine, $opts_ref )= @_;
1060     my $cond= $op->{vals};
1061     my $test= $op->{test};
1062     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
1063     return "( $test )" if !defined $cond;
1064
1065     # rangify the list.  As we encounter a new value, it is placed in a new
1066     # subarray by itself.  If the next value is adjacent to it, the end point
1067     # of the subarray is merely incremented; and so on.  When the next value
1068     # that isn't adjacent to the previous one is encountered, Update() is
1069     # called to hoist any single-element subarray to be a scalar.
1070     my @ranges;
1071     my $Update= sub {
1072         # We skip this if there are optimizations that
1073         # we can apply (below) to the individual ranges
1074         if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
1075             $ranges[-1] = $ranges[-1][0] if $ranges[-1][0] == $ranges[-1][1];
1076         }
1077     };
1078     for my $condition ( @$cond ) {
1079         if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
1080             # Not adjacent to the existing range.  Remove that from being a
1081             # range if only a single value;
1082             $Update->();
1083             push @ranges, [ $condition, $condition ];
1084         } else {    # Adjacent to the existing range; add to the range
1085             $ranges[-1][1]++;
1086         }
1087     }
1088     $Update->();
1089
1090     return $self->_combine( $test, @ranges )
1091       if $combine;
1092
1093     # If the input set has certain characteristics, we can optimize tests
1094     # for it.
1095
1096     return 1 if @$cond == 256;  # If all bytes match, is trivially true
1097
1098     my @masks;
1099     if (@ranges > 1) {
1100
1101         # See if the entire set shares optimizable characteristics, and if so,
1102         # return the optimization.  There is no need to do this on sets with
1103         # just a single range, as that can be expressed with a single
1104         # conditional.
1105         @masks = calculate_mask(@$cond);
1106
1107         # Stringify the output of calculate_mask()
1108         if (@masks) {
1109             my @return;
1110             foreach my $mask_ref (@masks) {
1111                 if (defined $mask_ref->[1]) {
1112                     push @return, "( ( $test & "
1113                                 . $self->val_fmt($mask_ref->[1]) . " ) == "
1114                                 . $self->val_fmt($mask_ref->[0]) . " )";
1115                 }
1116                 else {  # An undefined mask means to use the value as-is
1117                     push @return, "$test == " . $self->val_fmt($mask_ref->[0]);
1118                 }
1119             }
1120
1121             # The best possible case below for specifying this set of values via
1122             # ranges is 1 branch per range.  If our mask method yielded better
1123             # results, there is no sense trying something that is bound to be
1124             # worse.
1125             if (@return < @ranges) {
1126                 return "( " . join( " || ", @return ) . " )";
1127             }
1128
1129             @masks = @return;
1130         }
1131     }
1132
1133     # Here, there was no entire-class optimization that was clearly better
1134     # than doing things by ranges.  Look at each range.
1135     my $range_count_extra = 0;
1136     for (my $i = 0; $i < @ranges; $i++) {
1137         if (! ref $ranges[$i]) {    # Trivial case: no range
1138             $ranges[$i] = $self->val_fmt($ranges[$i]) . " == $test";
1139         }
1140         elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
1141             $ranges[$i] =           # Trivial case: single element range
1142                     $self->val_fmt($ranges[$i]->[0]) . " == $test";
1143         }
1144         elsif ($ranges[$i]->[0] == 0) {
1145             # If the range matches all 256 possible bytes, it is trivially
1146             # true.
1147             return 1 if $ranges[0]->[1] == 0xFF;    # @ranges must be 1 in
1148                                                     # this case
1149             $ranges[$i] = "( $test <= "
1150                         . $self->val_fmt($ranges[$i]->[1]) . " )";
1151         }
1152         elsif ($ranges[$i]->[1] == 255) {
1153
1154             # Similarly the max possible is 255, so can omit an upper bound
1155             # test if the calculated max is the max possible one.
1156             $ranges[$i] = "( $test >= " . $self->val_fmt($ranges[0]->[0]) . " )";
1157         }
1158         else {
1159             my $output = "";
1160
1161             # Well-formed UTF-8 continuation bytes on ascii platforms must be
1162             # in the range 0x80 .. 0xBF.  If we know that the input is
1163             # well-formed (indicated by not trying to be 'safe'), we can omit
1164             # tests that verify that the input is within either of these
1165             # bounds.  (No legal UTF-8 character can begin with anything in
1166             # this range, so we don't have to worry about this being a
1167             # continuation byte or not.)
1168             if ($opts_ref->{charset} =~ /ascii/i
1169                 && (! $opts_ref->{safe} && ! $opts_ref->{no_length_checks})
1170                 && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
1171             {
1172                 # If the range is the entire legal range, it matches any legal
1173                 # byte, so we can omit both tests.  (This should happen only
1174                 # if the number of ranges is 1.)
1175                 if ($ranges[$i]->[0] == 0x80 && $ranges[$i]->[1] == 0xBF) {
1176                     return 1;
1177                 }
1178             }
1179
1180             # Here, it isn't the full range of legal continuation bytes.  We
1181             # could just assume that there's nothing outside of the legal
1182             # bounds.  But inRANGE() allows us to have a single conditional,
1183             # so the only cost of making sure it's a legal UTF-8 continuation
1184             # byte is an extra subtraction instruction, a trivial expense.
1185             $ranges[$i] = "inRANGE_helper_(U8, $test, "
1186                         . $self->val_fmt($ranges[$i]->[0]) .", "
1187                         . $self->val_fmt($ranges[$i]->[1]) . ")";
1188         }
1189     }
1190
1191     # We have generated the list of bytes in two ways; one trying to use masks
1192     # to cut the number of branches down, and the other to look at individual
1193     # ranges (some of which could be cut down by using a mask for just it).
1194     # We return whichever method uses the fewest branches.
1195     return "( "
1196            . join( " || ", (@masks && @masks < @ranges + $range_count_extra)
1197                             ? @masks
1198                             : @ranges)
1199            . " )";
1200 }
1201
1202 # _combine
1203 # recursively turn a list of conditions into a fast break-out condition
1204 # used by _cond_as_str() for 'cp' type macros.
1205 sub _combine {
1206     my ( $self, $test, @cond )= @_;
1207     return if !@cond;
1208     my $item= shift @cond;
1209     my ( $cstr, $gtv );
1210     if ( ref $item ) {  # @item should be a 2-element array giving range start
1211                         # and end
1212         if ($item->[0] == 0) {  # UV's are never negative, so skip "0 <= "
1213                                 # test which could generate a compiler warning
1214                                 # that test is always true
1215             $cstr= "$test <= " . $self->val_fmt($item->[1]);
1216         }
1217         else {
1218             $cstr = "inRANGE_helper_(UV, $test, "
1219                   . $self->val_fmt($item->[0]) . ", "
1220                   . $self->val_fmt($item->[1]) . ")";
1221         }
1222         $gtv= $self->val_fmt($item->[1]);
1223     } else {
1224         $cstr= $self->val_fmt($item) . " == $test";
1225         $gtv= $self->val_fmt($item)
1226     }
1227     if ( @cond ) {
1228         my $combine= $self->_combine( $test, @cond );
1229         if (@cond >1) {
1230             return "( $cstr || ( $gtv < $test &&\n"
1231                    . $combine . " ) )";
1232         } else {
1233             return "( $cstr || $combine )";
1234         }
1235     } else {
1236         return $cstr;
1237     }
1238 }
1239
1240 # _render()
1241 # recursively convert an optree to text with reasonably neat formatting
1242 sub _render {
1243     my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_;
1244     return 0 if ! defined $op;  # The set is empty
1245     if ( !ref $op ) {
1246         return $op;
1247     }
1248     my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1249     #no warnings 'recursion';   # This would allow really really inefficient
1250                                 # code to be generated.  See pod
1251     my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def,
1252                                                                     $submacros);
1253     return $yes if $cond eq '1';
1254
1255     my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref, $def,
1256                                                                     $submacros);
1257     return "( $cond )" if $yes eq '1' and $no eq '0';
1258     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1259     return "$lb$cond ? $yes : $no$rb"
1260       if !ref( $op->{yes} ) && !ref( $op->{no} );
1261     my $ind1= " " x 4;
1262     my $ind= "\n" . ( $ind1 x $op->{depth} );
1263
1264     if ( ref $op->{yes} ) {
1265         $yes= $ind . $ind1 . $yes;
1266     } else {
1267         $yes= " " . $yes;
1268     }
1269
1270     my $str= "$lb$cond ?$yes$ind: $no$rb";
1271     if (length $str > 6000) {
1272         push @$submacros, sprintf "#define $def\n( %s )", "_part"
1273                                   . (my $yes_idx= 0+@$submacros) . "_", $yes;
1274         push @$submacros, sprintf "#define $def\n( %s )", "_part"
1275                                   . (my $no_idx= 0+@$submacros) . "_", $no;
1276         return sprintf "%s%s ? $def : $def%s", $lb, $cond,
1277                                     "_part${yes_idx}_", "_part${no_idx}_", $rb;
1278     }
1279     return $str;
1280 }
1281
1282 # $expr=render($op,$combine)
1283 #
1284 # convert an optree to text with reasonably neat formatting. If $combine
1285 # is true then the condition is created using "fast breakouts" which
1286 # produce uglier expressions that are more efficient for common case,
1287 # longer lists such as that resulting from type 'cp' output.
1288 # Currently only used for type 'cp' macros.
1289 sub render {
1290     my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_;
1291
1292     my @submacros;
1293     my $macro= sprintf "#define $def_fmt\n( %s )", "",
1294                        $self->_render( $op, $combine, 0, $opts_ref, $def_fmt,
1295                                                                  \@submacros);
1296
1297     return join "\n\n",
1298             map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) }
1299                                                             @submacros, $macro;
1300 }
1301
1302 # make_macro
1303 # make a macro of a given type.
1304 # calls into make_trie and (generic_|length_)optree as needed
1305 # Opts are:
1306 # type             : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
1307 # ret_type         : 'cp' or 'len'
1308 # safe             : don't assume is well-formed UTF-8, so don't skip any range
1309 #                    checks, and add length guards to macro
1310 # no_length_checks : like safe, but don't add length guards.
1311 #
1312 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1313 # in which case it defaults to 'cp' as well.
1314 #
1315 # It is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1316 # sequences in it, as the generated macro will accept only a single codepoint
1317 # as an argument.
1318 #
1319 # It is also illegal to do a non-safe macro on a pattern with multi-codepoint
1320 # sequences in it, as even if it is known to be well-formed, we need to not
1321 # run off the end of the buffer when, say, the buffer ends with the first two
1322 # characters, but three are looked at by the macro.
1323 #
1324 # returns the macro.
1325
1326
1327 sub make_macro {
1328     my $self= shift;
1329     my %opts= @_;
1330     my $type= $opts{type} || 'generic';
1331     if ($self->{has_multi}) {
1332         if ($type =~ /^cp/) {
1333             die "Can't do a 'cp' on multi-codepoint character class"
1334               . " '$self->{op}'"
1335         }
1336         elsif (! $opts{safe}) {
1337             die "'safe' is required on multi-codepoint character class"
1338                ." '$self->{op}'"
1339         }
1340     }
1341     my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1342     my $method;
1343     if ( $opts{safe} ) {
1344         $method= 'length_optree';
1345     } elsif ( $type =~ /generic/ ) {
1346         $method= 'generic_optree';
1347     } else {
1348         $method= 'optree';
1349     }
1350     my @args= $type =~ /^cp/ ? 'cp' : 's';
1351     push @args, "e" if $opts{safe};
1352     push @args, "is_utf8" if $type =~ /generic/;
1353     push @args, "len" if $ret_type eq 'both';
1354     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
1355              $ret_type eq 'cp'      ? 'what_'     : 'is_';
1356     my $ext= $type     =~ /generic/ ? ''          : '_' . lc( $type );
1357     $ext .= '_non_low' if $type eq 'generic_non_low';
1358     $ext .= "_safe" if $opts{safe};
1359     $ext .= "_no_length_checks" if $opts{no_length_checks};
1360     my $argstr= join ",", @args;
1361     my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
1362     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1363     return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt );
1364 }
1365
1366 # if we aren't being used as a module (highly likely) then process
1367 # the __DATA__ below and produce macros in regcharclass.h
1368 # if an argument is provided to the script then it is assumed to
1369 # be the path of the file to output to, if the arg is '-' outputs
1370 # to STDOUT.
1371 if ( !caller ) {
1372     $|++;
1373     my $path= shift @ARGV || "regcharclass.h";
1374     my $out_fh;
1375     if ( $path eq '-' ) {
1376         $out_fh= \*STDOUT;
1377     } else {
1378         $out_fh = open_new( $path );
1379     }
1380     print $out_fh read_only_top( lang => 'C', by => $0,
1381                                  file => 'regcharclass.h', style => '*',
1382                                  copyright => [2007, 2011],
1383                                  final => <<EOF,
1384 WARNING: These macros are for internal Perl core use only, and may be
1385 changed or removed without notice.
1386 EOF
1387     );
1388     print $out_fh "\n#ifndef PERL_REGCHARCLASS_H_ /* Guard against nested",
1389                   " #includes */\n#define PERL_REGCHARCLASS_H_\n";
1390
1391     my ( $op, $title, @txt, @types, %mods );
1392     my $doit= sub ($) {
1393         return unless $op;
1394
1395         my $charset = shift;
1396
1397         # Skip if to compile on a different platform.
1398         return if delete $mods{only_ascii_platform} && $charset !~ /ascii/i;
1399         return if delete $mods{only_ebcdic_platform} && $charset !~ /ebcdic/i;
1400
1401         print $out_fh "/*\n\t$op: $title\n\n";
1402         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1403         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt,
1404                                                         charset => $charset);
1405
1406         #die Dumper(\@types,\%mods);
1407
1408         my @mods;
1409         push @mods, 'safe' if delete $mods{safe};
1410         push @mods, 'no_length_checks' if delete $mods{no_length_checks};
1411
1412         # Default to 'fast' do this one first, as traditional
1413         unshift @mods, 'fast' if delete $mods{fast} || ! @mods;
1414         if (%mods) {
1415             die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1416         }
1417
1418         foreach my $type_spec ( @types ) {
1419             my ( $type, $ret )= split /-/, $type_spec;
1420             $ret ||= 'len';
1421             foreach my $mod ( @mods ) {
1422
1423                 # 'safe' is irrelevant with code point macros, so skip if
1424                 # there is also a 'fast', but don't skip if this is the only
1425                 # way a cp macro will get generated.  Below we convert 'safe'
1426                 # to 'fast' in this instance
1427                 next if $type =~ /^cp/
1428                         && ($mod eq 'safe' || $mod eq 'no_length_checks')
1429                         && grep { 'fast' =~ $_ } @mods;
1430                 delete $mods{$mod};
1431                 my $macro= $obj->make_macro(
1432                     type     => $type,
1433                     ret_type => $ret,
1434                     safe     => $mod eq 'safe' && $type !~ /^cp/,
1435                     charset  => $charset,
1436                     no_length_checks => $mod eq 'no_length_checks'
1437                                      && $type !~ /^cp/,
1438                 );
1439                 print $out_fh $macro, "\n";
1440             }
1441         }
1442     };
1443
1444     my @data = <DATA>;
1445     foreach my $charset (get_supported_code_pages()) {
1446         my $first_time = 1;
1447         undef $op;
1448         undef $title;
1449         undef @txt;
1450         undef @types;
1451         undef %mods;
1452         print $out_fh "\n", get_conditional_compile_line_start($charset);
1453         my @data_copy = @data;
1454         for (@data_copy) {
1455             s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1456             next unless /\S/;
1457             chomp;
1458             if ( /^[A-Z]/ ) {
1459                 $doit->($charset) unless $first_time;  # This starts a new
1460                                                        # definition; do the
1461                                                        # previous one
1462                 $first_time = 0;
1463                 ( $op, $title )= split /\s*:\s*/, $_, 2;
1464                 @txt= ();
1465             } elsif ( s/^=>// ) {
1466                 my ( $type, $modifier )= split /:/, $_;
1467                 @types= split ' ', $type;
1468                 undef %mods;
1469                 map { $mods{$_} = 1 } split ' ',  $modifier;
1470             } else {
1471                 push @txt, "$_";
1472             }
1473         }
1474         $doit->($charset);
1475         print $out_fh get_conditional_compile_line_end();
1476     }
1477
1478     print $out_fh "\n#endif /* PERL_REGCHARCLASS_H_ */\n";
1479
1480     if($path eq '-') {
1481         print $out_fh "/* ex: set ro: */\n";
1482     } else {
1483         # Some of the sources for these macros come from Unicode tables
1484         my $sources_list = "lib/unicore/mktables.lst";
1485         my @sources = ($0, qw(lib/unicore/mktables
1486                               lib/Unicode/UCD.pm
1487                               regen/regcharclass_multi_char_folds.pl
1488                               regen/charset_translations.pl
1489                              ));
1490         {
1491             # Depend on mktables’ own sources.  It’s a shorter list of files than
1492             # those that Unicode::UCD uses.
1493             if (! open my $mktables_list, '<', $sources_list) {
1494
1495                 # This should force a rebuild once $sources_list exists
1496                 push @sources, $sources_list;
1497             }
1498             else {
1499                 while(<$mktables_list>) {
1500                     last if /===/;
1501                     chomp;
1502                     push @sources, "lib/unicore/$_" if /^[^#]/;
1503                 }
1504             }
1505         }
1506         read_only_bottom_close_and_rename($out_fh, \@sources)
1507     }
1508 }
1509
1510 # The form of the input is a series of definitions to make macros for.
1511 # The first line gives the base name of the macro, followed by a colon, and
1512 # then text to be used in comments associated with the macro that are its
1513 # title or description.  In all cases the first (perhaps only) parameter to
1514 # the macro is a pointer to the first byte of the code point it is to test to
1515 # see if it is in the class determined by the macro.  In the case of non-UTF8,
1516 # the code point consists only of a single byte.
1517 #
1518 # The second line must begin with a '=>' and be followed by the types of
1519 # macro(s) to be generated; these are specified below.  A colon follows the
1520 # types, followed by the modifiers, also specified below.  At least one
1521 # modifier is required.
1522 #
1523 # The subsequent lines give what code points go into the class defined by the
1524 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1525 # enclosed in quotes.  Otherwise the lines consist of one of:
1526 #   1)  a single Unicode code point, prefaced by 0x
1527 #   2)  a single range of Unicode code points separated by a minus (and
1528 #       optional space)
1529 #   3)  a single Unicode property specified in the standard Perl form
1530 #       "\p{...}"
1531 #   4)  a line like 'do path'.  This will do a 'do' on the file given by
1532 #       'path'.  It is assumed that this does nothing but load subroutines
1533 #       (See item 5 below).  The reason 'require path' is not used instead is
1534 #       because 'do' doesn't assume that path is in @INC.
1535 #   5)  a subroutine call
1536 #           &pkg::foo(arg1, ...)
1537 #       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1538 #       returns an array of entries of forms like items 1-3 above.  This
1539 #       allows more complex inputs than achievable from the other input types.
1540 #
1541 # A blank line or one whose first non-blank character is '#' is a comment.
1542 # The definition of the macro is terminated by a line unlike those described.
1543 #
1544 # Valid types:
1545 #   low         generate a macro whose name is 'is_BASE_low' and defines a
1546 #               class that includes only ASCII-range chars.  (BASE is the
1547 #               input macro base name.)
1548 #   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1549 #               class that includes only upper-Latin1-range chars.  It is not
1550 #               designed to take a UTF-8 input parameter.
1551 #   high        generate a macro whose name is 'is_BASE_high' and defines a
1552 #               class that includes all relevant code points that are above
1553 #               the Latin1 range.  This is for very specialized uses only.
1554 #               It is designed to take only an input UTF-8 parameter.
1555 #   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1556 #               class that includes all relevant characters that aren't ASCII.
1557 #               It is designed to take only an input UTF-8 parameter.
1558 #   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1559 #               class that includes both ASCII and upper-Latin1-range chars.
1560 #               It is not designed to take a UTF-8 input parameter.
1561 #   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1562 #               class that can include any code point, adding the 'low' ones
1563 #               to what 'utf8' works on.  It is designed to take only an input
1564 #               UTF-8 parameter.
1565 #   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1566 #               boolean, parameter which indicates if the first one points to
1567 #               a UTF-8 string or not.  Thus it works in all circumstances.
1568 #   generic_non_low generate a macro whose name is 'is_BASE_non_low".  It has
1569 #               a 2nd, boolean, parameter which indicates if the first one
1570 #               points to a UTF-8 string or not.  It excludes any ASCII-range
1571 #               matches, but otherwise it works in all circumstances.
1572 #   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1573 #               class that returns true if the UV parameter is a member of the
1574 #               class; false if not.
1575 #   cp_high     like cp, but it is assumed that it is known that the UV
1576 #               parameter is above Latin1.  The name of the generated macro is
1577 #               'is_BASE_cp_high'.  This is different from high-cp, derived
1578 #               below.
1579 # A macro of the given type is generated for each type listed in the input.
1580 # The default return value is the number of octets read to generate the match.
1581 # Append "-cp" to the type to have it instead return the matched codepoint.
1582 #               The macro name is changed to 'what_BASE...'.  See pod for
1583 #               caveats
1584 # Appending '-both" instead adds an extra parameter to the end of the argument
1585 #               list, which is a pointer as to where to store the number of
1586 #               bytes matched, while also returning the code point.  The macro
1587 #               name is changed to 'what_len_BASE...'.  See pod for caveats
1588 #
1589 # Valid modifiers:
1590 #   safe        The input string is not necessarily valid UTF-8.  In
1591 #               particular an extra parameter (always the 2nd) to the macro is
1592 #               required, which points to one beyond the end of the string.
1593 #               The macro will make sure not to read off the end of the
1594 #               string.  In the case of non-UTF8, it makes sure that the
1595 #               string has at least one byte in it.  The macro name has
1596 #               '_safe' appended to it.
1597 #   no_length_checks  The input string is not necessarily valid UTF-8, but it
1598 #               is to be assumed that the length has already been checked and
1599 #               found to be valid
1600 #   fast        The input string is valid UTF-8.  No bounds checking is done,
1601 #               and the macro can make assumptions that lead to faster
1602 #               execution.
1603 #   only_ascii_platform   Skip this definition if the character set is for
1604 #               a non-ASCII platform.
1605 #   only_ebcdic_platform  Skip this definition if the character set is for
1606 #               a non-EBCDIC platform.
1607 # No modifier need be specified; fast is assumed for this case.  If both
1608 # 'fast', and 'safe' are specified, two macros will be created for each
1609 # 'type'.
1610 #
1611 # If run on a non-ASCII platform will automatically convert the Unicode input
1612 # to native.  The documentation above is slightly wrong in this case.  'low'
1613 # actually refers to code points whose UTF-8 representation is the same as the
1614 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1615 # code points less than 256.
1616
1617 1; # in the unlikely case we are being used as a module
1618
1619 __DATA__
1620 # This is no longer used, but retained in case it is needed some day.
1621 # TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1622 # => generic cp generic-cp generic-both :fast safe
1623 # 0x00DF        # LATIN SMALL LETTER SHARP S
1624 # 0x0390        # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1625 # 0x03B0        # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1626 # 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1627 # 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1628 # 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1629
1630 LNBREAK: Line Break: \R
1631 => generic UTF8 LATIN1 : safe
1632 "\x0D\x0A"      # CRLF - Network (Windows) line ending
1633 \p{VertSpace}
1634
1635 HORIZWS: Horizontal Whitespace: \h \H
1636 => high cp_high : fast
1637 \p{HorizSpace}
1638
1639 VERTWS: Vertical Whitespace: \v \V
1640 => high cp_high : fast
1641 \p{VertSpace}
1642
1643 XDIGIT: Hexadecimal digits
1644 => high cp_high : fast
1645 \p{XDigit}
1646
1647 XPERLSPACE: \p{XPerlSpace}
1648 => high cp_high : fast
1649 \p{XPerlSpace}
1650
1651 NONCHAR: Non character code points
1652 => UTF8 :safe
1653 \p{_Perl_Nchar}
1654
1655 SURROGATE: Surrogate code points
1656 => UTF8 :safe
1657 \p{_Perl_Surrogate}
1658
1659 QUOTEMETA: Meta-characters that \Q should quote
1660 => high :fast
1661 \p{_Perl_Quotemeta}
1662
1663 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1664 => UTF8 UTF8-cp :safe
1665 %regcharclass_multi_char_folds::multi_char_folds('u', 'a')
1666
1667 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1668 => LATIN1 LATIN1-cp : safe
1669 %regcharclass_multi_char_folds::multi_char_folds('l', 'a')
1670
1671 THREE_CHAR_FOLD: A three-character multi-char fold
1672 => UTF8 :safe
1673 %regcharclass_multi_char_folds::multi_char_folds('u', '3')
1674
1675 THREE_CHAR_FOLD: A three-character multi-char fold
1676 => LATIN1 :safe
1677 %regcharclass_multi_char_folds::multi_char_folds('l', '3')
1678
1679 THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
1680 => UTF8 :safe
1681 %regcharclass_multi_char_folds::multi_char_folds('u', 'h')
1682
1683 THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
1684 => LATIN1 :safe
1685 %regcharclass_multi_char_folds::multi_char_folds('l', 'h')
1686 #
1687 #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
1688 #=> UTF8 :safe
1689 #%regcharclass_multi_char_folds::multi_char_folds('u', 'fm')
1690 #
1691 #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
1692 #=> LATIN1 :safe
1693 #%regcharclass_multi_char_folds::multi_char_folds('l', 'fm')
1694
1695 FOLDS_TO_MULTI: characters that fold to multi-char strings
1696 => UTF8 :fast
1697 \p{_Perl_Folds_To_Multi_Char}
1698
1699 PROBLEMATIC_LOCALE_FOLD : characters whose fold is problematic under locale
1700 => UTF8 cp :fast
1701 \p{_Perl_Problematic_Locale_Folds}
1702
1703 PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are problematic under locale
1704 => UTF8 cp :fast
1705 \p{_Perl_Problematic_Locale_Foldeds_Start}
1706
1707 PATWS: pattern white space
1708 => generic : safe
1709 \p{_Perl_PatWS}
1710
1711 HANGUL_ED: Hangul syllables whose first character is \xED
1712 => UTF8 :only_ascii_platform safe
1713 0xD000 - 0xD7FF