regen/regcharclass.pl

   1 #!perl
   2 package CharClass::Matcher;
   3 use strict;
   4 use 5.008;
   5 use warnings;
   6 use warnings FATAL => 'all';
   7 use Text::Wrap qw(wrap);
   8 use Data::Dumper;
   9 $Data::Dumper::Useqq= 1;
  10 our $hex_fmt= "0x%02X";
  11
  12 sub DEBUG () { 0 }
  13 $|=1 if DEBUG;
  14
  15 sub ASCII_PLATFORM { (ord('A') == 65) }
  16
  17 require 'regen/regen_lib.pl';
  18
  19 =head1 NAME
  20
  21 CharClass::Matcher -- Generate C macros that match character classes efficiently
  22
  23 =head1 SYNOPSIS
  24
  25     perl Porting/regcharclass.pl
  26
  27 =head1 DESCRIPTION
  28
  29 Dynamically generates macros for detecting special charclasses
  30 in latin-1, utf8, and codepoint forms. Macros can be set to return
  31 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
  32
  33 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
  34 are necessary.
  35
  36 Using WHATEVER as an example the following macros can be produced, depending
  37 on the input parameters (how to get each is described by internal comments at
  38 the C<__DATA__> line):
  39
  40 =over 4
  41
  42 =item C<is_WHATEVER(s,is_utf8)>
  43
  44 =item C<is_WHATEVER_safe(s,e,is_utf8)>
  45
  46 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
  47 comparisons involving octect<128 are done before checking the C<is_utf8>
  48 flag, hopefully saving time.
  49
  50 The version without the C<_safe> suffix should be used only when the input is
  51 known to be well-formed.
  52
  53 =item C<is_WHATEVER_utf8(s)>
  54
  55 =item C<is_WHATEVER_utf8_safe(s,e)>
  56
  57 Do a lookup assuming the string is encoded in (normalized) UTF8.
  58
  59 The version without the C<_safe> suffix should be used only when the input is
  60 known to be well-formed.
  61
  62 =item C<is_WHATEVER_latin1(s)>
  63
  64 =item C<is_WHATEVER_latin1_safe(s,e)>
  65
  66 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  67
  68 The version without the C<_safe> suffix should be used only when it is known
  69 that C<s> contains at least one character.
  70
  71 =item C<is_WHATEVER_cp(cp)>
  72
  73 Check to see if the string matches a given codepoint (hypothetically a
  74 U32). The condition is constructed as as to "break out" as early as
  75 possible if the codepoint is out of range of the condition.
  76
  77 IOW:
  78
  79   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  80
  81 Thus if the character is X+1 only two comparisons will be done. Making
  82 matching lookups slower, but non-matching faster.
  83
  84 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
  85
  86 A variant form of each of the macro types described above can be generated, in
  87 which the code point is returned by the macro, and an extra parameter (in the
  88 final position) is added, which is a pointer for the macro to set the byte
  89 length of the returned code point.
  90
  91 These forms all have a C<what_len> prefix instead of the C<is_>, for example
  92 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
  93 C<what_len_WHATEVER_utf8(s,len)>.
  94
  95 These forms should not be used I<except> on small sets of mostly widely
  96 separated code points; otherwise the code generated is inefficient.  For these
  97 cases, it is best to use the C<is_> forms, and then find the code point with
  98 C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
  99 message on the worst of the inappropriate sets.  Examine the generated macro
 100 to see if it is acceptable.
 101
 102 =item C<what_WHATEVER_FOO(arg1, ...)>
 103
 104 A variant form of each of the C<is_> macro types described above can be generated, in
 105 which the code point and not the length is returned by the macro.  These have
 106 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
 107 not be used where the set contains a NULL, as 0 is returned for two different
 108 cases: a) the set doesn't include the input code point; b) the set does
 109 include it, and it is a NULL.
 110
 111 =back
 112
 113 =head2 CODE FORMAT
 114
 115 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
 116
 117
 118 =head1 AUTHOR
 119
 120 Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
 121
 122 =head1 BUGS
 123
 124 No tests directly here (although the regex engine will fail tests
 125 if this code is broken). Insufficient documentation and no Getopts
 126 handler for using the module as a script.
 127
 128 =head1 LICENSE
 129
 130 You may distribute under the terms of either the GNU General Public
 131 License or the Artistic License, as specified in the README file.
 132
 133 =cut
 134
 135 # Sub naming convention:
 136 # __func : private subroutine, can not be called as a method
 137 # _func  : private method, not meant for external use
 138 # func   : public method.
 139
 140 # private subs
 141 #-------------------------------------------------------------------------------
 142 #
 143 # ($cp,$n,$l,$u)=__uni_latin($str);
 144 #
 145 # Return a list of arrays, each of which when interpreted correctly
 146 # represent the string in some given encoding with specific conditions.
 147 #
 148 # $cp - list of codepoints that make up the string.
 149 # $n  - list of octets that make up the string if all codepoints are invariant
 150 #       regardless of if the string is in UTF-8 or not.
 151 # $l  - list of octets that make up the string in latin1 encoding if all
 152 #       codepoints < 256, and at least one codepoint is UTF-8 variant.
 153 # $u  - list of octets that make up the string in utf8 if any codepoint is
 154 #       UTF-8 variant
 155 #
 156 #   High CP | Defined
 157 #-----------+----------
 158 #   0 - 127 : $n            (127/128 are the values for ASCII platforms)
 159 # 128 - 255 : $l, $u
 160 # 256 - ... : $u
 161 #
 162
 163 sub __uni_latin1 {
 164     my $str= shift;
 165     my $max= 0;
 166     my @cp;
 167     my @cp_high;
 168     my $only_has_invariants = 1;
 169     for my $ch ( split //, $str ) {
 170         my $cp= ord $ch;
 171         push @cp, $cp;
 172         push @cp_high, $cp if $cp > 255;
 173         $max= $cp if $max < $cp;
 174         if (! ASCII_PLATFORM && $only_has_invariants) {
 175             if ($cp > 255) {
 176                 $only_has_invariants = 0;
 177             }
 178             else {
 179                 my $temp = chr($cp);
 180                 utf8::upgrade($temp);
 181                 my @utf8 = unpack "U0C*", $temp;
 182                 $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
 183             }
 184         }
 185     }
 186     my ( $n, $l, $u );
 187     $only_has_invariants = $max < 128 if ASCII_PLATFORM;
 188     if ($only_has_invariants) {
 189         $n= [@cp];
 190     } else {
 191         $l= [@cp] if $max && $max < 256;
 192
 193         $u= $str;
 194         utf8::upgrade($u);
 195         $u= [ unpack "U0C*", $u ] if defined $u;
 196     }
 197     return ( \@cp, \@cp_high, $n, $l, $u );
 198 }
 199
 200 #
 201 # $clean= __clean($expr);
 202 #
 203 # Cleanup a ternary expression, removing unnecessary parens and apply some
 204 # simplifications using regexes.
 205 #
 206
 207 sub __clean {
 208     my ( $expr )= @_;
 209
 210     #return $expr;
 211
 212     our $parens;
 213     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 214
 215     ## remove redundant parens
 216     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 217
 218
 219     # repeatedly simplify conditions like
 220     #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
 221     # into
 222     #       ( ( (cond1) && (cond2) ) ? X : Y )
 223     # Also similarly handles expressions like:
 224     #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
 225     # Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is
 226     # purely to ensure we have a balanced set of parens in the expression which makes
 227     # it easier to understand the pattern in an editor that understands paren's, we do
 228     # not expect either of these cases to actually fire. - Yves
 229     1 while $expr =~ s/
 230         ([:()])  \s*
 231             ($parens) \s*
 232             \? \s*
 233                 \( \s* ($parens) \s*
 234                     \? \s* ($parens|[^()?:\s]+?) \s*
 235                     :  \s* ($parens|[^()?:\s]+?) \s*
 236                 \) \s*
 237             : \s* \5 \s*
 238         ([()])
 239     /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
 240
 241     return $expr;
 242 }
 243
 244 #
 245 # $text= __macro(@args);
 246 # Join args together by newlines, and then neatly add backslashes to the end
 247 # of every  line as expected by the C pre-processor for #define's.
 248 #
 249
 250 sub __macro {
 251     my $str= join "\n", @_;
 252     $str =~ s/\s*$//;
 253     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 254     my $last= pop @lines;
 255     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 256     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 257     return $str . "\n";
 258 }
 259
 260 #
 261 # my $op=__incrdepth($op);
 262 #
 263 # take an 'op' hashref and add one to it and all its childrens depths.
 264 #
 265
 266 sub __incrdepth {
 267     my $op= shift;
 268     return unless ref $op;
 269     $op->{depth} += 1;
 270     __incrdepth( $op->{yes} );
 271     __incrdepth( $op->{no} );
 272     return $op;
 273 }
 274
 275 # join two branches of an opcode together with a condition, incrementing
 276 # the depth on the yes branch when we do so.
 277 # returns the new root opcode of the tree.
 278 sub __cond_join {
 279     my ( $cond, $yes, $no )= @_;
 280     return {
 281         test  => $cond,
 282         yes   => __incrdepth( $yes ),
 283         no    => $no,
 284         depth => 0,
 285     };
 286 }
 287
 288 # Methods
 289
 290 # constructor
 291 #
 292 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 293 #
 294 # Create a new CharClass::Matcher object by parsing the text in
 295 # the txt array. Currently applies the following rules:
 296 #
 297 # Element starts with C<0x>, line is evaled the result treated as
 298 # a number which is passed to chr().
 299 #
 300 # Element starts with C<">, line is evaled and the result treated
 301 # as a string.
 302 #
 303 # Each string is then stored in the 'strs' subhash as a hash record
 304 # made up of the results of __uni_latin1, using the keynames
 305 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
 306 # 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
 307 #
 308 # Size data is tracked per type in the 'size' subhash.
 309 #
 310 # Return an object
 311 #
 312 sub new {
 313     my $class= shift;
 314     my %opt= @_;
 315     for ( qw(op txt) ) {
 316         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 317           if !exists $opt{$_};
 318     }
 319
 320     my $self= bless {
 321         op    => $opt{op},
 322         title => $opt{title} || '',
 323     }, $class;
 324     foreach my $txt ( @{ $opt{txt} } ) {
 325         my $str= $txt;
 326         if ( $str =~ /^[""]/ ) {
 327             $str= eval $str;
 328         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
 329                                     # list with its expansion
 330             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
 331             die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
 332             foreach my $cp (hex $lower .. hex $upper) {
 333                 push @{$opt{txt}}, sprintf "0x%X", $cp;
 334             }
 335             next;
 336         } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
 337             # Otherwise undocumented, a leading N means is already in the
 338             # native character set; don't convert.
 339             $str= chr eval $str;
 340         } elsif ( $str =~ /^0x/ ) {
 341             $str= eval $str;
 342
 343             # Convert from Unicode/ASCII to native, if necessary
 344             $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
 345                                                     && $str <= 0xFF;
 346             $str = chr $str;
 347         } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
 348             my $property = $1;
 349             use Unicode::UCD qw(prop_invlist);
 350
 351             my @invlist = prop_invlist($property, '_perl_core_internal_ok');
 352             if (! @invlist) {
 353
 354                 # An empty return could mean an unknown property, or merely
 355                 # that it is empty.  Call in scalar context to differentiate
 356                 my $count = prop_invlist($property, '_perl_core_internal_ok');
 357                 die "$property not found" unless defined $count;
 358             }
 359
 360             # Replace this element on the list with the property's expansion
 361             for (my $i = 0; $i < @invlist; $i += 2) {
 362                 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
 363
 364                     # prop_invlist() returns native values; add leading 'N'
 365                     # to indicate that.
 366                     push @{$opt{txt}}, sprintf "N0x%X", $cp;
 367                 }
 368             }
 369             next;
 370         } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
 371             die "do '$1' failed: $!$@" if ! do $1 or $@;
 372             next;
 373         } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
 374             my @results = eval "$1";
 375             die "eval '$1' failed: $@" if $@;
 376             push @{$opt{txt}}, @results;
 377             next;
 378         } else {
 379             die "Unparsable line: $txt\n";
 380         }
 381         my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $str );
 382         my $UTF8= $low   || $utf8;
 383         my $LATIN1= $low || $latin1;
 384         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
 385         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 386         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 387
 388         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}=
 389           ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 );
 390         my $rec= $self->{strs}{$str};
 391         foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
 392             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 393               if $self->{strs}{$str}{$key};
 394         }
 395         $self->{has_multi} ||= @$cp > 1;
 396         $self->{has_ascii} ||= $latin1 && @$latin1;
 397         $self->{has_low}   ||= $low && @$low;
 398         $self->{has_high}  ||= !$low && !$latin1;
 399     }
 400     $self->{val_fmt}= $hex_fmt;
 401     $self->{count}= 0 + keys %{ $self->{strs} };
 402     return $self;
 403 }
 404
 405 # my $trie = make_trie($type,$maxlen);
 406 #
 407 # using the data stored in the object build a trie of a specific type,
 408 # and with specific maximum depth. The trie is made up the elements of
 409 # the given types array for each string in the object (assuming it is
 410 # not too long.)
 411 #
 412 # returns the trie, or undef if there was no relevant data in the object.
 413 #
 414
 415 sub make_trie {
 416     my ( $self, $type, $maxlen )= @_;
 417
 418     my $strs= $self->{strs};
 419     my %trie;
 420     foreach my $rec ( values %$strs ) {
 421         die "panic: unknown type '$type'"
 422           if !exists $rec->{$type};
 423         my $dat= $rec->{$type};
 424         next unless $dat;
 425         next if $maxlen && @$dat > $maxlen;
 426         my $node= \%trie;
 427         foreach my $elem ( @$dat ) {
 428             $node->{$elem} ||= {};
 429             $node= $node->{$elem};
 430         }
 431         $node->{''}= $rec->{str};
 432     }
 433     return 0 + keys( %trie ) ? \%trie : undef;
 434 }
 435
 436 sub pop_count ($) {
 437     my $word = shift;
 438
 439     # This returns a list of the positions of the bits in the input word that
 440     # are 1.
 441
 442     my @positions;
 443     my $position = 0;
 444     while ($word) {
 445         push @positions, $position if $word & 1;
 446         $position++;
 447         $word >>= 1;
 448     }
 449     return @positions;
 450 }
 451
 452 # my $optree= _optree()
 453 #
 454 # recursively convert a trie to an optree where every node represents
 455 # an if else branch.
 456 #
 457 #
 458
 459 sub _optree {
 460     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 461     return unless defined $trie;
 462     if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
 463         die "Can't do 'cp' optree from multi-codepoint strings";
 464     }
 465     $ret_type ||= 'len';
 466     $else= 0  unless defined $else;
 467     $depth= 0 unless defined $depth;
 468
 469     # if we have an emptry string as a key it means we are in an
 470     # accepting state and unless we can match further on should
 471     # return the value of the '' key.
 472     if (exists $trie->{''} ) {
 473         # we can now update the "else" value, anything failing to match
 474         # after this point should return the value from this.
 475         if ( $ret_type eq 'cp' ) {
 476             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 477             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 478         } elsif ( $ret_type eq 'len' ) {
 479             $else= $depth;
 480         } elsif ( $ret_type eq 'both') {
 481             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 482             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 483             $else= "len=$depth, $else";
 484         }
 485     }
 486     # extract the meaningful keys from the trie, filter out '' as
 487     # it means we are an accepting state (end of sequence).
 488     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 489
 490     # if we havent any keys there is no further we can match and we
 491     # can return the "else" value.
 492     return $else if !@conds;
 493
 494
 495     my $test= $test_type =~ /^cp/ ? "cp" : "((U8*)s)[$depth]";
 496     # first we loop over the possible keys/conditions and find out what they look like
 497     # we group conditions with the same optree together.
 498     my %dmp_res;
 499     my @res_order;
 500     local $Data::Dumper::Sortkeys=1;
 501     foreach my $cond ( @conds ) {
 502
 503         # get the optree for this child/condition
 504         my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 );
 505         # convert it to a string with Dumper
 506         my $res_code= Dumper( $res );
 507
 508         push @{$dmp_res{$res_code}{vals}}, $cond;
 509         if (!$dmp_res{$res_code}{optree}) {
 510             $dmp_res{$res_code}{optree}= $res;
 511             push @res_order, $res_code;
 512         }
 513     }
 514
 515     # now that we have deduped the optrees we construct a new optree containing the merged
 516     # results.
 517     my %root;
 518     my $node= \%root;
 519     foreach my $res_code_idx (0 .. $#res_order) {
 520         my $res_code= $res_order[$res_code_idx];
 521         $node->{vals}= $dmp_res{$res_code}{vals};
 522         $node->{test}= $test;
 523         $node->{yes}= $dmp_res{$res_code}{optree};
 524         $node->{depth}= $depth;
 525         if ($res_code_idx < $#res_order) {
 526             $node= $node->{no}= {};
 527         } else {
 528             $node->{no}= $else;
 529         }
 530     }
 531
 532     # return the optree.
 533     return \%root;
 534 }
 535
 536 # my $optree= optree(%opts);
 537 #
 538 # Convert a trie to an optree, wrapper for _optree
 539
 540 sub optree {
 541     my $self= shift;
 542     my %opt= @_;
 543     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 544     $opt{ret_type} ||= 'len';
 545     my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
 546     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 547 }
 548
 549 # my $optree= generic_optree(%opts);
 550 #
 551 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 552 # sets of strings, including a branch for handling the string type check.
 553 #
 554
 555 sub generic_optree {
 556     my $self= shift;
 557     my %opt= @_;
 558
 559     $opt{ret_type} ||= 'len';
 560     my $test_type= 'depth';
 561     my $else= $opt{else} || 0;
 562
 563     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 564     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 565
 566     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 567       for $latin1, $utf8;
 568
 569     if ( $utf8 ) {
 570         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 571     } elsif ( $latin1 ) {
 572         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 573     }
 574     my $low= $self->make_trie( 'low', $opt{max_depth} );
 575     if ( $low ) {
 576         $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 577     }
 578
 579     return $else;
 580 }
 581
 582 # length_optree()
 583 #
 584 # create a string length guarded optree.
 585 #
 586
 587 sub length_optree {
 588     my $self= shift;
 589     my %opt= @_;
 590     my $type= $opt{type};
 591
 592     die "Can't do a length_optree on type 'cp', makes no sense."
 593       if $type =~ /^cp/;
 594
 595     my ( @size, $method );
 596
 597     if ( $type eq 'generic' ) {
 598         $method= 'generic_optree';
 599         my %sizes= (
 600             %{ $self->{size}{low}    || {} },
 601             %{ $self->{size}{latin1} || {} },
 602             %{ $self->{size}{utf8}   || {} }
 603         );
 604         @size= sort { $a <=> $b } keys %sizes;
 605     } else {
 606         $method= 'optree';
 607         @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 608     }
 609
 610     my $else= ( $opt{else} ||= 0 );
 611     for my $size ( @size ) {
 612         my $optree= $self->$method( %opt, type => $type, max_depth => $size );
 613         my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 614         $else= __cond_join( $cond, $optree, $else );
 615     }
 616     return $else;
 617 }
 618
 619 sub calculate_mask(@) {
 620     # Look at the input list of byte values.  This routine returns an array of
 621     # mask/base pairs to generate that list.
 622
 623     my @list = @_;
 624     my $list_count = @list;
 625
 626     # Consider a set of byte values, A, B, C ....  If we want to determine if
 627     # <c> is one of them, we can write c==A || c==B || c==C ....  If the
 628     # values are consecutive, we can shorten that to A<=c && c<=Z, which uses
 629     # far fewer branches.  If only some of them are consecutive we can still
 630     # save some branches by creating range tests for just those that are
 631     # consecutive. _cond_as_str() does this work for looking for ranges.
 632     #
 633     # Another approach is to look at the bit patterns for A, B, C .... and see
 634     # if they have some commonalities.  That's what this function does.  For
 635     # example, consider a set consisting of the bytes
 636     # 0xF0, 0xF1, 0xF2, and 0xF3.  We could write:
 637     #   0xF0 <= c && c <= 0xF4
 638     # But the following mask/compare also works, and has just one test:
 639     #   (c & 0xFC) == 0xF0
 640     # The reason it works is that the set consists of exactly those bytes
 641     # whose first 4 bits are 1, and the next two are 0.  (The value of the
 642     # other 2 bits is immaterial in determining if a byte is in the set or
 643     # not.)  The mask masks out those 2 irrelevant bits, and the comparison
 644     # makes sure that the result matches all bytes which match those 6
 645     # material bits exactly.  In other words, the set of bytes contains
 646     # exactly those whose bottom two bit positions are either 0 or 1.  The
 647     # same principle applies to bit positions that are not necessarily
 648     # adjacent.  And it can be applied to bytes that differ in 1 through all 8
 649     # bit positions.  In order to be a candidate for this optimization, the
 650     # number of bytes in the set must be a power of 2.
 651     #
 652     # Consider a different example, the set 0x53, 0x54, 0x73, and 0x74.  That
 653     # requires 4 tests using either ranges or individual values, and even
 654     # though the number in the set is a power of 2, it doesn't qualify for the
 655     # mask optimization described above because the number of bits that are
 656     # different is too large for that.  However, the set can be expressed as
 657     # two branches with masks thusly:
 658     #   (c & 0xDF) == 0x53 || (c & 0xDF) == 0x54
 659     # a branch savings of 50%.  This is done by splitting the set into two
 660     # subsets each of which has 2 elements, and within each set the values
 661     # differ by 1 byte.
 662     #
 663     # This function attempts to find some way to save some branches using the
 664     # mask technique.  If not, it returns an empty list; if so, it
 665     # returns a list consisting of
 666     #   [ [compare1, mask1], [compare2, mask2], ...
 667     #     [compare_n, undef], [compare_m, undef], ...
 668     #   ]
 669     # The <mask> is undef in the above for those bytes that must be tested
 670     # for individually.
 671     #
 672     # This function does not attempt to find the optimal set.  To do so would
 673     # probably require testing all possible combinations, and keeping track of
 674     # the current best one.
 675     #
 676     # There are probably much better algorithms, but this is the one I (khw)
 677     # came up with.  We start with doing a bit-wise compare of every byte in
 678     # the set with every other byte.  The results are sorted into arrays of
 679     # all those that differ by the same bit positions.  These are stored in a
 680     # hash with the each key being the bits they differ in.  Here is the hash
 681     # for the 0x53, 0x54, 0x73, 0x74 set:
 682     # {
 683     #    4 => {
 684     #            "0,1,2,5" => [
 685     #                            83,
 686     #                            116,
 687     #                            84,
 688     #                            115
 689     #                        ]
 690     #        },
 691     #    3 => {
 692     #            "0,1,2" => [
 693     #                        83,
 694     #                        84,
 695     #                        115,
 696     #                        116
 697     #                        ]
 698     #        }
 699     #    1 => {
 700     #            5 => [
 701     #                    83,
 702     #                    115,
 703     #                    84,
 704     #                    116
 705     #                ]
 706     #        },
 707     # }
 708     #
 709     # The set consisting of values which differ in the 4 bit positions 0, 1,
 710     # 2, and 5 from some other value in the set consists of all 4 values.
 711     # Likewise all 4 values differ from some other value in the 3 bit
 712     # positions 0, 1, and 2; and all 4 values differ from some other value in
 713     # the single bit position 5.  The keys at the uppermost level in the above
 714     # hash, 1, 3, and 4, give the number of bit positions that each sub-key
 715     # below it has.  For example, the 4 key could have as its value an array
 716     # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
 717     # such.  The best optimization will group the most values into a single
 718     # mask.  The most values will be the ones that differ in the most
 719     # positions, the ones with the largest value for the topmost key.  These
 720     # keys, are thus just for convenience of sorting by that number, and do
 721     # not have any bearing on the core of the algorithm.
 722     #
 723     # We start with an element from largest number of differing bits.  The
 724     # largest in this case is 4 bits, and there is only one situation in this
 725     # set which has 4 differing bits, "0,1,2,5".  We look for any subset of
 726     # this set which has 16 values that differ in these 4 bits.  There aren't
 727     # any, because there are only 4 values in the entire set.  We then look at
 728     # the next possible thing, which is 3 bits differing in positions "0,1,2".
 729     # We look for a subset that has 8 values that differ in these 3 bits.
 730     # Again there are none.  So we go to look for the next possible thing,
 731     # which is a subset of 2**1 values that differ only in bit position 5.  83
 732     # and 115 do, so we calculate a mask and base for those and remove them
 733     # from every set.  Since there is only the one set remaining, we remove
 734     # them from just this one.  We then look to see if there is another set of
 735     # 2 values that differ in bit position 5.  84 and 116 do, so we calculate
 736     # a mask and base for those and remove them from every set (again only
 737     # this set remains in this example).  The set is now empty, and there are
 738     # no more sets to look at, so we are done.
 739
 740     if ($list_count == 256) {   # All 256 is trivially masked
 741         return (0, 0);
 742     }
 743
 744     my %hash;
 745
 746     # Generate bits-differing lists for each element compared against each
 747     # other element
 748     for my $i (0 .. $list_count - 2) {
 749         for my $j ($i + 1 .. $list_count - 1) {
 750             my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
 751             my $differ_count = @bits_that_differ;
 752             my $key = join ",", @bits_that_differ;
 753             push @{$hash{$differ_count}{$key}}, $list[$i] unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
 754             push @{$hash{$differ_count}{$key}}, $list[$j];
 755         }
 756     }
 757
 758     print STDERR __LINE__, ": calculate_mask() called:  List of values grouped by differing bits: ", Dumper \%hash if DEBUG;
 759
 760     my @final_results;
 761     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 762         my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
 763         foreach my $bits (sort keys $hash{$count}) {
 764
 765             print STDERR __LINE__, ": For $count bit(s) difference ($bits), need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
 766
 767             # Look only as long as there are at least as many elements in the
 768             # subset as are needed
 769             while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
 770
 771                 print STDERR __LINE__, ": Looking at bit positions ($bits): ", Dumper $hash{$count}{$bits} if DEBUG;
 772
 773                 # Start with the first element in it
 774                 my $try_base = $hash{$count}{$bits}[0];
 775                 my @subset = $try_base;
 776
 777                 # If it succeeds, we return a mask and a base to compare
 778                 # against the masked value.  That base will be the AND of
 779                 # every element in the subset.  Initialize to the one element
 780                 # we have so far.
 781                 my $compare = $try_base;
 782
 783                 # We are trying to find a subset of this that has <need>
 784                 # elements that differ in the bit positions given by the
 785                 # string $bits, which is comma separated.
 786                 my @bits = split ",", $bits;
 787
 788                 TRY: # Look through the remainder of the list for other
 789                      # elements that differ only by these bit positions.
 790
 791                 for (my $i = 1; $i < $cur_count; $i++) {
 792                     my $try_this = $hash{$count}{$bits}[$i];
 793                     my @positions = pop_count($try_base ^ $try_this);
 794
 795                     print STDERR __LINE__, ": $try_base vs $try_this: is (", join(',', @positions), ") a subset of ($bits)?" if DEBUG;;
 796
 797                     foreach my $pos (@positions) {
 798                         unless (grep { $pos == $_ } @bits) {
 799                             print STDERR "  No\n" if DEBUG;
 800                             my $remaining = $cur_count - $i - 1;
 801                             if ($remaining && @subset + $remaining < $need) {
 802                                 print STDERR __LINE__, ": Can stop trying $try_base, because even if all the remaining $remaining values work, they wouldn't add up to the needed $need when combined with the existing ", scalar @subset, " ones\n" if DEBUG;
 803                                 last TRY;
 804                             }
 805                             next TRY;
 806                         }
 807                     }
 808
 809                     print STDERR "  Yes\n" if DEBUG;
 810                     push @subset, $try_this;
 811
 812                     # Add this to the mask base, in case it ultimately
 813                     # succeeds,
 814                     $compare &= $try_this;
 815                 }
 816
 817                 print STDERR __LINE__, ": subset (", join(", ", @subset), ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
 818
 819                 if (@subset < $need) {
 820                     shift @{$hash{$count}{$bits}};
 821                     next;   # Try with next value
 822                 }
 823
 824                 # Create the mask
 825                 my $mask = 0;
 826                 foreach my $position (@bits) {
 827                     $mask |= 1 << $position;
 828                 }
 829                 $mask = ~$mask & 0xFF;
 830                 push @final_results, [$compare, $mask];
 831
 832                 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n", __LINE__, $compare, $compare, $mask if DEBUG;
 833
 834                 # These values are now spoken for.  Remove them from future
 835                 # consideration
 836                 foreach my $remove_count (sort keys %hash) {
 837                     foreach my $bits (sort keys %{$hash{$remove_count}}) {
 838                         foreach my $to_remove (@subset) {
 839                             @{$hash{$remove_count}{$bits}} = grep { $_ != $to_remove } @{$hash{$remove_count}{$bits}};
 840                         }
 841                     }
 842                 }
 843             }
 844         }
 845     }
 846
 847     # Any values that remain in the list are ones that have to be tested for
 848     # individually.
 849     my @individuals;
 850     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 851         foreach my $bits (sort keys $hash{$count}) {
 852             foreach my $remaining (@{$hash{$count}{$bits}}) {
 853
 854                 # If we already know about this value, just ignore it.
 855                 next if grep { $remaining == $_ } @individuals;
 856
 857                 # Otherwise it needs to be returned as something to match
 858                 # individually
 859                 push @final_results, [$remaining, undef];
 860                 push @individuals, $remaining;
 861             }
 862         }
 863     }
 864
 865     # Sort by increasing numeric value
 866     @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
 867
 868     print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
 869
 870     return @final_results;
 871 }
 872
 873 # _cond_as_str
 874 # turn a list of conditions into a text expression
 875 # - merges ranges of conditions, and joins the result with ||
 876 sub _cond_as_str {
 877     my ( $self, $op, $combine, $opts_ref )= @_;
 878     my $cond= $op->{vals};
 879     my $test= $op->{test};
 880     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
 881     return "( $test )" if !defined $cond;
 882
 883     # rangify the list.
 884     my @ranges;
 885     my $Update= sub {
 886         # We skip this if there are optimizations that
 887         # we can apply (below) to the individual ranges
 888         if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
 889             if ( $ranges[-1][0] == $ranges[-1][1] ) {
 890                 $ranges[-1]= $ranges[-1][0];
 891             } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
 892                 $ranges[-1]= $ranges[-1][0];
 893                 push @ranges, $ranges[-1] + 1;
 894             }
 895         }
 896     };
 897     for my $condition ( @$cond ) {
 898         if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
 899             $Update->();
 900             push @ranges, [ $condition, $condition ];
 901         } else {
 902             $ranges[-1][1]++;
 903         }
 904     }
 905     $Update->();
 906
 907     return $self->_combine( $test, @ranges )
 908       if $combine;
 909
 910     if ($is_cp_ret) {
 911         @ranges= map {
 912             ref $_
 913             ? sprintf(
 914                 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
 915                 @$_ )
 916             : sprintf( "$self->{val_fmt} == $test", $_ );
 917         } @ranges;
 918
 919         return "( " . join( " || ", @ranges ) . " )";
 920     }
 921
 922     # If the input set has certain characteristics, we can optimize tests
 923     # for it.  This doesn't apply if returning the code point, as we want
 924     # each element of the set individually.  The code above is for this
 925     # simpler case.
 926
 927     return 1 if @$cond == 256;  # If all bytes match, is trivially true
 928
 929     my @masks;
 930     if (@ranges > 1) {
 931
 932         # See if the entire set shares optimizable characterstics, and if so,
 933         # return the optimization.  We delay checking for this on sets with
 934         # just a single range, as there may be better optimizations available
 935         # in that case.
 936         @masks = calculate_mask(@$cond);
 937
 938         # Stringify the output of calculate_mask()
 939         if (@masks) {
 940             my @return;
 941             foreach my $mask_ref (@masks) {
 942                 if (defined $mask_ref->[1]) {
 943                     push @return, sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask_ref->[1], $mask_ref->[0];
 944                 }
 945                 else {  # An undefined mask means to use the value as-is
 946                     push @return, sprintf "$test == $self->{val_fmt}", $mask_ref->[0];
 947                 }
 948             }
 949
 950             # The best possible case below for specifying this set of values via
 951             # ranges is 1 branch per range.  If our mask method yielded better
 952             # results, there is no sense trying something that is bound to be
 953             # worse.
 954             if (@return < @ranges) {
 955                 return "( " . join( " || ", @return ) . " )";
 956             }
 957
 958             @masks = @return;
 959         }
 960     }
 961
 962     # Here, there was no entire-class optimization that was clearly better
 963     # than doing things by ranges.  Look at each range.
 964     my $range_count_extra = 0;
 965     for (my $i = 0; $i < @ranges; $i++) {
 966         if (! ref $ranges[$i]) {    # Trivial case: no range
 967             $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
 968         }
 969         elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
 970             $ranges[$i] =           # Trivial case: single element range
 971                     sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
 972         }
 973         else {
 974             my $output = "";
 975
 976             # Well-formed UTF-8 continuation bytes on ascii platforms must be
 977             # in the range 0x80 .. 0xBF.  If we know that the input is
 978             # well-formed (indicated by not trying to be 'safe'), we can omit
 979             # tests that verify that the input is within either of these
 980             # bounds.  (No legal UTF-8 character can begin with anything in
 981             # this range, so we don't have to worry about this being a
 982             # continuation byte or not.)
 983             if (ASCII_PLATFORM
 984                 && ! $opts_ref->{safe}
 985                 && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
 986             {
 987                 my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
 988                 my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
 989
 990                 # If the range is the entire legal range, it matches any legal
 991                 # byte, so we can omit both tests.  (This should happen only
 992                 # if the number of ranges is 1.)
 993                 if ($lower_limit_is_80 && $upper_limit_is_BF) {
 994                     return 1;
 995                 }
 996                 elsif ($lower_limit_is_80) { # Just use the upper limit test
 997                     $output = sprintf("( $test <= $self->{val_fmt} )",
 998                                         $ranges[$i]->[1]);
 999                 }
1000                 elsif ($upper_limit_is_BF) { # Just use the lower limit test
1001                     $output = sprintf("( $test >= $self->{val_fmt} )",
1002                                     $ranges[$i]->[0]);
1003                 }
1004             }
1005
1006             # If we didn't change to omit a test above, see if the number of
1007             # elements is a power of 2 (only a single bit in the
1008             # representation of its count will be set) and if so, it may be
1009             # that a mask/compare optimization is possible.
1010             if ($output eq ""
1011                 && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
1012             {
1013                 my @list;
1014                 push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
1015                 my @this_masks = calculate_mask(@list);
1016
1017                 # Use the mask if there is just one for the whole range.
1018                 # Otherwise there is no savings over the two branches that can
1019                 # define the range.
1020                 if (@this_masks == 1 && defined $this_masks[0][1]) {
1021                     $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $this_masks[0][1], $this_masks[0][0];
1022                 }
1023             }
1024
1025             if ($output ne "") {  # Prefer any optimization
1026                 $ranges[$i] = $output;
1027             }
1028             else {
1029                 # No optimization happened.  We need a test that the code
1030                 # point is within both bounds.  But, if the bounds are
1031                 # adjacent code points, it is cleaner to say
1032                 # 'first == test || second == test'
1033                 # than it is to say
1034                 # 'first <= test && test <= second'
1035
1036                 $range_count_extra++;   # This range requires 2 branches to
1037                                         # represent
1038                 if ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
1039                     $ranges[$i] = "( "
1040                                 .  join( " || ", ( map
1041                                     { sprintf "$self->{val_fmt} == $test", $_ }
1042                                     @{$ranges[$i]} ) )
1043                                 . " )";
1044                 }
1045                 else {  # Full bounds checking
1046                     $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
1047                 }
1048             }
1049         }
1050     }
1051
1052     # We have generated the list of bytes in two ways; one trying to use masks
1053     # to cut the number of branches down, and the other to look at individual
1054     # ranges (some of which could be cut down by using a mask for just it).
1055     # We return whichever method uses the fewest branches.
1056     return "( "
1057            . join( " || ", (@masks && @masks < @ranges + $range_count_extra)
1058                             ? @masks
1059                             : @ranges)
1060            . " )";
1061 }
1062
1063 # _combine
1064 # recursively turn a list of conditions into a fast break-out condition
1065 # used by _cond_as_str() for 'cp' type macros.
1066 sub _combine {
1067     my ( $self, $test, @cond )= @_;
1068     return if !@cond;
1069     my $item= shift @cond;
1070     my ( $cstr, $gtv );
1071     if ( ref $item ) {
1072         $cstr=
1073           sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1074             @$item );
1075         $gtv= sprintf "$self->{val_fmt}", $item->[1];
1076     } else {
1077         $cstr= sprintf( "$self->{val_fmt} == $test", $item );
1078         $gtv= sprintf "$self->{val_fmt}", $item;
1079     }
1080     if ( @cond ) {
1081         my $combine= $self->_combine( $test, @cond );
1082         if (@cond >1) {
1083             return "( $cstr || ( $gtv < $test &&\n"
1084                    . $combine . " ) )";
1085         } else {
1086             return "( $cstr || $combine )";
1087         }
1088     } else {
1089         return $cstr;
1090     }
1091 }
1092
1093 # _render()
1094 # recursively convert an optree to text with reasonably neat formatting
1095 sub _render {
1096     my ( $self, $op, $combine, $brace, $opts_ref )= @_;
1097     return 0 if ! defined $op;  # The set is empty
1098     if ( !ref $op ) {
1099         return $op;
1100     }
1101     my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1102     #no warnings 'recursion';   # This would allow really really inefficient
1103                                 # code to be generated.  See pod
1104     my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref );
1105     return $yes if $cond eq '1';
1106
1107     my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref );
1108     return "( $cond )" if $yes eq '1' and $no eq '0';
1109     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1110     return "$lb$cond ? $yes : $no$rb"
1111       if !ref( $op->{yes} ) && !ref( $op->{no} );
1112     my $ind1= " " x 4;
1113     my $ind= "\n" . ( $ind1 x $op->{depth} );
1114
1115     if ( ref $op->{yes} ) {
1116         $yes= $ind . $ind1 . $yes;
1117     } else {
1118         $yes= " " . $yes;
1119     }
1120
1121     return "$lb$cond ?$yes$ind: $no$rb";
1122 }
1123
1124 # $expr=render($op,$combine)
1125 #
1126 # convert an optree to text with reasonably neat formatting. If $combine
1127 # is true then the condition is created using "fast breakouts" which
1128 # produce uglier expressions that are more efficient for common case,
1129 # longer lists such as that resulting from type 'cp' output.
1130 # Currently only used for type 'cp' macros.
1131 sub render {
1132     my ( $self, $op, $combine, $opts_ref )= @_;
1133     my $str= "( " . $self->_render( $op, $combine, 0, $opts_ref ) . " )";
1134     return __clean( $str );
1135 }
1136
1137 # make_macro
1138 # make a macro of a given type.
1139 # calls into make_trie and (generic_|length_)optree as needed
1140 # Opts are:
1141 # type     : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
1142 # ret_type : 'cp' or 'len'
1143 # safe     : add length guards to macro
1144 #
1145 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1146 # in which case it defaults to 'cp' as well.
1147 #
1148 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1149 # sequences in it, as the generated macro will accept only a single codepoint
1150 # as an argument.
1151 #
1152 # returns the macro.
1153
1154
1155 sub make_macro {
1156     my $self= shift;
1157     my %opts= @_;
1158     my $type= $opts{type} || 'generic';
1159     die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
1160       if $type =~ /^cp/
1161       and $self->{has_multi};
1162     my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1163     my $method;
1164     if ( $opts{safe} ) {
1165         $method= 'length_optree';
1166     } elsif ( $type eq 'generic' ) {
1167         $method= 'generic_optree';
1168     } else {
1169         $method= 'optree';
1170     }
1171     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1172     my $text= $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts );
1173     my @args= $type =~ /^cp/ ? 'cp' : 's';
1174     push @args, "e" if $opts{safe};
1175     push @args, "is_utf8" if $type eq 'generic';
1176     push @args, "len" if $ret_type eq 'both';
1177     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
1178              $ret_type eq 'cp'      ? 'what_'     : 'is_';
1179     my $ext= $type     eq 'generic' ? ''          : '_' . lc( $type );
1180     $ext .= "_safe" if $opts{safe};
1181     my $argstr= join ",", @args;
1182     return "/*** GENERATED CODE ***/\n"
1183       . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
1184 }
1185
1186 # if we arent being used as a module (highly likely) then process
1187 # the __DATA__ below and produce macros in regcharclass.h
1188 # if an argument is provided to the script then it is assumed to
1189 # be the path of the file to output to, if the arg is '-' outputs
1190 # to STDOUT.
1191 if ( !caller ) {
1192     $|++;
1193     my $path= shift @ARGV || "regcharclass.h";
1194     my $out_fh;
1195     if ( $path eq '-' ) {
1196         $out_fh= \*STDOUT;
1197     } else {
1198         $out_fh = open_new( $path );
1199     }
1200     print $out_fh read_only_top( lang => 'C', by => $0,
1201                                  file => 'regcharclass.h', style => '*',
1202                                  copyright => [2007, 2011] );
1203     print $out_fh "\n#ifndef H_REGCHARCLASS   /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
1204
1205     my ( $op, $title, @txt, @types, %mods );
1206     my $doit= sub {
1207         return unless $op;
1208
1209         # Skip if to compile on a different platform.
1210         return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
1211         return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
1212
1213         print $out_fh "/*\n\t$op: $title\n\n";
1214         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1215         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
1216
1217         #die Dumper(\@types,\%mods);
1218
1219         my @mods;
1220         push @mods, 'safe' if delete $mods{safe};
1221         unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
1222                                                                 # do this one
1223                                                                 # first, as
1224                                                                 # traditional
1225         if (%mods) {
1226             die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1227         }
1228
1229         foreach my $type_spec ( @types ) {
1230             my ( $type, $ret )= split /-/, $type_spec;
1231             $ret ||= 'len';
1232             foreach my $mod ( @mods ) {
1233                 next if $mod eq 'safe' and $type =~ /^cp/;
1234                 delete $mods{$mod};
1235                 my $macro= $obj->make_macro(
1236                     type     => $type,
1237                     ret_type => $ret,
1238                     safe     => $mod eq 'safe'
1239                 );
1240                 print $out_fh $macro, "\n";
1241             }
1242         }
1243     };
1244
1245     while ( <DATA> ) {
1246         s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1247         next unless /\S/;
1248         chomp;
1249         if ( /^[A-Z]/ ) {
1250             $doit->();  # This starts a new definition; do the previous one
1251             ( $op, $title )= split /\s*:\s*/, $_, 2;
1252             @txt= ();
1253         } elsif ( s/^=>// ) {
1254             my ( $type, $modifier )= split /:/, $_;
1255             @types= split ' ', $type;
1256             undef %mods;
1257             map { $mods{$_} = 1 } split ' ',  $modifier;
1258         } else {
1259             push @txt, "$_";
1260         }
1261     }
1262     $doit->();
1263
1264     print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
1265
1266     if($path eq '-') {
1267         print $out_fh "/* ex: set ro: */\n";
1268     } else {
1269         read_only_bottom_close_and_rename($out_fh)
1270     }
1271 }
1272
1273 # The form of the input is a series of definitions to make macros for.
1274 # The first line gives the base name of the macro, followed by a colon, and
1275 # then text to be used in comments associated with the macro that are its
1276 # title or description.  In all cases the first (perhaps only) parameter to
1277 # the macro is a pointer to the first byte of the code point it is to test to
1278 # see if it is in the class determined by the macro.  In the case of non-UTF8,
1279 # the code point consists only of a single byte.
1280 #
1281 # The second line must begin with a '=>' and be followed by the types of
1282 # macro(s) to be generated; these are specified below.  A colon follows the
1283 # types, followed by the modifiers, also specified below.  At least one
1284 # modifier is required.
1285 #
1286 # The subsequent lines give what code points go into the class defined by the
1287 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1288 # enclosed in quotes.  Otherwise the lines consist of one of:
1289 #   1)  a single Unicode code point, prefaced by 0x
1290 #   2)  a single range of Unicode code points separated by a minus (and
1291 #       optional space)
1292 #   3)  a single Unicode property specified in the standard Perl form
1293 #       "\p{...}"
1294 #   4)  a line like 'do path'.  This will do a 'do' on the file given by
1295 #       'path'.  It is assumed that this does nothing but load subroutines
1296 #       (See item 5 below).  The reason 'require path' is not used instead is
1297 #       because 'do' doesn't assume that path is in @INC.
1298 #   5)  a subroutine call
1299 #           &pkg::foo(arg1, ...)
1300 #       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1301 #       returns an array of entries of forms like items 1-3 above.  This
1302 #       allows more complex inputs than achievable from the other input types.
1303 #
1304 # A blank line or one whose first non-blank character is '#' is a comment.
1305 # The definition of the macro is terminated by a line unlike those described.
1306 #
1307 # Valid types:
1308 #   low         generate a macro whose name is 'is_BASE_low' and defines a
1309 #               class that includes only ASCII-range chars.  (BASE is the
1310 #               input macro base name.)
1311 #   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1312 #               class that includes only upper-Latin1-range chars.  It is not
1313 #               designed to take a UTF-8 input parameter.
1314 #   high        generate a macro whose name is 'is_BASE_high' and defines a
1315 #               class that includes all relevant code points that are above
1316 #               the Latin1 range.  This is for very specialized uses only.
1317 #               It is designed to take only an input UTF-8 parameter.
1318 #   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1319 #               class that includes all relevant characters that aren't ASCII.
1320 #               It is designed to take only an input UTF-8 parameter.
1321 #   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1322 #               class that includes both ASCII and upper-Latin1-range chars.
1323 #               It is not designed to take a UTF-8 input parameter.
1324 #   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1325 #               class that can include any code point, adding the 'low' ones
1326 #               to what 'utf8' works on.  It is designed to take only an input
1327 #               UTF-8 parameter.
1328 #   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1329 #               boolean, parameter which indicates if the first one points to
1330 #               a UTF-8 string or not.  Thus it works in all circumstances.
1331 #   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1332 #               class that returns true if the UV parameter is a member of the
1333 #               class; false if not.
1334 #   cp_high     like cp, but it is assumed that it is known that the UV
1335 #               parameter is above Latin1.  The name of the generated macro is
1336 #               'is_BASE_cp_high'.  This is different from high-cp, derived
1337 #               below.
1338 # A macro of the given type is generated for each type listed in the input.
1339 # The default return value is the number of octets read to generate the match.
1340 # Append "-cp" to the type to have it instead return the matched codepoint.
1341 #               The macro name is changed to 'what_BASE...'.  See pod for
1342 #               caveats
1343 # Appending '-both" instead adds an extra parameter to the end of the argument
1344 #               list, which is a pointer as to where to store the number of
1345 #               bytes matched, while also returning the code point.  The macro
1346 #               name is changed to 'what_len_BASE...'.  See pod for caveats
1347 #
1348 # Valid modifiers:
1349 #   safe        The input string is not necessarily valid UTF-8.  In
1350 #               particular an extra parameter (always the 2nd) to the macro is
1351 #               required, which points to one beyond the end of the string.
1352 #               The macro will make sure not to read off the end of the
1353 #               string.  In the case of non-UTF8, it makes sure that the
1354 #               string has at least one byte in it.  The macro name has
1355 #               '_safe' appended to it.
1356 #   fast        The input string is valid UTF-8.  No bounds checking is done,
1357 #               and the macro can make assumptions that lead to faster
1358 #               execution.
1359 #   only_ascii_platform   Skip this definition if this program is being run on
1360 #               a non-ASCII platform.
1361 #   only_ebcdic_platform  Skip this definition if this program is being run on
1362 #               a non-EBCDIC platform.
1363 # No modifier need be specified; fast is assumed for this case.  If both
1364 # 'fast', and 'safe' are specified, two macros will be created for each
1365 # 'type'.
1366 #
1367 # If run on a non-ASCII platform will automatically convert the Unicode input
1368 # to native.  The documentation above is slightly wrong in this case.  'low'
1369 # actually refers to code points whose UTF-8 representation is the same as the
1370 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1371 # code points less than 256.
1372
1373 1; # in the unlikely case we are being used as a module
1374
1375 __DATA__
1376 # This is no longer used, but retained in case it is needed some day.
1377 # TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1378 # => generic cp generic-cp generic-both :fast safe
1379 # 0x00DF        # LATIN SMALL LETTER SHARP S
1380 # 0x0390        # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1381 # 0x03B0        # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1382 # 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1383 # 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1384 # 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1385
1386 LNBREAK: Line Break: \R
1387 => generic UTF8 LATIN1 :fast safe
1388 "\x0D\x0A"      # CRLF - Network (Windows) line ending
1389 \p{VertSpace}
1390
1391 HORIZWS: Horizontal Whitespace: \h \H
1392 => generic UTF8 LATIN1 high cp cp_high :fast safe
1393 \p{HorizSpace}
1394
1395 VERTWS: Vertical Whitespace: \v \V
1396 => generic UTF8 high LATIN1 cp cp_high :fast safe
1397 \p{VertSpace}
1398
1399 XDIGIT: Hexadecimal digits
1400 => UTF8 high cp_high :fast
1401 \p{XDigit}
1402
1403 XPERLSPACE: \p{XPerlSpace}
1404 => generic UTF8 high cp_high :fast
1405 \p{XPerlSpace}
1406
1407 REPLACEMENT: Unicode REPLACEMENT CHARACTER
1408 => UTF8 :safe
1409 0xFFFD
1410
1411 NONCHAR: Non character code points
1412 => UTF8 :fast
1413 \p{Nchar}
1414
1415 SURROGATE: Surrogate characters
1416 => UTF8 :fast
1417 \p{Gc=Cs}
1418
1419 GCB_L: Grapheme_Cluster_Break=L
1420 => UTF8 :fast
1421 \p{_X_GCB_L}
1422
1423 GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
1424 => UTF8 :fast
1425 \p{_X_LV_LVT_V}
1426
1427 GCB_Prepend: Grapheme_Cluster_Break=Prepend
1428 => UTF8 :fast
1429 \p{_X_GCB_Prepend}
1430
1431 GCB_RI: Grapheme_Cluster_Break=RI
1432 => UTF8 :fast
1433 \p{_X_RI}
1434
1435 GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins
1436 => UTF8 :fast
1437 \p{_X_Special_Begin}
1438
1439 GCB_T: Grapheme_Cluster_Break=T
1440 => UTF8 :fast
1441 \p{_X_GCB_T}
1442
1443 GCB_V: Grapheme_Cluster_Break=V
1444 => UTF8 :fast
1445 \p{_X_GCB_V}
1446
1447 # This program was run with this enabled, and the results copied to utf8.h;
1448 # then this was commented out because it takes so long to figure out these 2
1449 # million code points.  The results would not change unless utf8.h decides it
1450 # wants a maximum other than 4 bytes, or this program creates better
1451 # optimizations
1452 #UTF8_CHAR: Matches utf8 from 1 to 4 bytes
1453 #=> UTF8 :safe only_ascii_platform
1454 #0x0 - 0x1FFFFF
1455
1456 # This hasn't been commented out, because we haven't an EBCDIC platform to run
1457 # it on, and the 3 types of EBCDIC allegedly supported by Perl would have
1458 # different results
1459 UTF8_CHAR: Matches utf8 from 1 to 5 bytes
1460 => UTF8 :safe only_ebcdic_platform
1461 0x0 - 0x3FFFFF:
1462
1463 QUOTEMETA: Meta-characters that \Q should quote
1464 => high :fast
1465 \p{_Perl_Quotemeta}
1466
1467 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1468 => UTF8 :safe
1469 do regen/regcharclass_multi_char_folds.pl
1470
1471 # 1 => All folds
1472 &regcharclass_multi_char_folds::multi_char_folds(1)
1473
1474 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1475 => LATIN1 :safe
1476
1477 &regcharclass_multi_char_folds::multi_char_folds(0)
1478 # 0 => Latin1-only