regen/regcharclass.pl

   1 #!perl
   2 package CharClass::Matcher;
   3 use strict;
   4 use 5.008;
   5 use warnings;
   6 use warnings FATAL => 'all';
   7 use Data::Dumper;
   8 $Data::Dumper::Useqq= 1;
   9
  10 sub DEBUG () { 0 }
  11 $|=1 if DEBUG;
  12
  13 require './regen/regen_lib.pl';
  14 require './regen/charset_translations.pl';
  15 require "./regen/regcharclass_multi_char_folds.pl";
  16
  17 =head1 NAME
  18
  19 CharClass::Matcher -- Generate C macros that match character classes efficiently
  20
  21 =head1 SYNOPSIS
  22
  23     perl regen/regcharclass.pl
  24
  25 =head1 DESCRIPTION
  26
  27 Dynamically generates macros for detecting special charclasses
  28 in latin-1, utf8, and codepoint forms. Macros can be set to return
  29 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
  30
  31 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
  32 are necessary.
  33
  34 Using WHATEVER as an example the following macros can be produced, depending
  35 on the input parameters (how to get each is described by internal comments at
  36 the C<__DATA__> line):
  37
  38 =over 4
  39
  40 =item C<is_WHATEVER(s,is_utf8)>
  41
  42 =item C<is_WHATEVER_safe(s,e,is_utf8)>
  43
  44 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
  45 comparisons involving octet<128 are done before checking the C<is_utf8>
  46 flag, hopefully saving time.
  47
  48 The version without the C<_safe> suffix should be used only when the input is
  49 known to be well-formed.
  50
  51 =item C<is_WHATEVER_utf8(s)>
  52
  53 =item C<is_WHATEVER_utf8_safe(s,e)>
  54
  55 Do a lookup assuming the string is encoded in (normalized) UTF8.
  56
  57 The version without the C<_safe> suffix should be used only when the input is
  58 known to be well-formed.
  59
  60 =item C<is_WHATEVER_latin1(s)>
  61
  62 =item C<is_WHATEVER_latin1_safe(s,e)>
  63
  64 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  65
  66 The version without the C<_safe> suffix should be used only when it is known
  67 that C<s> contains at least one character.
  68
  69 =item C<is_WHATEVER_cp(cp)>
  70
  71 Check to see if the string matches a given codepoint (hypothetically a
  72 U32). The condition is constructed as to "break out" as early as
  73 possible if the codepoint is out of range of the condition.
  74
  75 IOW:
  76
  77   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  78
  79 Thus if the character is X+1 only two comparisons will be done. Making
  80 matching lookups slower, but non-matching faster.
  81
  82 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
  83
  84 A variant form of each of the macro types described above can be generated, in
  85 which the code point is returned by the macro, and an extra parameter (in the
  86 final position) is added, which is a pointer for the macro to set the byte
  87 length of the returned code point.
  88
  89 These forms all have a C<what_len> prefix instead of the C<is_>, for example
  90 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
  91 C<what_len_WHATEVER_utf8(s,len)>.
  92
  93 These forms should not be used I<except> on small sets of mostly widely
  94 separated code points; otherwise the code generated is inefficient.  For these
  95 cases, it is best to use the C<is_> forms, and then find the code point with
  96 C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
  97 message on the worst of the inappropriate sets.  Examine the generated macro
  98 to see if it is acceptable.
  99
 100 =item C<what_WHATEVER_FOO(arg1, ...)>
 101
 102 A variant form of each of the C<is_> macro types described above can be generated, in
 103 which the code point and not the length is returned by the macro.  These have
 104 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
 105 not be used where the set contains a NULL, as 0 is returned for two different
 106 cases: a) the set doesn't include the input code point; b) the set does
 107 include it, and it is a NULL.
 108
 109 =back
 110
 111 The above isn't quite complete, as for specialized purposes one can get a
 112 macro like C<is_WHATEVER_utf8_no_length_checks(s)>, which assumes that it is
 113 already known that there is enough space to hold the character starting at
 114 C<s>, but otherwise checks that it is well-formed.  In other words, this is
 115 intermediary in checking between C<is_WHATEVER_utf8(s)> and
 116 C<is_WHATEVER_utf8_safe(s,e)>.
 117
 118 =head2 CODE FORMAT
 119
 120 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
 121
 122
 123 =head1 AUTHOR
 124
 125 Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
 126
 127 =head1 BUGS
 128
 129 No tests directly here (although the regex engine will fail tests
 130 if this code is broken). Insufficient documentation and no Getopts
 131 handler for using the module as a script.
 132
 133 =head1 LICENSE
 134
 135 You may distribute under the terms of either the GNU General Public
 136 License or the Artistic License, as specified in the README file.
 137
 138 =cut
 139
 140 # Sub naming convention:
 141 # __func : private subroutine, can not be called as a method
 142 # _func  : private method, not meant for external use
 143 # func   : public method.
 144
 145 # private subs
 146 #-------------------------------------------------------------------------------
 147 #
 148 # ($cp,$n,$l,$u)=__uni_latin($str);
 149 #
 150 # Return a list of arrays, each of which when interpreted correctly
 151 # represent the string in some given encoding with specific conditions.
 152 #
 153 # $cp - list of codepoints that make up the string.
 154 # $n  - list of octets that make up the string if all codepoints are invariant
 155 #       regardless of if the string is in UTF-8 or not.
 156 # $l  - list of octets that make up the string in latin1 encoding if all
 157 #       codepoints < 256, and at least one codepoint is UTF-8 variant.
 158 # $u  - list of octets that make up the string in utf8 if any codepoint is
 159 #       UTF-8 variant
 160 #
 161 #   High CP | Defined
 162 #-----------+----------
 163 #   0 - 127 : $n            (127/128 are the values for ASCII platforms)
 164 # 128 - 255 : $l, $u
 165 # 256 - ... : $u
 166 #
 167
 168 sub __uni_latin1 {
 169     my $charset = shift;
 170     my $a2n= shift;
 171     my $str= shift;
 172     my $max= 0;
 173     my @cp;
 174     my @cp_high;
 175     my $only_has_invariants = 1;
 176     for my $ch ( split //, $str ) {
 177         my $cp= ord $ch;
 178         $max= $cp if $max < $cp;
 179         if ($cp > 255) {
 180             push @cp, $cp;
 181             push @cp_high, $cp;
 182         }
 183         else {
 184             push @cp, $a2n->[$cp];
 185         }
 186     }
 187     my ( $n, $l, $u );
 188     $only_has_invariants = ($charset =~ /ascii/i) ? $max < 128 : $max < 160;
 189     if ($only_has_invariants) {
 190         $n= [@cp];
 191     } else {
 192         $l= [@cp] if $max && $max < 256;
 193
 194         my @u;
 195         for my $ch ( split //, $str ) {
 196             push @u, map { ord } split //, cp_2_utfbytes(ord $ch, $charset);
 197         }
 198         $u = \@u;
 199     }
 200     return ( \@cp, \@cp_high, $n, $l, $u );
 201 }
 202
 203 #
 204 # $clean= __clean($expr);
 205 #
 206 # Cleanup a ternary expression, removing unnecessary parens and apply some
 207 # simplifications using regexes.
 208 #
 209
 210 sub __clean {
 211     my ( $expr )= @_;
 212
 213     #return $expr;
 214
 215     our $parens;
 216     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 217
 218     ## remove redundant parens
 219     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 220
 221
 222     # repeatedly simplify conditions like
 223     #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
 224     # into
 225     #       ( ( (cond1) && (cond2) ) ? X : Y )
 226     # Also similarly handles expressions like:
 227     #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
 228     # Note the inclusion of the close paren in ([:()]) and the open paren in
 229     # ([()]) is purely to ensure we have a balanced set of parens in the
 230     # expression which makes it easier to understand the pattern in an editor
 231     # that understands paren's, we do not expect either of these cases to
 232     # actually fire. - Yves
 233     1 while $expr =~ s/
 234         ([:()])  \s*
 235             ($parens) \s*
 236             \? \s*
 237                 \( \s* ($parens) \s*
 238                     \? \s* ($parens|[^()?:\s]+?) \s*
 239                     :  \s* ($parens|[^()?:\s]+?) \s*
 240                 \) \s*
 241             : \s* \5 \s*
 242         ([()])
 243     /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
 244     #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000;
 245     #$expr=~s/\s+//g if length $expr > 8000;
 246
 247     die "Expression too long" if length $expr > 8000;
 248
 249     return $expr;
 250 }
 251
 252 #
 253 # $text= __macro(@args);
 254 # Join args together by newlines, and then neatly add backslashes to the end
 255 # of every  line as expected by the C pre-processor for #define's.
 256 #
 257
 258 sub __macro {
 259     my $str= join "\n", @_;
 260     $str =~ s/\s*$//;
 261     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 262     my $last= pop @lines;
 263     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 264     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 265     return $str . "\n";
 266 }
 267
 268 #
 269 # my $op=__incrdepth($op);
 270 #
 271 # take an 'op' hashref and add one to it and all its childrens depths.
 272 #
 273
 274 sub __incrdepth {
 275     my $op= shift;
 276     return unless ref $op;
 277     $op->{depth} += 1;
 278     __incrdepth( $op->{yes} );
 279     __incrdepth( $op->{no} );
 280     return $op;
 281 }
 282
 283 # join two branches of an opcode together with a condition, incrementing
 284 # the depth on the yes branch when we do so.
 285 # returns the new root opcode of the tree.
 286 sub __cond_join {
 287     my ( $cond, $yes, $no )= @_;
 288     if (ref $yes) {
 289         return {
 290             test  => $cond,
 291             yes   => __incrdepth( $yes ),
 292             no    => $no,
 293             depth => 0,
 294         };
 295     }
 296     else {
 297         return {
 298             test  => $cond,
 299             yes   => $yes,
 300             no    => __incrdepth($no),
 301             depth => 0,
 302         };
 303     }
 304 }
 305
 306 my $hex_fmt= "0x%02X";
 307
 308 sub val_fmt
 309 {
 310     my $self = shift;
 311     my $arg = shift;
 312     my $always_hex = shift // 0;    # Use \x{}; don't look for a mnemonic
 313
 314     # Format 'arg' using the printable character if it has one, or a %x if
 315     # not, returning a string containing the result
 316
 317     # Return what always returned for an unexpected argument
 318     return $hex_fmt unless defined $arg && $arg !~ /\D/;
 319
 320     # We convert only things inside Latin1
 321     if (! $always_hex && $arg < 256) {
 322
 323         # Find the ASCII equivalent of this argument (as the current character
 324         # set might not be ASCII)
 325         my $char = chr $self->{n2a}->[$arg];
 326
 327         # If printable, return it, escaping \ and '
 328         return "'$char'" if $char =~ /[^\\'[:^print:]]/a;
 329         return "'\\\\'" if $char eq "\\";
 330         return "'\''" if $char eq "'";
 331
 332         # Handle the mnemonic controls
 333         my $pos = index("\a\b\e\f\n\r\t\cK", $char);
 334         return "'\\" . substr("abefnrtv", $pos, 1) . "'" if $pos >= 0;
 335     }
 336
 337     # Otherwise, just the input, formatted
 338     return sprintf $hex_fmt, $arg;
 339 }
 340
 341 # Methods
 342
 343 # constructor
 344 #
 345 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 346 #
 347 # Create a new CharClass::Matcher object by parsing the text in
 348 # the txt array. Currently applies the following rules:
 349 #
 350 # Element starts with C<0x>, line is evaled the result treated as
 351 # a number which is passed to chr().
 352 #
 353 # Element starts with C<">, line is evaled and the result treated
 354 # as a string.
 355 #
 356 # Each string is then stored in the 'strs' subhash as a hash record
 357 # made up of the results of __uni_latin1, using the keynames
 358 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
 359 # 'UTF8' which hold a merge of 'low' and their lowercase equivalents.
 360 #
 361 # Size data is tracked per type in the 'size' subhash.
 362 #
 363 # Return an object
 364
 365 my %a2n;
 366 my %n2a;        # Inversion of a2n, for each character set
 367 my %I8_2_utf;
 368 my %utf_2_I8;   # Inversion of I8_2_utf, for each EBCDIC character set
 369 my @identity = (0..255);
 370
 371 sub new {
 372     my $class= shift;
 373     my %opt= @_;
 374     my %hash_return;
 375     for ( qw(op txt) ) {
 376         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 377           if !exists $opt{$_};
 378     }
 379
 380     my $self= bless {
 381         op    => $opt{op},
 382         title => $opt{title} || '',
 383     }, $class;
 384
 385     my $charset = $opt{charset};
 386     $a2n{$charset} = get_a2n($charset);
 387
 388     # We need to construct the maps going the other way if not already done
 389     unless (defined $n2a{$charset}) {
 390         for (my $i = 0; $i < 256; $i++) {
 391             $n2a{$charset}->[$a2n{$charset}->[$i]] = $i;
 392         }
 393     }
 394
 395     if ($charset =~ /ebcdic/i) {
 396         $I8_2_utf{$charset} = get_I8_2_utf($charset);
 397         unless (defined $utf_2_I8{$charset}) {
 398             for (my $i = 0; $i < 256; $i++) {
 399                 $utf_2_I8{$charset}->[$I8_2_utf{$charset}->[$i]] = $i;
 400             }
 401         }
 402     }
 403
 404     foreach my $txt ( @{ $opt{txt} } ) {
 405         my $str= $txt;
 406         if ( $str =~ /^[""]/ ) {
 407             $str= eval $str;
 408         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
 409                                     # list with its expansion
 410             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
 411             die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'"
 412                                         if ! defined $lower || ! defined $upper;
 413             foreach my $cp (hex $lower .. hex $upper) {
 414                 push @{$opt{txt}}, sprintf "0x%X", $cp;
 415             }
 416             next;
 417         } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
 418             # Otherwise undocumented, a leading N means is already in the
 419             # native character set; don't convert.
 420             $str= chr eval $str;
 421         } elsif ( $str =~ /^0x/ ) {
 422             $str= eval $str;
 423             $str = chr $str;
 424         } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
 425             my $property = $1;
 426             use Unicode::UCD qw(prop_invlist);
 427
 428             my @invlist = prop_invlist($property, '_perl_core_internal_ok');
 429             if (! @invlist) {
 430
 431                 # An empty return could mean an unknown property, or merely
 432                 # that it is empty.  Call in scalar context to differentiate
 433                 my $count = prop_invlist($property, '_perl_core_internal_ok');
 434                 die "$property not found" unless defined $count;
 435             }
 436
 437             # Replace this element on the list with the property's expansion
 438             for (my $i = 0; $i < @invlist; $i += 2) {
 439                 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
 440
 441                     # prop_invlist() returns native values; add leading 'N'
 442                     # to indicate that.
 443                     push @{$opt{txt}}, sprintf "N0x%X", $cp;
 444                 }
 445             }
 446             next;
 447         } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
 448             die "do '$1' failed: $!$@" if ! do $1 or $@;
 449             next;
 450         } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
 451             my @results = eval "$1";
 452             die "eval '$1' failed: $@" if $@;
 453             push @{$opt{txt}}, @results;
 454             next;
 455         } elsif ($str =~ / ^ % \s* ( .* ) /x) { # user-furnished sub() call
 456             %hash_return = eval "$1";
 457             die "eval '$1' failed: $@" if $@;
 458             push @{$opt{txt}}, keys %hash_return;
 459             die "Only one multi character expansion currently allowed per rule"
 460                                                         if  $self->{multi_maps};
 461             next;
 462         } else {
 463             die "Unparsable line: $txt\n";
 464         }
 465         my ( $cp, $cp_high, $low, $latin1, $utf8 )
 466                                     = __uni_latin1($charset, $a2n{$charset}, $str );
 467         my $from;
 468         if (defined $hash_return{"\"$str\""}) {
 469             $from = $hash_return{"\"$str\""};
 470             $from = $a2n{$charset}->[$from] if $from < 256;
 471         }
 472         my $UTF8= $low   || $utf8;
 473         my $LATIN1= $low || $latin1;
 474         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
 475         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 476         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 477
 478         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 from )}=
 479           ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1, $from );
 480         my $rec= $self->{strs}{$str};
 481         foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
 482             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 483               if $self->{strs}{$str}{$key};
 484         }
 485         $self->{has_multi} ||= @$cp > 1;
 486         $self->{has_ascii} ||= $latin1 && @$latin1;
 487         $self->{has_low}   ||= $low && @$low;
 488         $self->{has_high}  ||= !$low && !$latin1;
 489     }
 490     $self->{n2a} = $n2a{$charset};
 491     $self->{count}= 0 + keys %{ $self->{strs} };
 492     return $self;
 493 }
 494
 495 # my $trie = make_trie($type,$maxlen);
 496 #
 497 # using the data stored in the object build a trie of a specific type,
 498 # and with specific maximum depth. The trie is made up the elements of
 499 # the given types array for each string in the object (assuming it is
 500 # not too long.)
 501 #
 502 # returns the trie, or undef if there was no relevant data in the object.
 503 #
 504
 505 sub make_trie {
 506     my ( $self, $type, $maxlen )= @_;
 507
 508     my $strs= $self->{strs};
 509     my %trie;
 510     foreach my $rec ( values %$strs ) {
 511         die "panic: unknown type '$type'"
 512           if !exists $rec->{$type};
 513         my $dat= $rec->{$type};
 514         next unless $dat;
 515         next if $maxlen && @$dat > $maxlen;
 516         my $node= \%trie;
 517         foreach my $elem ( @$dat ) {
 518             $node->{$elem} ||= {};
 519             $node= $node->{$elem};
 520         }
 521         $node->{''}= $rec->{str};
 522     }
 523     return 0 + keys( %trie ) ? \%trie : undef;
 524 }
 525
 526 sub pop_count ($) {
 527     my $word = shift;
 528
 529     # This returns a list of the positions of the bits in the input word that
 530     # are 1.
 531
 532     my @positions;
 533     my $position = 0;
 534     while ($word) {
 535         push @positions, $position if $word & 1;
 536         $position++;
 537         $word >>= 1;
 538     }
 539     return @positions;
 540 }
 541
 542 # my $optree= _optree()
 543 #
 544 # recursively convert a trie to an optree where every node represents
 545 # an if else branch.
 546 #
 547 #
 548
 549 sub _optree {
 550     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 551     return unless defined $trie;
 552     $ret_type ||= 'len';
 553     $else= 0  unless defined $else;
 554     $depth= 0 unless defined $depth;
 555
 556     # if we have an empty string as a key it means we are in an
 557     # accepting state and unless we can match further on should
 558     # return the value of the '' key.
 559     if (exists $trie->{''} ) {
 560         # we can now update the "else" value, anything failing to match
 561         # after this point should return the value from this.
 562         my $prefix = $self->{strs}{ $trie->{''} };
 563         if ( $ret_type eq 'cp' ) {
 564             $else= $prefix->{from};
 565             $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
 566             $else= $self->val_fmt($else) if $else > 9;
 567         } elsif ( $ret_type eq 'len' ) {
 568             $else= $depth;
 569         } elsif ( $ret_type eq 'both') {
 570             $else= $prefix->{from};
 571             $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
 572             $else= $self->val_fmt($else) if $else > 9;
 573             $else= "len=$depth, $else";
 574         }
 575     }
 576     # extract the meaningful keys from the trie, filter out '' as
 577     # it means we are an accepting state (end of sequence).
 578     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 579
 580     # if we haven't any keys there is no further we can match and we
 581     # can return the "else" value.
 582     return $else if !@conds;
 583
 584     my $test = $test_type =~ /^cp/ ? "cp" : "((const U8*)s)[$depth]";
 585
 586     # First we loop over the possible keys/conditions and find out what they
 587     # look like; we group conditions with the same optree together.
 588     my %dmp_res;
 589     my @res_order;
 590     local $Data::Dumper::Sortkeys=1;
 591     foreach my $cond ( @conds ) {
 592
 593         # get the optree for this child/condition
 594         my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type,
 595                                                             $else, $depth + 1 );
 596         # convert it to a string with Dumper
 597         my $res_code= Dumper( $res );
 598
 599         push @{$dmp_res{$res_code}{vals}}, $cond;
 600         if (!$dmp_res{$res_code}{optree}) {
 601             $dmp_res{$res_code}{optree}= $res;
 602             push @res_order, $res_code;
 603         }
 604     }
 605
 606     # now that we have deduped the optrees we construct a new optree
 607     # containing the merged
 608     # results.
 609     my %root;
 610     my $node= \%root;
 611     foreach my $res_code_idx (0 .. $#res_order) {
 612         my $res_code= $res_order[$res_code_idx];
 613         $node->{vals}= $dmp_res{$res_code}{vals};
 614         $node->{test}= $test;
 615         $node->{yes}= $dmp_res{$res_code}{optree};
 616         $node->{depth}= $depth;
 617         if ($res_code_idx < $#res_order) {
 618             $node= $node->{no}= {};
 619         } else {
 620             $node->{no}= $else;
 621         }
 622     }
 623
 624     # return the optree.
 625     return \%root;
 626 }
 627
 628 # my $optree= optree(%opts);
 629 #
 630 # Convert a trie to an optree, wrapper for _optree
 631
 632 sub optree {
 633     my $self= shift;
 634     my %opt= @_;
 635     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 636     $opt{ret_type} ||= 'len';
 637     my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
 638     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 639 }
 640
 641 # my $optree= generic_optree(%opts);
 642 #
 643 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 644 # sets of strings, including a branch for handling the string type check.
 645 #
 646
 647 sub generic_optree {
 648     my $self= shift;
 649     my %opt= @_;
 650
 651     $opt{ret_type} ||= 'len';
 652     my $test_type= 'depth';
 653     my $else= $opt{else} || 0;
 654
 655     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 656     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 657
 658     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 659       for $latin1, $utf8;
 660
 661     if ( $utf8 ) {
 662         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 663     } elsif ( $latin1 ) {
 664         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 665     }
 666     if ($opt{type} eq 'generic') {
 667         my $low= $self->make_trie( 'low', $opt{max_depth} );
 668         if ( $low ) {
 669             $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 670         }
 671     }
 672
 673     return $else;
 674 }
 675
 676 # length_optree()
 677 #
 678 # create a string length guarded optree.
 679 #
 680
 681 sub length_optree {
 682     my $self= shift;
 683     my %opt= @_;
 684     my $type= $opt{type};
 685
 686     die "Can't do a length_optree on type 'cp', makes no sense."
 687       if $type =~ /^cp/;
 688
 689     my $else= ( $opt{else} ||= 0 );
 690
 691     return $else if $self->{count} == 0;
 692
 693     my $method = $type =~ /generic/ ? 'generic_optree' : 'optree';
 694     if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) {
 695
 696         # Here is non-generic output (meaning that we are only generating one
 697         # type), and all things that match have the same number ('size') of
 698         # bytes.  The length guard is simply that we have that number of
 699         # bytes.
 700         my @size = keys %{$self->{size}{$type}};
 701         my $cond= "((e) - (s)) >= $size[0]";
 702         my $optree = $self->$method(%opt);
 703         $else= __cond_join( $cond, $optree, $else );
 704     }
 705     elsif ($self->{has_multi}) {
 706         my @size;
 707
 708         # Here, there can be a match of a multiple character string.  We use
 709         # the traditional method which is to have a branch for each possible
 710         # size (longest first) and test for the legal values for that size.
 711         my %sizes= (
 712             %{ $self->{size}{low}    || {} },
 713             %{ $self->{size}{latin1} || {} },
 714             %{ $self->{size}{utf8}   || {} }
 715         );
 716         if ($method eq 'generic_optree') {
 717             @size= sort { $a <=> $b } keys %sizes;
 718         } else {
 719             @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 720         }
 721         for my $size ( @size ) {
 722             my $optree= $self->$method(%opt, type => $type, max_depth => $size);
 723             my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 724             $else= __cond_join( $cond, $optree, $else );
 725         }
 726     }
 727     else {
 728         my $utf8;
 729
 730         # Here, has more than one possible size, and only matches a single
 731         # character.  For non-utf8, the needed length is 1; for utf8, it is
 732         # found by array lookup 'UTF8SKIP'.
 733
 734         # If want just the code points above 255, set up to look for those;
 735         # otherwise assume will be looking for all non-UTF-8-invariant code
 736         # poiints.
 737         my $trie_type = ($type eq 'high') ? 'high' : 'utf8';
 738
 739         # If we do want more than the 0-255 range, find those, and if they
 740         # exist...
 741         if (   $opt{type} !~ /latin1/i
 742             && ($utf8 = $self->make_trie($trie_type, 0)))
 743         {
 744
 745             # ... get them into an optree, and set them up as the 'else' clause
 746             $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 );
 747
 748             # We could make this
 749             #   UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))";
 750             # to avoid doing the UTF8SKIP and subsequent branches for invariants
 751             # that don't match.  But the current macros that get generated
 752             # have only a few things that can match past this, so I (khw)
 753             # don't think it is worth it.  (Even better would be to use
 754             # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it
 755             # if it saves a bunch.  We assume that input text likely to be
 756             # well-formed .
 757             my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))";
 758             $else = __cond_join($cond, $utf8, $else);
 759
 760             # For 'generic', we also will want the latin1 UTF-8 variants for
 761             # the case where the input isn't UTF-8.
 762             my $latin1;
 763             if ($method eq 'generic_optree') {
 764                 $latin1 = $self->make_trie( 'latin1', 1);
 765                 $latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0);
 766             }
 767
 768             # If we want the UTF-8 invariants, get those.
 769             my $low;
 770             if ($opt{type} !~ /non_low|high/
 771                 && ($low= $self->make_trie( 'low', 1)))
 772             {
 773                 $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 );
 774
 775                 # Expand out the UTF-8 invariants as a string so that we
 776                 # can use them as the conditional
 777                 $low = $self->_cond_as_str( $low, 0, \%opt);
 778
 779                 # If there are Latin1 variants, add a test for them.
 780                 if ($latin1) {
 781                     $else = __cond_join("(! is_utf8 )", $latin1, $else);
 782                 }
 783                 elsif ($method eq 'generic_optree') {
 784
 785                     # Otherwise for 'generic' only we know that what
 786                     # follows must be valid for just UTF-8 strings,
 787                     $else->{test} = "( is_utf8 && $else->{test} )";
 788                 }
 789
 790                 # If the invariants match, we are done; otherwise we have
 791                 # to go to the 'else' clause.
 792                 $else = __cond_join($low, 1, $else);
 793             }
 794             elsif ($latin1) {   # Here, didn't want or didn't have invariants,
 795                                 # but we do have latin variants
 796                 $else = __cond_join("(! is_utf8)", $latin1, $else);
 797             }
 798
 799             # We need at least one byte available to start off the tests
 800             $else = __cond_join("LIKELY((e) > (s))", $else, 0);
 801         }
 802         else {  # Here, we don't want or there aren't any variants.  A single
 803                 # byte available is enough.
 804             my $cond= "((e) > (s))";
 805             my $optree = $self->$method(%opt);
 806             $else= __cond_join( $cond, $optree, $else );
 807         }
 808     }
 809
 810     return $else;
 811 }
 812
 813 sub calculate_mask(@) {
 814     # Look at the input list of byte values.  This routine returns an array of
 815     # mask/base pairs to generate that list.
 816
 817     my @list = @_;
 818     my $list_count = @list;
 819
 820     # Consider a set of byte values, A, B, C ....  If we want to determine if
 821     # <c> is one of them, we can write c==A || c==B || c==C ....  If the
 822     # values are consecutive, we can shorten that to inRANGE(c, 'A', 'Z'),
 823     # which uses far fewer branches.  If only some of them are consecutive we
 824     # can still save some branches by creating range tests for just those that
 825     # are consecutive. _cond_as_str() does this work for looking for ranges.
 826     #
 827     # Another approach is to look at the bit patterns for A, B, C .... and see
 828     # if they have some commonalities.  That's what this function does.  For
 829     # example, consider a set consisting of the bytes
 830     # 0x42, 0x43, 0x62, and 0x63.  We could write:
 831     #   inRANGE(c, 0x42, 0x43) || inRANGE(c, 0x62, 0x63)
 832     # which through the magic of casting has not 4, but 2 tests.  But the
 833     # following mask/compare also works, and has just one test:
 834     #   (c & 0xDE) == 0x42
 835     # The reason it works is that the set consists of exactly the 4 bit
 836     # patterns which have either 0 or 1 in the two bit positions that are 0 in
 837     # the mask.  They have the same value in each bit position where the mask
 838     # is 1.  The comparison makes sure that the result matches all bytes which
 839     # match those six 1 bits exactly.  This can be applied to bytes that
 840     # differ in 1 through all 8 bit positions.  In order to be a candidate for
 841     # this optimization, the number of bytes in the set must be a power of 2.
 842     #
 843     # It may be that the bytes needing to be matched can't be done with a
 844     # single mask.  But it may be possible to have two (or more) sets, each
 845     # with a separate mask.  This function attempts to find some way to save
 846     # some branches using the mask technique.  If not, it returns an empty
 847     # list; if so, it returns a list consisting of
 848     #   [ [compare1, mask1], [compare2, mask2], ...
 849     #     [compare_n, undef], [compare_m, undef], ...
 850     #   ]
 851     # The <mask> is undef in the above for those bytes that must be tested
 852     # for individually.
 853     #
 854     # This function does not attempt to find the optimal set.  To do so would
 855     # probably require testing all possible combinations, and keeping track of
 856     # the current best one.
 857     #
 858     # There are probably much better algorithms, but this is the one I (khw)
 859     # came up with.  We start with doing a bit-wise compare of every byte in
 860     # the set with every other byte.  The results are sorted into arrays of
 861     # all those that differ by the same bit positions.  These are stored in a
 862     # hash with the each key being the bits they differ in.  Here is the hash
 863     # for the 0x53, 0x54, 0x73, 0x74 set:
 864     # {
 865     #    4 => {
 866     #            "0,1,2,5" => [
 867     #                            83,
 868     #                            116,
 869     #                            84,
 870     #                            115
 871     #                        ]
 872     #        },
 873     #    3 => {
 874     #            "0,1,2" => [
 875     #                        83,
 876     #                        84,
 877     #                        115,
 878     #                        116
 879     #                        ]
 880     #        }
 881     #    1 => {
 882     #            5 => [
 883     #                    83,
 884     #                    115,
 885     #                    84,
 886     #                    116
 887     #                ]
 888     #        },
 889     # }
 890     #
 891     # The set consisting of values which differ in the 4 bit positions 0, 1,
 892     # 2, and 5 from some other value in the set consists of all 4 values.
 893     # Likewise all 4 values differ from some other value in the 3 bit
 894     # positions 0, 1, and 2; and all 4 values differ from some other value in
 895     # the single bit position 5.  The keys at the uppermost level in the above
 896     # hash, 1, 3, and 4, give the number of bit positions that each sub-key
 897     # below it has.  For example, the 4 key could have as its value an array
 898     # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
 899     # such.  The best optimization will group the most values into a single
 900     # mask.  The most values will be the ones that differ in the most
 901     # positions, the ones with the largest value for the topmost key.  These
 902     # keys, are thus just for convenience of sorting by that number, and do
 903     # not have any bearing on the core of the algorithm.
 904     #
 905     # We start with an element from largest number of differing bits.  The
 906     # largest in this case is 4 bits, and there is only one situation in this
 907     # set which has 4 differing bits, "0,1,2,5".  We look for any subset of
 908     # this set which has 16 values that differ in these 4 bits.  There aren't
 909     # any, because there are only 4 values in the entire set.  We then look at
 910     # the next possible thing, which is 3 bits differing in positions "0,1,2".
 911     # We look for a subset that has 8 values that differ in these 3 bits.
 912     # Again there are none.  So we go to look for the next possible thing,
 913     # which is a subset of 2**1 values that differ only in bit position 5.  83
 914     # and 115 do, so we calculate a mask and base for those and remove them
 915     # from every set.  Since there is only the one set remaining, we remove
 916     # them from just this one.  We then look to see if there is another set of
 917     # 2 values that differ in bit position 5.  84 and 116 do, so we calculate
 918     # a mask and base for those and remove them from every set (again only
 919     # this set remains in this example).  The set is now empty, and there are
 920     # no more sets to look at, so we are done.
 921
 922     if ($list_count == 256) {   # All 256 is trivially masked
 923         return (0, 0);
 924     }
 925
 926     my %hash;
 927
 928     # Generate bits-differing lists for each element compared against each
 929     # other element
 930     for my $i (0 .. $list_count - 2) {
 931         for my $j ($i + 1 .. $list_count - 1) {
 932             my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
 933             my $differ_count = @bits_that_differ;
 934             my $key = join ",", @bits_that_differ;
 935             push @{$hash{$differ_count}{$key}}, $list[$i]
 936                 unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
 937             push @{$hash{$differ_count}{$key}}, $list[$j];
 938         }
 939     }
 940
 941     print STDERR __LINE__, ": calculate_mask() called:  List of values grouped",
 942                                 " by differing bits: ", Dumper \%hash if DEBUG;
 943
 944     my @final_results;
 945     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 946         my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
 947         foreach my $bits (sort keys $hash{$count}->%*) {
 948
 949             print STDERR __LINE__, ": For $count bit(s) difference ($bits),",
 950             " need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
 951
 952             # Look only as long as there are at least as many elements in the
 953             # subset as are needed
 954             while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
 955
 956                 print STDERR __LINE__, ": Looking at bit positions ($bits): ",
 957                                           Dumper $hash{$count}{$bits} if DEBUG;
 958
 959                 # Start with the first element in it
 960                 my $try_base = $hash{$count}{$bits}[0];
 961                 my @subset = $try_base;
 962
 963                 # If it succeeds, we return a mask and a base to compare
 964                 # against the masked value.  That base will be the AND of
 965                 # every element in the subset.  Initialize to the one element
 966                 # we have so far.
 967                 my $compare = $try_base;
 968
 969                 # We are trying to find a subset of this that has <need>
 970                 # elements that differ in the bit positions given by the
 971                 # string $bits, which is comma separated.
 972                 my @bits = split ",", $bits;
 973
 974                 TRY: # Look through the remainder of the list for other
 975                      # elements that differ only by these bit positions.
 976
 977                 for (my $i = 1; $i < $cur_count; $i++) {
 978                     my $try_this = $hash{$count}{$bits}[$i];
 979                     my @positions = pop_count($try_base ^ $try_this);
 980
 981                     print STDERR __LINE__, ": $try_base vs $try_this: is (",
 982                       join(',', @positions), ") a subset of ($bits)?" if DEBUG;
 983
 984                     foreach my $pos (@positions) {
 985                         unless (grep { $pos == $_ } @bits) {
 986                             print STDERR "  No\n" if DEBUG;
 987                             my $remaining = $cur_count - $i - 1;
 988                             if ($remaining && @subset + $remaining < $need) {
 989                                 print STDERR __LINE__, ": Can stop trying",
 990                                     " $try_base, because even if all the",
 991                                     " remaining $remaining values work, they",
 992                                     " wouldn't add up to the needed $need when",
 993                                     " combined with the existing ",
 994                                             scalar @subset, " ones\n" if DEBUG;
 995                                 last TRY;
 996                             }
 997                             next TRY;
 998                         }
 999                     }
1000
1001                     print STDERR "  Yes\n" if DEBUG;
1002                     push @subset, $try_this;
1003
1004                     # Add this to the mask base, in case it ultimately
1005                     # succeeds,
1006                     $compare &= $try_this;
1007                 }
1008
1009                 print STDERR __LINE__, ": subset (", join(", ", @subset),
1010                  ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
1011
1012                 if (@subset < $need) {
1013                     shift @{$hash{$count}{$bits}};
1014                     next;   # Try with next value
1015                 }
1016
1017                 # Create the mask
1018                 my $mask = 0;
1019                 foreach my $position (@bits) {
1020                     $mask |= 1 << $position;
1021                 }
1022                 $mask = ~$mask & 0xFF;
1023                 push @final_results, [$compare, $mask];
1024
1025                 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n",
1026                                 __LINE__, $compare, $compare, $mask if DEBUG;
1027
1028                 # These values are now spoken for.  Remove them from future
1029                 # consideration
1030                 foreach my $remove_count (sort keys %hash) {
1031                     foreach my $bits (sort keys %{$hash{$remove_count}}) {
1032                         foreach my $to_remove (@subset) {
1033                             @{$hash{$remove_count}{$bits}}
1034                                     = grep { $_ != $to_remove }
1035                                                 @{$hash{$remove_count}{$bits}};
1036                         }
1037                     }
1038                 }
1039             }
1040         }
1041     }
1042
1043     # Any values that remain in the list are ones that have to be tested for
1044     # individually.
1045     my @individuals;
1046     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
1047         foreach my $bits (sort keys $hash{$count}->%*) {
1048             foreach my $remaining (@{$hash{$count}{$bits}}) {
1049
1050                 # If we already know about this value, just ignore it.
1051                 next if grep { $remaining == $_ } @individuals;
1052
1053                 # Otherwise it needs to be returned as something to match
1054                 # individually
1055                 push @final_results, [$remaining, undef];
1056                 push @individuals, $remaining;
1057             }
1058         }
1059     }
1060
1061     # Sort by increasing numeric value
1062     @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
1063
1064     print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
1065
1066     return @final_results;
1067 }
1068
1069 # _cond_as_str
1070 # turn a list of conditions into a text expression
1071 # - merges ranges of conditions, and joins the result with ||
1072 sub _cond_as_str {
1073     my ( $self, $op, $combine, $opts_ref )= @_;
1074     my @cond = ();
1075     @cond = $op->{vals}->@* if defined $op->{vals};
1076     my $test= $op->{test};
1077     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
1078     my $charset = $opts_ref->{charset};
1079     return "( $test )" unless @cond;
1080
1081     my (@ranges, @native_ranges);
1082     my @native_conds;
1083
1084     # rangify the list.  As we encounter a new value, it is placed in a new
1085     # subarray by itself.  If the next value is adjacent to it, the end point
1086     # of the subarray is merely incremented; and so on.  When the next value
1087     # that isn't adjacent to the previous one is encountered, Update() is
1088     # called to hoist any single-element subarray to be a scalar.
1089     my $Update= sub {
1090         # We skip this if there are optimizations that
1091         # we can apply (below) to the individual ranges
1092         if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
1093             $ranges[-1] = $ranges[-1][0] if $ranges[-1][0] == $ranges[-1][1];
1094         }
1095     };
1096
1097     # Parse things twice, using different approaches for representing things,
1098     # afterwards choosing the alternative with the fewest branches
1099     for my $i (0, 1) {
1100
1101         # Should we avoid using mnemonics for code points?
1102         my $always_hex = 0;
1103
1104         # The second pass is all about using a transformation to see if it
1105         # creates contiguous blocks that lead to fewer ranges or masking.  But
1106         # single element ranges don't have any benefit, and so the transform
1107         # is just extra work for them.  '$range_test' includes the transform
1108         # for multi-element ranges, and '$original' maps a byte back to what
1109         # it was without being transformed.  Thus we use '$range_test' and the
1110         # transormed bytes on multi-element ranges, and plain '$test' and
1111         # '$original' on single ones.  In the first pass these are effectively
1112         # no-ops.
1113         my $range_test = $test;
1114         my $original = \@identity;
1115
1116         if ($i) {   # 2nd pass
1117             # The second pass is only for non-ascii character sets, to see if
1118             # a transform to Unicode/ASCII saves anything.
1119             last if $charset =~ /ascii/i;
1120
1121             # If the first pass came up with a single range, we won't be able
1122             # to do better than that, so don't try.
1123             last if @ranges == 1;
1124
1125             # We calculated the native values the first iteration
1126             @native_ranges = @ranges;
1127             @native_conds = @cond;
1128
1129             # Start fresh
1130             undef @ranges;
1131             undef @cond;
1132
1133             # Determine the translation function, to/from UTF-8 or Latin1, and
1134             # the corresponding transform of the condition to match
1135             my $lookup;
1136             if ($opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi) {
1137                 $lookup = $utf_2_I8{$charset};
1138                 $original = $I8_2_utf{$charset};
1139                 $range_test = "NATIVE_UTF8_TO_I8($test)";
1140             }
1141             else {
1142                 $lookup = $n2a{$charset};
1143                 $original = $a2n{$charset};
1144                 $range_test = "NATIVE_TO_LATIN1($test)";
1145             }
1146
1147             # Translate the native conditions (bytes) into the Unicode ones
1148             for my $condition (@native_conds) {
1149                 push @cond, $lookup->[$condition];
1150             }
1151
1152             # 'f' won't be the expected 'f' on this box
1153             $always_hex = 1;
1154         }
1155
1156         # Go through the code points (@cond) and collapse them as much as
1157         # possible into ranges
1158         for my $condition ( @cond ) {
1159             if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
1160                 # Not adjacent to the existing range.  Remove that from being a
1161                 # range if only a single value;
1162                 $Update->();
1163                 push @ranges, [ $condition, $condition ];
1164             } else {    # Adjacent to the existing range; add to the range
1165                 $ranges[-1][1]++;
1166             }
1167         }
1168         $Update->();
1169
1170         # _combine is used for cp type matching.  By having it here return, no
1171         # second pass is done.  It could conceivably be restructured to have a
1172         # second pass, but no current uses of script would actually gain any
1173         # advantage by doing so, so the work hasn't been further considered.
1174         return $self->_combine( $test, @ranges ) if $combine;
1175
1176         # If the input set has certain characteristics, we can optimize tests
1177         # for it.
1178
1179         # If all bytes match, is trivially true; we don't need a 2nd pass
1180         return 1 if @cond == 256;
1181
1182         # If this is a single UTF-8 range which includes all possible
1183         # continuation bytes, and we aren't checking for well-formedness, this
1184         # is trivially true.
1185         #
1186         # (In EBCDIC, this won't happen until the 2nd pass transforms the
1187         # disjoint continuation byte ranges into a single I8 one.)
1188         if (     @ranges == 1
1189             && ! $opts_ref->{safe}
1190             && ! $opts_ref->{no_length_checks}
1191             &&   $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi
1192             &&   $ranges[0]->[1] == 0xBF
1193             &&   $ranges[0]->[0] == (($charset =~ /ascii/i)
1194                                         ? 0x80 : 0xA0))
1195         {
1196             return 1;
1197         }
1198
1199         my $loop_start = 0;
1200         if (ref $ranges[0] && $ranges[0]->[0] == 0) {
1201
1202             # If the first range matches all 256 possible bytes, it is
1203             # trivially true.
1204             if ($ranges[0]->[1] == 0xFF) {
1205                 die "Range spanning all bytes must be the only one"
1206                                                                 if @ranges > 1;
1207                 return 1;
1208             }
1209
1210             # Here, the first range starts at 0, but doesn't match everything.
1211             # But the condition doesn't have to worry about being < 0
1212             $ranges[0] = "( $test <= "
1213                         . $self->val_fmt($ranges[0]->[1], $always_hex) . " )";
1214             $loop_start++;
1215         }
1216
1217         my $loop_end = @ranges;
1218         if (   @ranges
1219             && ref $ranges[-1]
1220             && $ranges[-1]->[1] == 0xFF
1221             && $ranges[-1]->[0] != 0xFF)
1222         {
1223             # If the final range consists of more than one byte ending with
1224             # the highest possible one, the condition doesn't have to worry
1225             # about being > FF
1226             $ranges[-1] = "( $test >= "
1227                         . $self->val_fmt($ranges[-1]->[0], $always_hex) . " )";
1228             $loop_end--;
1229         }
1230
1231         # Look at each range to see if there any optimizations.  The
1232         # formatting may be thrown away, so might be wasted effort; and khw
1233         # supposes this could be restructured to delay that until the final
1234         # method is chosen.  But that would be more coding work than
1235         # warranted, as this is executed not that many times during a
1236         # development cycle.
1237         for (my $i = $loop_start; $i < $loop_end; $i++) {
1238             if (! ref $ranges[$i]) {    # Trivial case: no range
1239                 $ranges[$i] =
1240                     $self->val_fmt($original->[$ranges[$i]], $always_hex)
1241                   . " == $test";
1242             }
1243             elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
1244                 $ranges[$i] =           # Trivial case: single element range
1245                      $self->val_fmt($original->[$ranges[$i]->[0]], $always_hex)
1246                    . " == $test";
1247             }
1248             else {
1249                 $ranges[$i] = "inRANGE_helper_(U8, $range_test, "
1250                         . $self->val_fmt($ranges[$i]->[0], $always_hex) .", "
1251                         . $self->val_fmt($ranges[$i]->[1], $always_hex) . ")";
1252             }
1253         }
1254
1255         # Here, have collapsed the matched code points into ranges.  This code
1256         # also sees if some of those different ranges have bit patterns which
1257         # causes them to be combinable by ANDing with a mask.  There's no need
1258         # to do this if we are already down to a single range.
1259         next unless @ranges > 1;
1260
1261         my @masks = calculate_mask(@cond);
1262
1263         # Stringify the output of calculate_mask()
1264         if (@masks) {
1265             my @masked;
1266             foreach my $mask_ref (@masks) {
1267                 if (defined $mask_ref->[1]) {
1268                     push @masked, "( ( $range_test & "
1269                         . $self->val_fmt($mask_ref->[1], $always_hex) . " ) == "
1270                         . $self->val_fmt($mask_ref->[0], $always_hex) . " )";
1271                 }
1272                 else {  # An undefined mask means to use the value as-is
1273                     push @masked, "$test == "
1274                     . $self->val_fmt($original->[$mask_ref->[0]], $always_hex);
1275                 }
1276             }
1277
1278             # The best possible case below for specifying this set of values via
1279             # ranges is 1 branch per range.  If our mask method yielded better
1280             # results, there is no sense trying something that is bound to be
1281             # worse.
1282             if (@masked < @ranges) {
1283                 @ranges = @masked;
1284                 next;
1285             }
1286
1287             @masks = @masked;
1288         }
1289
1290         # If we found some mask possibilities, and they have fewer
1291         # conditionals in them than the plain range method, convert to use the
1292         # masks.
1293         @ranges = @masks if @masks && @masks < @ranges;
1294     }  # End of both passes
1295
1296     # If the two passes came up with two sets, use the one with the fewest
1297     # conditionals (the number of ranges is a proxy for that).  If both have
1298     # the same number, prefer the native, as that omits transformations.
1299     if (@native_ranges && @native_ranges <= @ranges) {
1300         @ranges = @native_ranges;
1301         @cond = @native_conds;
1302     }
1303
1304     return "( " . join( " || ", @ranges) . " )";
1305 }
1306
1307 # _combine
1308 # recursively turn a list of conditions into a fast break-out condition
1309 # used by _cond_as_str() for 'cp' type macros.
1310 sub _combine {
1311     my ( $self, $test, @cond )= @_;
1312     return if !@cond;
1313     my $item= shift @cond;
1314     my ( $cstr, $gtv );
1315     if ( ref $item ) {  # @item should be a 2-element array giving range start
1316                         # and end
1317         if ($item->[0] == 0) {  # UV's are never negative, so skip "0 <= "
1318                                 # test which could generate a compiler warning
1319                                 # that test is always true
1320             $cstr= "$test <= " . $self->val_fmt($item->[1]);
1321         }
1322         else {
1323             $cstr = "inRANGE_helper_(UV, $test, "
1324                   . $self->val_fmt($item->[0]) . ", "
1325                   . $self->val_fmt($item->[1]) . ")";
1326         }
1327         $gtv= $self->val_fmt($item->[1]);
1328     } else {
1329         $cstr= $self->val_fmt($item) . " == $test";
1330         $gtv= $self->val_fmt($item)
1331     }
1332     if ( @cond ) {
1333         my $combine= $self->_combine( $test, @cond );
1334         if (@cond >1) {
1335             return "( $cstr || ( $gtv < $test &&\n"
1336                    . $combine . " ) )";
1337         } else {
1338             return "( $cstr || $combine )";
1339         }
1340     } else {
1341         return $cstr;
1342     }
1343 }
1344
1345 # _render()
1346 # recursively convert an optree to text with reasonably neat formatting
1347 sub _render {
1348     my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_;
1349     return 0 if ! defined $op;  # The set is empty
1350     if ( !ref $op ) {
1351         return $op;
1352     }
1353     my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1354     #no warnings 'recursion';   # This would allow really really inefficient
1355                                 # code to be generated.  See pod
1356     my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def,
1357                                                                     $submacros);
1358     return $yes if $cond eq '1';
1359
1360     my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref, $def,
1361                                                                     $submacros);
1362     return "( $cond )" if $yes eq '1' and $no eq '0';
1363     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1364     return "$lb$cond ? $yes : $no$rb"
1365       if !ref( $op->{yes} ) && !ref( $op->{no} );
1366     my $ind1= " " x 4;
1367     my $ind= "\n" . ( $ind1 x $op->{depth} );
1368
1369     if ( ref $op->{yes} ) {
1370         $yes= $ind . $ind1 . $yes;
1371     } else {
1372         $yes= " " . $yes;
1373     }
1374
1375     my $str= "$lb$cond ?$yes$ind: $no$rb";
1376     if (length $str > 6000) {
1377         push @$submacros, sprintf "#define $def\n( %s )", "_part"
1378                                   . (my $yes_idx= 0+@$submacros) . "_", $yes;
1379         push @$submacros, sprintf "#define $def\n( %s )", "_part"
1380                                   . (my $no_idx= 0+@$submacros) . "_", $no;
1381         return sprintf "%s%s ? $def : $def%s", $lb, $cond,
1382                                     "_part${yes_idx}_", "_part${no_idx}_", $rb;
1383     }
1384     return $str;
1385 }
1386
1387 # $expr=render($op,$combine)
1388 #
1389 # convert an optree to text with reasonably neat formatting. If $combine
1390 # is true then the condition is created using "fast breakouts" which
1391 # produce uglier expressions that are more efficient for common case,
1392 # longer lists such as that resulting from type 'cp' output.
1393 # Currently only used for type 'cp' macros.
1394 sub render {
1395     my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_;
1396
1397     my @submacros;
1398     my $macro= sprintf "#define $def_fmt\n( %s )", "",
1399                        $self->_render( $op, $combine, 0, $opts_ref, $def_fmt,
1400                                                                  \@submacros);
1401
1402     return join "\n\n",
1403             map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) }
1404                                                             @submacros, $macro;
1405 }
1406
1407 # make_macro
1408 # make a macro of a given type.
1409 # calls into make_trie and (generic_|length_)optree as needed
1410 # Opts are:
1411 # type             : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
1412 # ret_type         : 'cp' or 'len'
1413 # safe             : don't assume is well-formed UTF-8, so don't skip any range
1414 #                    checks, and add length guards to macro
1415 # no_length_checks : like safe, but don't add length guards.
1416 #
1417 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1418 # in which case it defaults to 'cp' as well.
1419 #
1420 # It is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1421 # sequences in it, as the generated macro will accept only a single codepoint
1422 # as an argument.
1423 #
1424 # It is also illegal to do a non-safe macro on a pattern with multi-codepoint
1425 # sequences in it, as even if it is known to be well-formed, we need to not
1426 # run off the end of the buffer when, say, the buffer ends with the first two
1427 # characters, but three are looked at by the macro.
1428 #
1429 # returns the macro.
1430
1431
1432 sub make_macro {
1433     my $self= shift;
1434     my %opts= @_;
1435     my $type= $opts{type} || 'generic';
1436     if ($self->{has_multi}) {
1437         if ($type =~ /^cp/) {
1438             die "Can't do a 'cp' on multi-codepoint character class"
1439               . " '$self->{op}'"
1440         }
1441         elsif (! $opts{safe}) {
1442             die "'safe' is required on multi-codepoint character class"
1443                ." '$self->{op}'"
1444         }
1445     }
1446     my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1447     my $method;
1448     if ( $opts{safe} ) {
1449         $method= 'length_optree';
1450     } elsif ( $type =~ /generic/ ) {
1451         $method= 'generic_optree';
1452     } else {
1453         $method= 'optree';
1454     }
1455     my @args= $type =~ /^cp/ ? 'cp' : 's';
1456     push @args, "e" if $opts{safe};
1457     push @args, "is_utf8" if $type =~ /generic/;
1458     push @args, "len" if $ret_type eq 'both';
1459     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
1460              $ret_type eq 'cp'      ? 'what_'     : 'is_';
1461     my $ext= $type     =~ /generic/ ? ''          : '_' . lc( $type );
1462     $ext .= '_non_low' if $type eq 'generic_non_low';
1463     $ext .= "_safe" if $opts{safe};
1464     $ext .= "_no_length_checks" if $opts{no_length_checks};
1465     my $argstr= join ",", @args;
1466     my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
1467     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1468     return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt );
1469 }
1470
1471 # if we aren't being used as a module (highly likely) then process
1472 # the __DATA__ below and produce macros in regcharclass.h
1473 # if an argument is provided to the script then it is assumed to
1474 # be the path of the file to output to, if the arg is '-' outputs
1475 # to STDOUT.
1476 if ( !caller ) {
1477     $|++;
1478     my $path= shift @ARGV || "regcharclass.h";
1479     my $out_fh;
1480     if ( $path eq '-' ) {
1481         $out_fh= \*STDOUT;
1482     } else {
1483         $out_fh = open_new( $path );
1484     }
1485     print $out_fh read_only_top( lang => 'C', by => $0,
1486                                  file => 'regcharclass.h', style => '*',
1487                                  copyright => [2007, 2011],
1488                                  final => <<EOF,
1489 WARNING: These macros are for internal Perl core use only, and may be
1490 changed or removed without notice.
1491 EOF
1492     );
1493     print $out_fh "\n#ifndef PERL_REGCHARCLASS_H_ /* Guard against nested",
1494                   " #includes */\n#define PERL_REGCHARCLASS_H_\n";
1495
1496     my ( $op, $title, @txt, @types, %mods );
1497     my $doit= sub ($) {
1498         return unless $op;
1499
1500         my $charset = shift;
1501
1502         # Skip if to compile on a different platform.
1503         return if delete $mods{only_ascii_platform} && $charset !~ /ascii/i;
1504         return if delete $mods{only_ebcdic_platform} && $charset !~ /ebcdic/i;
1505
1506         print $out_fh "/*\n\t$op: $title\n\n";
1507         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1508         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt,
1509                                                         charset => $charset);
1510
1511         #die Dumper(\@types,\%mods);
1512
1513         my @mods;
1514         push @mods, 'safe' if delete $mods{safe};
1515         push @mods, 'no_length_checks' if delete $mods{no_length_checks};
1516
1517         # Default to 'fast' do this one first, as traditional
1518         unshift @mods, 'fast' if delete $mods{fast} || ! @mods;
1519         if (%mods) {
1520             die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1521         }
1522
1523         foreach my $type_spec ( @types ) {
1524             my ( $type, $ret )= split /-/, $type_spec;
1525             $ret ||= 'len';
1526             foreach my $mod ( @mods ) {
1527
1528                 # 'safe' is irrelevant with code point macros, so skip if
1529                 # there is also a 'fast', but don't skip if this is the only
1530                 # way a cp macro will get generated.  Below we convert 'safe'
1531                 # to 'fast' in this instance
1532                 next if $type =~ /^cp/
1533                         && ($mod eq 'safe' || $mod eq 'no_length_checks')
1534                         && grep { 'fast' =~ $_ } @mods;
1535                 delete $mods{$mod};
1536                 my $macro= $obj->make_macro(
1537                     type     => $type,
1538                     ret_type => $ret,
1539                     safe     => $mod eq 'safe' && $type !~ /^cp/,
1540                     charset  => $charset,
1541                     no_length_checks => $mod eq 'no_length_checks'
1542                                      && $type !~ /^cp/,
1543                 );
1544                 print $out_fh $macro, "\n";
1545             }
1546         }
1547     };
1548
1549     my @data = <DATA>;
1550     foreach my $charset (get_supported_code_pages()) {
1551         my $first_time = 1;
1552         undef $op;
1553         undef $title;
1554         undef @txt;
1555         undef @types;
1556         undef %mods;
1557         print $out_fh "\n", get_conditional_compile_line_start($charset);
1558         my @data_copy = @data;
1559         for (@data_copy) {
1560             s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1561             next unless /\S/;
1562             chomp;
1563             if ( /^[A-Z]/ ) {
1564                 $doit->($charset) unless $first_time;  # This starts a new
1565                                                        # definition; do the
1566                                                        # previous one
1567                 $first_time = 0;
1568                 ( $op, $title )= split /\s*:\s*/, $_, 2;
1569                 @txt= ();
1570             } elsif ( s/^=>// ) {
1571                 my ( $type, $modifier )= split /:/, $_;
1572                 @types= split ' ', $type;
1573                 undef %mods;
1574                 map { $mods{$_} = 1 } split ' ',  $modifier;
1575             } else {
1576                 push @txt, "$_";
1577             }
1578         }
1579         $doit->($charset);
1580         print $out_fh get_conditional_compile_line_end();
1581     }
1582
1583     print $out_fh "\n#endif /* PERL_REGCHARCLASS_H_ */\n";
1584
1585     if($path eq '-') {
1586         print $out_fh "/* ex: set ro: */\n";
1587     } else {
1588         # Some of the sources for these macros come from Unicode tables
1589         my $sources_list = "lib/unicore/mktables.lst";
1590         my @sources = ($0, qw(lib/unicore/mktables
1591                               lib/Unicode/UCD.pm
1592                               regen/regcharclass_multi_char_folds.pl
1593                               regen/charset_translations.pl
1594                              ));
1595         {
1596             # Depend on mktables’ own sources.  It’s a shorter list of files than
1597             # those that Unicode::UCD uses.
1598             if (! open my $mktables_list, '<', $sources_list) {
1599
1600                 # This should force a rebuild once $sources_list exists
1601                 push @sources, $sources_list;
1602             }
1603             else {
1604                 while(<$mktables_list>) {
1605                     last if /===/;
1606                     chomp;
1607                     push @sources, "lib/unicore/$_" if /^[^#]/;
1608                 }
1609             }
1610         }
1611         read_only_bottom_close_and_rename($out_fh, \@sources)
1612     }
1613 }
1614
1615 # The form of the input is a series of definitions to make macros for.
1616 # The first line gives the base name of the macro, followed by a colon, and
1617 # then text to be used in comments associated with the macro that are its
1618 # title or description.  In all cases the first (perhaps only) parameter to
1619 # the macro is a pointer to the first byte of the code point it is to test to
1620 # see if it is in the class determined by the macro.  In the case of non-UTF8,
1621 # the code point consists only of a single byte.
1622 #
1623 # The second line must begin with a '=>' and be followed by the types of
1624 # macro(s) to be generated; these are specified below.  A colon follows the
1625 # types, followed by the modifiers, also specified below.  At least one
1626 # modifier is required.
1627 #
1628 # The subsequent lines give what code points go into the class defined by the
1629 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1630 # enclosed in quotes.  Otherwise the lines consist of one of:
1631 #   1)  a single Unicode code point, prefaced by 0x
1632 #   2)  a single range of Unicode code points separated by a minus (and
1633 #       optional space)
1634 #   3)  a single Unicode property specified in the standard Perl form
1635 #       "\p{...}"
1636 #   4)  a line like 'do path'.  This will do a 'do' on the file given by
1637 #       'path'.  It is assumed that this does nothing but load subroutines
1638 #       (See item 5 below).  The reason 'require path' is not used instead is
1639 #       because 'do' doesn't assume that path is in @INC.
1640 #   5)  a subroutine call
1641 #           &pkg::foo(arg1, ...)
1642 #       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1643 #       returns an array of entries of forms like items 1-3 above.  This
1644 #       allows more complex inputs than achievable from the other input types.
1645 #
1646 # A blank line or one whose first non-blank character is '#' is a comment.
1647 # The definition of the macro is terminated by a line unlike those described.
1648 #
1649 # Valid types:
1650 #   low         generate a macro whose name is 'is_BASE_low' and defines a
1651 #               class that includes only ASCII-range chars.  (BASE is the
1652 #               input macro base name.)
1653 #   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1654 #               class that includes only upper-Latin1-range chars.  It is not
1655 #               designed to take a UTF-8 input parameter.
1656 #   high        generate a macro whose name is 'is_BASE_high' and defines a
1657 #               class that includes all relevant code points that are above
1658 #               the Latin1 range.  This is for very specialized uses only.
1659 #               It is designed to take only an input UTF-8 parameter.
1660 #   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1661 #               class that includes all relevant characters that aren't ASCII.
1662 #               It is designed to take only an input UTF-8 parameter.
1663 #   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1664 #               class that includes both ASCII and upper-Latin1-range chars.
1665 #               It is not designed to take a UTF-8 input parameter.
1666 #   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1667 #               class that can include any code point, adding the 'low' ones
1668 #               to what 'utf8' works on.  It is designed to take only an input
1669 #               UTF-8 parameter.
1670 #   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1671 #               boolean, parameter which indicates if the first one points to
1672 #               a UTF-8 string or not.  Thus it works in all circumstances.
1673 #   generic_non_low generate a macro whose name is 'is_BASE_non_low".  It has
1674 #               a 2nd, boolean, parameter which indicates if the first one
1675 #               points to a UTF-8 string or not.  It excludes any ASCII-range
1676 #               matches, but otherwise it works in all circumstances.
1677 #   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1678 #               class that returns true if the UV parameter is a member of the
1679 #               class; false if not.
1680 #   cp_high     like cp, but it is assumed that it is known that the UV
1681 #               parameter is above Latin1.  The name of the generated macro is
1682 #               'is_BASE_cp_high'.  This is different from high-cp, derived
1683 #               below.
1684 # A macro of the given type is generated for each type listed in the input.
1685 # The default return value is the number of octets read to generate the match.
1686 # Append "-cp" to the type to have it instead return the matched codepoint.
1687 #               The macro name is changed to 'what_BASE...'.  See pod for
1688 #               caveats
1689 # Appending '-both" instead adds an extra parameter to the end of the argument
1690 #               list, which is a pointer as to where to store the number of
1691 #               bytes matched, while also returning the code point.  The macro
1692 #               name is changed to 'what_len_BASE...'.  See pod for caveats
1693 #
1694 # Valid modifiers:
1695 #   safe        The input string is not necessarily valid UTF-8.  In
1696 #               particular an extra parameter (always the 2nd) to the macro is
1697 #               required, which points to one beyond the end of the string.
1698 #               The macro will make sure not to read off the end of the
1699 #               string.  In the case of non-UTF8, it makes sure that the
1700 #               string has at least one byte in it.  The macro name has
1701 #               '_safe' appended to it.
1702 #   no_length_checks  The input string is not necessarily valid UTF-8, but it
1703 #               is to be assumed that the length has already been checked and
1704 #               found to be valid
1705 #   fast        The input string is valid UTF-8.  No bounds checking is done,
1706 #               and the macro can make assumptions that lead to faster
1707 #               execution.
1708 #   only_ascii_platform   Skip this definition if the character set is for
1709 #               a non-ASCII platform.
1710 #   only_ebcdic_platform  Skip this definition if the character set is for
1711 #               a non-EBCDIC platform.
1712 # No modifier need be specified; fast is assumed for this case.  If both
1713 # 'fast', and 'safe' are specified, two macros will be created for each
1714 # 'type'.
1715 #
1716 # If run on a non-ASCII platform will automatically convert the Unicode input
1717 # to native.  The documentation above is slightly wrong in this case.  'low'
1718 # actually refers to code points whose UTF-8 representation is the same as the
1719 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1720 # code points less than 256.
1721
1722 1; # in the unlikely case we are being used as a module
1723
1724 __DATA__
1725 # This is no longer used, but retained in case it is needed some day.
1726 # TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1727 # => generic cp generic-cp generic-both :fast safe
1728 # 0x00DF        # LATIN SMALL LETTER SHARP S
1729 # 0x0390        # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1730 # 0x03B0        # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1731 # 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1732 # 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1733 # 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1734
1735 LNBREAK: Line Break: \R
1736 => generic UTF8 LATIN1 : safe
1737 "\x0D\x0A"      # CRLF - Network (Windows) line ending
1738 \p{VertSpace}
1739
1740 HORIZWS: Horizontal Whitespace: \h \H
1741 => high cp_high : fast
1742 \p{HorizSpace}
1743
1744 VERTWS: Vertical Whitespace: \v \V
1745 => high cp_high : fast
1746 \p{VertSpace}
1747
1748 XDIGIT: Hexadecimal digits
1749 => high cp_high : fast
1750 \p{XDigit}
1751
1752 XPERLSPACE: \p{XPerlSpace}
1753 => high cp_high : fast
1754 \p{XPerlSpace}
1755
1756 NONCHAR: Non character code points
1757 => UTF8 :safe
1758 \p{_Perl_Nchar}
1759
1760 SHORTER_NON_CHARS:  # 3 bytes
1761 => UTF8 :only_ascii_platform fast
1762 0xFDD0 - 0xFDEF
1763 0xFFFE - 0xFFFF
1764
1765 LARGER_NON_CHARS:   # 4 bytes
1766 => UTF8 :only_ascii_platform fast
1767 0x1FFFE - 0x1FFFF
1768 0x2FFFE - 0x2FFFF
1769 0x3FFFE - 0x3FFFF
1770 0x4FFFE - 0x4FFFF
1771 0x5FFFE - 0x5FFFF
1772 0x6FFFE - 0x6FFFF
1773 0x7FFFE - 0x7FFFF
1774 0x8FFFE - 0x8FFFF
1775 0x9FFFE - 0x9FFFF
1776 0xAFFFE - 0xAFFFF
1777 0xBFFFE - 0xBFFFF
1778 0xCFFFE - 0xCFFFF
1779 0xDFFFE - 0xDFFFF
1780 0xEFFFE - 0xEFFFF
1781 0xFFFFE - 0xFFFFF
1782 0x10FFFE - 0x10FFFF
1783
1784 SHORTER_NON_CHARS:  # 4 bytes
1785 => UTF8 :only_ebcdic_platform fast
1786 0xFDD0 - 0xFDEF
1787 0xFFFE - 0xFFFF
1788 0x1FFFE - 0x1FFFF
1789 0x2FFFE - 0x2FFFF
1790 0x3FFFE - 0x3FFFF
1791
1792 LARGER_NON_CHARS:   # 5 bytes
1793 => UTF8 :only_ebcdic_platform fast
1794 0x4FFFE - 0x4FFFF
1795 0x5FFFE - 0x5FFFF
1796 0x6FFFE - 0x6FFFF
1797 0x7FFFE - 0x7FFFF
1798 0x8FFFE - 0x8FFFF
1799 0x9FFFE - 0x9FFFF
1800 0xAFFFE - 0xAFFFF
1801 0xBFFFE - 0xBFFFF
1802 0xCFFFE - 0xCFFFF
1803 0xDFFFE - 0xDFFFF
1804 0xEFFFE - 0xEFFFF
1805 0xFFFFE - 0xFFFFF
1806 0x10FFFE - 0x10FFFF
1807
1808 # Note that code in utf8.c is counting on the 'fast' version to look at no
1809 # more than two bytes
1810 SURROGATE: Surrogate code points
1811 => UTF8 :safe fast
1812 \p{_Perl_Surrogate}
1813
1814 QUOTEMETA: Meta-characters that \Q should quote
1815 => high :fast
1816 \p{_Perl_Quotemeta}
1817
1818 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1819 => UTF8 UTF8-cp :safe
1820 %regcharclass_multi_char_folds::multi_char_folds('u', 'a')
1821
1822 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1823 => LATIN1 LATIN1-cp : safe
1824 %regcharclass_multi_char_folds::multi_char_folds('l', 'a')
1825
1826 THREE_CHAR_FOLD: A three-character multi-char fold
1827 => UTF8 :safe
1828 %regcharclass_multi_char_folds::multi_char_folds('u', '3')
1829
1830 THREE_CHAR_FOLD: A three-character multi-char fold
1831 => LATIN1 :safe
1832 %regcharclass_multi_char_folds::multi_char_folds('l', '3')
1833
1834 THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
1835 => UTF8 :safe
1836 %regcharclass_multi_char_folds::multi_char_folds('u', 'h')
1837
1838 THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
1839 => LATIN1 :safe
1840 %regcharclass_multi_char_folds::multi_char_folds('l', 'h')
1841 #
1842 #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
1843 #=> UTF8 :safe
1844 #%regcharclass_multi_char_folds::multi_char_folds('u', 'fm')
1845 #
1846 #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
1847 #=> LATIN1 :safe
1848 #%regcharclass_multi_char_folds::multi_char_folds('l', 'fm')
1849
1850 FOLDS_TO_MULTI: characters that fold to multi-char strings
1851 => UTF8 :fast
1852 \p{_Perl_Folds_To_Multi_Char}
1853
1854 PROBLEMATIC_LOCALE_FOLD : characters whose fold is problematic under locale
1855 => UTF8 cp :fast
1856 \p{_Perl_Problematic_Locale_Folds}
1857
1858 PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are problematic under locale
1859 => UTF8 cp :fast
1860 \p{_Perl_Problematic_Locale_Foldeds_Start}
1861
1862 PATWS: pattern white space
1863 => generic : safe
1864 \p{_Perl_PatWS}
1865
1866 HANGUL_ED: Hangul syllables whose first UTF-8 byte is \xED
1867 => UTF8 :only_ascii_platform safe
1868 0xD000 - 0xD7FF
1869
1870 HANGUL_ED: Hangul syllables whose first UTF-8 byte is \xED
1871 => UTF8 :only_ebcdic_platform safe
1872 0x1 - 0x0
1873 # Alows fails on EBCDIC; there are no ED Hanguls there