regen/regcharclass.pl

   1 #!perl
   2 package CharClass::Matcher;
   3 use strict;
   4 use 5.008;
   5 use warnings;
   6 use warnings FATAL => 'all';
   7 use Data::Dumper;
   8 $Data::Dumper::Useqq= 1;
   9
  10 sub DEBUG () { 0 }
  11 $|=1 if DEBUG;
  12
  13 require './regen/regen_lib.pl';
  14 require './regen/charset_translations.pl';
  15 require "./regen/regcharclass_multi_char_folds.pl";
  16
  17 =head1 NAME
  18
  19 CharClass::Matcher -- Generate C macros that match character classes efficiently
  20
  21 =head1 SYNOPSIS
  22
  23     perl regen/regcharclass.pl
  24
  25 =head1 DESCRIPTION
  26
  27 Dynamically generates macros for detecting special charclasses
  28 in latin-1, utf8, and codepoint forms. Macros can be set to return
  29 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
  30
  31 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
  32 are necessary.
  33
  34 Using WHATEVER as an example the following macros can be produced, depending
  35 on the input parameters (how to get each is described by internal comments at
  36 the C<__DATA__> line):
  37
  38 =over 4
  39
  40 =item C<is_WHATEVER(s,is_utf8)>
  41
  42 =item C<is_WHATEVER_safe(s,e,is_utf8)>
  43
  44 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
  45 comparisons involving octet<128 are done before checking the C<is_utf8>
  46 flag, hopefully saving time.
  47
  48 The version without the C<_safe> suffix should be used only when the input is
  49 known to be well-formed.
  50
  51 =item C<is_WHATEVER_utf8(s)>
  52
  53 =item C<is_WHATEVER_utf8_safe(s,e)>
  54
  55 Do a lookup assuming the string is encoded in (normalized) UTF8.
  56
  57 The version without the C<_safe> suffix should be used only when the input is
  58 known to be well-formed.
  59
  60 =item C<is_WHATEVER_latin1(s)>
  61
  62 =item C<is_WHATEVER_latin1_safe(s,e)>
  63
  64 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  65
  66 The version without the C<_safe> suffix should be used only when it is known
  67 that C<s> contains at least one character.
  68
  69 =item C<is_WHATEVER_cp(cp)>
  70
  71 Check to see if the string matches a given codepoint (hypothetically a
  72 U32). The condition is constructed as to "break out" as early as
  73 possible if the codepoint is out of range of the condition.
  74
  75 IOW:
  76
  77   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  78
  79 Thus if the character is X+1 only two comparisons will be done. Making
  80 matching lookups slower, but non-matching faster.
  81
  82 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
  83
  84 A variant form of each of the macro types described above can be generated, in
  85 which the code point is returned by the macro, and an extra parameter (in the
  86 final position) is added, which is a pointer for the macro to set the byte
  87 length of the returned code point.
  88
  89 These forms all have a C<what_len> prefix instead of the C<is_>, for example
  90 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
  91 C<what_len_WHATEVER_utf8(s,len)>.
  92
  93 These forms should not be used I<except> on small sets of mostly widely
  94 separated code points; otherwise the code generated is inefficient.  For these
  95 cases, it is best to use the C<is_> forms, and then find the code point with
  96 C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
  97 message on the worst of the inappropriate sets.  Examine the generated macro
  98 to see if it is acceptable.
  99
 100 =item C<what_WHATEVER_FOO(arg1, ...)>
 101
 102 A variant form of each of the C<is_> macro types described above can be generated, in
 103 which the code point and not the length is returned by the macro.  These have
 104 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
 105 not be used where the set contains a NULL, as 0 is returned for two different
 106 cases: a) the set doesn't include the input code point; b) the set does
 107 include it, and it is a NULL.
 108
 109 =back
 110
 111 The above isn't quite complete, as for specialized purposes one can get a
 112 macro like C<is_WHATEVER_utf8_no_length_checks(s)>, which assumes that it is
 113 already known that there is enough space to hold the character starting at
 114 C<s>, but otherwise checks that it is well-formed.  In other words, this is
 115 intermediary in checking between C<is_WHATEVER_utf8(s)> and
 116 C<is_WHATEVER_utf8_safe(s,e)>.
 117
 118 =head2 CODE FORMAT
 119
 120 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
 121
 122
 123 =head1 AUTHOR
 124
 125 Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
 126
 127 =head1 BUGS
 128
 129 No tests directly here (although the regex engine will fail tests
 130 if this code is broken). Insufficient documentation and no Getopts
 131 handler for using the module as a script.
 132
 133 =head1 LICENSE
 134
 135 You may distribute under the terms of either the GNU General Public
 136 License or the Artistic License, as specified in the README file.
 137
 138 =cut
 139
 140 # Sub naming convention:
 141 # __func : private subroutine, can not be called as a method
 142 # _func  : private method, not meant for external use
 143 # func   : public method.
 144
 145 # private subs
 146 #-------------------------------------------------------------------------------
 147 #
 148 # ($cp,$n,$l,$u)=__uni_latin($str);
 149 #
 150 # Return a list of arrays, each of which when interpreted correctly
 151 # represent the string in some given encoding with specific conditions.
 152 #
 153 # $cp - list of codepoints that make up the string.
 154 # $n  - list of octets that make up the string if all codepoints are invariant
 155 #       regardless of if the string is in UTF-8 or not.
 156 # $l  - list of octets that make up the string in latin1 encoding if all
 157 #       codepoints < 256, and at least one codepoint is UTF-8 variant.
 158 # $u  - list of octets that make up the string in utf8 if any codepoint is
 159 #       UTF-8 variant
 160 #
 161 #   High CP | Defined
 162 #-----------+----------
 163 #   0 - 127 : $n            (127/128 are the values for ASCII platforms)
 164 # 128 - 255 : $l, $u
 165 # 256 - ... : $u
 166 #
 167
 168 sub __uni_latin1 {
 169     my $charset = shift;
 170     my $a2n= shift;
 171     my $str= shift;
 172     my $max= 0;
 173     my @cp;
 174     my @cp_high;
 175     my $only_has_invariants = 1;
 176     for my $ch ( split //, $str ) {
 177         my $cp= ord $ch;
 178         $max= $cp if $max < $cp;
 179         if ($cp > 255) {
 180             push @cp, $cp;
 181             push @cp_high, $cp;
 182         }
 183         else {
 184             push @cp, $a2n->[$cp];
 185         }
 186     }
 187     my ( $n, $l, $u );
 188     $only_has_invariants = ($charset =~ /ascii/i) ? $max < 128 : $max < 160;
 189     if ($only_has_invariants) {
 190         $n= [@cp];
 191     } else {
 192         $l= [@cp] if $max && $max < 256;
 193
 194         my @u;
 195         for my $ch ( split //, $str ) {
 196             push @u, map { ord } split //, cp_2_utfbytes(ord $ch, $charset);
 197         }
 198         $u = \@u;
 199     }
 200     return ( \@cp, \@cp_high, $n, $l, $u );
 201 }
 202
 203 #
 204 # $clean= __clean($expr);
 205 #
 206 # Cleanup a ternary expression, removing unnecessary parens and apply some
 207 # simplifications using regexes.
 208 #
 209
 210 sub __clean {
 211     my ( $expr )= @_;
 212
 213     #return $expr;
 214
 215     our $parens;
 216     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 217
 218     ## remove redundant parens
 219     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 220
 221
 222     # repeatedly simplify conditions like
 223     #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
 224     # into
 225     #       ( ( (cond1) && (cond2) ) ? X : Y )
 226     # Also similarly handles expressions like:
 227     #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
 228     # Note the inclusion of the close paren in ([:()]) and the open paren in
 229     # ([()]) is purely to ensure we have a balanced set of parens in the
 230     # expression which makes it easier to understand the pattern in an editor
 231     # that understands paren's, we do not expect either of these cases to
 232     # actually fire. - Yves
 233     1 while $expr =~ s/
 234         ([:()])  \s*
 235             ($parens) \s*
 236             \? \s*
 237                 \( \s* ($parens) \s*
 238                     \? \s* ($parens|[^()?:\s]+?) \s*
 239                     :  \s* ($parens|[^()?:\s]+?) \s*
 240                 \) \s*
 241             : \s* \5 \s*
 242         ([()])
 243     /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
 244     #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000;
 245     #$expr=~s/\s+//g if length $expr > 8000;
 246
 247     die "Expression too long" if length $expr > 8000;
 248
 249     return $expr;
 250 }
 251
 252 #
 253 # $text= __macro(@args);
 254 # Join args together by newlines, and then neatly add backslashes to the end
 255 # of every  line as expected by the C pre-processor for #define's.
 256 #
 257
 258 sub __macro {
 259     my $str= join "\n", @_;
 260     $str =~ s/\s*$//;
 261     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 262     my $last= pop @lines;
 263     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 264     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 265     return $str . "\n";
 266 }
 267
 268 #
 269 # my $op=__incrdepth($op);
 270 #
 271 # take an 'op' hashref and add one to it and all its childrens depths.
 272 #
 273
 274 sub __incrdepth {
 275     my $op= shift;
 276     return unless ref $op;
 277     $op->{depth} += 1;
 278     __incrdepth( $op->{yes} );
 279     __incrdepth( $op->{no} );
 280     return $op;
 281 }
 282
 283 # join two branches of an opcode together with a condition, incrementing
 284 # the depth on the yes branch when we do so.
 285 # returns the new root opcode of the tree.
 286 sub __cond_join {
 287     my ( $cond, $yes, $no )= @_;
 288     if (ref $yes) {
 289         return {
 290             test  => $cond,
 291             yes   => __incrdepth( $yes ),
 292             no    => $no,
 293             depth => 0,
 294         };
 295     }
 296     else {
 297         return {
 298             test  => $cond,
 299             yes   => $yes,
 300             no    => __incrdepth($no),
 301             depth => 0,
 302         };
 303     }
 304 }
 305
 306 my $hex_fmt= "0x%02X";
 307
 308 sub val_fmt
 309 {
 310     my $self = shift;
 311     my $arg = shift;
 312     my $always_hex = shift // 0;    # Use \x{}; don't look for a mnemonic
 313
 314     # Format 'arg' using the printable character if it has one, or a %x if
 315     # not, returning a string containing the result
 316
 317     # Return what always returned for an unexpected argument
 318     return $hex_fmt unless defined $arg && $arg !~ /\D/;
 319
 320     # We convert only things inside Latin1
 321     if (! $always_hex && $arg < 256) {
 322
 323         # Find the ASCII equivalent of this argument (as the current character
 324         # set might not be ASCII)
 325         my $char = chr $self->{n2a}->[$arg];
 326
 327         # If printable, return it, escaping \ and '
 328         return "'$char'" if $char =~ /[^\\'[:^print:]]/a;
 329         return "'\\\\'" if $char eq "\\";
 330         return "'\''" if $char eq "'";
 331
 332         # Handle the mnemonic controls
 333         my $pos = index("\a\b\e\f\n\r\t\cK", $char);
 334         return "'\\" . substr("abefnrtv", $pos, 1) . "'" if $pos >= 0;
 335     }
 336
 337     # Otherwise, just the input, formatted
 338     return sprintf $hex_fmt, $arg;
 339 }
 340
 341 # Methods
 342
 343 # constructor
 344 #
 345 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 346 #
 347 # Create a new CharClass::Matcher object by parsing the text in
 348 # the txt array. Currently applies the following rules:
 349 #
 350 # Element starts with C<0x>, line is evaled the result treated as
 351 # a number which is passed to chr().
 352 #
 353 # Element starts with C<">, line is evaled and the result treated
 354 # as a string.
 355 #
 356 # Each string is then stored in the 'strs' subhash as a hash record
 357 # made up of the results of __uni_latin1, using the keynames
 358 # 'low', 'latin1', 'utf8', as well as the synthesized 'LATIN1', 'high',
 359 # 'UTF8', and 'backwards_UTF8' which hold a merge of 'low' and their lowercase
 360 # equivalents.
 361 #
 362 # Size data is tracked per type in the 'size' subhash.
 363 #
 364 # Return an object
 365
 366 my %a2n;
 367 my %n2a;        # Inversion of a2n, for each character set
 368 my %I8_2_utf;
 369 my %utf_2_I8;   # Inversion of I8_2_utf, for each EBCDIC character set
 370 my @identity = (0..255);
 371
 372 sub new {
 373     my $class= shift;
 374     my %opt= @_;
 375     my %hash_return;
 376     for ( qw(op txt) ) {
 377         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 378           if !exists $opt{$_};
 379     }
 380
 381     my $self= bless {
 382         op    => $opt{op},
 383         title => $opt{title} || '',
 384     }, $class;
 385
 386     my $charset = $opt{charset};
 387     $a2n{$charset} = get_a2n($charset);
 388
 389     # We need to construct the maps going the other way if not already done
 390     unless (defined $n2a{$charset}) {
 391         for (my $i = 0; $i < 256; $i++) {
 392             $n2a{$charset}->[$a2n{$charset}->[$i]] = $i;
 393         }
 394     }
 395
 396     if ($charset =~ /ebcdic/i) {
 397         $I8_2_utf{$charset} = get_I8_2_utf($charset);
 398         unless (defined $utf_2_I8{$charset}) {
 399             for (my $i = 0; $i < 256; $i++) {
 400                 $utf_2_I8{$charset}->[$I8_2_utf{$charset}->[$i]] = $i;
 401             }
 402         }
 403     }
 404
 405     foreach my $txt ( @{ $opt{txt} } ) {
 406         my $str= $txt;
 407         if ( $str =~ /^[""]/ ) {
 408             $str= eval $str;
 409         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
 410                                     # list with its expansion
 411             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
 412             die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'"
 413                                         if ! defined $lower || ! defined $upper;
 414             foreach my $cp (hex $lower .. hex $upper) {
 415                 push @{$opt{txt}}, sprintf "0x%X", $cp;
 416             }
 417             next;
 418         } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
 419             # Otherwise undocumented, a leading N means is already in the
 420             # native character set; don't convert.
 421             $str= chr eval $str;
 422         } elsif ( $str =~ /^0x/ ) {
 423             $str= eval $str;
 424             $str = chr $str;
 425         } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
 426             my $property = $1;
 427             use Unicode::UCD qw(prop_invlist);
 428
 429             my @invlist = prop_invlist($property, '_perl_core_internal_ok');
 430             if (! @invlist) {
 431
 432                 # An empty return could mean an unknown property, or merely
 433                 # that it is empty.  Call in scalar context to differentiate
 434                 my $count = prop_invlist($property, '_perl_core_internal_ok');
 435                 die "$property not found" unless defined $count;
 436             }
 437
 438             # Replace this element on the list with the property's expansion
 439             for (my $i = 0; $i < @invlist; $i += 2) {
 440                 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
 441
 442                     # prop_invlist() returns native values; add leading 'N'
 443                     # to indicate that.
 444                     push @{$opt{txt}}, sprintf "N0x%X", $cp;
 445                 }
 446             }
 447             next;
 448         } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
 449             die "do '$1' failed: $!$@" if ! do $1 or $@;
 450             next;
 451         } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
 452             my @results = eval "$1";
 453             die "eval '$1' failed: $@" if $@;
 454             push @{$opt{txt}}, @results;
 455             next;
 456         } elsif ($str =~ / ^ % \s* ( .* ) /x) { # user-furnished sub() call
 457             %hash_return = eval "$1";
 458             die "eval '$1' failed: $@" if $@;
 459             push @{$opt{txt}}, keys %hash_return;
 460             die "Only one multi character expansion currently allowed per rule"
 461                                                         if  $self->{multi_maps};
 462             next;
 463         } else {
 464             die "Unparsable line: $txt\n";
 465         }
 466         my ( $cp, $cp_high, $low, $latin1, $utf8 )
 467                                     = __uni_latin1($charset, $a2n{$charset}, $str );
 468         my $from;
 469         if (defined $hash_return{"\"$str\""}) {
 470             $from = $hash_return{"\"$str\""};
 471             $from = $a2n{$charset}->[$from] if $from < 256;
 472         }
 473         my $UTF8= $low   || $utf8;
 474         my $LATIN1= $low || $latin1;
 475         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
 476         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 477         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 478
 479         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 from )}=
 480           ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1, $from );
 481         my $rec= $self->{strs}{$str};
 482         foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
 483             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 484               if $self->{strs}{$str}{$key};
 485         }
 486         $self->{has_multi} ||= @$cp > 1;
 487         $self->{has_ascii} ||= $latin1 && @$latin1;
 488         $self->{has_low}   ||= $low && @$low;
 489         $self->{has_high}  ||= !$low && !$latin1;
 490     }
 491     $self->{n2a} = $n2a{$charset};
 492     $self->{count}= 0 + keys %{ $self->{strs} };
 493     return $self;
 494 }
 495
 496 # my $trie = make_trie($type,$maxlen);
 497 #
 498 # using the data stored in the object build a trie of a specific type,
 499 # and with specific maximum depth. The trie is made up the elements of
 500 # the given types array for each string in the object (assuming it is
 501 # not too long.)
 502 #
 503 # returns the trie, or undef if there was no relevant data in the object.
 504 #
 505
 506 sub make_trie {
 507     my ( $self, $type, $maxlen, $backwards )= @_;
 508
 509     my $strs= $self->{strs};
 510     my %trie;
 511     foreach my $rec ( values %$strs ) {
 512         die "panic: unknown type '$type'"
 513           if !exists $rec->{$type};
 514         my $dat= $rec->{$type};
 515         next unless $dat;
 516         next if $maxlen && @$dat > $maxlen;
 517         my $node= \%trie;
 518         my @ordered_dat = ($backwards) ? reverse @$dat : @$dat;
 519         foreach my $elem ( @ordered_dat ) {
 520             $node->{$elem} ||= {};
 521             $node= $node->{$elem};
 522         }
 523         $node->{''}= $rec->{str};
 524     }
 525     return 0 + keys( %trie ) ? \%trie : undef;
 526 }
 527
 528 sub pop_count ($) {
 529     my $word = shift;
 530
 531     # This returns a list of the positions of the bits in the input word that
 532     # are 1.
 533
 534     my @positions;
 535     my $position = 0;
 536     while ($word) {
 537         push @positions, $position if $word & 1;
 538         $position++;
 539         $word >>= 1;
 540     }
 541     return @positions;
 542 }
 543
 544 # my $optree= _optree()
 545 #
 546 # recursively convert a trie to an optree where every node represents
 547 # an if else branch.
 548 #
 549 #
 550
 551 sub _optree {
 552     my ( $self, $trie, $test_type, $ret_type, $else, $depth, $backwards )= @_;
 553     return unless defined $trie;
 554     $ret_type ||= 'len';
 555     $else= 0  unless defined $else;
 556     $depth= 0 unless defined $depth;
 557
 558     # if we have an empty string as a key it means we are in an
 559     # accepting state and unless we can match further on should
 560     # return the value of the '' key.
 561     if (exists $trie->{''} ) {
 562         # we can now update the "else" value, anything failing to match
 563         # after this point should return the value from this.
 564         my $prefix = $self->{strs}{ $trie->{''} };
 565         if ( $ret_type eq 'cp' ) {
 566             $else= $prefix->{from};
 567             $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
 568             $else= $self->val_fmt($else) if $else > 9;
 569         } elsif ( $ret_type eq 'len' ) {
 570             $else= $depth;
 571         } elsif ( $ret_type eq 'both') {
 572             $else= $prefix->{from};
 573             $else= $self->{strs}{ $trie->{''} }{cp}[0] unless defined $else;
 574             $else= $self->val_fmt($else) if $else > 9;
 575             $else= "len=$depth, $else";
 576         }
 577     }
 578     # extract the meaningful keys from the trie, filter out '' as
 579     # it means we are an accepting state (end of sequence).
 580     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 581
 582     # if we haven't any keys there is no further we can match and we
 583     # can return the "else" value.
 584     return $else if !@conds;
 585
 586     my $test;
 587     if ($test_type =~ /^cp/) {
 588         $test = "cp";
 589     }
 590     elsif ($backwards) {
 591         $test = "*((const U8*)s - " . ($depth + 1) . ")";
 592     }
 593     else {
 594         $test = "((const U8*)s)[$depth]";
 595     }
 596
 597     # First we loop over the possible keys/conditions and find out what they
 598     # look like; we group conditions with the same optree together.
 599     my %dmp_res;
 600     my @res_order;
 601     local $Data::Dumper::Sortkeys=1;
 602     foreach my $cond ( @conds ) {
 603
 604         # get the optree for this child/condition
 605         my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type,
 606                                                 $else, $depth + 1, $backwards );
 607         # convert it to a string with Dumper
 608         my $res_code= Dumper( $res );
 609
 610         push @{$dmp_res{$res_code}{vals}}, $cond;
 611         if (!$dmp_res{$res_code}{optree}) {
 612             $dmp_res{$res_code}{optree}= $res;
 613             push @res_order, $res_code;
 614         }
 615     }
 616
 617     # now that we have deduped the optrees we construct a new optree
 618     # containing the merged
 619     # results.
 620     my %root;
 621     my $node= \%root;
 622     foreach my $res_code_idx (0 .. $#res_order) {
 623         my $res_code= $res_order[$res_code_idx];
 624         $node->{vals}= $dmp_res{$res_code}{vals};
 625         $node->{test}= $test;
 626         $node->{yes}= $dmp_res{$res_code}{optree};
 627         $node->{depth}= $depth;
 628         if ($res_code_idx < $#res_order) {
 629             $node= $node->{no}= {};
 630         } else {
 631             $node->{no}= $else;
 632         }
 633     }
 634
 635     # return the optree.
 636     return \%root;
 637 }
 638
 639 # my $optree= optree(%opts);
 640 #
 641 # Convert a trie to an optree, wrapper for _optree
 642
 643 sub optree {
 644     my $self= shift;
 645     my %opt= @_;
 646     my $trie= $self->make_trie( $opt{type}, $opt{max_depth}, $opt{backwards} );
 647     $opt{ret_type} ||= 'len';
 648     my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
 649     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0,
 650                                                                     $opt{backwards} );
 651 }
 652
 653 # my $optree= generic_optree(%opts);
 654 #
 655 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 656 # sets of strings, including a branch for handling the string type check.
 657 #
 658
 659 sub generic_optree {
 660     my $self= shift;
 661     my %opt= @_;
 662
 663     $opt{ret_type} ||= 'len';
 664     my $test_type= 'depth';
 665     my $else= $opt{else} || 0;
 666
 667     my $latin1= $self->make_trie( 'latin1', $opt{max_depth}, $opt{backwards} );
 668     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth}, $opt{backwards} );
 669
 670     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0, $opt{backwards} )
 671       for $latin1, $utf8;
 672
 673     if ( $utf8 ) {
 674         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 675     } elsif ( $latin1 ) {
 676         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 677     }
 678     if ($opt{type} eq 'generic') {
 679         my $low= $self->make_trie( 'low', $opt{max_depth}, $opt{backwards} );
 680         if ( $low ) {
 681             $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0,
 682                                                                     $opt{backwards} );
 683         }
 684     }
 685
 686     return $else;
 687 }
 688
 689 # length_optree()
 690 #
 691 # create a string length guarded optree.
 692 #
 693
 694 sub length_optree {
 695     my $self= shift;
 696     my %opt= @_;
 697     my $type= $opt{type};
 698
 699     die "Can't do a length_optree on type 'cp', makes no sense."
 700       if $type =~ /^cp/;
 701
 702     my $else= ( $opt{else} ||= 0 );
 703
 704     return $else if $self->{count} == 0;
 705
 706     my $method = $type =~ /generic/ ? 'generic_optree' : 'optree';
 707     if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) {
 708
 709         # Here is non-generic output (meaning that we are only generating one
 710         # type), and all things that match have the same number ('size') of
 711         # bytes.  The length guard is simply that we have that number of
 712         # bytes.
 713         my @size = keys %{$self->{size}{$type}};
 714         my $cond= "((e) - (s)) >= $size[0]";
 715         my $optree = $self->$method(%opt);
 716         $else= __cond_join( $cond, $optree, $else );
 717     }
 718     elsif ($self->{has_multi}) {
 719         my @size;
 720
 721         # Here, there can be a match of a multiple character string.  We use
 722         # the traditional method which is to have a branch for each possible
 723         # size (longest first) and test for the legal values for that size.
 724         my %sizes= (
 725             %{ $self->{size}{low}    || {} },
 726             %{ $self->{size}{latin1} || {} },
 727             %{ $self->{size}{utf8}   || {} }
 728         );
 729         if ($method eq 'generic_optree') {
 730             @size= sort { $a <=> $b } keys %sizes;
 731         } else {
 732             @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 733         }
 734         for my $size ( @size ) {
 735             my $optree= $self->$method(%opt, type => $type, max_depth => $size);
 736             my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 737             $else= __cond_join( $cond, $optree, $else );
 738         }
 739     }
 740     elsif ($opt{backwards}) {
 741         my @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 742         for my $size ( @size ) {
 743             my $optree= $self->$method(%opt, type => $type, max_depth => $size);
 744             my $cond= "((s) - (e) > " . ( $size - 1 ).")";
 745             $else= __cond_join( $cond, $optree, $else );
 746         }
 747     }
 748     else {
 749         my $utf8;
 750
 751         # Here, has more than one possible size, and only matches a single
 752         # character.  For non-utf8, the needed length is 1; for utf8, it is
 753         # found by array lookup 'UTF8SKIP'.
 754
 755         # If want just the code points above 255, set up to look for those;
 756         # otherwise assume will be looking for all non-UTF-8-invariant code
 757         # poiints.
 758         my $trie_type = ($type eq 'high') ? 'high' : 'utf8';
 759
 760         # If we do want more than the 0-255 range, find those, and if they
 761         # exist...
 762         if (   $opt{type} !~ /latin1/i
 763             && ($utf8 = $self->make_trie($trie_type, 0, $opt{backwards})))
 764         {
 765
 766             # ... get them into an optree, and set them up as the 'else' clause
 767             $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0,
 768                                                                     $opt{backwards} );
 769
 770             # We could make this
 771             #   UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))";
 772             # to avoid doing the UTF8SKIP and subsequent branches for invariants
 773             # that don't match.  But the current macros that get generated
 774             # have only a few things that can match past this, so I (khw)
 775             # don't think it is worth it.  (Even better would be to use
 776             # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it
 777             # if it saves a bunch.  We assume that input text likely to be
 778             # well-formed .
 779             my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))";
 780             $else = __cond_join($cond, $utf8, $else);
 781
 782             # For 'generic', we also will want the latin1 UTF-8 variants for
 783             # the case where the input isn't UTF-8.
 784             my $latin1;
 785             if ($method eq 'generic_optree') {
 786                 $latin1 = $self->make_trie( 'latin1', 1, $opt{backwards});
 787                 $latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0,
 788                                                                     $opt{backwards});
 789             }
 790
 791             # If we want the UTF-8 invariants, get those.
 792             my $low;
 793             if ($opt{type} !~ /non_low|high/
 794                 && ($low= $self->make_trie( 'low', 1, 0)))
 795             {
 796                 $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0,
 797                                                                     $opt{backwards} );
 798
 799                 # Expand out the UTF-8 invariants as a string so that we
 800                 # can use them as the conditional
 801                 $low = $self->_cond_as_str( $low, 0, \%opt);
 802
 803                 # If there are Latin1 variants, add a test for them.
 804                 if ($latin1) {
 805                     $else = __cond_join("(! is_utf8 )", $latin1, $else);
 806                 }
 807                 elsif ($method eq 'generic_optree') {
 808
 809                     # Otherwise for 'generic' only we know that what
 810                     # follows must be valid for just UTF-8 strings,
 811                     $else->{test} = "( is_utf8 && $else->{test} )";
 812                 }
 813
 814                 # If the invariants match, we are done; otherwise we have
 815                 # to go to the 'else' clause.
 816                 $else = __cond_join($low, 1, $else);
 817             }
 818             elsif ($latin1) {   # Here, didn't want or didn't have invariants,
 819                                 # but we do have latin variants
 820                 $else = __cond_join("(! is_utf8)", $latin1, $else);
 821             }
 822
 823             # We need at least one byte available to start off the tests
 824             $else = __cond_join("LIKELY((e) > (s))", $else, 0);
 825         }
 826         else {  # Here, we don't want or there aren't any variants.  A single
 827                 # byte available is enough.
 828             my $cond= "((e) > (s))";
 829             my $optree = $self->$method(%opt);
 830             $else= __cond_join( $cond, $optree, $else );
 831         }
 832     }
 833
 834     return $else;
 835 }
 836
 837 sub calculate_mask(@) {
 838     # Look at the input list of byte values.  This routine returns an array of
 839     # mask/base pairs to generate that list.
 840
 841     my @list = @_;
 842     my $list_count = @list;
 843
 844     # Consider a set of byte values, A, B, C ....  If we want to determine if
 845     # <c> is one of them, we can write c==A || c==B || c==C ....  If the
 846     # values are consecutive, we can shorten that to inRANGE(c, 'A', 'Z'),
 847     # which uses far fewer branches.  If only some of them are consecutive we
 848     # can still save some branches by creating range tests for just those that
 849     # are consecutive. _cond_as_str() does this work for looking for ranges.
 850     #
 851     # Another approach is to look at the bit patterns for A, B, C .... and see
 852     # if they have some commonalities.  That's what this function does.  For
 853     # example, consider a set consisting of the bytes
 854     # 0x42, 0x43, 0x62, and 0x63.  We could write:
 855     #   inRANGE(c, 0x42, 0x43) || inRANGE(c, 0x62, 0x63)
 856     # which through the magic of casting has not 4, but 2 tests.  But the
 857     # following mask/compare also works, and has just one test:
 858     #   (c & 0xDE) == 0x42
 859     # The reason it works is that the set consists of exactly the 4 bit
 860     # patterns which have either 0 or 1 in the two bit positions that are 0 in
 861     # the mask.  They have the same value in each bit position where the mask
 862     # is 1.  The comparison makes sure that the result matches all bytes which
 863     # match those six 1 bits exactly.  This can be applied to bytes that
 864     # differ in 1 through all 8 bit positions.  In order to be a candidate for
 865     # this optimization, the number of bytes in the set must be a power of 2.
 866     #
 867     # It may be that the bytes needing to be matched can't be done with a
 868     # single mask.  But it may be possible to have two (or more) sets, each
 869     # with a separate mask.  This function attempts to find some way to save
 870     # some branches using the mask technique.  If not, it returns an empty
 871     # list; if so, it returns a list consisting of
 872     #   [ [compare1, mask1], [compare2, mask2], ...
 873     #     [compare_n, undef], [compare_m, undef], ...
 874     #   ]
 875     # The <mask> is undef in the above for those bytes that must be tested
 876     # for individually.
 877     #
 878     # This function does not attempt to find the optimal set.  To do so would
 879     # probably require testing all possible combinations, and keeping track of
 880     # the current best one.
 881     #
 882     # There are probably much better algorithms, but this is the one I (khw)
 883     # came up with.  We start with doing a bit-wise compare of every byte in
 884     # the set with every other byte.  The results are sorted into arrays of
 885     # all those that differ by the same bit positions.  These are stored in a
 886     # hash with the each key being the bits they differ in.  Here is the hash
 887     # for the 0x53, 0x54, 0x73, 0x74 set:
 888     # {
 889     #    4 => {
 890     #            "0,1,2,5" => [
 891     #                            83,
 892     #                            116,
 893     #                            84,
 894     #                            115
 895     #                        ]
 896     #        },
 897     #    3 => {
 898     #            "0,1,2" => [
 899     #                        83,
 900     #                        84,
 901     #                        115,
 902     #                        116
 903     #                        ]
 904     #        }
 905     #    1 => {
 906     #            5 => [
 907     #                    83,
 908     #                    115,
 909     #                    84,
 910     #                    116
 911     #                ]
 912     #        },
 913     # }
 914     #
 915     # The set consisting of values which differ in the 4 bit positions 0, 1,
 916     # 2, and 5 from some other value in the set consists of all 4 values.
 917     # Likewise all 4 values differ from some other value in the 3 bit
 918     # positions 0, 1, and 2; and all 4 values differ from some other value in
 919     # the single bit position 5.  The keys at the uppermost level in the above
 920     # hash, 1, 3, and 4, give the number of bit positions that each sub-key
 921     # below it has.  For example, the 4 key could have as its value an array
 922     # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
 923     # such.  The best optimization will group the most values into a single
 924     # mask.  The most values will be the ones that differ in the most
 925     # positions, the ones with the largest value for the topmost key.  These
 926     # keys, are thus just for convenience of sorting by that number, and do
 927     # not have any bearing on the core of the algorithm.
 928     #
 929     # We start with an element from largest number of differing bits.  The
 930     # largest in this case is 4 bits, and there is only one situation in this
 931     # set which has 4 differing bits, "0,1,2,5".  We look for any subset of
 932     # this set which has 16 values that differ in these 4 bits.  There aren't
 933     # any, because there are only 4 values in the entire set.  We then look at
 934     # the next possible thing, which is 3 bits differing in positions "0,1,2".
 935     # We look for a subset that has 8 values that differ in these 3 bits.
 936     # Again there are none.  So we go to look for the next possible thing,
 937     # which is a subset of 2**1 values that differ only in bit position 5.  83
 938     # and 115 do, so we calculate a mask and base for those and remove them
 939     # from every set.  Since there is only the one set remaining, we remove
 940     # them from just this one.  We then look to see if there is another set of
 941     # 2 values that differ in bit position 5.  84 and 116 do, so we calculate
 942     # a mask and base for those and remove them from every set (again only
 943     # this set remains in this example).  The set is now empty, and there are
 944     # no more sets to look at, so we are done.
 945
 946     if ($list_count == 256) {   # All 256 is trivially masked
 947         return (0, 0);
 948     }
 949
 950     my %hash;
 951
 952     # Generate bits-differing lists for each element compared against each
 953     # other element
 954     for my $i (0 .. $list_count - 2) {
 955         for my $j ($i + 1 .. $list_count - 1) {
 956             my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
 957             my $differ_count = @bits_that_differ;
 958             my $key = join ",", @bits_that_differ;
 959             push @{$hash{$differ_count}{$key}}, $list[$i]
 960                 unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
 961             push @{$hash{$differ_count}{$key}}, $list[$j];
 962         }
 963     }
 964
 965     print STDERR __LINE__, ": calculate_mask() called:  List of values grouped",
 966                                 " by differing bits: ", Dumper \%hash if DEBUG;
 967
 968     my @final_results;
 969     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
 970         my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
 971         foreach my $bits (sort keys $hash{$count}->%*) {
 972
 973             print STDERR __LINE__, ": For $count bit(s) difference ($bits),",
 974             " need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
 975
 976             # Look only as long as there are at least as many elements in the
 977             # subset as are needed
 978             while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
 979
 980                 print STDERR __LINE__, ": Looking at bit positions ($bits): ",
 981                                           Dumper $hash{$count}{$bits} if DEBUG;
 982
 983                 # Start with the first element in it
 984                 my $try_base = $hash{$count}{$bits}[0];
 985                 my @subset = $try_base;
 986
 987                 # If it succeeds, we return a mask and a base to compare
 988                 # against the masked value.  That base will be the AND of
 989                 # every element in the subset.  Initialize to the one element
 990                 # we have so far.
 991                 my $compare = $try_base;
 992
 993                 # We are trying to find a subset of this that has <need>
 994                 # elements that differ in the bit positions given by the
 995                 # string $bits, which is comma separated.
 996                 my @bits = split ",", $bits;
 997
 998                 TRY: # Look through the remainder of the list for other
 999                      # elements that differ only by these bit positions.
1000
1001                 for (my $i = 1; $i < $cur_count; $i++) {
1002                     my $try_this = $hash{$count}{$bits}[$i];
1003                     my @positions = pop_count($try_base ^ $try_this);
1004
1005                     print STDERR __LINE__, ": $try_base vs $try_this: is (",
1006                       join(',', @positions), ") a subset of ($bits)?" if DEBUG;
1007
1008                     foreach my $pos (@positions) {
1009                         unless (grep { $pos == $_ } @bits) {
1010                             print STDERR "  No\n" if DEBUG;
1011                             my $remaining = $cur_count - $i - 1;
1012                             if ($remaining && @subset + $remaining < $need) {
1013                                 print STDERR __LINE__, ": Can stop trying",
1014                                     " $try_base, because even if all the",
1015                                     " remaining $remaining values work, they",
1016                                     " wouldn't add up to the needed $need when",
1017                                     " combined with the existing ",
1018                                             scalar @subset, " ones\n" if DEBUG;
1019                                 last TRY;
1020                             }
1021                             next TRY;
1022                         }
1023                     }
1024
1025                     print STDERR "  Yes\n" if DEBUG;
1026                     push @subset, $try_this;
1027
1028                     # Add this to the mask base, in case it ultimately
1029                     # succeeds,
1030                     $compare &= $try_this;
1031                 }
1032
1033                 print STDERR __LINE__, ": subset (", join(", ", @subset),
1034                  ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
1035
1036                 if (@subset < $need) {
1037                     shift @{$hash{$count}{$bits}};
1038                     next;   # Try with next value
1039                 }
1040
1041                 # Create the mask
1042                 my $mask = 0;
1043                 foreach my $position (@bits) {
1044                     $mask |= 1 << $position;
1045                 }
1046                 $mask = ~$mask & 0xFF;
1047                 push @final_results, [$compare, $mask];
1048
1049                 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n",
1050                                 __LINE__, $compare, $compare, $mask if DEBUG;
1051
1052                 # These values are now spoken for.  Remove them from future
1053                 # consideration
1054                 foreach my $remove_count (sort keys %hash) {
1055                     foreach my $bits (sort keys %{$hash{$remove_count}}) {
1056                         foreach my $to_remove (@subset) {
1057                             @{$hash{$remove_count}{$bits}}
1058                                     = grep { $_ != $to_remove }
1059                                                 @{$hash{$remove_count}{$bits}};
1060                         }
1061                     }
1062                 }
1063             }
1064         }
1065     }
1066
1067     # Any values that remain in the list are ones that have to be tested for
1068     # individually.
1069     my @individuals;
1070     foreach my $count (reverse sort { $a <=> $b } keys %hash) {
1071         foreach my $bits (sort keys $hash{$count}->%*) {
1072             foreach my $remaining (@{$hash{$count}{$bits}}) {
1073
1074                 # If we already know about this value, just ignore it.
1075                 next if grep { $remaining == $_ } @individuals;
1076
1077                 # Otherwise it needs to be returned as something to match
1078                 # individually
1079                 push @final_results, [$remaining, undef];
1080                 push @individuals, $remaining;
1081             }
1082         }
1083     }
1084
1085     # Sort by increasing numeric value
1086     @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
1087
1088     print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
1089
1090     return @final_results;
1091 }
1092
1093 # _cond_as_str
1094 # turn a list of conditions into a text expression
1095 # - merges ranges of conditions, and joins the result with ||
1096 sub _cond_as_str {
1097     my ( $self, $op, $combine, $opts_ref )= @_;
1098     my @cond = ();
1099     @cond = $op->{vals}->@* if defined $op->{vals};
1100     my $test= $op->{test};
1101     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
1102     my $charset = $opts_ref->{charset};
1103     return "( $test )" unless @cond;
1104
1105     my (@ranges, @native_ranges);
1106     my @native_conds;
1107
1108     # rangify the list.  As we encounter a new value, it is placed in a new
1109     # subarray by itself.  If the next value is adjacent to it, the end point
1110     # of the subarray is merely incremented; and so on.  When the next value
1111     # that isn't adjacent to the previous one is encountered, Update() is
1112     # called to hoist any single-element subarray to be a scalar.
1113     my $Update= sub {
1114         # We skip this if there are optimizations that
1115         # we can apply (below) to the individual ranges
1116         if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
1117             $ranges[-1] = $ranges[-1][0] if $ranges[-1][0] == $ranges[-1][1];
1118         }
1119     };
1120
1121     # Parse things twice, using different approaches for representing things,
1122     # afterwards choosing the alternative with the fewest branches
1123     for my $i (0, 1) {
1124
1125         # Should we avoid using mnemonics for code points?
1126         my $always_hex = 0;
1127
1128         # The second pass is all about using a transformation to see if it
1129         # creates contiguous blocks that lead to fewer ranges or masking.  But
1130         # single element ranges don't have any benefit, and so the transform
1131         # is just extra work for them.  '$range_test' includes the transform
1132         # for multi-element ranges, and '$original' maps a byte back to what
1133         # it was without being transformed.  Thus we use '$range_test' and the
1134         # transormed bytes on multi-element ranges, and plain '$test' and
1135         # '$original' on single ones.  In the first pass these are effectively
1136         # no-ops.
1137         my $range_test = $test;
1138         my $original = \@identity;
1139
1140         if ($i) {   # 2nd pass
1141             # The second pass is only for non-ascii character sets, to see if
1142             # a transform to Unicode/ASCII saves anything.
1143             last if $charset =~ /ascii/i;
1144
1145             # If the first pass came up with a single range, we won't be able
1146             # to do better than that, so don't try.
1147             last if @ranges == 1;
1148
1149             # We calculated the native values the first iteration
1150             @native_ranges = @ranges;
1151             @native_conds = @cond;
1152
1153             # Start fresh
1154             undef @ranges;
1155             undef @cond;
1156
1157             # Determine the translation function, to/from UTF-8 or Latin1, and
1158             # the corresponding transform of the condition to match
1159             my $lookup;
1160             if ($opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi) {
1161                 $lookup = $utf_2_I8{$charset};
1162                 $original = $I8_2_utf{$charset};
1163                 $range_test = "NATIVE_UTF8_TO_I8($test)";
1164             }
1165             else {
1166                 $lookup = $n2a{$charset};
1167                 $original = $a2n{$charset};
1168                 $range_test = "NATIVE_TO_LATIN1($test)";
1169             }
1170
1171             # Translate the native conditions (bytes) into the Unicode ones
1172             for my $condition (@native_conds) {
1173                 push @cond, $lookup->[$condition];
1174             }
1175
1176             # 'f' won't be the expected 'f' on this box
1177             $always_hex = 1;
1178         }
1179
1180         # Go through the code points (@cond) and collapse them as much as
1181         # possible into ranges
1182         for my $condition ( @cond ) {
1183             if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
1184                 # Not adjacent to the existing range.  Remove that from being a
1185                 # range if only a single value;
1186                 $Update->();
1187                 push @ranges, [ $condition, $condition ];
1188             } else {    # Adjacent to the existing range; add to the range
1189                 $ranges[-1][1]++;
1190             }
1191         }
1192         $Update->();
1193
1194         # _combine is used for cp type matching.  By having it here return, no
1195         # second pass is done.  It could conceivably be restructured to have a
1196         # second pass, but no current uses of script would actually gain any
1197         # advantage by doing so, so the work hasn't been further considered.
1198         return $self->_combine( $test, @ranges ) if $combine;
1199
1200         # If the input set has certain characteristics, we can optimize tests
1201         # for it.
1202
1203         # If all bytes match, is trivially true; we don't need a 2nd pass
1204         return 1 if @cond == 256;
1205
1206         # If this is a single UTF-8 range which includes all possible
1207         # continuation bytes, and we aren't checking for well-formedness, this
1208         # is trivially true.
1209         #
1210         # (In EBCDIC, this won't happen until the 2nd pass transforms the
1211         # disjoint continuation byte ranges into a single I8 one.)
1212         if (     @ranges == 1
1213             && ! $opts_ref->{safe}
1214             && ! $opts_ref->{no_length_checks}
1215             &&   $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi
1216             &&   $ranges[0]->[1] == 0xBF
1217             &&   $ranges[0]->[0] == (($charset =~ /ascii/i)
1218                                         ? 0x80 : 0xA0))
1219         {
1220             return 1;
1221         }
1222
1223         my $loop_start = 0;
1224         if (ref $ranges[0] && $ranges[0]->[0] == 0) {
1225
1226             # If the first range matches all 256 possible bytes, it is
1227             # trivially true.
1228             if ($ranges[0]->[1] == 0xFF) {
1229                 die "Range spanning all bytes must be the only one"
1230                                                                 if @ranges > 1;
1231                 return 1;
1232             }
1233
1234             # Here, the first range starts at 0, but doesn't match everything.
1235             # But the condition doesn't have to worry about being < 0
1236             $ranges[0] = "( $test <= "
1237                         . $self->val_fmt($ranges[0]->[1], $always_hex) . " )";
1238             $loop_start++;
1239         }
1240
1241         my $loop_end = @ranges;
1242         if (   @ranges
1243             && ref $ranges[-1]
1244             && $ranges[-1]->[1] == 0xFF
1245             && $ranges[-1]->[0] != 0xFF)
1246         {
1247             # If the final range consists of more than one byte ending with
1248             # the highest possible one, the condition doesn't have to worry
1249             # about being > FF
1250             $ranges[-1] = "( $test >= "
1251                         . $self->val_fmt($ranges[-1]->[0], $always_hex) . " )";
1252             $loop_end--;
1253         }
1254
1255         # Look at each range to see if there any optimizations.  The
1256         # formatting may be thrown away, so might be wasted effort; and khw
1257         # supposes this could be restructured to delay that until the final
1258         # method is chosen.  But that would be more coding work than
1259         # warranted, as this is executed not that many times during a
1260         # development cycle.
1261         for (my $i = $loop_start; $i < $loop_end; $i++) {
1262             if (! ref $ranges[$i]) {    # Trivial case: no range
1263                 $ranges[$i] =
1264                     $self->val_fmt($original->[$ranges[$i]], $always_hex)
1265                   . " == $test";
1266             }
1267             elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
1268                 $ranges[$i] =           # Trivial case: single element range
1269                      $self->val_fmt($original->[$ranges[$i]->[0]], $always_hex)
1270                    . " == $test";
1271             }
1272             else {
1273                 $ranges[$i] = "inRANGE_helper_(U8, $range_test, "
1274                         . $self->val_fmt($ranges[$i]->[0], $always_hex) .", "
1275                         . $self->val_fmt($ranges[$i]->[1], $always_hex) . ")";
1276             }
1277         }
1278
1279         # Here, have collapsed the matched code points into ranges.  This code
1280         # also sees if some of those different ranges have bit patterns which
1281         # causes them to be combinable by ANDing with a mask.  There's no need
1282         # to do this if we are already down to a single range.
1283         next unless @ranges > 1;
1284
1285         my @masks = calculate_mask(@cond);
1286
1287         # Stringify the output of calculate_mask()
1288         if (@masks) {
1289             my @masked;
1290             foreach my $mask_ref (@masks) {
1291                 if (defined $mask_ref->[1]) {
1292                     push @masked, "( ( $range_test & "
1293                         . $self->val_fmt($mask_ref->[1], $always_hex) . " ) == "
1294                         . $self->val_fmt($mask_ref->[0], $always_hex) . " )";
1295                 }
1296                 else {  # An undefined mask means to use the value as-is
1297                     push @masked, "$test == "
1298                     . $self->val_fmt($original->[$mask_ref->[0]], $always_hex);
1299                 }
1300             }
1301
1302             # The best possible case below for specifying this set of values via
1303             # ranges is 1 branch per range.  If our mask method yielded better
1304             # results, there is no sense trying something that is bound to be
1305             # worse.
1306             if (@masked < @ranges) {
1307                 @ranges = @masked;
1308                 next;
1309             }
1310
1311             @masks = @masked;
1312         }
1313
1314         # If we found some mask possibilities, and they have fewer
1315         # conditionals in them than the plain range method, convert to use the
1316         # masks.
1317         @ranges = @masks if @masks && @masks < @ranges;
1318     }  # End of both passes
1319
1320     # If the two passes came up with two sets, use the one with the fewest
1321     # conditionals (the number of ranges is a proxy for that).  If both have
1322     # the same number, prefer the native, as that omits transformations.
1323     if (@native_ranges && @native_ranges <= @ranges) {
1324         @ranges = @native_ranges;
1325         @cond = @native_conds;
1326     }
1327
1328     return "( " . join( " || ", @ranges) . " )";
1329 }
1330
1331 # _combine
1332 # recursively turn a list of conditions into a fast break-out condition
1333 # used by _cond_as_str() for 'cp' type macros.
1334 sub _combine {
1335     my ( $self, $test, @cond )= @_;
1336     return if !@cond;
1337     my $item= shift @cond;
1338     my ( $cstr, $gtv );
1339     if ( ref $item ) {  # @item should be a 2-element array giving range start
1340                         # and end
1341         if ($item->[0] == 0) {  # UV's are never negative, so skip "0 <= "
1342                                 # test which could generate a compiler warning
1343                                 # that test is always true
1344             $cstr= "$test <= " . $self->val_fmt($item->[1]);
1345         }
1346         else {
1347             $cstr = "inRANGE_helper_(UV, $test, "
1348                   . $self->val_fmt($item->[0]) . ", "
1349                   . $self->val_fmt($item->[1]) . ")";
1350         }
1351         $gtv= $self->val_fmt($item->[1]);
1352     } else {
1353         $cstr= $self->val_fmt($item) . " == $test";
1354         $gtv= $self->val_fmt($item)
1355     }
1356     if ( @cond ) {
1357         my $combine= $self->_combine( $test, @cond );
1358         if (@cond >1) {
1359             return "( $cstr || ( $gtv < $test &&\n"
1360                    . $combine . " ) )";
1361         } else {
1362             return "( $cstr || $combine )";
1363         }
1364     } else {
1365         return $cstr;
1366     }
1367 }
1368
1369 # _render()
1370 # recursively convert an optree to text with reasonably neat formatting
1371 sub _render {
1372     my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_;
1373     return 0 if ! defined $op;  # The set is empty
1374     if ( !ref $op ) {
1375         return $op;
1376     }
1377     my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1378     #no warnings 'recursion';   # This would allow really really inefficient
1379                                 # code to be generated.  See pod
1380     my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def,
1381                                                                     $submacros);
1382     return $yes if $cond eq '1';
1383
1384     my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref, $def,
1385                                                                     $submacros);
1386     return "( $cond )" if $yes eq '1' and $no eq '0';
1387     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1388     return "$lb$cond ? $yes : $no$rb"
1389       if !ref( $op->{yes} ) && !ref( $op->{no} );
1390     my $ind1= " " x 4;
1391     my $ind= "\n" . ( $ind1 x $op->{depth} );
1392
1393     if ( ref $op->{yes} ) {
1394         $yes= $ind . $ind1 . $yes;
1395     } else {
1396         $yes= " " . $yes;
1397     }
1398
1399     my $str= "$lb$cond ?$yes$ind: $no$rb";
1400     if (length $str > 6000) {
1401         push @$submacros, sprintf "#define $def\n( %s )", "_part"
1402                                   . (my $yes_idx= 0+@$submacros) . "_", $yes;
1403         push @$submacros, sprintf "#define $def\n( %s )", "_part"
1404                                   . (my $no_idx= 0+@$submacros) . "_", $no;
1405         return sprintf "%s%s ? $def : $def%s", $lb, $cond,
1406                                     "_part${yes_idx}_", "_part${no_idx}_", $rb;
1407     }
1408     return $str;
1409 }
1410
1411 # $expr=render($op,$combine)
1412 #
1413 # convert an optree to text with reasonably neat formatting. If $combine
1414 # is true then the condition is created using "fast breakouts" which
1415 # produce uglier expressions that are more efficient for common case,
1416 # longer lists such as that resulting from type 'cp' output.
1417 # Currently only used for type 'cp' macros.
1418 sub render {
1419     my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_;
1420
1421     my @submacros;
1422     my $macro= sprintf "#define $def_fmt\n( %s )", "",
1423                        $self->_render( $op, $combine, 0, $opts_ref, $def_fmt,
1424                                                                  \@submacros);
1425
1426     return join "\n\n",
1427             map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) }
1428                                                             @submacros, $macro;
1429 }
1430
1431 # make_macro
1432 # make a macro of a given type.
1433 # calls into make_trie and (generic_|length_)optree as needed
1434 # Opts are:
1435 # type             : 'cp', 'cp_high', 'generic', 'high', 'low', 'latin1',
1436 #                    'utf8', 'LATIN1', 'UTF8' 'backwards_UTF8'
1437 # ret_type         : 'cp' or 'len'
1438 # safe             : don't assume is well-formed UTF-8, so don't skip any range
1439 #                    checks, and add length guards to macro
1440 # no_length_checks : like safe, but don't add length guards.
1441 #
1442 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1443 # in which case it defaults to 'cp' as well.
1444 #
1445 # It is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1446 # sequences in it, as the generated macro will accept only a single codepoint
1447 # as an argument.
1448 #
1449 # It is also illegal to do a non-safe macro on a pattern with multi-codepoint
1450 # sequences in it, as even if it is known to be well-formed, we need to not
1451 # run off the end of the buffer when, say, the buffer ends with the first two
1452 # characters, but three are looked at by the macro.
1453 #
1454 # returns the macro.
1455
1456
1457 sub make_macro {
1458     my $self= shift;
1459     my %opts= @_;
1460     my $type= $opts{type} || 'generic';
1461     if ($self->{has_multi}) {
1462         if ($type =~ /^cp/) {
1463             die "Can't do a 'cp' on multi-codepoint character class"
1464               . " '$self->{op}'"
1465         }
1466         elsif (! $opts{safe}) {
1467             die "'safe' is required on multi-codepoint character class"
1468                ." '$self->{op}'"
1469         }
1470     }
1471     my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1472     my $method;
1473     if ( $opts{safe} ) {
1474         $method= 'length_optree';
1475     } elsif ( $type =~ /generic/ ) {
1476         $method= 'generic_optree';
1477     } else {
1478         $method= 'optree';
1479     }
1480     my @args= $type =~ /^cp/ ? 'cp' : 's';
1481     push @args, "e" if $opts{safe};
1482     push @args, "is_utf8" if $type =~ /generic/;
1483     push @args, "len" if $ret_type eq 'both';
1484     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
1485              $ret_type eq 'cp'      ? 'what_'     : 'is_';
1486     my $ext= $type     =~ /generic/ ? ''          : '_' . lc( $type );
1487     $ext .= '_non_low' if $type eq 'generic_non_low';
1488     $ext .= "_safe" if $opts{safe};
1489     $ext .= "_no_length_checks" if $opts{no_length_checks};
1490     $ext .= "_backwards" if $opts{backwards};
1491     my $argstr= join ",", @args;
1492     my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
1493     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1494     return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt );
1495 }
1496
1497 # if we aren't being used as a module (highly likely) then process
1498 # the __DATA__ below and produce macros in regcharclass.h
1499 # if an argument is provided to the script then it is assumed to
1500 # be the path of the file to output to, if the arg is '-' outputs
1501 # to STDOUT.
1502 if ( !caller ) {
1503     $|++;
1504     my $path= shift @ARGV || "regcharclass.h";
1505     my $out_fh;
1506     if ( $path eq '-' ) {
1507         $out_fh= \*STDOUT;
1508     } else {
1509         $out_fh = open_new( $path );
1510     }
1511     print $out_fh read_only_top( lang => 'C', by => $0,
1512                                  file => 'regcharclass.h', style => '*',
1513                                  copyright => [2007, 2011],
1514                                  final => <<EOF,
1515 WARNING: These macros are for internal Perl core use only, and may be
1516 changed or removed without notice.
1517 EOF
1518     );
1519     print $out_fh "\n#ifndef PERL_REGCHARCLASS_H_ /* Guard against nested",
1520                   " #includes */\n#define PERL_REGCHARCLASS_H_\n";
1521
1522     my ( $op, $title, @txt, @types, %mods );
1523     my $doit= sub ($) {
1524         return unless $op;
1525
1526         my $charset = shift;
1527
1528         # Skip if to compile on a different platform.
1529         return if delete $mods{only_ascii_platform} && $charset !~ /ascii/i;
1530         return if delete $mods{only_ebcdic_platform} && $charset !~ /ebcdic/i;
1531
1532         print $out_fh "/*\n\t$op: $title\n\n";
1533         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1534         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt,
1535                                                         charset => $charset);
1536
1537         #die Dumper(\@types,\%mods);
1538
1539         my @mods;
1540         push @mods, 'safe' if delete $mods{safe};
1541         push @mods, 'no_length_checks' if delete $mods{no_length_checks};
1542
1543         # Default to 'fast' do this one first, as traditional
1544         unshift @mods, 'fast' if delete $mods{fast} || ! @mods;
1545         if (%mods) {
1546             die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1547         }
1548
1549         foreach my $type_spec ( @types ) {
1550             my ( $type, $ret )= split /-/, $type_spec;
1551             $ret ||= 'len';
1552
1553             my $backwards = 0;
1554             if ($type eq 'backwards_UTF8') {
1555                 $type = 'UTF8';
1556                 $backwards = 1;
1557             }
1558
1559             foreach my $mod ( @mods ) {
1560
1561                 # 'safe' is irrelevant with code point macros, so skip if
1562                 # there is also a 'fast', but don't skip if this is the only
1563                 # way a cp macro will get generated.  Below we convert 'safe'
1564                 # to 'fast' in this instance
1565                 next if $type =~ /^cp/
1566                         && ($mod eq 'safe' || $mod eq 'no_length_checks')
1567                         && grep { 'fast' =~ $_ } @mods;
1568                 delete $mods{$mod};
1569                 my $macro= $obj->make_macro(
1570                     type     => $type,
1571                     ret_type => $ret,
1572                     safe     => $mod eq 'safe' && $type !~ /^cp/,
1573                     charset  => $charset,
1574                     no_length_checks => $mod eq 'no_length_checks'
1575                                      && $type !~ /^cp/,
1576                     backwards => $backwards,
1577                 );
1578                 print $out_fh $macro, "\n";
1579             }
1580         }
1581     };
1582
1583     my @data = <DATA>;
1584     foreach my $charset (get_supported_code_pages()) {
1585         my $first_time = 1;
1586         undef $op;
1587         undef $title;
1588         undef @txt;
1589         undef @types;
1590         undef %mods;
1591         print $out_fh "\n", get_conditional_compile_line_start($charset);
1592         my @data_copy = @data;
1593         for (@data_copy) {
1594             s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1595             next unless /\S/;
1596             chomp;
1597             if ( /^[A-Z]/ ) {
1598                 $doit->($charset) unless $first_time;  # This starts a new
1599                                                        # definition; do the
1600                                                        # previous one
1601                 $first_time = 0;
1602                 ( $op, $title )= split /\s*:\s*/, $_, 2;
1603                 @txt= ();
1604             } elsif ( s/^=>// ) {
1605                 my ( $type, $modifier )= split /:/, $_;
1606                 @types= split ' ', $type;
1607                 undef %mods;
1608                 map { $mods{$_} = 1 } split ' ',  $modifier;
1609             } else {
1610                 push @txt, "$_";
1611             }
1612         }
1613         $doit->($charset);
1614         print $out_fh get_conditional_compile_line_end();
1615     }
1616
1617     print $out_fh "\n#endif /* PERL_REGCHARCLASS_H_ */\n";
1618
1619     if($path eq '-') {
1620         print $out_fh "/* ex: set ro: */\n";
1621     } else {
1622         # Some of the sources for these macros come from Unicode tables
1623         my $sources_list = "lib/unicore/mktables.lst";
1624         my @sources = ($0, qw(lib/unicore/mktables
1625                               lib/Unicode/UCD.pm
1626                               regen/regcharclass_multi_char_folds.pl
1627                               regen/charset_translations.pl
1628                              ));
1629         {
1630             # Depend on mktables’ own sources.  It’s a shorter list of files than
1631             # those that Unicode::UCD uses.
1632             if (! open my $mktables_list, '<', $sources_list) {
1633
1634                 # This should force a rebuild once $sources_list exists
1635                 push @sources, $sources_list;
1636             }
1637             else {
1638                 while(<$mktables_list>) {
1639                     last if /===/;
1640                     chomp;
1641                     push @sources, "lib/unicore/$_" if /^[^#]/;
1642                 }
1643             }
1644         }
1645         read_only_bottom_close_and_rename($out_fh, \@sources)
1646     }
1647 }
1648
1649 # The form of the input is a series of definitions to make macros for.
1650 # The first line gives the base name of the macro, followed by a colon, and
1651 # then text to be used in comments associated with the macro that are its
1652 # title or description.  In all cases the first (perhaps only) parameter to
1653 # the macro is a pointer to the first byte of the code point it is to test to
1654 # see if it is in the class determined by the macro.  In the case of non-UTF8,
1655 # the code point consists only of a single byte.
1656 #
1657 # The second line must begin with a '=>' and be followed by the types of
1658 # macro(s) to be generated; these are specified below.  A colon follows the
1659 # types, followed by the modifiers, also specified below.  At least one
1660 # modifier is required.
1661 #
1662 # The subsequent lines give what code points go into the class defined by the
1663 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1664 # enclosed in quotes.  Otherwise the lines consist of one of:
1665 #   1)  a single Unicode code point, prefaced by 0x
1666 #   2)  a single range of Unicode code points separated by a minus (and
1667 #       optional space)
1668 #   3)  a single Unicode property specified in the standard Perl form
1669 #       "\p{...}"
1670 #   4)  a line like 'do path'.  This will do a 'do' on the file given by
1671 #       'path'.  It is assumed that this does nothing but load subroutines
1672 #       (See item 5 below).  The reason 'require path' is not used instead is
1673 #       because 'do' doesn't assume that path is in @INC.
1674 #   5)  a subroutine call
1675 #           &pkg::foo(arg1, ...)
1676 #       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1677 #       returns an array of entries of forms like items 1-3 above.  This
1678 #       allows more complex inputs than achievable from the other input types.
1679 #
1680 # A blank line or one whose first non-blank character is '#' is a comment.
1681 # The definition of the macro is terminated by a line unlike those described.
1682 #
1683 # Valid types:
1684 #   low         generate a macro whose name is 'is_BASE_low' and defines a
1685 #               class that includes only ASCII-range chars.  (BASE is the
1686 #               input macro base name.)
1687 #   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1688 #               class that includes only upper-Latin1-range chars.  It is not
1689 #               designed to take a UTF-8 input parameter.
1690 #   high        generate a macro whose name is 'is_BASE_high' and defines a
1691 #               class that includes all relevant code points that are above
1692 #               the Latin1 range.  This is for very specialized uses only.
1693 #               It is designed to take only an input UTF-8 parameter.
1694 #   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1695 #               class that includes all relevant characters that aren't ASCII.
1696 #               It is designed to take only an input UTF-8 parameter.
1697 #   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1698 #               class that includes both ASCII and upper-Latin1-range chars.
1699 #               It is not designed to take a UTF-8 input parameter.
1700 #   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1701 #               class that can include any code point, adding the 'low' ones
1702 #               to what 'utf8' works on.  It is designed to take only an input
1703 #               UTF-8 parameter.
1704 #   backwards_UTF8  like 'UTF8', but designed to match backwards, so that the
1705 #               second parameter to the function is earlier in the string than
1706 #               the first.
1707 #   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1708 #               boolean, parameter which indicates if the first one points to
1709 #               a UTF-8 string or not.  Thus it works in all circumstances.
1710 #   generic_non_low generate a macro whose name is 'is_BASE_non_low".  It has
1711 #               a 2nd, boolean, parameter which indicates if the first one
1712 #               points to a UTF-8 string or not.  It excludes any ASCII-range
1713 #               matches, but otherwise it works in all circumstances.
1714 #   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1715 #               class that returns true if the UV parameter is a member of the
1716 #               class; false if not.
1717 #   cp_high     like cp, but it is assumed that it is known that the UV
1718 #               parameter is above Latin1.  The name of the generated macro is
1719 #               'is_BASE_cp_high'.  This is different from high-cp, derived
1720 #               below.
1721 # A macro of the given type is generated for each type listed in the input.
1722 # The default return value is the number of octets read to generate the match.
1723 # Append "-cp" to the type to have it instead return the matched codepoint.
1724 #               The macro name is changed to 'what_BASE...'.  See pod for
1725 #               caveats
1726 # Appending '-both" instead adds an extra parameter to the end of the argument
1727 #               list, which is a pointer as to where to store the number of
1728 #               bytes matched, while also returning the code point.  The macro
1729 #               name is changed to 'what_len_BASE...'.  See pod for caveats
1730 #
1731 # Valid modifiers:
1732 #   safe        The input string is not necessarily valid UTF-8.  In
1733 #               particular an extra parameter (always the 2nd) to the macro is
1734 #               required, which points to one beyond the end of the string.
1735 #               The macro will make sure not to read off the end of the
1736 #               string.  In the case of non-UTF8, it makes sure that the
1737 #               string has at least one byte in it.  The macro name has
1738 #               '_safe' appended to it.
1739 #   no_length_checks  The input string is not necessarily valid UTF-8, but it
1740 #               is to be assumed that the length has already been checked and
1741 #               found to be valid
1742 #   fast        The input string is valid UTF-8.  No bounds checking is done,
1743 #               and the macro can make assumptions that lead to faster
1744 #               execution.
1745 #   only_ascii_platform   Skip this definition if the character set is for
1746 #               a non-ASCII platform.
1747 #   only_ebcdic_platform  Skip this definition if the character set is for
1748 #               a non-EBCDIC platform.
1749 # No modifier need be specified; fast is assumed for this case.  If both
1750 # 'fast', and 'safe' are specified, two macros will be created for each
1751 # 'type'.
1752 #
1753 # If run on a non-ASCII platform will automatically convert the Unicode input
1754 # to native.  The documentation above is slightly wrong in this case.  'low'
1755 # actually refers to code points whose UTF-8 representation is the same as the
1756 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1757 # code points less than 256.
1758
1759 1; # in the unlikely case we are being used as a module
1760
1761 __DATA__
1762 # This is no longer used, but retained in case it is needed some day.
1763 # TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1764 # => generic cp generic-cp generic-both :fast safe
1765 # 0x00DF        # LATIN SMALL LETTER SHARP S
1766 # 0x0390        # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1767 # 0x03B0        # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1768 # 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1769 # 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1770 # 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1771
1772 LNBREAK: Line Break: \R
1773 => generic UTF8 LATIN1 : safe
1774 "\x0D\x0A"      # CRLF - Network (Windows) line ending
1775 \p{VertSpace}
1776
1777 HORIZWS: Horizontal Whitespace: \h \H
1778 => high cp_high : fast
1779 \p{HorizSpace}
1780
1781 VERTWS: Vertical Whitespace: \v \V
1782 => high cp_high : fast
1783 \p{VertSpace}
1784
1785 XDIGIT: Hexadecimal digits
1786 => high cp_high : fast
1787 \p{XDigit}
1788
1789 XPERLSPACE: \p{XPerlSpace}
1790 => high cp_high : fast
1791 \p{XPerlSpace}
1792
1793 SPACE: Backwards \p{XPerlSpace}
1794 => backwards_UTF8 : safe
1795 \p{XPerlSpace}
1796
1797 NONCHAR: Non character code points
1798 => UTF8 :safe
1799 \p{_Perl_Nchar}
1800
1801 SHORTER_NON_CHARS:  # 3 bytes
1802 => UTF8 :only_ascii_platform fast
1803 0xFDD0 - 0xFDEF
1804 0xFFFE - 0xFFFF
1805
1806 LARGER_NON_CHARS:   # 4 bytes
1807 => UTF8 :only_ascii_platform fast
1808 0x1FFFE - 0x1FFFF
1809 0x2FFFE - 0x2FFFF
1810 0x3FFFE - 0x3FFFF
1811 0x4FFFE - 0x4FFFF
1812 0x5FFFE - 0x5FFFF
1813 0x6FFFE - 0x6FFFF
1814 0x7FFFE - 0x7FFFF
1815 0x8FFFE - 0x8FFFF
1816 0x9FFFE - 0x9FFFF
1817 0xAFFFE - 0xAFFFF
1818 0xBFFFE - 0xBFFFF
1819 0xCFFFE - 0xCFFFF
1820 0xDFFFE - 0xDFFFF
1821 0xEFFFE - 0xEFFFF
1822 0xFFFFE - 0xFFFFF
1823 0x10FFFE - 0x10FFFF
1824
1825 SHORTER_NON_CHARS:  # 4 bytes
1826 => UTF8 :only_ebcdic_platform fast
1827 0xFDD0 - 0xFDEF
1828 0xFFFE - 0xFFFF
1829 0x1FFFE - 0x1FFFF
1830 0x2FFFE - 0x2FFFF
1831 0x3FFFE - 0x3FFFF
1832
1833 LARGER_NON_CHARS:   # 5 bytes
1834 => UTF8 :only_ebcdic_platform fast
1835 0x4FFFE - 0x4FFFF
1836 0x5FFFE - 0x5FFFF
1837 0x6FFFE - 0x6FFFF
1838 0x7FFFE - 0x7FFFF
1839 0x8FFFE - 0x8FFFF
1840 0x9FFFE - 0x9FFFF
1841 0xAFFFE - 0xAFFFF
1842 0xBFFFE - 0xBFFFF
1843 0xCFFFE - 0xCFFFF
1844 0xDFFFE - 0xDFFFF
1845 0xEFFFE - 0xEFFFF
1846 0xFFFFE - 0xFFFFF
1847 0x10FFFE - 0x10FFFF
1848
1849 # Note that code in utf8.c is counting on the 'fast' version to look at no
1850 # more than two bytes
1851 SURROGATE: Surrogate code points
1852 => UTF8 :safe fast
1853 \p{_Perl_Surrogate}
1854
1855 QUOTEMETA: Meta-characters that \Q should quote
1856 => high :fast
1857 \p{_Perl_Quotemeta}
1858
1859 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1860 => UTF8 UTF8-cp :safe
1861 %regcharclass_multi_char_folds::multi_char_folds('u', 'a')
1862
1863 MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1864 => LATIN1 LATIN1-cp : safe
1865 %regcharclass_multi_char_folds::multi_char_folds('l', 'a')
1866
1867 THREE_CHAR_FOLD: A three-character multi-char fold
1868 => UTF8 :safe
1869 %regcharclass_multi_char_folds::multi_char_folds('u', '3')
1870
1871 THREE_CHAR_FOLD: A three-character multi-char fold
1872 => LATIN1 :safe
1873 %regcharclass_multi_char_folds::multi_char_folds('l', '3')
1874
1875 THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
1876 => UTF8 :safe
1877 %regcharclass_multi_char_folds::multi_char_folds('u', 'h')
1878
1879 THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
1880 => LATIN1 :safe
1881 %regcharclass_multi_char_folds::multi_char_folds('l', 'h')
1882 #
1883 #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
1884 #=> UTF8 :safe
1885 #%regcharclass_multi_char_folds::multi_char_folds('u', 'fm')
1886 #
1887 #THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
1888 #=> LATIN1 :safe
1889 #%regcharclass_multi_char_folds::multi_char_folds('l', 'fm')
1890
1891 FOLDS_TO_MULTI: characters that fold to multi-char strings
1892 => UTF8 :fast
1893 \p{_Perl_Folds_To_Multi_Char}
1894
1895 PROBLEMATIC_LOCALE_FOLD : characters whose fold is problematic under locale
1896 => UTF8 cp :fast
1897 \p{_Perl_Problematic_Locale_Folds}
1898
1899 PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are problematic under locale
1900 => UTF8 cp :fast
1901 \p{_Perl_Problematic_Locale_Foldeds_Start}
1902
1903 PATWS: pattern white space
1904 => generic : safe
1905 \p{_Perl_PatWS}
1906
1907 HANGUL_ED: Hangul syllables whose first UTF-8 byte is \xED
1908 => UTF8 :only_ascii_platform safe
1909 0xD000 - 0xD7FF
1910
1911 HANGUL_ED: Hangul syllables whose first UTF-8 byte is \xED
1912 => UTF8 :only_ebcdic_platform safe
1913 0x1 - 0x0
1914 # Alows fails on EBCDIC; there are no ED Hanguls there