regen/regcharclass.pl

   1 #!perl
   2 package CharClass::Matcher;
   3 use strict;
   4 use 5.008;
   5 use warnings;
   6 use warnings FATAL => 'all';
   7 use Text::Wrap qw(wrap);
   8 use Data::Dumper;
   9 $Data::Dumper::Useqq= 1;
  10 our $hex_fmt= "0x%02X";
  11
  12 sub ASCII_PLATFORM { (ord('A') == 65) }
  13
  14 require 'regen/regen_lib.pl';
  15
  16 =head1 NAME
  17
  18 CharClass::Matcher -- Generate C macros that match character classes efficiently
  19
  20 =head1 SYNOPSIS
  21
  22     perl Porting/regcharclass.pl
  23
  24 =head1 DESCRIPTION
  25
  26 Dynamically generates macros for detecting special charclasses
  27 in latin-1, utf8, and codepoint forms. Macros can be set to return
  28 the length (in bytes) of the matched codepoint, and/or the codepoint itself.
  29
  30 To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
  31 are necessary.
  32
  33 Using WHATEVER as an example the following macros can be produced, depending
  34 on the input parameters (how to get each is described by internal comments at
  35 the C<__DATA__> line):
  36
  37 =over 4
  38
  39 =item C<is_WHATEVER(s,is_utf8)>
  40
  41 =item C<is_WHATEVER_safe(s,e,is_utf8)>
  42
  43 Do a lookup as appropriate based on the C<is_utf8> flag. When possible
  44 comparisons involving octect<128 are done before checking the C<is_utf8>
  45 flag, hopefully saving time.
  46
  47 The version without the C<_safe> suffix should be used only when the input is
  48 known to be well-formed.
  49
  50 =item C<is_WHATEVER_utf8(s)>
  51
  52 =item C<is_WHATEVER_utf8_safe(s,e)>
  53
  54 Do a lookup assuming the string is encoded in (normalized) UTF8.
  55
  56 The version without the C<_safe> suffix should be used only when the input is
  57 known to be well-formed.
  58
  59 =item C<is_WHATEVER_latin1(s)>
  60
  61 =item C<is_WHATEVER_latin1_safe(s,e)>
  62
  63 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  64
  65 The version without the C<_safe> suffix should be used only when it is known
  66 that C<s> contains at least one character.
  67
  68 =item C<is_WHATEVER_cp(cp)>
  69
  70 Check to see if the string matches a given codepoint (hypothetically a
  71 U32). The condition is constructed as as to "break out" as early as
  72 possible if the codepoint is out of range of the condition.
  73
  74 IOW:
  75
  76   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  77
  78 Thus if the character is X+1 only two comparisons will be done. Making
  79 matching lookups slower, but non-matching faster.
  80
  81 =item C<what_len_WHATEVER_FOO(arg1, ..., len)>
  82
  83 A variant form of each of the macro types described above can be generated, in
  84 which the code point is returned by the macro, and an extra parameter (in the
  85 final position) is added, which is a pointer for the macro to set the byte
  86 length of the returned code point.
  87
  88 These forms all have a C<what_len> prefix instead of the C<is_>, for example
  89 C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
  90 C<what_len_WHATEVER_utf8(s,len)>.
  91
  92 These forms should not be used I<except> on small sets of mostly widely
  93 separated code points; otherwise the code generated is inefficient.  For these
  94 cases, it is best to use the C<is_> forms, and then find the code point with
  95 C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
  96 message on the worst of the inappropriate sets.  Examine the generated macro
  97 to see if it is acceptable.
  98
  99 =item C<what_WHATEVER_FOO(arg1, ...)>
 100
 101 A variant form of each of the C<is_> macro types described above can be generated, in
 102 which the code point and not the length is returned by the macro.  These have
 103 the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
 104 not be used where the set contains a NULL, as 0 is returned for two different
 105 cases: a) the set doesn't include the input code point; b) the set does
 106 include it, and it is a NULL.
 107
 108 =back
 109
 110 =head2 CODE FORMAT
 111
 112 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
 113
 114
 115 =head1 AUTHOR
 116
 117 Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
 118
 119 =head1 BUGS
 120
 121 No tests directly here (although the regex engine will fail tests
 122 if this code is broken). Insufficient documentation and no Getopts
 123 handler for using the module as a script.
 124
 125 =head1 LICENSE
 126
 127 You may distribute under the terms of either the GNU General Public
 128 License or the Artistic License, as specified in the README file.
 129
 130 =cut
 131
 132 # Sub naming convention:
 133 # __func : private subroutine, can not be called as a method
 134 # _func  : private method, not meant for external use
 135 # func   : public method.
 136
 137 # private subs
 138 #-------------------------------------------------------------------------------
 139 #
 140 # ($cp,$n,$l,$u)=__uni_latin($str);
 141 #
 142 # Return a list of arrays, each of which when interpreted correctly
 143 # represent the string in some given encoding with specific conditions.
 144 #
 145 # $cp - list of codepoints that make up the string.
 146 # $n  - list of octets that make up the string if all codepoints are invariant
 147 #       regardless of if the string is in UTF-8 or not.
 148 # $l  - list of octets that make up the string in latin1 encoding if all
 149 #       codepoints < 256, and at least one codepoint is UTF-8 variant.
 150 # $u  - list of octets that make up the string in utf8 if any codepoint is
 151 #       UTF-8 variant
 152 #
 153 #   High CP | Defined
 154 #-----------+----------
 155 #   0 - 127 : $n            (127/128 are the values for ASCII platforms)
 156 # 128 - 255 : $l, $u
 157 # 256 - ... : $u
 158 #
 159
 160 sub __uni_latin1 {
 161     my $str= shift;
 162     my $max= 0;
 163     my @cp;
 164     my $only_has_invariants = 1;
 165     for my $ch ( split //, $str ) {
 166         my $cp= ord $ch;
 167         push @cp, $cp;
 168         $max= $cp if $max < $cp;
 169         if (! ASCII_PLATFORM && $only_has_invariants) {
 170             if ($cp > 255) {
 171                 $only_has_invariants = 0;
 172             }
 173             else {
 174                 my $temp = chr($cp);
 175                 utf8::upgrade($temp);
 176                 my @utf8 = unpack "U0C*", $temp;
 177                 $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
 178             }
 179         }
 180     }
 181     my ( $n, $l, $u );
 182     $only_has_invariants = $max < 128 if ASCII_PLATFORM;
 183     if ($only_has_invariants) {
 184         $n= [@cp];
 185     } else {
 186         $l= [@cp] if $max && $max < 256;
 187
 188         $u= $str;
 189         utf8::upgrade($u);
 190         $u= [ unpack "U0C*", $u ] if defined $u;
 191     }
 192     return ( \@cp, $n, $l, $u );
 193 }
 194
 195 #
 196 # $clean= __clean($expr);
 197 #
 198 # Cleanup a ternary expression, removing unnecessary parens and apply some
 199 # simplifications using regexes.
 200 #
 201
 202 sub __clean {
 203     my ( $expr )= @_;
 204
 205     #return $expr;
 206
 207     our $parens;
 208     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 209
 210     ## remove redundant parens
 211     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 212
 213
 214     # repeatedly simplify conditions like
 215     #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
 216     # into
 217     #       ( ( (cond1) && (cond2) ) ? X : Y )
 218     # Also similarly handles expressions like:
 219     #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
 220     # Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is
 221     # purely to ensure we have a balanced set of parens in the expression which makes
 222     # it easier to understand the pattern in an editor that understands paren's, we do
 223     # not expect either of these cases to actually fire. - Yves
 224     1 while $expr =~ s/
 225         ([:()])  \s*
 226             ($parens) \s*
 227             \? \s*
 228                 \( \s* ($parens) \s*
 229                     \? \s* ($parens|[^()?:\s]+?) \s*
 230                     :  \s* ($parens|[^()?:\s]+?) \s*
 231                 \) \s*
 232             : \s* \5 \s*
 233         ([()])
 234     /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
 235
 236     return $expr;
 237 }
 238
 239 #
 240 # $text= __macro(@args);
 241 # Join args together by newlines, and then neatly add backslashes to the end
 242 # of every  line as expected by the C pre-processor for #define's.
 243 #
 244
 245 sub __macro {
 246     my $str= join "\n", @_;
 247     $str =~ s/\s*$//;
 248     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 249     my $last= pop @lines;
 250     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 251     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 252     return $str . "\n";
 253 }
 254
 255 #
 256 # my $op=__incrdepth($op);
 257 #
 258 # take an 'op' hashref and add one to it and all its childrens depths.
 259 #
 260
 261 sub __incrdepth {
 262     my $op= shift;
 263     return unless ref $op;
 264     $op->{depth} += 1;
 265     __incrdepth( $op->{yes} );
 266     __incrdepth( $op->{no} );
 267     return $op;
 268 }
 269
 270 # join two branches of an opcode together with a condition, incrementing
 271 # the depth on the yes branch when we do so.
 272 # returns the new root opcode of the tree.
 273 sub __cond_join {
 274     my ( $cond, $yes, $no )= @_;
 275     return {
 276         test  => $cond,
 277         yes   => __incrdepth( $yes ),
 278         no    => $no,
 279         depth => 0,
 280     };
 281 }
 282
 283 # Methods
 284
 285 # constructor
 286 #
 287 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 288 #
 289 # Create a new CharClass::Matcher object by parsing the text in
 290 # the txt array. Currently applies the following rules:
 291 #
 292 # Element starts with C<0x>, line is evaled the result treated as
 293 # a number which is passed to chr().
 294 #
 295 # Element starts with C<">, line is evaled and the result treated
 296 # as a string.
 297 #
 298 # Each string is then stored in the 'strs' subhash as a hash record
 299 # made up of the results of __uni_latin1, using the keynames
 300 # 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
 301 # 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
 302 #
 303 # Size data is tracked per type in the 'size' subhash.
 304 #
 305 # Return an object
 306 #
 307 sub new {
 308     my $class= shift;
 309     my %opt= @_;
 310     for ( qw(op txt) ) {
 311         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 312           if !exists $opt{$_};
 313     }
 314
 315     my $self= bless {
 316         op    => $opt{op},
 317         title => $opt{title} || '',
 318     }, $class;
 319     foreach my $txt ( @{ $opt{txt} } ) {
 320         my $str= $txt;
 321         if ( $str =~ /^[""]/ ) {
 322             $str= eval $str;
 323         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
 324                                     # list with its expansion
 325             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
 326             die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
 327             foreach my $cp (hex $lower .. hex $upper) {
 328                 push @{$opt{txt}}, sprintf "0x%X", $cp;
 329             }
 330             next;
 331         } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
 332             # Otherwise undocumented, a leading N means is already in the
 333             # native character set; don't convert.
 334             $str= chr eval $str;
 335         } elsif ( $str =~ /^0x/ ) {
 336             $str= eval $str;
 337
 338             # Convert from Unicode/ASCII to native, if necessary
 339             $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
 340                                                     && $str <= 0xFF;
 341             $str = chr $str;
 342         } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
 343             my $property = $1;
 344             use Unicode::UCD qw(prop_invlist);
 345
 346             my @invlist = prop_invlist($property, '_perl_core_internal_ok');
 347             if (! @invlist) {
 348
 349                 # An empty return could mean an unknown property, or merely
 350                 # that it is empty.  Call in scalar context to differentiate
 351                 my $count = prop_invlist($property, '_perl_core_internal_ok');
 352                 die "$property not found" unless defined $count;
 353             }
 354
 355             # Replace this element on the list with the property's expansion
 356             for (my $i = 0; $i < @invlist; $i += 2) {
 357                 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
 358
 359                     # prop_invlist() returns native values; add leading 'N'
 360                     # to indicate that.
 361                     push @{$opt{txt}}, sprintf "N0x%X", $cp;
 362                 }
 363             }
 364             next;
 365         } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
 366             die "do '$1' failed: $!$@" if ! do $1 or $@;
 367             next;
 368         } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
 369             my @results = eval "$1";
 370             die "eval '$1' failed: $@" if $@;
 371             push @{$opt{txt}}, @results;
 372             next;
 373         } else {
 374             die "Unparsable line: $txt\n";
 375         }
 376         my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
 377         my $UTF8= $low   || $utf8;
 378         my $LATIN1= $low || $latin1;
 379         my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
 380         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 381         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 382
 383         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp UTF8 LATIN1 )}=
 384           ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $UTF8, $LATIN1 );
 385         my $rec= $self->{strs}{$str};
 386         foreach my $key ( qw(low utf8 latin1 high cp UTF8 LATIN1) ) {
 387             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 388               if $self->{strs}{$str}{$key};
 389         }
 390         $self->{has_multi} ||= @$cp > 1;
 391         $self->{has_ascii} ||= $latin1 && @$latin1;
 392         $self->{has_low}   ||= $low && @$low;
 393         $self->{has_high}  ||= !$low && !$latin1;
 394     }
 395     $self->{val_fmt}= $hex_fmt;
 396     $self->{count}= 0 + keys %{ $self->{strs} };
 397     return $self;
 398 }
 399
 400 # my $trie = make_trie($type,$maxlen);
 401 #
 402 # using the data stored in the object build a trie of a specific type,
 403 # and with specific maximum depth. The trie is made up the elements of
 404 # the given types array for each string in the object (assuming it is
 405 # not too long.)
 406 #
 407 # returns the trie, or undef if there was no relevant data in the object.
 408 #
 409
 410 sub make_trie {
 411     my ( $self, $type, $maxlen )= @_;
 412
 413     my $strs= $self->{strs};
 414     my %trie;
 415     foreach my $rec ( values %$strs ) {
 416         die "panic: unknown type '$type'"
 417           if !exists $rec->{$type};
 418         my $dat= $rec->{$type};
 419         next unless $dat;
 420         next if $maxlen && @$dat > $maxlen;
 421         my $node= \%trie;
 422         foreach my $elem ( @$dat ) {
 423             $node->{$elem} ||= {};
 424             $node= $node->{$elem};
 425         }
 426         $node->{''}= $rec->{str};
 427     }
 428     return 0 + keys( %trie ) ? \%trie : undef;
 429 }
 430
 431 sub pop_count ($) {
 432     my $word = shift;
 433
 434     # This returns a list of the positions of the bits in the input word that
 435     # are 1.
 436
 437     my @positions;
 438     my $position = 0;
 439     while ($word) {
 440         push @positions, $position if $word & 1;
 441         $position++;
 442         $word >>= 1;
 443     }
 444     return @positions;
 445 }
 446
 447 # my $optree= _optree()
 448 #
 449 # recursively convert a trie to an optree where every node represents
 450 # an if else branch.
 451 #
 452 #
 453
 454 sub _optree {
 455     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 456     return unless defined $trie;
 457     if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
 458         die "Can't do 'cp' optree from multi-codepoint strings";
 459     }
 460     $ret_type ||= 'len';
 461     $else= 0  unless defined $else;
 462     $depth= 0 unless defined $depth;
 463
 464     # if we have an emptry string as a key it means we are in an
 465     # accepting state and unless we can match further on should
 466     # return the value of the '' key.
 467     if (exists $trie->{''} ) {
 468         # we can now update the "else" value, anything failing to match
 469         # after this point should return the value from this.
 470         if ( $ret_type eq 'cp' ) {
 471             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 472             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 473         } elsif ( $ret_type eq 'len' ) {
 474             $else= $depth;
 475         } elsif ( $ret_type eq 'both') {
 476             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 477             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 478             $else= "len=$depth, $else";
 479         }
 480     }
 481     # extract the meaningful keys from the trie, filter out '' as
 482     # it means we are an accepting state (end of sequence).
 483     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 484
 485     # if we havent any keys there is no further we can match and we
 486     # can return the "else" value.
 487     return $else if !@conds;
 488
 489
 490     my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
 491     # first we loop over the possible keys/conditions and find out what they look like
 492     # we group conditions with the same optree together.
 493     my %dmp_res;
 494     my @res_order;
 495     local $Data::Dumper::Sortkeys=1;
 496     foreach my $cond ( @conds ) {
 497
 498         # get the optree for this child/condition
 499         my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 );
 500         # convert it to a string with Dumper
 501         my $res_code= Dumper( $res );
 502
 503         push @{$dmp_res{$res_code}{vals}}, $cond;
 504         if (!$dmp_res{$res_code}{optree}) {
 505             $dmp_res{$res_code}{optree}= $res;
 506             push @res_order, $res_code;
 507         }
 508     }
 509
 510     # now that we have deduped the optrees we construct a new optree containing the merged
 511     # results.
 512     my %root;
 513     my $node= \%root;
 514     foreach my $res_code_idx (0 .. $#res_order) {
 515         my $res_code= $res_order[$res_code_idx];
 516         $node->{vals}= $dmp_res{$res_code}{vals};
 517         $node->{test}= $test;
 518         $node->{yes}= $dmp_res{$res_code}{optree};
 519         $node->{depth}= $depth;
 520         if ($res_code_idx < $#res_order) {
 521             $node= $node->{no}= {};
 522         } else {
 523             $node->{no}= $else;
 524         }
 525     }
 526
 527     # return the optree.
 528     return \%root;
 529 }
 530
 531 # my $optree= optree(%opts);
 532 #
 533 # Convert a trie to an optree, wrapper for _optree
 534
 535 sub optree {
 536     my $self= shift;
 537     my %opt= @_;
 538     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 539     $opt{ret_type} ||= 'len';
 540     my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
 541     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 542 }
 543
 544 # my $optree= generic_optree(%opts);
 545 #
 546 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 547 # sets of strings, including a branch for handling the string type check.
 548 #
 549
 550 sub generic_optree {
 551     my $self= shift;
 552     my %opt= @_;
 553
 554     $opt{ret_type} ||= 'len';
 555     my $test_type= 'depth';
 556     my $else= $opt{else} || 0;
 557
 558     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 559     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 560
 561     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 562       for $latin1, $utf8;
 563
 564     if ( $utf8 ) {
 565         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 566     } elsif ( $latin1 ) {
 567         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 568     }
 569     my $low= $self->make_trie( 'low', $opt{max_depth} );
 570     if ( $low ) {
 571         $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 572     }
 573
 574     return $else;
 575 }
 576
 577 # length_optree()
 578 #
 579 # create a string length guarded optree.
 580 #
 581
 582 sub length_optree {
 583     my $self= shift;
 584     my %opt= @_;
 585     my $type= $opt{type};
 586
 587     die "Can't do a length_optree on type 'cp', makes no sense."
 588       if $type eq 'cp';
 589
 590     my ( @size, $method );
 591
 592     if ( $type eq 'generic' ) {
 593         $method= 'generic_optree';
 594         my %sizes= (
 595             %{ $self->{size}{low}    || {} },
 596             %{ $self->{size}{latin1} || {} },
 597             %{ $self->{size}{utf8}   || {} }
 598         );
 599         @size= sort { $a <=> $b } keys %sizes;
 600     } else {
 601         $method= 'optree';
 602         @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 603     }
 604
 605     my $else= ( $opt{else} ||= 0 );
 606     for my $size ( @size ) {
 607         my $optree= $self->$method( %opt, type => $type, max_depth => $size );
 608         my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 609         $else= __cond_join( $cond, $optree, $else );
 610     }
 611     return $else;
 612 }
 613
 614 sub calculate_mask(@) {
 615     my @list = @_;
 616     my $list_count = @list;
 617
 618     # Look at the input list of byte values.  This routine sees if the set
 619     # consisting of those bytes is exactly determinable by using a
 620     # mask/compare operation.  If not, it returns an empty list; if so, it
 621     # returns a list consisting of (mask, compare).  For example, consider a
 622     # set consisting of the numbers 0xF0, 0xF1, 0xF2, and 0xF3.  If we want to
 623     # know if a number 'c' is in the set, we could write:
 624     #   0xF0 <= c && c <= 0xF4
 625     # But the following mask/compare also works, and has just one test:
 626     #   c & 0xFC == 0xF0
 627     # The reason it works is that the set consists of exactly those numbers
 628     # whose first 4 bits are 1, and the next two are 0.  (The value of the
 629     # other 2 bits is immaterial in determining if a number is in the set or
 630     # not.)  The mask masks out those 2 irrelevant bits, and the comparison
 631     # makes sure that the result matches all bytes that which match those 6
 632     # material bits exactly.  In other words, the set of numbers contains
 633     # exactly those whose bottom two bit positions are either 0 or 1.  The
 634     # same principle applies to bit positions that are not necessarily
 635     # adjacent.  And it can be applied to bytes that differ in 1 through all 8
 636     # bit positions.  In order to be a candidate for this optimization, the
 637     # number of numbers in the test must be a power of 2.  Based on this
 638     # count, we know the number of bit positions that must differ.
 639     my $bit_diff_count = 0;
 640     my $compare = $list[0];
 641     if ($list_count == 2) {
 642         $bit_diff_count = 1;
 643     }
 644     elsif ($list_count == 4) {
 645         $bit_diff_count = 2;
 646     }
 647     elsif ($list_count == 8) {
 648         $bit_diff_count = 3;
 649     }
 650     elsif ($list_count == 16) {
 651         $bit_diff_count = 4;
 652     }
 653     elsif ($list_count == 32) {
 654         $bit_diff_count = 5;
 655     }
 656     elsif ($list_count == 64) {
 657         $bit_diff_count = 6;
 658     }
 659     elsif ($list_count == 128) {
 660         $bit_diff_count = 7;
 661     }
 662     elsif ($list_count == 256) {
 663         return (0, 0);
 664     }
 665
 666     # If the count wasn't a power of 2, we can't apply this optimization
 667     return if ! $bit_diff_count;
 668
 669     my %bit_map;
 670
 671     # For each byte in the list, find the bit positions in it whose value
 672     # differs from the first byte in the set.
 673     for (my $i = 1; $i < @list; $i++) {
 674         my @positions = pop_count($list[0] ^ $list[$i]);
 675
 676         # If the number of differing bits is greater than those permitted by
 677         # the set size, this optimization doesn't apply.
 678         return if @positions > $bit_diff_count;
 679
 680         # Save the bit positions that differ.
 681         foreach my $bit (@positions) {
 682             $bit_map{$bit} = 1;
 683         }
 684
 685         # If the total so far is greater than those permitted by the set size,
 686         # this optimization doesn't apply.
 687         return if keys %bit_map > $bit_diff_count;
 688
 689
 690         # The value to compare against is the AND of all the members of the
 691         # set.  The bit positions that are the same in all will be correct in
 692         # the AND, and the bit positions that differ will be 0.
 693         $compare &= $list[$i];
 694     }
 695
 696     # To get to here, we have gone through all bytes in the set,
 697     # and determined that they all differ from each other in at most
 698     # the number of bits allowed for the set's quantity.  And since we have
 699     # tested all 2**N possibilities, we know that the set includes no fewer
 700     # elements than we need,, so the optimization applies.
 701     die "panic: internal logic error" if keys %bit_map != $bit_diff_count;
 702
 703     # The mask is the bit positions where things differ, complemented.
 704     my $mask = 0;
 705     foreach my $position (keys %bit_map) {
 706         $mask |= 1 << $position;
 707     }
 708     $mask = ~$mask & 0xFF;
 709
 710     return ($mask, $compare);
 711 }
 712
 713 # _cond_as_str
 714 # turn a list of conditions into a text expression
 715 # - merges ranges of conditions, and joins the result with ||
 716 sub _cond_as_str {
 717     my ( $self, $op, $combine, $opts_ref )= @_;
 718     my $cond= $op->{vals};
 719     my $test= $op->{test};
 720     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
 721     return "( $test )" if !defined $cond;
 722
 723     # rangify the list.
 724     my @ranges;
 725     my $Update= sub {
 726         # We skip this if there are optimizations that
 727         # we can apply (below) to the individual ranges
 728         if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
 729             if ( $ranges[-1][0] == $ranges[-1][1] ) {
 730                 $ranges[-1]= $ranges[-1][0];
 731             } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
 732                 $ranges[-1]= $ranges[-1][0];
 733                 push @ranges, $ranges[-1] + 1;
 734             }
 735         }
 736     };
 737     for my $condition ( @$cond ) {
 738         if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
 739             $Update->();
 740             push @ranges, [ $condition, $condition ];
 741         } else {
 742             $ranges[-1][1]++;
 743         }
 744     }
 745     $Update->();
 746
 747     return $self->_combine( $test, @ranges )
 748       if $combine;
 749
 750     if ($is_cp_ret) {
 751         @ranges= map {
 752             ref $_
 753             ? sprintf(
 754                 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
 755                 @$_ )
 756             : sprintf( "$self->{val_fmt} == $test", $_ );
 757         } @ranges;
 758     }
 759     else {
 760         # If the input set has certain characteristics, we can optimize tests
 761         # for it.  This doesn't apply if returning the code point, as we want
 762         # each element of the set individually.  The code above is for this
 763         # simpler case.
 764
 765         return 1 if @$cond == 256;  # If all bytes match, is trivially true
 766
 767         if (@ranges > 1) {
 768             # See if the entire set shares optimizable characterstics, and if
 769             # so, return the optimization.  We delay checking for this on sets
 770             # with just a single range, as there may be better optimizations
 771             # available in that case.
 772             my ($mask, $base) = calculate_mask(@$cond);
 773             if (defined $mask && defined $base) {
 774                 return sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask, $base;
 775             }
 776         }
 777
 778         # Here, there was no entire-class optimization.  Look at each range.
 779         for (my $i = 0; $i < @ranges; $i++) {
 780             if (! ref $ranges[$i]) {    # Trivial case: no range
 781                 $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
 782             }
 783             elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
 784                 $ranges[$i] =           # Trivial case: single element range
 785                         sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
 786             }
 787             else {
 788                 my $output = "";
 789
 790                 # Well-formed UTF-8 continuation bytes on ascii platforms must
 791                 # be in the range 0x80 .. 0xBF.  If we know that the input is
 792                 # well-formed (indicated by not trying to be 'safe'), we can
 793                 # omit tests that verify that the input is within either of
 794                 # these bounds.  (No legal UTF-8 character can begin with
 795                 # anything in this range, so we don't have to worry about this
 796                 # being a continuation byte or not.)
 797                 if (ASCII_PLATFORM
 798                     && ! $opts_ref->{safe}
 799                     && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
 800                 {
 801                     my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
 802                     my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
 803
 804                     # If the range is the entire legal range, it matches any
 805                     # legal byte, so we can omit both tests.  (This should
 806                     # happen only if the number of ranges is 1.)
 807                     if ($lower_limit_is_80 && $upper_limit_is_BF) {
 808                         return 1;
 809                     }
 810                     elsif ($lower_limit_is_80) { # Just use the upper limit test
 811                         $output = sprintf("( $test <= $self->{val_fmt} )",
 812                                             $ranges[$i]->[1]);
 813                     }
 814                     elsif ($upper_limit_is_BF) { # Just use the lower limit test
 815                         $output = sprintf("( $test >= $self->{val_fmt} )",
 816                                         $ranges[$i]->[0]);
 817                     }
 818                 }
 819
 820                 # If we didn't change to omit a test above, see if the number
 821                 # of elements is a power of 2 (only a single bit in the
 822                 # representation of its count will be set) and if so, it may
 823                 # be that a mask/compare optimization is possible.
 824                 if ($output eq ""
 825                     && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
 826                 {
 827                     my @list;
 828                     push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
 829                     my ($mask, $base) = calculate_mask(@list);
 830                     if (defined $mask && defined $base) {
 831                         $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $mask, $base;
 832                     }
 833                 }
 834
 835                 if ($output ne "") {  # Prefer any optimization
 836                     $ranges[$i] = $output;
 837                 }
 838                 elsif ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
 839                     # No optimization happened.  We need a test that the code
 840                     # point is within both bounds.  But, if the bounds are
 841                     # adjacent code points, it is cleaner to say
 842                     # 'first == test || second == test'
 843                     # than it is to say
 844                     # 'first <= test && test <= second'
 845                     $ranges[$i] = "( "
 846                                 .  join( " || ", ( map
 847                                     { sprintf "$self->{val_fmt} == $test", $_ }
 848                                     @{$ranges[$i]} ) )
 849                                 . " )";
 850                 }
 851                 else {  # Full bounds checking
 852                     $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
 853                 }
 854             }
 855         }
 856     }
 857
 858     return "( " . join( " || ", @ranges ) . " )";
 859
 860 }
 861
 862 # _combine
 863 # recursively turn a list of conditions into a fast break-out condition
 864 # used by _cond_as_str() for 'cp' type macros.
 865 sub _combine {
 866     my ( $self, $test, @cond )= @_;
 867     return if !@cond;
 868     my $item= shift @cond;
 869     my ( $cstr, $gtv );
 870     if ( ref $item ) {
 871         $cstr=
 872           sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
 873             @$item );
 874         $gtv= sprintf "$self->{val_fmt}", $item->[1];
 875     } else {
 876         $cstr= sprintf( "$self->{val_fmt} == $test", $item );
 877         $gtv= sprintf "$self->{val_fmt}", $item;
 878     }
 879     if ( @cond ) {
 880         return "( $cstr || ( $gtv < $test &&\n"
 881           . $self->_combine( $test, @cond ) . " ) )";
 882     } else {
 883         return $cstr;
 884     }
 885 }
 886
 887 # _render()
 888 # recursively convert an optree to text with reasonably neat formatting
 889 sub _render {
 890     my ( $self, $op, $combine, $brace, $opts_ref )= @_;
 891     return 0 if ! defined $op;  # The set is empty
 892     if ( !ref $op ) {
 893         return $op;
 894     }
 895     my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
 896     #no warnings 'recursion';   # This would allow really really inefficient
 897                                 # code to be generated.  See pod
 898     my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref );
 899     return $yes if $cond eq '1';
 900
 901     my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref );
 902     return "( $cond )" if $yes eq '1' and $no eq '0';
 903     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
 904     return "$lb$cond ? $yes : $no$rb"
 905       if !ref( $op->{yes} ) && !ref( $op->{no} );
 906     my $ind1= " " x 4;
 907     my $ind= "\n" . ( $ind1 x $op->{depth} );
 908
 909     if ( ref $op->{yes} ) {
 910         $yes= $ind . $ind1 . $yes;
 911     } else {
 912         $yes= " " . $yes;
 913     }
 914
 915     return "$lb$cond ?$yes$ind: $no$rb";
 916 }
 917
 918 # $expr=render($op,$combine)
 919 #
 920 # convert an optree to text with reasonably neat formatting. If $combine
 921 # is true then the condition is created using "fast breakouts" which
 922 # produce uglier expressions that are more efficient for common case,
 923 # longer lists such as that resulting from type 'cp' output.
 924 # Currently only used for type 'cp' macros.
 925 sub render {
 926     my ( $self, $op, $combine, $opts_ref )= @_;
 927     my $str= "( " . $self->_render( $op, $combine, 0, $opts_ref ) . " )";
 928     return __clean( $str );
 929 }
 930
 931 # make_macro
 932 # make a macro of a given type.
 933 # calls into make_trie and (generic_|length_)optree as needed
 934 # Opts are:
 935 # type     : 'cp','generic','high','low','latin1','utf8','LATIN1','UTF8'
 936 # ret_type : 'cp' or 'len'
 937 # safe     : add length guards to macro
 938 #
 939 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
 940 # in which case it defaults to 'cp' as well.
 941 #
 942 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
 943 # sequences in it, as the generated macro will accept only a single codepoint
 944 # as an argument.
 945 #
 946 # returns the macro.
 947
 948
 949 sub make_macro {
 950     my $self= shift;
 951     my %opts= @_;
 952     my $type= $opts{type} || 'generic';
 953     die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
 954       if $type eq 'cp'
 955       and $self->{has_multi};
 956     my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
 957     my $method;
 958     if ( $opts{safe} ) {
 959         $method= 'length_optree';
 960     } elsif ( $type eq 'generic' ) {
 961         $method= 'generic_optree';
 962     } else {
 963         $method= 'optree';
 964     }
 965     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
 966     my $text= $self->render( $optree, $type eq 'cp', \%opts );
 967     my @args= $type eq 'cp' ? 'cp' : 's';
 968     push @args, "e" if $opts{safe};
 969     push @args, "is_utf8" if $type eq 'generic';
 970     push @args, "len" if $ret_type eq 'both';
 971     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
 972              $ret_type eq 'cp'      ? 'what_'     : 'is_';
 973     my $ext= $type     eq 'generic' ? ''          : '_' . lc( $type );
 974     $ext .= "_safe" if $opts{safe};
 975     my $argstr= join ",", @args;
 976     return "/*** GENERATED CODE ***/\n"
 977       . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
 978 }
 979
 980 # if we arent being used as a module (highly likely) then process
 981 # the __DATA__ below and produce macros in regcharclass.h
 982 # if an argument is provided to the script then it is assumed to
 983 # be the path of the file to output to, if the arg is '-' outputs
 984 # to STDOUT.
 985 if ( !caller ) {
 986     $|++;
 987     my $path= shift @ARGV || "regcharclass.h";
 988     my $out_fh;
 989     if ( $path eq '-' ) {
 990         $out_fh= \*STDOUT;
 991     } else {
 992         $out_fh = open_new( $path );
 993     }
 994     print $out_fh read_only_top( lang => 'C', by => $0,
 995                                  file => 'regcharclass.h', style => '*',
 996                                  copyright => [2007, 2011] );
 997     print $out_fh "\n#ifndef H_REGCHARCLASS   /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
 998
 999     my ( $op, $title, @txt, @types, %mods );
1000     my $doit= sub {
1001         return unless $op;
1002
1003         # Skip if to compile on a different platform.
1004         return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
1005         return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
1006
1007         print $out_fh "/*\n\t$op: $title\n\n";
1008         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1009         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
1010
1011         #die Dumper(\@types,\%mods);
1012
1013         my @mods;
1014         push @mods, 'safe' if delete $mods{safe};
1015         unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
1016                                                                 # do this one
1017                                                                 # first, as
1018                                                                 # traditional
1019         if (%mods) {
1020             die "Unknown modifiers: ", join ", ", map { "'$_'" } keys %mods;
1021         }
1022
1023         foreach my $type_spec ( @types ) {
1024             my ( $type, $ret )= split /-/, $type_spec;
1025             $ret ||= 'len';
1026             foreach my $mod ( @mods ) {
1027                 next if $mod eq 'safe' and $type eq 'cp';
1028                 delete $mods{$mod};
1029                 my $macro= $obj->make_macro(
1030                     type     => $type,
1031                     ret_type => $ret,
1032                     safe     => $mod eq 'safe'
1033                 );
1034                 print $out_fh $macro, "\n";
1035             }
1036         }
1037     };
1038
1039     while ( <DATA> ) {
1040         s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1041         next unless /\S/;
1042         chomp;
1043         if ( /^[A-Z]/ ) {
1044             $doit->();  # This starts a new definition; do the previous one
1045             ( $op, $title )= split /\s*:\s*/, $_, 2;
1046             @txt= ();
1047         } elsif ( s/^=>// ) {
1048             my ( $type, $modifier )= split /:/, $_;
1049             @types= split ' ', $type;
1050             undef %mods;
1051             map { $mods{$_} = 1 } split ' ',  $modifier;
1052         } else {
1053             push @txt, "$_";
1054         }
1055     }
1056     $doit->();
1057
1058     print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
1059
1060     if($path eq '-') {
1061         print $out_fh "/* ex: set ro: */\n";
1062     } else {
1063         read_only_bottom_close_and_rename($out_fh)
1064     }
1065 }
1066
1067 # The form of the input is a series of definitions to make macros for.
1068 # The first line gives the base name of the macro, followed by a colon, and
1069 # then text to be used in comments associated with the macro that are its
1070 # title or description.  In all cases the first (perhaps only) parameter to
1071 # the macro is a pointer to the first byte of the code point it is to test to
1072 # see if it is in the class determined by the macro.  In the case of non-UTF8,
1073 # the code point consists only of a single byte.
1074 #
1075 # The second line must begin with a '=>' and be followed by the types of
1076 # macro(s) to be generated; these are specified below.  A colon follows the
1077 # types, followed by the modifiers, also specified below.  At least one
1078 # modifier is required.
1079 #
1080 # The subsequent lines give what code points go into the class defined by the
1081 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1082 # enclosed in quotes.  Otherwise the lines consist of one of:
1083 #   1)  a single Unicode code point, prefaced by 0x
1084 #   2)  a single range of Unicode code points separated by a minus (and
1085 #       optional space)
1086 #   3)  a single Unicode property specified in the standard Perl form
1087 #       "\p{...}"
1088 #   4)  a line like 'do path'.  This will do a 'do' on the file given by
1089 #       'path'.  It is assumed that this does nothing but load subroutines
1090 #       (See item 5 below).  The reason 'require path' is not used instead is
1091 #       because 'do' doesn't assume that path is in @INC.
1092 #   5)  a subroutine call
1093 #           &pkg::foo(arg1, ...)
1094 #       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1095 #       returns an array of entries of forms like items 1-3 above.  This
1096 #       allows more complex inputs than achievable from the other input types.
1097 #
1098 # A blank line or one whose first non-blank character is '#' is a comment.
1099 # The definition of the macro is terminated by a line unlike those described.
1100 #
1101 # Valid types:
1102 #   low         generate a macro whose name is 'is_BASE_low' and defines a
1103 #               class that includes only ASCII-range chars.  (BASE is the
1104 #               input macro base name.)
1105 #   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1106 #               class that includes only upper-Latin1-range chars.  It is not
1107 #               designed to take a UTF-8 input parameter.
1108 #   high        generate a macro whose name is 'is_BASE_high' and defines a
1109 #               class that includes all relevant code points that are above
1110 #               the Latin1 range.  This is for very specialized uses only.
1111 #               It is designed to take only an input UTF-8 parameter.
1112 #   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1113 #               class that includes all relevant characters that aren't ASCII.
1114 #               It is designed to take only an input UTF-8 parameter.
1115 #   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1116 #               class that includes both ASCII and upper-Latin1-range chars.
1117 #               It is not designed to take a UTF-8 input parameter.
1118 #   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1119 #               class that can include any code point, adding the 'low' ones
1120 #               to what 'utf8' works on.  It is designed to take only an input
1121 #               UTF-8 parameter.
1122 #   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1123 #               boolean, parameter which indicates if the first one points to
1124 #               a UTF-8 string or not.  Thus it works in all circumstances.
1125 #   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1126 #               class that returns true if the UV parameter is a member of the
1127 #               class; false if not.
1128 # A macro of the given type is generated for each type listed in the input.
1129 # The default return value is the number of octets read to generate the match.
1130 # Append "-cp" to the type to have it instead return the matched codepoint.
1131 #               The macro name is changed to 'what_BASE...'.  See pod for
1132 #               caveats
1133 # Appending '-both" instead adds an extra parameter to the end of the argument
1134 #               list, which is a pointer as to where to store the number of
1135 #               bytes matched, while also returning the code point.  The macro
1136 #               name is changed to 'what_len_BASE...'.  See pod for caveats
1137 #
1138 # Valid modifiers:
1139 #   safe        The input string is not necessarily valid UTF-8.  In
1140 #               particular an extra parameter (always the 2nd) to the macro is
1141 #               required, which points to one beyond the end of the string.
1142 #               The macro will make sure not to read off the end of the
1143 #               string.  In the case of non-UTF8, it makes sure that the
1144 #               string has at least one byte in it.  The macro name has
1145 #               '_safe' appended to it.
1146 #   fast        The input string is valid UTF-8.  No bounds checking is done,
1147 #               and the macro can make assumptions that lead to faster
1148 #               execution.
1149 #   only_ascii_platform   Skip this definition if this program is being run on
1150 #               a non-ASCII platform.
1151 #   only_ebcdic_platform  Skip this definition if this program is being run on
1152 #               a non-EBCDIC platform.
1153 # No modifier need be specified; fast is assumed for this case.  If both
1154 # 'fast', and 'safe' are specified, two macros will be created for each
1155 # 'type'.
1156 #
1157 # If run on a non-ASCII platform will automatically convert the Unicode input
1158 # to native.  The documentation above is slightly wrong in this case.  'low'
1159 # actually refers to code points whose UTF-8 representation is the same as the
1160 # non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1161 # code points less than 256.
1162
1163 1; # in the unlikely case we are being used as a module
1164
1165 __DATA__
1166 # This is no longer used, but retained in case it is needed some day.
1167 # TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1168 # => generic cp generic-cp generic-both :fast safe
1169 # 0x00DF        # LATIN SMALL LETTER SHARP S
1170 # 0x0390        # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1171 # 0x03B0        # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1172 # 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1173 # 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1174 # 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1175
1176 LNBREAK: Line Break: \R
1177 => generic UTF8 LATIN1 :fast safe
1178 "\x0D\x0A"      # CRLF - Network (Windows) line ending
1179 \p{VertSpace}
1180
1181 HORIZWS: Horizontal Whitespace: \h \H
1182 => generic UTF8 LATIN1 cp :fast safe
1183 \p{HorizSpace}
1184
1185 VERTWS: Vertical Whitespace: \v \V
1186 => generic UTF8 LATIN1 cp :fast safe
1187 \p{VertSpace}
1188
1189 REPLACEMENT: Unicode REPLACEMENT CHARACTER
1190 => UTF8 :safe
1191 0xFFFD
1192
1193 NONCHAR: Non character code points
1194 => UTF8 :fast
1195 \p{Nchar}
1196
1197 SURROGATE: Surrogate characters
1198 => UTF8 :fast
1199 \p{Gc=Cs}
1200
1201 GCB_L: Grapheme_Cluster_Break=L
1202 => UTF8 :fast
1203 \p{_X_GCB_L}
1204
1205 GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
1206 => UTF8 :fast
1207 \p{_X_LV_LVT_V}
1208
1209 GCB_Prepend: Grapheme_Cluster_Break=Prepend
1210 => UTF8 :fast
1211 \p{_X_GCB_Prepend}
1212
1213 GCB_RI: Grapheme_Cluster_Break=RI
1214 => UTF8 :fast
1215 \p{_X_RI}
1216
1217 GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins
1218 => UTF8 :fast
1219 \p{_X_Special_Begin}
1220
1221 GCB_T: Grapheme_Cluster_Break=T
1222 => UTF8 :fast
1223 \p{_X_GCB_T}
1224
1225 GCB_V: Grapheme_Cluster_Break=V
1226 => UTF8 :fast
1227 \p{_X_GCB_V}
1228
1229 # This program was run with this enabled, and the results copied to utf8.h;
1230 # then this was commented out because it takes so long to figure out these 2
1231 # million code points.  The results would not change unless utf8.h decides it
1232 # wants a maximum other than 4 bytes, or this program creates better
1233 # optimizations
1234 #UTF8_CHAR: Matches utf8 from 1 to 4 bytes
1235 #=> UTF8 :safe only_ascii_platform
1236 #0x0 - 0x1FFFFF
1237
1238 # This hasn't been commented out, because we haven't an EBCDIC platform to run
1239 # it on, and the 3 types of EBCDIC allegedly supported by Perl would have
1240 # different results
1241 UTF8_CHAR: Matches utf8 from 1 to 5 bytes
1242 => UTF8 :safe only_ebcdic_platform
1243 0x0 - 0x3FFFFF:
1244
1245 QUOTEMETA: Meta-characters that \Q should quote
1246 => high :fast
1247 \p{_Perl_Quotemeta}