utf8.h: Use machine generated IS_UTF8_CHAR()

[perl5.git] / regen / regcharclass.pl
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl

index 1d4a921..7d12642 100755 (executable)
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -395,6 +395,22 @@ sub make_trie {
      return 0 + keys( %trie ) ? \%trie : undef;
  }
  
+sub pop_count ($) {
+    my $word = shift;
+
+    # This returns a list of the positions of the bits in the input word that
+    # are 1.
+
+    my @positions;
+    my $position = 0;
+    while ($word) {
+        push @positions, $position if $word & 1;
+        $position++;
+        $word >>= 1;
+    }
+    return @positions;
+}
+
  # my $optree= _optree()
  #
  # recursively convert a trie to an optree where every node represents
@@ -541,19 +557,121 @@ sub length_optree {
      return $else;
  }
  
+sub calculate_mask(@) {
+    my @list = @_;
+    my $list_count = @list;
+
+    # Look at the input list of byte values.  This routine sees if the set
+    # consisting of those bytes is exactly determinable by using a
+    # mask/compare operation.  If not, it returns an empty list; if so, it
+    # returns a list consisting of (mask, compare).  For example, consider a
+    # set consisting of the numbers 0xF0, 0xF1, 0xF2, and 0xF3.  If we want to
+    # know if a number 'c' is in the set, we could write:
+    #   0xF0 <= c && c <= 0xF4
+    # But the following mask/compare also works, and has just one test:
+    #   c & 0xFC == 0xF0
+    # The reason it works is that the set consists of exactly those numbers
+    # whose first 4 bits are 1, and the next two are 0.  (The value of the
+    # other 2 bits is immaterial in determining if a number is in the set or
+    # not.)  The mask masks out those 2 irrelevant bits, and the comparison
+    # makes sure that the result matches all bytes that which match those 6
+    # material bits exactly.  In other words, the set of numbers contains
+    # exactly those whose bottom two bit positions are either 0 or 1.  The
+    # same principle applies to bit positions that are not necessarily
+    # adjacent.  And it can be applied to bytes that differ in 1 through all 8
+    # bit positions.  In order to be a candidate for this optimization, the
+    # number of numbers in the test must be a power of 2.  Based on this
+    # count, we know the number of bit positions that must differ.
+    my $bit_diff_count = 0;
+    my $compare = $list[0];
+    if ($list_count == 2) {
+        $bit_diff_count = 1;
+    }
+    elsif ($list_count == 4) {
+        $bit_diff_count = 2;
+    }
+    elsif ($list_count == 8) {
+        $bit_diff_count = 3;
+    }
+    elsif ($list_count == 16) {
+        $bit_diff_count = 4;
+    }
+    elsif ($list_count == 32) {
+        $bit_diff_count = 5;
+    }
+    elsif ($list_count == 64) {
+        $bit_diff_count = 6;
+    }
+    elsif ($list_count == 128) {
+        $bit_diff_count = 7;
+    }
+    elsif ($list_count == 256) {
+        return (0, 0);
+    }
+
+    # If the count wasn't a power of 2, we can't apply this optimization
+    return if ! $bit_diff_count;
+
+    my %bit_map;
+
+    # For each byte in the list, find the bit positions in it whose value
+    # differs from the first byte in the set.
+    for (my $i = 1; $i < @list; $i++) {
+        my @positions = pop_count($list[0] ^ $list[$i]);
+
+        # If the number of differing bits is greater than those permitted by
+        # the set size, this optimization doesn't apply.
+        return if @positions > $bit_diff_count;
+
+        # Save the bit positions that differ.
+        foreach my $bit (@positions) {
+            $bit_map{$bit} = 1;
+        }
+
+        # If the total so far is greater than those permitted by the set size,
+        # this optimization doesn't apply.
+        return if keys %bit_map > $bit_diff_count;
+
+
+        # The value to compare against is the AND of all the members of the
+        # set.  The bit positions that are the same in all will be correct in
+        # the AND, and the bit positions that differ will be 0.
+        $compare &= $list[$i];
+    }
+
+    # To get to here, we have gone through all bytes in the set,
+    # and determined that they all differ from each other in at most
+    # the number of bits allowed for the set's quantity.  And since we have
+    # tested all 2**N possibilities, we know that the set includes no fewer
+    # elements than we need,, so the optimization applies.
+    die "panic: internal logic error" if keys %bit_map != $bit_diff_count;
+
+    # The mask is the bit positions where things differ, complemented.
+    my $mask = 0;
+    foreach my $position (keys %bit_map) {
+        $mask |= 1 << $position;
+    }
+    $mask = ~$mask & 0xFF;
+
+    return ($mask, $compare);
+}
+
  # _cond_as_str
  # turn a list of conditions into a text expression
  # - merges ranges of conditions, and joins the result with ||
  sub _cond_as_str {
-    my ( $self, $op, $combine )= @_;
+    my ( $self, $op, $combine, $opts_ref )= @_;
      my $cond= $op->{vals};
      my $test= $op->{test};
+    my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
      return "( $test )" if !defined $cond;
  
-    # rangify the list
+    # rangify the list.
      my @ranges;
      my $Update= sub {
-        if ( @ranges ) {
+        # We skip this if there are optimizations that
+        # we can apply (below) to the individual ranges
+        if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
              if ( $ranges[-1][0] == $ranges[-1][1] ) {
                  $ranges[-1]= $ranges[-1][0];
              } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
@@ -562,25 +680,129 @@ sub _cond_as_str {
              }
          }
      };
-    for my $cond ( @$cond ) {
-        if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
+    for my $condition ( @$cond ) {
+        if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
              $Update->();
-            push @ranges, [ $cond, $cond ];
+            push @ranges, [ $condition, $condition ];
          } else {
              $ranges[-1][1]++;
          }
      }
      $Update->();
+
      return $self->_combine( $test, @ranges )
        if $combine;
-    @ranges= map {
-        ref $_
-          ? sprintf(
-            "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
-            @$_ )
-          : sprintf( "$self->{val_fmt} == $test", $_ );
-    } @ranges;
+
+    if ($is_cp_ret) {
+        @ranges= map {
+            ref $_
+            ? sprintf(
+                "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
+                @$_ )
+            : sprintf( "$self->{val_fmt} == $test", $_ );
+        } @ranges;
+    }
+    else {
+        # If the input set has certain characteristics, we can optimize tests
+        # for it.  This doesn't apply if returning the code point, as we want
+        # each element of the set individually.  The code above is for this
+        # simpler case.
+
+        return 1 if @$cond == 256;  # If all bytes match, is trivially true
+
+        if (@ranges > 1) {
+            # See if the entire set shares optimizable characterstics, and if
+            # so, return the optimization.  We delay checking for this on sets
+            # with just a single range, as there may be better optimizations
+            # available in that case.
+            my ($mask, $base) = calculate_mask(@$cond);
+            if (defined $mask && defined $base) {
+                return sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask, $base;
+            }
+        }
+
+        # Here, there was no entire-class optimization.  Look at each range.
+        for (my $i = 0; $i < @ranges; $i++) {
+            if (! ref $ranges[$i]) {    # Trivial case: no range
+                $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
+            }
+            elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
+                $ranges[$i] =           # Trivial case: single element range
+                        sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
+            }
+            else {
+                my $output = "";
+
+                # Well-formed UTF-8 continuation bytes on ascii platforms must
+                # be in the range 0x80 .. 0xBF.  If we know that the input is
+                # well-formed (indicated by not trying to be 'safe'), we can
+                # omit tests that verify that the input is within either of
+                # these bounds.  (No legal UTF-8 character can begin with
+                # anything in this range, so we don't have to worry about this
+                # being a continuation byte or not.)
+                if (ASCII_PLATFORM
+                    && ! $opts_ref->{safe}
+                    && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
+                {
+                    my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
+                    my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
+
+                    # If the range is the entire legal range, it matches any
+                    # legal byte, so we can omit both tests.  (This should
+                    # happen only if the number of ranges is 1.)
+                    if ($lower_limit_is_80 && $upper_limit_is_BF) {
+                        return 1;
+                    }
+                    elsif ($lower_limit_is_80) { # Just use the upper limit test
+                        $output = sprintf("( $test <= $self->{val_fmt} )",
+                                            $ranges[$i]->[1]);
+                    }
+                    elsif ($upper_limit_is_BF) { # Just use the lower limit test
+                        $output = sprintf("( $test >= $self->{val_fmt} )",
+                                        $ranges[$i]->[0]);
+                    }
+                }
+
+                # If we didn't change to omit a test above, see if the number
+                # of elements is a power of 2 (only a single bit in the
+                # representation of its count will be set) and if so, it may
+                # be that a mask/compare optimization is possible.
+                if ($output eq ""
+                    && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
+                {
+                    my @list;
+                    push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
+                    my ($mask, $base) = calculate_mask(@list);
+                    if (defined $mask && defined $base) {
+                        $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $mask, $base;
+                    }
+                }
+
+                if ($output ne "") {  # Prefer any optimization
+                    $ranges[$i] = $output;
+                }
+                elsif ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
+                    # No optimization happened.  We need a test that the code
+                    # point is within both bounds.  But, if the bounds are
+                    # adjacent code points, it is cleaner to say
+                    # 'first == test || second == test'
+                    # than it is to say
+                    # 'first <= test && test <= second'
+                    $ranges[$i] = "( "
+                                .  join( " || ", ( map
+                                    { sprintf "$self->{val_fmt} == $test", $_ }
+                                    @{$ranges[$i]} ) )
+                                . " )";
+                }
+                else {  # Full bounds checking
+                    $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
+                }
+            }
+        }
+    }
+
      return "( " . join( " || ", @ranges ) . " )";
+
  }
  
  # _combine
@@ -611,16 +833,18 @@ sub _combine {
  # _render()
  # recursively convert an optree to text with reasonably neat formatting
  sub _render {
-    my ( $self, $op, $combine, $brace )= @_;
+    my ( $self, $op, $combine, $brace, $opts_ref )= @_;
      return 0 if ! defined $op;  # The set is empty
      if ( !ref $op ) {
          return $op;
      }
-    my $cond= $self->_cond_as_str( $op, $combine );
+    my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
      #no warnings 'recursion';   # This would allow really really inefficient
                                  # code to be generated.  See pod
-    my $yes= $self->_render( $op->{yes}, $combine, 1 );
-    my $no= $self->_render( $op->{no},   $combine, 0 );
+    my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref );
+    return $yes if $cond eq '1';
+
+    my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref );
      return "( $cond )" if $yes eq '1' and $no eq '0';
      my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
      return "$lb$cond ? $yes : $no$rb"
@@ -645,8 +869,8 @@ sub _render {
  # longer lists such as that resulting from type 'cp' output.
  # Currently only used for type 'cp' macros.
  sub render {
-    my ( $self, $op, $combine )= @_;
-    my $str= "( " . $self->_render( $op, $combine ) . " )";
+    my ( $self, $op, $combine, $opts_ref )= @_;
+    my $str= "( " . $self->_render( $op, $combine, 0, $opts_ref ) . " )";
      return __clean( $str );
  }
  
@@ -685,7 +909,7 @@ sub make_macro {
          $method= 'optree';
      }
      my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
-    my $text= $self->render( $optree, $type eq 'cp' );
+    my $text= $self->render( $optree, $type eq 'cp', \%opts );
      my @args= $type eq 'cp' ? 'cp' : 's';
      push @args, "e" if $opts{safe};
      push @args, "is_utf8" if $type eq 'generic';
@@ -721,6 +945,11 @@ if ( !caller ) {
      my ( $op, $title, @txt, @types, %mods );
      my $doit= sub {
          return unless $op;
+
+        # Skip if to compile on a different platform.
+        return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
+        return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
+
          print $out_fh "/*\n\t$op: $title\n\n";
          print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
          my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
@@ -852,6 +1081,10 @@ if ( !caller ) {
  #   fast        The input string is valid UTF-8.  No bounds checking is done,
  #               and the macro can make assumptions that lead to faster
  #               execution.
+#   only_ascii_platform   Skip this definition if this program is being run on
+#               a non-ASCII platform.
+#   only_ebcdic_platform  Skip this definition if this program is being run on
+#               a non-EBCDIC platform.
  # No modifier need be specified; fast is assumed for this case.  If both
  # 'fast', and 'safe' are specified, two macros will be created for each
  # 'type'.
@@ -888,6 +1121,18 @@ VERTWS: Vertical Whitespace: \v \V
  => generic UTF8 LATIN1 cp :fast safe
  \p{VertSpace}
  
+REPLACEMENT: Unicode REPLACEMENT CHARACTER
+=> UTF8 :safe
+0xFFFD
+
+NONCHAR: Non character code points
+=> UTF8 :fast
+\p{Nchar}
+
+SURROGATE: Surrogate characters
+=> UTF8 :fast
+\p{Gc=Cs}
+
  GCB_L: Grapheme_Cluster_Break=L
  => UTF8 :fast
  \p{_X_GCB_L}
@@ -915,3 +1160,23 @@ GCB_T: Grapheme_Cluster_Break=T
  GCB_V: Grapheme_Cluster_Break=V
  => UTF8 :fast
  \p{_X_GCB_V}
+
+# This program was run with this enabled, and the results copied to utf8.h;
+# then this was commented out because it takes so long to figure out these 2
+# million code points.  The results would not change unless utf8.h decides it
+# wants a maximum other than 4 bytes, or this program creates better
+# optimizations
+#UTF8_CHAR: Matches utf8 from 1 to 4 bytes
+#=> UTF8 :safe only_ascii_platform
+#0x0 - 0x1FFFFF
+
+# This hasn't been commented out, because we haven't an EBCDIC platform to run
+# it on, and the 3 types of EBCDIC allegedly supported by Perl would have
+# different results
+UTF8_CHAR: Matches utf8 from 1 to 5 bytes
+=> UTF8 :safe only_ebcdic_platform
+0x0 - 0x3FFFFF:
+
+QUOTEMETA: Meta-characters that \Q should quote
+=> high :fast
+\p{_Perl_Quotemeta}