pat_advanced.t: Update test

[perl5.git] / regen / regcharclass.pl
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl

index 1f780a4..8e3f06d 100755 (executable)
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -4,7 +4,6 @@ use strict;
  use 5.008;
  use warnings;
  use warnings FATAL => 'all';
-no warnings 'experimental::autoderef';
  use Data::Dumper;
  $Data::Dumper::Useqq= 1;
  our $hex_fmt= "0x%02X";
@@ -12,9 +11,9 @@ our $hex_fmt= "0x%02X";
  sub DEBUG () { 0 }
  $|=1 if DEBUG;
  
-sub ASCII_PLATFORM { (ord('A') == 65) }
-
-require 'regen/regen_lib.pl';
+require './regen/regen_lib.pl';
+require './regen/charset_translations.pl';
+require "./regen/regcharclass_multi_char_folds.pl";
  
  =head1 NAME
  
@@ -22,7 +21,7 @@ CharClass::Matcher -- Generate C macros that match character classes efficiently
  
  =head1 SYNOPSIS
  
-    perl Porting/regcharclass.pl
+    perl regen/regcharclass.pl
  
  =head1 DESCRIPTION
  
@@ -44,7 +43,7 @@ the C<__DATA__> line):
  =item C<is_WHATEVER_safe(s,e,is_utf8)>
  
  Do a lookup as appropriate based on the C<is_utf8> flag. When possible
-comparisons involving octect<128 are done before checking the C<is_utf8>
+comparisons involving octet<128 are done before checking the C<is_utf8>
  flag, hopefully saving time.
  
  The version without the C<_safe> suffix should be used only when the input is
@@ -110,6 +109,13 @@ include it, and it is a NULL.
  
  =back
  
+The above isn't quite complete, as for specialized purposes one can get a
+macro like C<is_WHATEVER_utf8_no_length_checks(s)>, which assumes that it is
+already known that there is enough space to hold the character starting at
+C<s>, but otherwise checks that it is well-formed.  In other words, this is
+intermediary in checking between C<is_WHATEVER_utf8(s)> and
+C<is_WHATEVER_utf8_safe(s,e)>.
+
  =head2 CODE FORMAT
  
  perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
@@ -161,38 +167,36 @@ License or the Artistic License, as specified in the README file.
  #
  
  sub __uni_latin1 {
+    my $charset= shift;
      my $str= shift;
      my $max= 0;
      my @cp;
      my @cp_high;
      my $only_has_invariants = 1;
+    my $a2n = get_a2n($charset);
      for my $ch ( split //, $str ) {
          my $cp= ord $ch;
-        push @cp, $cp;
-        push @cp_high, $cp if $cp > 255;
          $max= $cp if $max < $cp;
-        if (! ASCII_PLATFORM && $only_has_invariants) {
-            if ($cp > 255) {
-                $only_has_invariants = 0;
-            }
-            else {
-                my $temp = chr($cp);
-                utf8::upgrade($temp);
-                my @utf8 = unpack "U0C*", $temp;
-                $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
-            }
+        if ($cp > 255) {
+            push @cp, $cp;
+            push @cp_high, $cp;
+        }
+        else {
+            push @cp, $a2n->[$cp];
          }
      }
      my ( $n, $l, $u );
-    $only_has_invariants = $max < 128 if ASCII_PLATFORM;
+    $only_has_invariants = ($charset =~ /ascii/i) ? $max < 128 : $max < 160;
      if ($only_has_invariants) {
          $n= [@cp];
      } else {
          $l= [@cp] if $max && $max < 256;
  
-        $u= $str;
-        utf8::upgrade($u);
-        $u= [ unpack "U0C*", $u ] if defined $u;
+        my @u;
+        for my $ch ( split //, $str ) {
+            push @u, map { ord } split //, cp_2_utfbytes(ord $ch, $charset);
+        }
+        $u = \@u;
      }
      return ( \@cp, \@cp_high, $n, $l, $u );
  }
@@ -353,10 +357,6 @@ sub new {
              $str= chr eval $str;
          } elsif ( $str =~ /^0x/ ) {
              $str= eval $str;
-
-            # Convert from Unicode/ASCII to native, if necessary
-            $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
-                                                    && $str <= 0xFF;
              $str = chr $str;
          } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
              my $property = $1;
@@ -392,7 +392,7 @@ sub new {
          } else {
              die "Unparsable line: $txt\n";
          }
-        my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $str );
+        my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $opt{charset}, $str );
          my $UTF8= $low   || $utf8;
          my $LATIN1= $low || $latin1;
          my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
@@ -505,25 +505,9 @@ sub _optree {
      # can return the "else" value.
      return $else if !@conds;
  
-    # Assuming Perl is being released from an ASCII platform, the below makes
-    # it work for non-UTF-8 out-of-the box when porting to non-ASCII, by
-    # adding a translation back to ASCII.  This is the wrong thing to do for
-    # UTF-EBCDIC, as that is different from UTF-8.  But the intent here is
-    # that this regen should be run on the target system, which will omit the
-    # translation, and generate the correct UTF-EBCDIC.  On ASCII systems, the
-    # translation macros expand to just their argument, so there is no harm
-    # done nor performance penalty by including them.
-    my $test;
-    if ($test_type =~ /^cp/) {
-        $test = "cp";
-        $test = "NATIVE_TO_UNI($test)" if ASCII_PLATFORM;
-    }
-    else {
-        $test = "((U8*)s)[$depth]";
-        $test = "NATIVE_TO_LATIN1($test)" if ASCII_PLATFORM;
-    }
+    my $test = $test_type =~ /^cp/ ? "cp" : "((const U8*)s)[$depth]";
  
-    # first we loop over the possible keys/conditions and find out what they
+    # First we loop over the possible keys/conditions and find out what they
      # look like; we group conditions with the same optree together.
      my %dmp_res;
      my @res_order;
@@ -626,6 +610,8 @@ sub length_optree {
  
      my $else= ( $opt{else} ||= 0 );
  
+    return $else if $self->{count} == 0;
+
      my $method = $type =~ /generic/ ? 'generic_optree' : 'optree';
      if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) {
  
@@ -686,8 +672,9 @@ sub length_optree {
              # have only a few things that can match past this, so I (khw)
              # don't think it is worth it.  (Even better would be to use
              # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it
-            # if it saves a bunch.
-            my $cond = "(((e) - (s)) >= UTF8SKIP(s))";
+            # if it saves a bunch.  We assume that input text likely to be
+            # well-formed .
+            my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))";
              $else = __cond_join($cond, $utf8, $else);
  
              # For 'generic', we also will want the latin1 UTF-8 variants for
@@ -730,7 +717,7 @@ sub length_optree {
              }
  
              # We need at least one byte available to start off the tests
-            $else = __cond_join("((e) > (s))", $else, 0);
+            $else = __cond_join("LIKELY((e) > (s))", $else, 0);
          }
          else {  # Here, we don't want or there aren't any variants.  A single
                  # byte available is enough.
@@ -752,44 +739,32 @@ sub calculate_mask(@) {
  
      # Consider a set of byte values, A, B, C ....  If we want to determine if
      # <c> is one of them, we can write c==A || c==B || c==C ....  If the
-    # values are consecutive, we can shorten that to A<=c && c<=Z, which uses
-    # far fewer branches.  If only some of them are consecutive we can still
-    # save some branches by creating range tests for just those that are
-    # consecutive. _cond_as_str() does this work for looking for ranges.
+    # values are consecutive, we can shorten that to inRANGE(c, 'A', 'Z'),
+    # which uses far fewer branches.  If only some of them are consecutive we
+    # can still save some branches by creating range tests for just those that
+    # are consecutive. _cond_as_str() does this work for looking for ranges.
      #
      # Another approach is to look at the bit patterns for A, B, C .... and see
      # if they have some commonalities.  That's what this function does.  For
      # example, consider a set consisting of the bytes
-    # 0xF0, 0xF1, 0xF2, and 0xF3.  We could write:
-    #   0xF0 <= c && c <= 0xF4
-    # But the following mask/compare also works, and has just one test:
-    #   (c & 0xFC) == 0xF0
-    # The reason it works is that the set consists of exactly those bytes
-    # whose first 4 bits are 1, and the next two are 0.  (The value of the
-    # other 2 bits is immaterial in determining if a byte is in the set or
-    # not.)  The mask masks out those 2 irrelevant bits, and the comparison
-    # makes sure that the result matches all bytes which match those 6
-    # material bits exactly.  In other words, the set of bytes contains
-    # exactly those whose bottom two bit positions are either 0 or 1.  The
-    # same principle applies to bit positions that are not necessarily
-    # adjacent.  And it can be applied to bytes that differ in 1 through all 8
-    # bit positions.  In order to be a candidate for this optimization, the
-    # number of bytes in the set must be a power of 2.
-    #
-    # Consider a different example, the set 0x53, 0x54, 0x73, and 0x74.  That
-    # requires 4 tests using either ranges or individual values, and even
-    # though the number in the set is a power of 2, it doesn't qualify for the
-    # mask optimization described above because the number of bits that are
-    # different is too large for that.  However, the set can be expressed as
-    # two branches with masks thusly:
-    #   (c & 0xDF) == 0x53 || (c & 0xDF) == 0x54
-    # a branch savings of 50%.  This is done by splitting the set into two
-    # subsets each of which has 2 elements, and within each set the values
-    # differ by 1 byte.
+    # 0x42, 0x43, 0x62, and 0x63.  We could write:
+    #   inRANGE(c, 0x42, 0x43) || inRANGE(c, 0x62, 0x63)
+    # which through the magic of casting has not 4, but 2 tests.  But the
+    # following mask/compare also works, and has just one test:
+    #   (c & 0xDE) == 0x42
+    # The reason it works is that the set consists of exactly the 4 bit
+    # patterns which have either 0 or 1 in the two bit positions that are 0 in
+    # the mask.  They have the same value in each bit position where the mask
+    # is 1.  The comparison makes sure that the result matches all bytes which
+    # match those six 1 bits exactly.  This can be applied to bytes that
+    # differ in 1 through all 8 bit positions.  In order to be a candidate for
+    # this optimization, the number of bytes in the set must be a power of 2.
      #
-    # This function attempts to find some way to save some branches using the
-    # mask technique.  If not, it returns an empty list; if so, it
-    # returns a list consisting of
+    # It may be that the bytes needing to be matched can't be done with a
+    # single mask.  But it may be possible to have two (or more) sets, each
+    # with a separate mask.  This function attempts to find some way to save
+    # some branches using the mask technique.  If not, it returns an empty
+    # list; if so, it returns a list consisting of
      #   [ [compare1, mask1], [compare2, mask2], ...
      #     [compare_n, undef], [compare_m, undef], ...
      #   ]
@@ -887,7 +862,7 @@ sub calculate_mask(@) {
      my @final_results;
      foreach my $count (reverse sort { $a <=> $b } keys %hash) {
          my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
-        foreach my $bits (sort keys $hash{$count}) {
+        foreach my $bits (sort keys $hash{$count}->%*) {
  
              print STDERR __LINE__, ": For $count bit(s) difference ($bits), need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
  
@@ -975,7 +950,7 @@ sub calculate_mask(@) {
      # individually.
      my @individuals;
      foreach my $count (reverse sort { $a <=> $b } keys %hash) {
-        foreach my $bits (sort keys $hash{$count}) {
+        foreach my $bits (sort keys $hash{$count}->%*) {
              foreach my $remaining (@{$hash{$count}{$bits}}) {
  
                  # If we already know about this value, just ignore it.
@@ -1038,7 +1013,7 @@ sub _cond_as_str {
          @ranges= map {
              ref $_
              ? sprintf(
-                "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
+                "isRANGE( $test, $self->{val_fmt}, $self->{val_fmt} )",
                  @$_ )
              : sprintf( "$self->{val_fmt} == $test", $_ );
          } @ranges;
@@ -1057,9 +1032,9 @@ sub _cond_as_str {
      if (@ranges > 1) {
  
          # See if the entire set shares optimizable characteristics, and if so,
-        # return the optimization.  We delay checking for this on sets with
-        # just a single range, as there may be better optimizations available
-        # in that case.
+        # return the optimization.  There is no need to do this on sets with
+        # just a single range, as that can be expressed with a single
+        # conditional.
          @masks = calculate_mask(@$cond);
  
          # Stringify the output of calculate_mask()
@@ -1122,72 +1097,26 @@ sub _cond_as_str {
              # bounds.  (No legal UTF-8 character can begin with anything in
              # this range, so we don't have to worry about this being a
              # continuation byte or not.)
-            if (ASCII_PLATFORM
-                && ! $opts_ref->{safe}
+            if ($opts_ref->{charset} =~ /ascii/i
+                && (! $opts_ref->{safe} && ! $opts_ref->{no_length_checks})
                  && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
              {
-                my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
-                my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
-
                  # If the range is the entire legal range, it matches any legal
                  # byte, so we can omit both tests.  (This should happen only
                  # if the number of ranges is 1.)
-                if ($lower_limit_is_80 && $upper_limit_is_BF) {
+                if ($ranges[$i]->[0] == 0x80 && $ranges[$i]->[1] == 0xBF) {
                      return 1;
                  }
-                elsif ($lower_limit_is_80) { # Just use the upper limit test
-                    $output = sprintf("( $test <= $self->{val_fmt} )",
-                                        $ranges[$i]->[1]);
-                }
-                elsif ($upper_limit_is_BF) { # Just use the lower limit test
-                    $output = sprintf("( $test >= $self->{val_fmt} )",
-                                    $ranges[$i]->[0]);
-                }
-            }
-
-            # If we didn't change to omit a test above, see if the number of
-            # elements is a power of 2 (only a single bit in the
-            # representation of its count will be set) and if so, it may be
-            # that a mask/compare optimization is possible.
-            if ($output eq ""
-                && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
-            {
-                my @list;
-                push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
-                my @this_masks = calculate_mask(@list);
-
-                # Use the mask if there is just one for the whole range.
-                # Otherwise there is no savings over the two branches that can
-                # define the range.
-                if (@this_masks == 1 && defined $this_masks[0][1]) {
-                    $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $this_masks[0][1], $this_masks[0][0];
-                }
              }
  
-            if ($output ne "") {  # Prefer any optimization
-                $ranges[$i] = $output;
-            }
-            else {
-                # No optimization happened.  We need a test that the code
-                # point is within both bounds.  But, if the bounds are
-                # adjacent code points, it is cleaner to say
-                # 'first == test || second == test'
-                # than it is to say
-                # 'first <= test && test <= second'
-
-                $range_count_extra++;   # This range requires 2 branches to
-                                        # represent
-                if ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
-                    $ranges[$i] = "( "
-                                .  join( " || ", ( map
-                                    { sprintf "$self->{val_fmt} == $test", $_ }
-                                    @{$ranges[$i]} ) )
-                                . " )";
-                }
-                else {  # Full bounds checking
-                    $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
-                }
-            }
+            # Here, it isn't the full range of legal continuation bytes.  We
+            # could just assume that there's nothing outside of the legal
+            # bounds.  But inRANGE() allows us to have a single conditional,
+            # so the only cost of making sure it's a legal UTF-8 continuation
+            # byte is an extra subtraction instruction, a trivial expense.
+            $ranges[$i] = sprintf("inRANGE($test, $self->{val_fmt},"
+                                                . " $self->{val_fmt} )",
+                                        $ranges[$i]->[0], $ranges[$i]->[1]);
          }
      }
  
@@ -1219,7 +1148,7 @@ sub _combine {
          }
          else {
              $cstr=
-          sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
+          sprintf( "inRANGE($test, $self->{val_fmt}, $self->{val_fmt})",
                     @$item );
          }
          $gtv= sprintf "$self->{val_fmt}", $item->[1];
@@ -1297,9 +1226,11 @@ sub render {
  # make a macro of a given type.
  # calls into make_trie and (generic_|length_)optree as needed
  # Opts are:
-# type     : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
-# ret_type : 'cp' or 'len'
-# safe     : add length guards to macro
+# type             : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
+# ret_type         : 'cp' or 'len'
+# safe             : don't assume is well-formed UTF-8, so don't skip any range
+#                    checks, and add length guards to macro
+# no_length_checks : like safe, but don't add length guards.
  #
  # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
  # in which case it defaults to 'cp' as well.
@@ -1310,7 +1241,7 @@ sub render {
  #
  # It is also illegal to do a non-safe macro on a pattern with multi-codepoint
  # sequences in it, as even if it is known to be well-formed, we need to not
-# run off the end of the buffer when say the buffer ends with the first two
+# run off the end of the buffer when, say, the buffer ends with the first two
  # characters, but three are looked at by the macro.
  #
  # returns the macro.
@@ -1346,6 +1277,7 @@ sub make_macro {
      my $ext= $type     =~ /generic/ ? ''          : '_' . lc( $type );
      $ext .= '_non_low' if $type eq 'generic_non_low';
      $ext .= "_safe" if $opts{safe};
+    $ext .= "_no_length_checks" if $opts{no_length_checks};
      my $argstr= join ",", @args;
      my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
      my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
@@ -1374,24 +1306,27 @@ WARNING: These macros are for internal Perl core use only, and may be
  changed or removed without notice.
  EOF
      );
-    print $out_fh "\n#ifndef H_REGCHARCLASS   /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
+    print $out_fh "\n#ifndef PERL_REGCHARCLASS_H_ /* Guard against nested #includes */\n#define PERL_REGCHARCLASS_H_\n";
  
      my ( $op, $title, @txt, @types, %mods );
-    my $doit= sub {
+    my $doit= sub ($) {
          return unless $op;
  
+        my $charset = shift;
+
          # Skip if to compile on a different platform.
-        return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
-        return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
+        return if delete $mods{only_ascii_platform} && $charset !~ /ascii/i;
+        return if delete $mods{only_ebcdic_platform} && $charset !~ /ebcdic/i;
  
          print $out_fh "/*\n\t$op: $title\n\n";
          print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
-        my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
+        my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt, charset => $charset);
  
          #die Dumper(\@types,\%mods);
  
          my @mods;
          push @mods, 'safe' if delete $mods{safe};
+        push @mods, 'no_length_checks' if delete $mods{no_length_checks};
          unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
                                                                  # do this one
                                                                  # first, as
@@ -1410,44 +1345,84 @@ EOF
                  # way a cp macro will get generated.  Below we convert 'safe'
                  # to 'fast' in this instance
                  next if $type =~ /^cp/
-                        && $mod eq 'safe'
-                        && grep { 'fast' eq $_ } @mods;
+                        && ($mod eq 'safe' || $mod eq 'no_length_checks')
+                        && grep { 'fast' =~ $_ } @mods;
                  delete $mods{$mod};
                  my $macro= $obj->make_macro(
                      type     => $type,
                      ret_type => $ret,
                      safe     => $mod eq 'safe' && $type !~ /^cp/,
+                    charset  => $charset,
+                    no_length_checks => $mod eq 'no_length_checks' && $type !~ /^cp/,
                  );
                  print $out_fh $macro, "\n";
              }
          }
      };
  
-    while ( <DATA> ) {
-        s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
-        next unless /\S/;
-        chomp;
-        if ( /^[A-Z]/ ) {
-            $doit->();  # This starts a new definition; do the previous one
-            ( $op, $title )= split /\s*:\s*/, $_, 2;
-            @txt= ();
-        } elsif ( s/^=>// ) {
-            my ( $type, $modifier )= split /:/, $_;
-            @types= split ' ', $type;
-            undef %mods;
-            map { $mods{$_} = 1 } split ' ',  $modifier;
-        } else {
-            push @txt, "$_";
+    my @data = <DATA>;
+    foreach my $charset (get_supported_code_pages()) {
+        my $first_time = 1;
+        undef $op;
+        undef $title;
+        undef @txt;
+        undef @types;
+        undef %mods;
+        print $out_fh "\n", get_conditional_compile_line_start($charset);
+        my @data_copy = @data;
+        for (@data_copy) {
+            s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
+            next unless /\S/;
+            chomp;
+            if ( /^[A-Z]/ ) {
+                $doit->($charset) unless $first_time;  # This starts a new
+                                                       # definition; do the
+                                                       # previous one
+                $first_time = 0;
+                ( $op, $title )= split /\s*:\s*/, $_, 2;
+                @txt= ();
+            } elsif ( s/^=>// ) {
+                my ( $type, $modifier )= split /:/, $_;
+                @types= split ' ', $type;
+                undef %mods;
+                map { $mods{$_} = 1 } split ' ',  $modifier;
+            } else {
+                push @txt, "$_";
+            }
          }
+        $doit->($charset);
+        print $out_fh get_conditional_compile_line_end();
      }
-    $doit->();
  
-    print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
+    print $out_fh "\n#endif /* PERL_REGCHARCLASS_H_ */\n";
  
      if($path eq '-') {
         print $out_fh "/* ex: set ro: */\n";
      } else {
-       read_only_bottom_close_and_rename($out_fh)
+        # Some of the sources for these macros come from Unicode tables
+        my $sources_list = "lib/unicore/mktables.lst";
+        my @sources = ($0, qw(lib/unicore/mktables
+                              lib/Unicode/UCD.pm
+                              regen/regcharclass_multi_char_folds.pl
+                              regen/charset_translations.pl
+                             ));
+        {
+            # Depend on mktables’ own sources.  It’s a shorter list of files than
+            # those that Unicode::UCD uses.
+            if (! open my $mktables_list, '<', $sources_list) {
+
+                # This should force a rebuild once $sources_list exists
+                push @sources, $sources_list;
+            }
+            else {
+                while(<$mktables_list>) {
+                    last if /===/;
+                    chomp;
+                    push @sources, "lib/unicore/$_" if /^[^#]/;
+                }
+            }
+        }
+        read_only_bottom_close_and_rename($out_fh, \@sources)
      }
  }
  
@@ -1538,12 +1513,15 @@ EOF
  #               string.  In the case of non-UTF8, it makes sure that the
  #               string has at least one byte in it.  The macro name has
  #               '_safe' appended to it.
+#   no_length_checks  The input string is not necessarily valid UTF-8, but it
+#               is to be assumed that the length has already been checked and
+#               found to be valid
  #   fast        The input string is valid UTF-8.  No bounds checking is done,
  #               and the macro can make assumptions that lead to faster
  #               execution.
-#   only_ascii_platform   Skip this definition if this program is being run on
+#   only_ascii_platform   Skip this definition if the character set is for
  #               a non-ASCII platform.
-#   only_ebcdic_platform  Skip this definition if this program is being run on
+#   only_ebcdic_platform  Skip this definition if the character set is for
  #               a non-EBCDIC platform.
  # No modifier need be specified; fast is assumed for this case.  If both
  # 'fast', and 'safe' are specified, two macros will be created for each
@@ -1589,61 +1567,13 @@ XPERLSPACE: \p{XPerlSpace}
  => high cp_high : fast
  \p{XPerlSpace}
  
-REPLACEMENT: Unicode REPLACEMENT CHARACTER
-=> UTF8 :safe
-0xFFFD
-
  NONCHAR: Non character code points
-=> UTF8 :fast
-\p{Nchar}
-
-SURROGATE: Surrogate characters
-=> UTF8 :fast
-\p{Gc=Cs}
-
-GCB_L: Grapheme_Cluster_Break=L
-=> UTF8 :fast
-\p{_X_GCB_L}
-
-GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
-=> UTF8 :fast
-\p{_X_LV_LVT_V}
-
-GCB_Prepend: Grapheme_Cluster_Break=Prepend
-=> UTF8 :fast
-\p{_X_GCB_Prepend}
-
-GCB_RI: Grapheme_Cluster_Break=RI
-=> UTF8 :fast
-\p{_X_RI}
-
-GCB_SPECIAL_BEGIN_START: Grapheme_Cluster_Break=special_begin_starts
-=> UTF8 :fast
-\p{_X_Special_Begin_Start}
-
-GCB_T: Grapheme_Cluster_Break=T
-=> UTF8 :fast
-\p{_X_GCB_T}
+=> UTF8 :safe
+\p{_Perl_Nchar}
  
-GCB_V: Grapheme_Cluster_Break=V
-=> UTF8 :fast
-\p{_X_GCB_V}
-
-# This program was run with this enabled, and the results copied to utf8.h;
-# then this was commented out because it takes so long to figure out these 2
-# million code points.  The results would not change unless utf8.h decides it
-# wants a maximum other than 4 bytes, or this program creates better
-# optimizations
-#UTF8_CHAR: Matches utf8 from 1 to 4 bytes
-#=> UTF8 :safe only_ascii_platform
-#0x0 - 0x1FFFFF
-
-# This hasn't been commented out, because we haven't an EBCDIC platform to run
-# it on, and the 3 types of EBCDIC allegedly supported by Perl would have
-# different results
-UTF8_CHAR: Matches utf8 from 1 to 5 bytes
-=> UTF8 :safe only_ebcdic_platform
-0x0 - 0x3FFFFF:
+SURROGATE: Surrogate code points
+=> UTF8 :safe
+\p{_Perl_Surrogate}
  
  QUOTEMETA: Meta-characters that \Q should quote
  => high :fast
@@ -1651,7 +1581,6 @@ QUOTEMETA: Meta-characters that \Q should quote
  
  MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
  => UTF8 :safe
-do regen/regcharclass_multi_char_folds.pl
  
  # 1 => All folds
  &regcharclass_multi_char_folds::multi_char_folds(1)
@@ -1676,4 +1605,8 @@ PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are
  
  PATWS: pattern white space
  => generic cp : safe
-\p{PatWS}
+\p{_Perl_PatWS}
+
+HANGUL_ED: Hangul syllables whose first character is \xED
+=> UTF8 :only_ascii_platform safe
+0xD000 - 0xD7FF