X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b4069bca6054692e4fffa8e9e04572511e910fbd..31c7f561ae1fcf5096c82b0ce7d0ab0dc6899204:/regen/mk_invlists.pl diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index 64e3d43..e5083c0 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -2,7 +2,7 @@ use 5.015; use strict; use warnings; -use Unicode::UCD "prop_invlist"; +use Unicode::UCD qw(prop_invlist prop_invmap); require 'regen/regen_lib.pl'; # This program outputs charclass_invlists.h, which contains various inversion @@ -15,7 +15,7 @@ require 'regen/regen_lib.pl'; # in the headers is used to minimize the possibility of things getting # out-of-sync, or the wrong data structure being passed. Currently that # random number is: -my $VERSION_DATA_STRUCTURE_TYPE = 1064334010; +my $VERSION_DATA_STRUCTURE_TYPE = 290655244; my $out_fh = open_new('charclass_invlists.h', '>', {style => '*', by => $0, @@ -23,10 +23,16 @@ my $out_fh = open_new('charclass_invlists.h', '>', print $out_fh "/* See the generating file for comments */\n\n"; +my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 ); + sub output_invlist ($$) { my $name = shift; my $invlist = shift; # Reference to inversion list array + die "No inversion list for $name" unless defined $invlist + && ref $invlist eq 'ARRAY' + && @$invlist; + # Output the inversion list $invlist using the name $name for it. # It is output in the exact internal form for inversion lists. @@ -47,10 +53,12 @@ sub output_invlist ($$) { $zero_or_one = 1; } - print $out_fh "\nUV ${name}_invlist[] = {\n"; + print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name}; + print $out_fh "\nstatic UV ${name}_invlist[] = {\n"; print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n"; print $out_fh "\t0,\t/* Current iteration position */\n"; + print $out_fh "\t0,\t/* Cache of previous search index result */\n"; print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n"; print $out_fh "\t", $zero_or_one, ",\t/* 0 if this is the first element of the list proper;", @@ -66,6 +74,62 @@ sub output_invlist ($$) { print $out_fh "\t$invlist->[-1]\n"; print $out_fh "};\n"; + print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name}; + +} + +sub mk_invlist_from_cp_list { + + # Returns an inversion list constructed from the sorted input array of + # code points + + my $list_ref = shift; + + # Initialize to just the first element + my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); + + # For each succeeding element, if it extends the previous range, adjust + # up, otherwise add it. + for my $i (1 .. @$list_ref - 1) { + if ($invlist[-1] == $list_ref->[$i]) { + $invlist[-1]++; + } + else { + push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; + } + } + return @invlist; +} + +# Read in the Case Folding rules, and construct arrays of code points for the +# properties we need. +my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); +die "Could not find inversion map for Case_Folding" unless defined $format; +die "Incorrect format '$format' for Case_Folding inversion map" + unless $format eq 'al'; +my @has_multi_char_fold; +my @is_non_final_fold; + +for my $i (0 .. @$folds_ref - 1) { + next unless ref $folds_ref->[$i]; # Skip single-char folds + push @has_multi_char_fold, $cp_ref->[$i]; + + # Add to the the non-finals list each code point that is in a non-final + # position + for my $j (0 .. @{$folds_ref->[$i]} - 2) { + push @is_non_final_fold, $folds_ref->[$i][$j] + unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold; + } +} + +sub _Perl_Multi_Char_Folds { + @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold; + return mk_invlist_from_cp_list(\@has_multi_char_fold); +} + +sub _Perl_Non_Final_Folds { + @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; + return mk_invlist_from_cp_list(\@is_non_final_fold); } output_invlist("Latin1", [ 0, 256 ]); @@ -90,6 +154,9 @@ output_invlist("AboveLatin1", [ 256 ]); # In the list of properties below that get generated, the L1 prefix is a fake # property that means just the Latin1 range of the full property (whose name # has an X prefix instead of L1). +# +# An initial & means to use the subroutine from this file instead of an +# official inversion list. for my $prop (qw( ASCII @@ -122,6 +189,8 @@ for my $prop (qw( L1PosixWord PosixXDigit XPosixXDigit + &NonL1_Perl_Non_Final_Folds + &_Perl_Multi_Char_Folds ) ) { @@ -138,11 +207,23 @@ for my $prop (qw( # start a new range above 255, as that could be construed as going to # infinity. For example, the Upper property doesn't include the character # at 255, but does include the one at 256. We don't include the 256 one. - my $lookup_prop = $prop; - $lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//; - my @invlist = prop_invlist($lookup_prop); + my $prop_name = $prop; + my $is_local_sub = $prop_name =~ s/^&//; + my $lookup_prop = $prop_name; + my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//); + my $nonl1_only = 0; + $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; - if ($lookup_prop ne $prop) { + my @invlist; + if ($is_local_sub) { + @invlist = eval $lookup_prop; + } + else { + @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); + } + die "Could not find inversion list for '$lookup_prop'" unless @invlist; + + if ($l1_only) { for my $i (0 .. @invlist - 1 - 1) { if ($invlist[$i] > 255) { @@ -165,8 +246,26 @@ for my $prop (qw( } } } + elsif ($nonl1_only) { + my $found_nonl1 = 0; + for my $i (0 .. @invlist - 1 - 1) { + next if $invlist[$i] < 256; + + # Here, we have the first element in the array that indicates an + # element above Latin1. Get rid of all previous ones. + splice @invlist, 0, $i; + + # If this one's index is not divisible by 2, it means that this + # element is inverting away from being in the list, which means + # all code points from 256 to this one are in this list. + unshift @invlist, 256 if $i % 2 != 0; + $found_nonl1 = 1; + last; + } + die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1; + } - output_invlist($prop, \@invlist); + output_invlist($prop_name, \@invlist); } read_only_bottom_close_and_rename($out_fh)