X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/c30a0cf2fdbe81e28e7b5cd87a818bee7ac46dc8..d43328d502ac91c4d98e218d0721cd5f3bcd3950:/regen/mk_PL_charclass.pl diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl index aaefb46..8b217b3 100644 --- a/regen/mk_PL_charclass.pl +++ b/regen/mk_PL_charclass.pl @@ -2,8 +2,8 @@ use v5.15.8; use strict; use warnings; -require 'regen/regen_lib.pl'; -require 'regen/charset_translations.pl'; +require './regen/regen_lib.pl'; +require './regen/charset_translations.pl'; # This program outputs l1_charclass_tab.h, which defines the guts of the # PL_charclass table. Each line is a bit map of properties that the Unicode @@ -38,7 +38,6 @@ my @properties = qw( LOWER NON_FINAL_FOLD PRINT - PSXSPC PUNCT QUOTEMETA SPACE @@ -47,10 +46,12 @@ my @properties = qw( XDIGIT VERTSPACE IS_IN_SOME_FOLD + MNEMONIC_CNTRL ); # Read in the case fold mappings. my %folded_closure; +my %simple_folded_closure; my @hex_non_final_folds; my @non_latin1_simple_folds; my @folds; @@ -118,8 +119,14 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property for my $i (0 .. @folded - 1) { my $hex_fold = $folded[$i]; my $fold = hex $hex_fold; - push @{$folded_closure{$fold}}, $from if $fold < 256; - push @{$folded_closure{$from}}, $fold if $from < 256; + if ($fold < 256) { + push @{$folded_closure{$fold}}, $from; + push @{$simple_folded_closure{$fold}}, $from if $fold_type ne 'F'; + } + if ($from < 256) { + push @{$folded_closure{$from}}, $fold; + push @{$simple_folded_closure{$from}}, $fold if $fold_type ne 'F'; + } if (($fold_type eq 'C' || $fold_type eq 'S') && ($fold < 256 != $from < 256)) @@ -153,11 +160,16 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property push @{$folded_closure{$from}}, @{$folded_closure{$folded}}; } } + foreach my $folded (keys %simple_folded_closure) { + foreach my $from (grep { $_ < 256 } @{$simple_folded_closure{$folded}}) { + push @{$simple_folded_closure{$from}}, @{$simple_folded_closure{$folded}}; + } + } # We have the single-character folds that cross the 255/256, like KELVIN # SIGN => 'k', but we need the closure, so add like 'K' to it foreach my $folded (@non_latin1_simple_folds) { - foreach my $fold (@{$folded_closure{$folded}}) { + foreach my $fold (@{$simple_folded_closure{$folded}}) { if ($fold < 256 && ! grep { $fold == $_ } @non_latin1_simple_folds) { push @non_latin1_simple_folds, $fold; } @@ -217,14 +229,18 @@ for my $ord (0..255) { } elsif ($name eq 'SPACE') {; $re = qr/\p{XPerlSpace}/; } elsif ($name eq 'IDFIRST') { - $re = qr/[_\p{Alpha}]/; - } elsif ($name eq 'PSXSPC') { - $re = qr/[\v\p{Space}]/; + $re = qr/[_\p{XPosixAlpha}]/; } elsif ($name eq 'WORDCHAR') { $re = qr/\p{XPosixWord}/; + } elsif ($name eq 'LOWER') { + $re = qr/\p{XPosixLower}/; + } elsif ($name eq 'UPPER') { + $re = qr/\p{XPosixUpper}/; } elsif ($name eq 'ALPHANUMERIC') { # Like \w, but no underscore $re = qr/\p{Alnum}/; + } elsif ($name eq 'ALPHA') { + $re = qr/\p{XPosixAlpha}/; } elsif ($name eq 'QUOTEMETA') { $re = qr/\p{_Perl_Quotemeta}/; } elsif ($name eq 'NONLATIN1_FOLD') { @@ -235,12 +251,15 @@ for my $ord (0..255) { $re = qr/\p{Is_Non_Final_Fold}/; } elsif ($name eq 'IS_IN_SOME_FOLD') { $re = qr/\p{_Perl_Any_Folds}/; + } elsif ($name eq 'MNEMONIC_CNTRL') { + # These are the control characters that there are mnemonics for + $re = qr/[\a\b\e\f\n\r\t]/; } else { # The remainder have the same name and values as Unicode $re = eval "qr/\\p{$name}/"; use Carp; carp $@ if ! defined $re; } - #print "$ord, $name $property, $re\n"; + #print STDERR __LINE__, ": $ord, $name $property, $re\n"; if ($char =~ $re) { # Add this property if matches $bits[$ord] .= '|' if $bits[$ord]; $bits[$ord] .= "(1U<<_CC_$property)"; @@ -259,9 +278,22 @@ print $out_fh <[$i]] = $i; + } + } print $out_fh "\n" . get_conditional_compile_line_start($charset); for my $ord (0..255) { @@ -294,10 +326,11 @@ foreach my $charset (get_supported_code_pages()) { } else { use Unicode::UCD qw(prop_invmap); - my ($list_ref, $map_ref, $format) = prop_invmap("Name_Alias"); + my ($list_ref, $map_ref, $format) + = prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); if ($format !~ /^s/) { use Carp; - carp "Unexpected format '$format' for 'Name_Alias"; + carp "Unexpected format '$format' for '_Perl_Name_Alias"; last; } my $which = Unicode::UCD::search_invlist($list_ref, $ord); @@ -318,11 +351,41 @@ foreach my $charset (get_supported_code_pages()) { $name =~ s/:.*//; } } + my $index = $a2n[$ord]; - $out[$index] = ($ord == $index) - ? sprintf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord] - : sprintf "/* 0x%02X U+%02X %s */ %s,\n", $index, $ord, $name, $bits[$ord]; + my $i8; + $i8 = $utf_to_i8[$index] if @utf_to_i8; + + $out[$index] = "/* "; + $out[$index] .= sprintf "0x%02X ", $index if $ord != $index; + $out[$index] .= sprintf "U+%02X ", $ord; + $out[$index] .= sprintf "I8=%02X ", $i8 if defined $i8 && $i8 != $ord; + $out[$index] .= "$name */ "; + $out[$index] .= $bits[$ord]; + + # For EBCDIC character sets, we also add some data for when the bytes + # are in UTF-EBCDIC; these are based on the fundamental + # characteristics of UTF-EBCDIC. + if (@utf_to_i8) { + if ($i8 >= 0xC5 && $i8 != 0xE0) { + $out[$index] .= '|(1U<<_CC_UTF8_IS_START)'; + if ($i8 <= 0xC7) { + $out[$index] .= '|(1U<<_CC_UTF8_IS_DOWNGRADEABLE_START)'; + } + } + if (($i8 & 0xE0) == 0xA0) { + $out[$index] .= '|(1U<<_CC_UTF8_IS_CONTINUATION)'; + } + if ($i8 >= 0xF1) { + $out[$index] .= + '|(1U<<_CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)'; + } + } + + $out[$index] .= ",\n"; } + $out[-1] =~ s/,$//; # No trailing comma in the final entry + print $out_fh join "", @out; print $out_fh "\n" . get_conditional_compile_line_end(); }