X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/850b7ec98b08ce3e7330267f4331f3006ea95551..216b41c2e7ca756cac276d5de6435f6dac31b86f:/regen/mk_PL_charclass.pl diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl index 918bb4d..4b46bd0 100644 --- a/regen/mk_PL_charclass.pl +++ b/regen/mk_PL_charclass.pl @@ -3,6 +3,7 @@ use v5.15.8; use strict; use warnings; require 'regen/regen_lib.pl'; +require 'regen/charset_translations.pl'; # This program outputs l1_charclass_tab.h, which defines the guts of the # PL_charclass table. Each line is a bit map of properties that the Unicode @@ -22,6 +23,7 @@ require 'regen/regen_lib.pl'; # new Unicode release, to make sure things haven't been changed by it. my @properties = qw( + NONLATIN1_SIMPLE_FOLD NONLATIN1_FOLD ALPHANUMERIC ALPHA @@ -36,7 +38,6 @@ my @properties = qw( LOWER NON_FINAL_FOLD PRINT - PSXSPC PUNCT QUOTEMETA SPACE @@ -45,12 +46,13 @@ my @properties = qw( XDIGIT VERTSPACE IS_IN_SOME_FOLD - BACKSLASH_FOO_LBRACE_IS_META + MNEMONIC_CNTRL ); # Read in the case fold mappings. my %folded_closure; my @hex_non_final_folds; +my @non_latin1_simple_folds; my @folds; use Unicode::UCD; @@ -107,8 +109,8 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property my $from = hex $hex_from; - # Perl only deals with C and F folds - next if $fold_type ne 'C' and $fold_type ne 'F'; + # Perl only deals with S, C, and F folds + next if $fold_type ne 'C' and $fold_type ne 'F' and $fold_type ne 'S'; # Get each code point in the range that participates in this line's fold. # The hash has keys of each code point in the range, and values of what it @@ -119,9 +121,20 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property push @{$folded_closure{$fold}}, $from if $fold < 256; push @{$folded_closure{$from}}, $fold if $from < 256; - if ($i < @folded-1 - && $fold < 256 - && ! grep { $_ eq $hex_fold } @hex_non_final_folds) + if (($fold_type eq 'C' || $fold_type eq 'S') + && ($fold < 256 != $from < 256)) + { + # Fold is simple (hence can't be a non-final fold, so the 'if' + # above is mutualy exclusive from the 'if below) and crosses + # 255/256 boundary. We keep track of the Latin1 code points + # in such folds. + push @non_latin1_simple_folds, ($fold < 256) + ? $fold + : $from; + } + elsif ($i < @folded-1 + && $fold < 256 + && ! grep { $_ eq $hex_fold } @hex_non_final_folds) { push @hex_non_final_folds, $hex_fold; @@ -140,6 +153,16 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property push @{$folded_closure{$from}}, @{$folded_closure{$folded}}; } } + + # We have the single-character folds that cross the 255/256, like KELVIN + # SIGN => 'k', but we need the closure, so add like 'K' to it + foreach my $folded (@non_latin1_simple_folds) { + foreach my $fold (@{$folded_closure{$folded}}) { + if ($fold < 256 && ! grep { $fold == $_ } @non_latin1_simple_folds) { + push @non_latin1_simple_folds, $fold; + } + } + } } sub Is_Non_Latin1_Fold { @@ -152,6 +175,12 @@ sub Is_Non_Latin1_Fold { return join("\n", @return) . "\n"; } +sub Is_Non_Latin1_Simple_Fold { # Latin1 code points that are folded to by + # non-Latin1 code points as single character + # folds + return join("\n", map { sprintf "%X", $_ } @non_latin1_simple_folds) . "\n"; +} + sub Is_Non_Final_Fold { return join("\n", @hex_non_final_folds) . "\n"; } @@ -175,8 +204,7 @@ for my $ord (0..255) { # Here, isn't an _L1. If its _A, it's automatically false for # non-ascii. The only current ones (besides ASCII) without a # suffix are valid over the whole range. - next if $name =~ s/_A$// && $ord >= 128; - + next if $name =~ s/_A$// && $char !~ /\p{ASCII}/; } my $re; if ($name eq 'PUNCT') {; @@ -190,8 +218,6 @@ for my $ord (0..255) { $re = qr/\p{XPerlSpace}/; } elsif ($name eq 'IDFIRST') { $re = qr/[_\p{Alpha}]/; - } elsif ($name eq 'PSXSPC') { - $re = qr/[\v\p{Space}]/; } elsif ($name eq 'WORDCHAR') { $re = qr/\p{XPosixWord}/; } elsif ($name eq 'ALPHANUMERIC') { @@ -201,17 +227,15 @@ for my $ord (0..255) { $re = qr/\p{_Perl_Quotemeta}/; } elsif ($name eq 'NONLATIN1_FOLD') { $re = qr/\p{Is_Non_Latin1_Fold}/; + } elsif ($name eq 'NONLATIN1_SIMPLE_FOLD') { + $re = qr/\p{Is_Non_Latin1_Simple_Fold}/; } elsif ($name eq 'NON_FINAL_FOLD') { $re = qr/\p{Is_Non_Final_Fold}/; } elsif ($name eq 'IS_IN_SOME_FOLD') { $re = qr/\p{_Perl_Any_Folds}/; - } elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') { - - # This is true for FOO where FOO is the varying character in: - # \a{, \b{, \c{, ... - # and the sequence has non-literal meaning to Perl; so it is true - # for 'x' because \x{ is special, but not 'a' because \a{ isn't. - $re = qr/[gkNopPx]/; + } elsif ($name eq 'MNEMONIC_CNTRL') { + # These are the control characters that there are mnemonics for + $re = qr/[\a\b\e\f\n\r\t]/; } else { # The remainder have the same name and values as Unicode $re = eval "qr/\\p{$name}/"; use Carp; @@ -226,99 +250,82 @@ for my $ord (0..255) { #print __LINE__, " $ord $char $bits[$ord]\n"; } -# Names of C0 controls -my @C0 = qw ( - NUL - SOH - STX - ETX - EOT - ENQ - ACK - BEL - BS - HT - LF - VT - FF - CR - SO - SI - DLE - DC1 - DC2 - DC3 - DC4 - NAK - SYN - ETB - CAN - EOM - SUB - ESC - FS - GS - RS - US - ); - -# Names of C1 controls, plus the adjacent DEL -my @C1 = qw( - DEL - PAD - HOP - BPH - NBH - IND - NEL - SSA - ESA - HTS - HTJ - VTS - PLD - PLU - RI - SS2 - SS3 - DCS - PU1 - PU2 - STS - CCH - MW - SPA - EPA - SOS - SGC - SCI - CSI - ST - OSC - PM - APC - ); - my $out_fh = open_new('l1_char_class_tab.h', '>', {style => '*', by => $0, from => "property definitions"}); +print $out_fh < 32 && $ord < 127) { # Graphic - $name = "'" . chr($ord) . "'"; - } elsif ($ord >= 127 && $ord <= 0x9f) { - $name = $C1[$ord - 127]; # A C1 control + DEL - } else { # SPACE, or, if Latin1, shorten the name */ - use charnames(); - $name = charnames::viacode($ord); - $name =~ s/LATIN CAPITAL LETTER // - || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/; +foreach my $charset (get_supported_code_pages()) { + my @a2n = @{get_a2n($charset)}; + my @out; + + print $out_fh "\n" . get_conditional_compile_line_start($charset); + for my $ord (0..255) { + my $name; + my $char = chr $ord; + if ($char =~ /\p{PosixGraph}/) { + my $quote = $char eq "'" ? '"' : "'"; + $name = $quote . chr($ord) . $quote; + } + elsif ($char =~ /\p{XPosixGraph}/) { + use charnames(); + $name = charnames::viacode($ord); + $name =~ s/LATIN CAPITAL LETTER // + or $name =~ s/LATIN SMALL LETTER (.*)/\L$1/ + or $name =~ s/ SIGN\b// + or $name =~ s/EXCLAMATION MARK/'!'/ + or $name =~ s/QUESTION MARK/'?'/ + or $name =~ s/QUOTATION MARK/QUOTE/ + or $name =~ s/ INDICATOR//; + $name =~ s/\bWITH\b/\L$&/; + $name =~ s/\bONE\b/1/; + $name =~ s/\b(TWO|HALF)\b/2/; + $name =~ s/\bTHREE\b/3/; + $name =~ s/\b QUARTER S? \b/4/x; + $name =~ s/VULGAR FRACTION (.) (.)/$1\/$2/; + $name =~ s/\bTILDE\b/'~'/i + or $name =~ s/\bCIRCUMFLEX\b/'^'/i + or $name =~ s/\bSTROKE\b/'\/'/i + or $name =~ s/ ABOVE\b//i; + } + else { + use Unicode::UCD qw(prop_invmap); + my ($list_ref, $map_ref, $format) = prop_invmap("Name_Alias"); + if ($format !~ /^s/) { + use Carp; + carp "Unexpected format '$format' for 'Name_Alias"; + last; + } + my $which = Unicode::UCD::search_invlist($list_ref, $ord); + if (! defined $which) { + use Carp; + carp "No name found for code pont $ord"; + } + else { + my $map = $map_ref->[$which]; + if (! ref $map) { + $name = $map; + } + else { + # Just pick the first abbreviation if more than one + my @names = grep { $_ =~ /abbreviation/ } @$map; + $name = $names[0]; + } + $name =~ s/:.*//; + } + } + my $index = $a2n[$ord]; + $out[$index] = ($ord == $index) + ? sprintf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord] + : sprintf "/* 0x%02X U+%02X %s */ %s,\n", $index, $ord, $name, $bits[$ord]; } - printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord]; + print $out_fh join "", @out; + print $out_fh "\n" . get_conditional_compile_line_end(); } read_only_bottom_close_and_rename($out_fh)