X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/377a58578acd4376703a673bb8df01ce912cce1d..d43328d502ac91c4d98e218d0721cd5f3bcd3950:/regen/mk_PL_charclass.pl diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl index 17280fd..8b217b3 100644 --- a/regen/mk_PL_charclass.pl +++ b/regen/mk_PL_charclass.pl @@ -2,8 +2,8 @@ use v5.15.8; use strict; use warnings; -require 'regen/regen_lib.pl'; -require 'regen/charset_translations.pl'; +require './regen/regen_lib.pl'; +require './regen/charset_translations.pl'; # This program outputs l1_charclass_tab.h, which defines the guts of the # PL_charclass table. Each line is a bit map of properties that the Unicode @@ -229,12 +229,18 @@ for my $ord (0..255) { } elsif ($name eq 'SPACE') {; $re = qr/\p{XPerlSpace}/; } elsif ($name eq 'IDFIRST') { - $re = qr/[_\p{Alpha}]/; + $re = qr/[_\p{XPosixAlpha}]/; } elsif ($name eq 'WORDCHAR') { $re = qr/\p{XPosixWord}/; + } elsif ($name eq 'LOWER') { + $re = qr/\p{XPosixLower}/; + } elsif ($name eq 'UPPER') { + $re = qr/\p{XPosixUpper}/; } elsif ($name eq 'ALPHANUMERIC') { # Like \w, but no underscore $re = qr/\p{Alnum}/; + } elsif ($name eq 'ALPHA') { + $re = qr/\p{XPosixAlpha}/; } elsif ($name eq 'QUOTEMETA') { $re = qr/\p{_Perl_Quotemeta}/; } elsif ($name eq 'NONLATIN1_FOLD') { @@ -272,9 +278,22 @@ print $out_fh <[$i]] = $i; + } + } print $out_fh "\n" . get_conditional_compile_line_start($charset); for my $ord (0..255) { @@ -307,10 +326,11 @@ foreach my $charset (get_supported_code_pages()) { } else { use Unicode::UCD qw(prop_invmap); - my ($list_ref, $map_ref, $format) = prop_invmap("Name_Alias"); + my ($list_ref, $map_ref, $format) + = prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); if ($format !~ /^s/) { use Carp; - carp "Unexpected format '$format' for 'Name_Alias"; + carp "Unexpected format '$format' for '_Perl_Name_Alias"; last; } my $which = Unicode::UCD::search_invlist($list_ref, $ord); @@ -331,11 +351,41 @@ foreach my $charset (get_supported_code_pages()) { $name =~ s/:.*//; } } + my $index = $a2n[$ord]; - $out[$index] = ($ord == $index) - ? sprintf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord] - : sprintf "/* 0x%02X U+%02X %s */ %s,\n", $index, $ord, $name, $bits[$ord]; + my $i8; + $i8 = $utf_to_i8[$index] if @utf_to_i8; + + $out[$index] = "/* "; + $out[$index] .= sprintf "0x%02X ", $index if $ord != $index; + $out[$index] .= sprintf "U+%02X ", $ord; + $out[$index] .= sprintf "I8=%02X ", $i8 if defined $i8 && $i8 != $ord; + $out[$index] .= "$name */ "; + $out[$index] .= $bits[$ord]; + + # For EBCDIC character sets, we also add some data for when the bytes + # are in UTF-EBCDIC; these are based on the fundamental + # characteristics of UTF-EBCDIC. + if (@utf_to_i8) { + if ($i8 >= 0xC5 && $i8 != 0xE0) { + $out[$index] .= '|(1U<<_CC_UTF8_IS_START)'; + if ($i8 <= 0xC7) { + $out[$index] .= '|(1U<<_CC_UTF8_IS_DOWNGRADEABLE_START)'; + } + } + if (($i8 & 0xE0) == 0xA0) { + $out[$index] .= '|(1U<<_CC_UTF8_IS_CONTINUATION)'; + } + if ($i8 >= 0xF1) { + $out[$index] .= + '|(1U<<_CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)'; + } + } + + $out[$index] .= ",\n"; } + $out[-1] =~ s/,$//; # No trailing comma in the final entry + print $out_fh join "", @out; print $out_fh "\n" . get_conditional_compile_line_end(); }