This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Define PatWS for Unicode versions missing it
[perl5.git] / lib / unicore / mktables
index ff30405..be7413a 100644 (file)
@@ -918,7 +918,7 @@ my @unimplemented_properties;
 
 # With this release, it is automatically handled if the Unihan db is
 # downloaded
-push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
+push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version lt v5.2.0;
 
 # There are several types of obsolete properties defined by Unicode.  These
 # must be hand-edited for every new Unicode release.
@@ -1494,6 +1494,7 @@ my $block;
 my $perl_charname;
 my $print;
 my $All;
+my $Assigned;   # All assigned characters in this Unicode release
 my $script;
 
 # Are there conflicting names because of beginning with 'In_', or 'Is_'
@@ -2461,7 +2462,7 @@ sub trace { return main::trace(@_); }
                 if (! $optional{$addr}  # File could be optional
                     && $v_version ge $first_released{$addr})
                 {
-                    print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
+                    print STDERR "Skipping processing input file '$file' because not found\n";
                 }
                 return;
             }
@@ -9710,14 +9711,14 @@ sub finish_property_setup {
                                               'AL');
             $lb->set_default_map($default);
         }
+    }
 
-        # If has the URS property, make sure that the standard aliases are in
-        # it, since not in the input tables in some versions.
-        my $urs = property_ref('Unicode_Radical_Stroke');
-        if (defined $urs) {
-            $urs->add_alias('cjkRSUnicode');
-            $urs->add_alias('kRSUnicode');
-        }
+    # If has the URS property, make sure that the standard aliases are in
+    # it, since not in the input tables in some versions.
+    my $urs = property_ref('Unicode_Radical_Stroke');
+    if (defined $urs) {
+        $urs->add_alias('cjkRSUnicode');
+        $urs->add_alias('kRSUnicode');
     }
 
     # For backwards compatibility with applications that may read the mapping
@@ -12029,10 +12030,8 @@ sub filter_old_style_case_folding {
     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
 
     my @fields = split /\s*;\s*/;
-    if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
-        $fields[1] = 'I';
-    }
-    elsif ($fields[1] eq 'L') {
+
+    if ($fields[1] eq 'L') {
         $fields[1] = 'C';             # L => C always
     }
     elsif ($fields[1] eq 'E') {
@@ -12440,7 +12439,7 @@ sub filter_blocks_lines {
 
     # Change hyphens and blanks in the block name field only
     $fields[1] =~ s/[ -]/_/g;
-    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
+    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
 
     $_ = join("; ", @fields);
     return;
@@ -13039,6 +13038,22 @@ sub filter_early_version_name_alias_line {
     return;
 }
 
+sub filter_all_caps_script_names {
+
+    # Some early Unicode releases had the script names in all CAPS.  This
+    # converts them to just the first letter of each word being capital.
+
+    my ($range, $script, @remainder)
+        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+    my @words = split "_", $script;
+    for my $word (@words) {
+        $word =
+            ucfirst(lc($word)) if $word ne 'CJK';
+    }
+    $script = join "_", @words;
+    $_ = join ";", $range, $script, @remainder;
+}
+
 sub finish_Unicode() {
     # This routine should be called after all the Unicode files have been read
     # in.  It:
@@ -13407,6 +13422,26 @@ sub pre_3_dot_1_Nl () {
     return $Nl;
 }
 
+sub calculate_Assigned() {  # Calculate the gc != Cn code points; may be
+                            # called before the Cn's are completely filled.
+                            # Works on Unicodes earlier than ones that
+                            # explicitly specify Cn.
+    return if defined $Assigned;
+
+    if (! defined $gc || $gc->is_empty()) {
+        Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
+    }
+
+    $Assigned = $perl->add_match_table('Assigned',
+                                Description  => "All assigned code points",
+                                );
+    while (defined (my $range = $gc->each_range())) {
+        my $standard_value = standardize($range->value);
+        next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
+        $Assigned->add_range($range->start, $range->end);
+    }
+}
+
 sub compile_perl() {
     # Create perl-defined tables.  Almost all are part of the pseudo-property
     # named 'perl' internally to this program.  Many of these are recommended
@@ -13450,11 +13485,7 @@ sub compile_perl() {
     $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
     $Any->add_alias('Unicode');
 
-    # Assigned is the opposite of gc=unassigned
-    my $Assigned = $perl->add_match_table('Assigned',
-                                Description  => "All assigned code points",
-                                Initialize => ~ $gc->table('Unassigned'),
-                                );
+    calculate_Assigned();
 
     # Our internal-only property should be treated as more than just a
     # synonym; grandfather it in to the pod.
@@ -13500,32 +13531,27 @@ sub compile_perl() {
 
         # There are quite a few code points in Lower, that aren't in gc=lc,
         # and not all are in all releases.
-        foreach my $code_point (    utf8::unicode_to_native(0xAA),
-                                    utf8::unicode_to_native(0xBA),
-                                    0x02B0 .. 0x02B8,
-                                    0x02C0 .. 0x02C1,
-                                    0x02E0 .. 0x02E4,
-                                    0x0345,
-                                    0x037A,
-                                    0x1D2C .. 0x1D6A,
-                                    0x1D78,
-                                    0x1D9B .. 0x1DBF,
-                                    0x2071,
-                                    0x207F,
-                                    0x2090 .. 0x209C,
-                                    0x2170 .. 0x217F,
-                                    0x24D0 .. 0x24E9,
-                                    0x2C7C .. 0x2C7D,
-                                    0xA770,
-                                    0xA7F8 .. 0xA7F9,
-        ) {
-            # Don't include the code point unless it is assigned in this
-            # release
-            my $category = $gc->value_of($code_point);
-            next if ! defined $category || $category eq 'Cn';
-
-            $Lower += $code_point;
-        }
+        my $temp = Range_List->new(Initialize => [
+                                                utf8::unicode_to_native(0xAA),
+                                                utf8::unicode_to_native(0xBA),
+                                                0x02B0 .. 0x02B8,
+                                                0x02C0 .. 0x02C1,
+                                                0x02E0 .. 0x02E4,
+                                                0x0345,
+                                                0x037A,
+                                                0x1D2C .. 0x1D6A,
+                                                0x1D78,
+                                                0x1D9B .. 0x1DBF,
+                                                0x2071,
+                                                0x207F,
+                                                0x2090 .. 0x209C,
+                                                0x2170 .. 0x217F,
+                                                0x24D0 .. 0x24E9,
+                                                0x2C7C .. 0x2C7D,
+                                                0xA770,
+                                                0xA7F8 .. 0xA7F9,
+                                ]);
+        $Lower += $temp & $Assigned;
     }
     my $Posix_Lower = $perl->add_match_table("PosixLower",
                             Description => "[a-z]",
@@ -13977,6 +14003,9 @@ sub compile_perl() {
          Description =>
               "Code points whose fold is a string of more than one character",
     );
+    if ($v_version lt v3.0.1) {
+        push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
+    }
 
     # Look through all the known folds to populate these tables.
     foreach my $range ($cf->ranges) {
@@ -14397,6 +14426,25 @@ END
         $unassigned->set_equivalent_to($age_default, Related => 1);
     }
 
+    my $patws = $perl->add_match_table('_Perl_PatWS',
+                                       Perl_Extension => 1,
+                                       Fate => $INTERNAL_ONLY);
+    if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
+        $patws->initialize($off_patws->table('Y'));
+    }
+    else {
+        $patws->initialize([ ord("\t"),
+                             ord("\n"),
+                             utf8::unicode_to_native(0x0B), # VT
+                             ord("\f"),
+                             ord("\r"),
+                             ord(" "),
+                             utf8::unicode_to_native(0x85), # NEL
+                             0x200E..0x200F,             # Left, Right marks
+                             0x2028..0x2029              # Line, Paragraph seps
+                           ] );
+    }
+
     # See L<perlfunc/quotemeta>
     my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
                                            Perl_Extension => 1,
@@ -15613,9 +15661,9 @@ sub pod_alphanumeric_sort {
     # The first few character columns are filler, plus the '\p{'; and get rid
     # of all the trailing stuff, starting with the trailing '}', so as to sort
     # on just 'Name=Value'
-    (my $a = lc $a) =~ s/^ .*? { //x;
+    (my $a = lc $a) =~ s/^ .*? \{ //x;
     $a =~ s/}.*//;
-    (my $b = lc $b) =~ s/^ .*? { //x;
+    (my $b = lc $b) =~ s/^ .*? \{ //x;
     $b =~ s/}.*//;
 
     # Determine if the two operands are both internal only or both not.
@@ -18137,6 +18185,9 @@ my @input_file_objects = (
                     ),
     Input_file->new('Scripts.txt', v3.1.0,
                     Property => 'Script',
+                    Each_Line_Handler => (($v_version le v4.0.0)
+                                          ? \&filter_all_caps_script_names
+                                          : undef),
                     Has_Missings_Defaults => $NOT_IGNORED,
                     ),
     Input_file->new('DNormalizationProps.txt', v3.1.0,