This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Unicode::UCD::casefold(): Don't use .txt file for source
authorKarl Williamson <public@khwilliamson.com>
Mon, 26 Mar 2012 18:31:20 +0000 (12:31 -0600)
committerKarl Williamson <public@khwilliamson.com>
Sat, 2 Jun 2012 14:29:13 +0000 (08:29 -0600)
This converts this function to using the outputs of prop_invmap() to get
its casefolding definitions.  This allows it to work on versions of
Unicode which don't have this file, allows the file to not have to be
installed, and removes this function from having to be different on
EBCDIC platforms (which wasn't coded anyway).

lib/Unicode/UCD.pm

index cdc08a0..9915220 100644 (file)
@@ -1028,54 +1028,88 @@ L<http://www.unicode.org/unicode/reports/tr21>
 my %CASEFOLD;
 
 sub _casefold {
-    unless (%CASEFOLD) {
-       if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
-           local $_;
-           local $/ = "\n";
-           while (<$CASEFOLDFH>) {
-               if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
-                   my $code = hex($1);
-                   $CASEFOLD{$code}{'code'} = $1;
-                   $CASEFOLD{$code}{'turkic'} = "" unless
-                                           defined $CASEFOLD{$code}{'turkic'};
-                   if ($2 eq 'C' || $2 eq 'I') {       # 'I' is only on 3.1 and
-                                                       # earlier Unicodes
-                                                       # Both entries there (I
-                                                       # only checked 3.1) are
-                                                       # the same as C, and
-                                                       # there are no other
-                                                       # entries for those
-                                                       # codepoints, so treat
-                                                       # as if C, but override
-                                                       # the turkic one for
-                                                       # 'I'.
-                       $CASEFOLD{$code}{'status'} = $2;
-                       $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
-                       $CASEFOLD{$code}{'mapping'} = $3;
-                       $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
-                   } elsif ($2 eq 'F') {
-                       $CASEFOLD{$code}{'full'} = $3;
-                       unless (defined $CASEFOLD{$code}{'simple'}) {
-                               $CASEFOLD{$code}{'simple'} = "";
-                               $CASEFOLD{$code}{'mapping'} = $3;
-                               $CASEFOLD{$code}{'status'} = $2;
-                       }
-                   } elsif ($2 eq 'S') {
+    unless (%CASEFOLD) {   # Populate the hash
+        my ($full_invlist_ref, $full_invmap_ref, undef, $default)
+                                                = prop_invmap('Case_Folding');
+
+        # Use the recipe given in the prop_invmap() pod to convert the
+        # inversion map into the hash.
+        for my $i (0 .. @$full_invlist_ref - 1 - 1) {
+            next if $full_invmap_ref->[$i] == $default;
+            my $adjust = -1;
+            for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
+                $adjust++;
+                if (! ref $full_invmap_ref->[$i]) {
+
+                    # This is a single character mapping
+                    $CASEFOLD{$j}{'status'} = 'C';
+                    $CASEFOLD{$j}{'simple'}
+                        = $CASEFOLD{$j}{'full'}
+                        = $CASEFOLD{$j}{'mapping'}
+                        = sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
+                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                    $CASEFOLD{$j}{'turkic'} = "";
+                }
+                else {  # prop_invmap ensures that $adjust is 0 for a ref
+                    $CASEFOLD{$j}{'status'} = 'F';
+                    $CASEFOLD{$j}{'full'}
+                    = $CASEFOLD{$j}{'mapping'}
+                    = join " ", map { sprintf "%04X", $_ }
+                                                    @{$full_invmap_ref->[$i]};
+                    $CASEFOLD{$j}{'simple'} = "";
+                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                    $CASEFOLD{$j}{'turkic'} = "";
+                }
+            }
+        }
 
+        # We have filled in the full mappings above, assuming there were no
+        # simple ones for the ones with multi-character maps.  Now, we find
+        # and fix the cases where that assumption was false.
+        (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
+                                        = prop_invmap('Simple_Case_Folding');
+        for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
+            next if $simple_invmap_ref->[$i] == $default;
+            my $adjust = -1;
+            for my $j ($simple_invlist_ref->[$i]
+                       .. $simple_invlist_ref->[$i+1] -1)
+            {
+                $adjust++;
+                next if $CASEFOLD{$j}{'status'} eq 'C';
+                $CASEFOLD{$j}{'status'} = 'S';
+                $CASEFOLD{$j}{'simple'}
+                    = $CASEFOLD{$j}{'mapping'}
+                    = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
+                $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                $CASEFOLD{$j}{'turkic'} = "";
+            }
+        }
 
-                       # There can't be a simple without a full, and simple
-                       # overrides all but full
+        # We hard-code in the turkish rules
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version ge v3.2.0) {
 
-                       $CASEFOLD{$code}{'simple'} = $3;
-                       $CASEFOLD{$code}{'mapping'} = $3;
-                       $CASEFOLD{$code}{'status'} = $2;
-                   } elsif ($2 eq 'T') {
-                       $CASEFOLD{$code}{'turkic'} = $3;
-                   } # else can't happen because only [CIFST] are possible
-               }
-           }
-           close($CASEFOLDFH);
-       }
+            # These two code points should already have regular entries, so
+            # just fill in the turkish fields
+            $CASEFOLD{ord('I')}{'turkic'} = '0131';
+            $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
+        }
+        elsif ($v_unicode_version ge v3.1.0) {
+
+            # These two code points don't have entries otherwise.
+            $CASEFOLD{0x130}{'code'} = '0130';
+            $CASEFOLD{0x131}{'code'} = '0131';
+            $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
+            $CASEFOLD{0x130}{'turkic'}
+                = $CASEFOLD{0x130}{'mapping'}
+                = $CASEFOLD{0x130}{'full'}
+                = $CASEFOLD{0x130}{'simple'}
+                = $CASEFOLD{0x131}{'turkic'}
+                = $CASEFOLD{0x131}{'mapping'}
+                = $CASEFOLD{0x131}{'full'}
+                = $CASEFOLD{0x131}{'simple'}
+                = sprintf "%04X", ord('i');
+        }
     }
 }