This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The funky final sigma casefolding.
authorJarkko Hietaniemi <jhi@iki.fi>
Sun, 23 Dec 2001 16:43:29 +0000 (16:43 +0000)
committerJarkko Hietaniemi <jhi@iki.fi>
Sun, 23 Dec 2001 16:43:29 +0000 (16:43 +0000)
p4raw-id: //depot/perl@13866

regcomp.c
regexec.c
t/op/pat.t
utf8.h

index 2e0943e..ea98177 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3981,9 +3981,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                if (prevvalue < value)
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
                                   (UV)prevvalue, (UV)value);
-               else if (prevvalue == value)
+               else if (prevvalue == value) {
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
                                   (UV)value);
+                   if (FOLD) {
+                        if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            UNICODE_GREEK_SMALL_LETTER_SIGMA);
+                        }
+                        else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            UNICODE_GREEK_SMALL_LETTER_SIGMA);
+                   }
+               }
            }
         }
 
index d239a70..7a00dfd 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -975,7 +975,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                 * Fortunately, not getting this right is allowed
                 * for Unicode Regular Expression Support level 1,
                 * only one-to-one matching is required. --jhi */
-               if (c1 == c2)
+               if (c1 == c2) {
                    while (s <= e) {
                        if ( utf8_to_uvchr((U8*)s, &len) == c1
                             && (ln == len ||
@@ -985,9 +985,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                            goto got_it;
                        s += len;
                    }
-               else
+               }
+               else {
                    while (s <= e) {
                        UV c = utf8_to_uvchr((U8*)s, &len);
+                       if (c == UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
+                           c == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
+                           c = UNICODE_GREEK_SMALL_LETTER_SIGMA;
                        if ( (c == c1 || c == c2)
                             && (ln == len ||
                                 ibcmp_utf8(s, do_utf8, strend - s,
@@ -996,6 +1000,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                            goto got_it;
                        s += len;
                    }
+               }
            }
            else {
                if (c1 == c2)
index 30fc1a9..a8742f8 100755 (executable)
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..794\n";
+print "1..812\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -2418,3 +2418,35 @@ print "# some Unicode properties\n";
     print "!abc!" =~ /a\Cc/ ? "ok 793\n" : "not ok 793\n";
     print "!abc!" =~ /a\Xc/ ? "ok 794\n" : "not ok 794\n";
 }
+
+{
+    print "# FINAL SIGMA\n";
+
+    my $SIGMA = "\x{03A3}"; # CAPITAL
+    my $Sigma = "\x{03C2}"; # SMALL FINAL
+    my $sigma = "\x{03C3}"; # SMALL
+
+    print $SIGMA =~ /$SIGMA/i ? "ok 795\n" : "not ok 795\n";
+    print $SIGMA =~ /$Sigma/i ? "ok 796\n" : "not ok 796\n";
+    print $SIGMA =~ /$sigma/i ? "ok 797\n" : "not ok 797\n";
+
+    print $Sigma =~ /$SIGMA/i ? "ok 798\n" : "not ok 798\n";
+    print $Sigma =~ /$Sigma/i ? "ok 799\n" : "not ok 799\n";
+    print $Sigma =~ /$sigma/i ? "ok 800\n" : "not ok 800\n";
+
+    print $sigma =~ /$SIGMA/i ? "ok 801\n" : "not ok 801\n";
+    print $sigma =~ /$Sigma/i ? "ok 802\n" : "not ok 802\n";
+    print $sigma =~ /$sigma/i ? "ok 803\n" : "not ok 803\n";
+    
+    print $SIGMA =~ /[$SIGMA]/i ? "ok 804\n" : "not ok 804\n";
+    print $SIGMA =~ /[$Sigma]/i ? "ok 805\n" : "not ok 805\n";
+    print $SIGMA =~ /[$sigma]/i ? "ok 806\n" : "not ok 806\n";
+
+    print $Sigma =~ /[$SIGMA]/i ? "ok 807\n" : "not ok 807\n";
+    print $Sigma =~ /[$Sigma]/i ? "ok 808\n" : "not ok 808\n";
+    print $Sigma =~ /[$sigma]/i ? "ok 809\n" : "not ok 809\n";
+
+    print $sigma =~ /[$SIGMA]/i ? "ok 810\n" : "not ok 810\n";
+    print $sigma =~ /[$Sigma]/i ? "ok 811\n" : "not ok 811\n";
+    print $sigma =~ /[$sigma]/i ? "ok 812\n" : "not ok 812\n";
+}
diff --git a/utf8.h b/utf8.h
index b35cfeb..d907d26 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -188,3 +188,8 @@ END_EXTERN_C
 #endif
 
 #define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)
+
+#define UNICODE_GREEK_CAPITAL_LETTER_SIGMA     0x03A3
+#define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
+#define UNICODE_GREEK_SMALL_LETTER_SIGMA       0x03C3
+