This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Faster latin1 folding
authorKarl Williamson <public@khwilliamson.com>
Wed, 9 Nov 2011 05:16:39 +0000 (22:16 -0700)
committerKarl Williamson <public@khwilliamson.com>
Wed, 9 Nov 2011 05:38:39 +0000 (22:38 -0700)
This adds a function similar to the ones for the other three case
changing operations that works on latin1 characters only, and avoids
having to go out to swashes.  It changes to_uni_fold() and
to_utf8_fold() to call it on the appropriate input

embed.fnc
embed.h
proto.h
utf8.c

index e2911dd..446faf5 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -603,6 +603,7 @@ Ap  |UV     |to_uni_title   |UV c|NN U8 *p|NN STRLEN *lenp
 #ifdef PERL_IN_UTF8_C
 sR     |U8     |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
 p      |UV     |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
+p      |UV     |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const U8 flags
 #endif
 Ap     |UV     |to_uni_lower   |UV c|NN U8 *p|NN STRLEN *lenp
 Amp    |UV     |to_uni_fold    |UV c|NN U8 *p|NN STRLEN *lenp
diff --git a/embed.h b/embed.h
index 3d985b5..8540031 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define isa_lookup(a,b,c,d)    S_isa_lookup(aTHX_ a,b,c,d)
 #  endif
 #  if defined(PERL_IN_UTF8_C)
+#define _to_fold_latin1(a,b,c,d)       Perl__to_fold_latin1(aTHX_ a,b,c,d)
 #define _to_upper_title_latin1(a,b,c,d)        Perl__to_upper_title_latin1(aTHX_ a,b,c,d)
 #define is_utf8_char_slow      S_is_utf8_char_slow
 #define is_utf8_common(a,b,c)  S_is_utf8_common(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index 7f9621a..534fab8 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6983,6 +6983,12 @@ STATIC bool      S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
 
 #endif
 #if defined(PERL_IN_UTF8_C)
+PERL_CALLCONV UV       Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const U8 flags)
+                       __attribute__nonnull__(pTHX_2)
+                       __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1       \
+       assert(p); assert(lenp)
+
 PERL_CALLCONV UV       Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s)
                        __attribute__nonnull__(pTHX_2)
                        __attribute__nonnull__(pTHX_3);
diff --git a/utf8.c b/utf8.c
index 38f5c6c..9c55d10 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1459,12 +1459,50 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 }
 
 UV
+Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const U8 flags)
+{
+    UV converted;
+
+    PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
+
+    if (c == MICRO_SIGN) {
+       converted = GREEK_SMALL_LETTER_MU;
+    }
+    else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) {
+       *(p)++ = 's';
+       *p = 's';
+       *lenp = 2;
+       return 's';
+    }
+    else { /* In this range the fold of all other characters is their lower
+              case */
+       converted = toLOWER_LATIN1(c);
+    }
+
+    if (UNI_IS_INVARIANT(converted)) {
+       *p = (U8) converted;
+       *lenp = 1;
+    }
+    else {
+       *(p)++ = UTF8_TWO_BYTE_HI(converted);
+       *p = UTF8_TWO_BYTE_LO(converted);
+       *lenp = 2;
+    }
+
+    return converted;
+}
+
+UV
 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
 {
     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
 
+    if (c < 256) {
+       return _to_fold_latin1((U8) c, p, lenp, flags);
+    }
+
     uvchr_to_utf8(p, c);
-    return _to_utf8_fold_flags(p, p, lenp, flags);
+    return CALL_FOLD_CASE(p, p, lenp, flags);
 }
 
 /* for now these all assume no locale info available for Unicode > 255 */
@@ -2180,6 +2218,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
 
     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
 
+    if (UTF8_IS_INVARIANT(*p)) {
+       return _to_fold_latin1(*p, ustrp, lenp, flags);
+    }
+    else if UTF8_IS_DOWNGRADEABLE_START(*p) {
+       return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
+                                                   ustrp, lenp, flags);
+    }
+
     return CALL_FOLD_CASE(p, ustrp, lenp, flags);
 }