This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add macro for converting Latin1 to UTF-8, and use it
authorKarl Williamson <khw@cpan.org>
Fri, 15 May 2015 16:59:54 +0000 (10:59 -0600)
committerKarl Williamson <khw@cpan.org>
Fri, 4 Sep 2015 16:21:17 +0000 (10:21 -0600)
This adds a macro that converts a code point in the ASCII 128-255 range
to UTF-8, and changes existing code to use it when the range is known to
be restricted to this one, rather than the previous macro which accepted
a wider range (any code point representable by 2 bytes), but had an
extra test on EBCDIC platforms, hence was larger than necessary and
slightly slower.

handy.h
hv.c
pp.c
regcomp.c
regexec.c
toke.c
utf8.c
utf8.h

diff --git a/handy.h b/handy.h
index 76dddc0..c3fd77d 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -1579,7 +1579,7 @@ END_EXTERN_C
                                          ? _generic_isCC(*(p), classnum)       \
                                          : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
                                            ? _generic_isCC(                    \
-                                                TWO_BYTE_UTF8_TO_NATIVE(*(p),  \
+                                                EIGHT_BIT_UTF8_TO_NATIVE(*(p), \
                                                                    *((p)+1 )), \
                                                 classnum)                      \
                                            : utf8)
@@ -1665,7 +1665,7 @@ END_EXTERN_C
                          (UTF8_IS_INVARIANT(*(p))                           \
                          ? macro(*(p))                                      \
                          : (UTF8_IS_DOWNGRADEABLE_START(*(p)))              \
-                           ? macro(TWO_BYTE_UTF8_TO_NATIVE(*(p), *((p)+1))) \
+                           ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1)))\
                            : utf8)
 
 #define _generic_LC_swash_utf8(macro, classnum, p)                         \
diff --git a/hv.c b/hv.c
index 9271901..253cad9 100644 (file)
--- a/hv.c
+++ b/hv.c
@@ -3223,7 +3223,7 @@ Perl_refcounted_he_fetch_pvn(pTHX_ const struct refcounted_he *chain,
                 }
                 else {
                     p++;
-                    *q = (char) TWO_BYTE_UTF8_TO_NATIVE(c, *p);
+                    *q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p);
                 }
            }
        }
@@ -3399,7 +3399,7 @@ Perl_refcounted_he_new_pvn(pTHX_ struct refcounted_he *parent,
                 }
                 else {
                     p++;
-                    *q = (char) TWO_BYTE_UTF8_TO_NATIVE(c, *p);
+                    *q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p);
                 }
            }
        }
diff --git a/pp.c b/pp.c
index 7e5dce1..9dd3048 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -4349,7 +4349,7 @@ PP(pp_quotemeta)
                    IN_LC_RUNTIME(LC_CTYPE)
                        ||
 #endif
-                       _isQUOTEMETA(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s + 1))))
+                       _isQUOTEMETA(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s + 1))))
                    {
                        to_quote = TRUE;
                    }
index d4c51e9..ccbccf8 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -12854,7 +12854,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             }
                         }
                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                            if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+                            if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
                                                                   *s, *(s+1))))
                             {
                                 break;
index 7d323d5..c88f467 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -492,7 +492,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
     }
     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
         return isFOO_lc(classnum,
-                        TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
+                        EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
     }
 
     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
@@ -2329,7 +2329,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                                                 classnum)))
                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
                             && to_complement ^ cBOOL(
-                                _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
+                                _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
                                                                       *(s + 1)),
                                               classnum))))
                     {
@@ -5386,7 +5386,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                             l++;
                         }
                         else {
-                            if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
+                            if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
                             {
                                 sayNO;
                             }
@@ -5410,7 +5410,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                             s++;
                         }
                         else {
-                            if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
+                            if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
                             {
                                 sayNO;
                             }
@@ -5783,7 +5783,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
-                                           (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+                                           (U8) EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
                                                             *(locinput + 1))))))
                 {
                     sayNO;
@@ -5864,7 +5864,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
                 if (! (to_complement
-                       ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+                       ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
                                                                *(locinput + 1)),
                                              FLAGS(scan)))))
                 {
@@ -8141,7 +8141,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
 
                 /* Target isn't utf8; convert the character in the UTF-8
                  * pattern to non-UTF8, and do a simple loop */
-                c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
+                c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
                 while (scan < loceol && UCHARAT(scan) == c) {
                     scan++;
                 }
@@ -8385,7 +8385,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                     }
                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
                         if (! (to_complement
-                              ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
+                              ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
                                                                      *(scan + 1)),
                                                     classnum))))
                         {
diff --git a/toke.c b/toke.c
index ef208f2..70318a7 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1033,7 +1033,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
                }
                else {
                     assert(p < e -1 );
-                   *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+                   *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
                    p += 2;
                 }
            }
@@ -2603,7 +2603,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
             }
             s++;
         } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-            if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
+            if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
                 goto bad_charname;
             }
             s += 2;
@@ -2633,7 +2633,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
                 s++;
             }
             else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
+                if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
                 {
                     goto bad_charname;
                 }
diff --git a/utf8.c b/utf8.c
index 5d4a7ce..2a9d20e 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1057,7 +1057,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
                if (u < uend) {
                    U8 c1 = *u++;
                    if (UTF8_IS_CONTINUATION(c1)) {
-                       c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
+                       c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
                    } else {
                        Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                                         "Malformed UTF-8 character "
@@ -1133,7 +1133,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
        U8 c = *s++;
        if (! UTF8_IS_INVARIANT(c)) {
            /* Then it is two-byte encoded */
-           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+           c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
             s++;
        }
        *d++ = c;
@@ -1190,7 +1190,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
        U8 c = *s++;
        if (! UTF8_IS_INVARIANT(c)) {
            /* Then it is two-byte encoded */
-           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+           c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
             s++;
        }
        *d++ = c;
@@ -1971,11 +1971,11 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
     }
     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
        if (flags) {
-            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+            U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
            result = toUPPER_LC(c);
        }
        else {
-           return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+           return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
                                          ustrp, lenp, 'S');
        }
     }
@@ -2042,11 +2042,11 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
     }
     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
        if (flags) {
-            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+            U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
            result = toUPPER_LC(c);
        }
        else {
-           return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+           return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
                                          ustrp, lenp, 's');
        }
     }
@@ -2112,11 +2112,11 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
     }
     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
        if (flags) {
-            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+            U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
            result = toLOWER_LC(c);
        }
        else {
-           return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+           return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
                                   ustrp, lenp);
        }
     }
@@ -2194,11 +2194,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
     }
     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
        if (flags & FOLD_FLAGS_LOCALE) {
-            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+            U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
            result = toFOLD_LC(c);
        }
        else {
-           return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+           return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
                             ustrp, lenp,
                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
        }
@@ -2723,7 +2723,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
     else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
         klen = 0;
        needents = 256;
-        off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
+        off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
     }
     else {
         klen = UTF8SKIP(ptr) - 1;
diff --git a/utf8.h b/utf8.h
index 271796b..85bf590 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -350,6 +350,21 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
 /* Longer, but more accurate name */
 #define UTF8_IS_ABOVE_LATIN1_START(c)     UTF8_IS_ABOVE_LATIN1(c)
 
+/* Convert a UTF-8 variant Latin1 character to a native code point value.
+ * Needs just one iteration of accumulate.  Should be used only if it is known
+ * that the code point is < 256, and is not UTF-8 invariant.  Use the slower
+ * but more general TWO_BYTE_UTF8_TO_NATIVE() which handles any code point
+ * representable by two bytes (which turns out to be up through
+ * MAX_PORTABLE_UTF8_TWO_BYTE).  The two parameters are:
+ *  HI: a downgradable start byte;
+ *  LO: continuation.
+ * */
+#define EIGHT_BIT_UTF8_TO_NATIVE(HI, LO)                                        \
+    ( __ASSERT_(UTF8_IS_DOWNGRADEABLE_START(HI))                                \
+      __ASSERT_(UTF8_IS_CONTINUATION(LO))                                       \
+     LATIN1_TO_NATIVE(UTF8_ACCUMULATE((                                         \
+                            NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), (LO))))
+
 /* Convert a two (not one) byte utf8 character to a native code point value.
  * Needs just one iteration of accumulate.  Should not be used unless it is
  * known that the two bytes are legal: 1) two-byte start, and 2) continuation.