This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Use byte domain EBCDIC/LATIN1 macro where appropriate
authorKarl Williamson <public@khwilliamson.com>
Fri, 8 Feb 2013 02:53:38 +0000 (19:53 -0700)
committerKarl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:55:49 +0000 (09:55 -0600)
The macros like NATIVE_TO_UNI will work on EBCDIC, but operate on the
whole Unicode range.  In the locations affected by this commit, it is
known that the domain is limited to a single byte, so the simpler ones
whose names contain LATIN1 may be used.

On ASCII platforms, all the macros are null, so there is no effective
change.

handy.h
regcomp.c
utf8.c

diff --git a/handy.h b/handy.h
index bc92ad9..144d2a1 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -850,7 +850,8 @@ patched there.  The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
 #define FITS_IN_8_BITS(c) ((sizeof(c) == 1) || !(((WIDEST_UTYPE)(c)) & ~0xFF))
 
 #ifdef EBCDIC
-#   define isASCII(c)    (FITS_IN_8_BITS(c) && (NATIVE_TO_UNI((U8) (c)) < 128))
+#   define isASCII(c)    (FITS_IN_8_BITS(c)                      \
+                         && (NATIVE_TO_LATIN1((U8) (c)) < 128))
 #else
 #   define isASCII(c)    ((WIDEST_UTYPE)(c) < 128)
 #endif
@@ -989,7 +990,7 @@ EXTCONST U32 PL_charclass[];
     /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */
 #   define _CC_mask(classnum) (1U << (classnum))
 #   define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \
-                && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(classnum)))
+                && (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(classnum)))
 
     /* The mask for the _A versions of the macros; it just adds in the bit for
      * ASCII. */
@@ -998,7 +999,7 @@ EXTCONST U32 PL_charclass[];
     /* The _A version makes sure that both the desired bit and the ASCII bit
      * are present */
 #   define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \
-        && ((PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask_A(classnum)) \
+        && ((PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask_A(classnum)) \
                                 == _CC_mask_A(classnum)))
 
 #   define isALPHA_A(c)  _generic_isCC_A(c, _CC_ALPHA)
@@ -1019,7 +1020,7 @@ EXTCONST U32 PL_charclass[];
 
     /* Either participates in a fold with a character above 255, or is a
      * multi-char fold */
-#   define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
+#   define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
 
 #   define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)
 #   define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
@@ -1094,19 +1095,19 @@ EXTCONST U32 PL_charclass[];
        * for backwards compatibility */
     /* ALPHAU includes Unicode semantics for latin1 characters.  It has an extra
      * >= AA test to speed up ASCII-only tests at the expense of the others */
-#   define isALPHA_L1(c) (isALPHA(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \
-       && ((NATIVE_TO_UNI((U8) c) >= 0xC0 \
-             && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \
-           || NATIVE_TO_UNI((U8) c) == 0xAA \
-           || NATIVE_TO_UNI((U8) c) == 0xB5 \
-           || NATIVE_TO_UNI((U8) c) == 0xBA)))
+#   define isALPHA_L1(c) (isALPHA(c) || (NATIVE_TO_LATIN1((U8) c) >= 0xAA \
+       && ((NATIVE_TO_LATIN1((U8) c) >= 0xC0 \
+             && NATIVE_TO_LATIN1((U8) c) != 0xD7 && NATIVE_TO_LATIN1((U8) c) != 0xF7) \
+           || NATIVE_TO_LATIN1((U8) c) == 0xAA \
+           || NATIVE_TO_LATIN1((U8) c) == 0xB5 \
+           || NATIVE_TO_LATIN1((U8) c) == 0xBA)))
 #   define isCHARNAME_CONT(c) (isWORDCHAR_L1(c)                         \
                                || (c) == ' '                            \
                                || (c) == '-'                            \
                                || (c) == '('                            \
                                || (c) == ')'                            \
                                || (c) == ':'                            \
-                               || NATIVE_TO_UNI((U8) c) == 0xA0)
+                               || NATIVE_TO_LATIN1((U8) c) == 0xA0)
 #endif
 
 /* Macros that differ between EBCDIC and ASCII.  Where C89 defines a function,
@@ -1118,7 +1119,7 @@ EXTCONST U32 PL_charclass[];
 #ifdef EBCDIC
 #   define isALPHA(c)  isalpha(c)
 #   define isALPHANUMERIC(c)   isalnum(c)
-#   define isBLANK(c)  ((c) == ' ' || (c) == '\t' || NATIVE_TO_UNI(c) == 0xA0)
+#   define isBLANK(c)  ((c) == ' ' || (c) == '\t' || NATIVE_TO_LATIN1(c) == 0xA0)
 #   define isCNTRL(c)  iscntrl(c)
 #   define isDIGIT(c)  isdigit(c)
 #   define isGRAPH(c)  isgraph(c)
@@ -1175,19 +1176,18 @@ EXTCONST U32 PL_charclass[];
 /* Use table lookup for speed; return error character for input
  * out-of-range */
 #define toLOWER_LATIN1(c)    ((! FITS_IN_8_BITS(c))                        \
-                             ? (c)                                           \
-                             : UNI_TO_NATIVE(PL_latin1_lc[                 \
-                                               NATIVE_TO_UNI( (U8) (c)) ]))
+                             ? (c)                                         \
+                             : LATIN1_TO_NATIVE(PL_latin1_lc[              \
+                                            NATIVE_TO_LATIN1( (U8) (c)) ]))
 #define toLOWER_L1(c)    toLOWER_LATIN1(c)  /* Synonym for consistency */
 
 /* Modified uc.  Is correct uc except for three non-ascii chars which are
  * all mapped to one of them, and these need special handling; error
  * character for input out-of-range */
 #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c))                       \
-                               ? (c)                                         \
-                               : UNI_TO_NATIVE(PL_mod_latin1_uc[           \
-                                               NATIVE_TO_UNI( (U8) (c)) ]))
-
+                               ? (c)                                       \
+                               : LATIN1_TO_NATIVE(PL_mod_latin1_uc[        \
+                                            NATIVE_TO_LATIN1( (U8) (c)) ]))
 #ifdef USE_NEXT_CTYPE
 
 #  define isALPHANUMERIC_LC(c) NXIsAlNum((unsigned int)(c))
@@ -1498,7 +1498,7 @@ EXTCONST U32 PL_charclass[];
  * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't
  * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but
  * that is EOT on ASCII;  ST on EBCDIC */
-#  define toCTRL(c)    (toUPPER(NATIVE_TO_UNI(c)) ^ 64)
+#  define toCTRL(c)    (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64)
 
 /* Line numbers are unsigned, 32 bits. */
 typedef U32 line_t;
index 5a1e234..440d26a 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4276,8 +4276,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
                             for (value = 0; value < loop_max; value++) {
-                                if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+                                if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
                                 }
                             }
                        }
@@ -4292,8 +4292,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                         * in case it isn't a true locale-node.  This will
                         * create false positives if it truly is locale */
                         for (value = 0; value < loop_max; value++) {
-                            if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+                            if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
                             }
                         }
                         }
@@ -4310,8 +4310,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
                             for (value = 0; value < loop_max; value++) {
-                                if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+                                if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
                                 }
                             }
                        }
@@ -4326,8 +4326,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                         * case it isn't a true locale-node.  This will create
                         * false positives if it truly is locale */
                         for (value = 0; value < loop_max; value++) {
-                            if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+                            if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
                             }
                         }
                         if (PL_regkind[OP(scan)] == NPOSIXD) {
diff --git a/utf8.c b/utf8.c
index d33ff47..1bdad1b 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1413,7 +1413,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
        p += 2;
        if (uv < 0x80) {
 #ifdef EBCDIC
-           *d++ = UNI_TO_NATIVE(uv);
+           *d++ = LATIN1_TO_NATIVE(uv);
 #else
            *d++ = (U8)uv;
 #endif
@@ -4610,7 +4610,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                    to_utf8_fold(p1, foldbuf1, &n1);
                }
                else {  /* Not utf8, get utf8 fold */
-                   to_uni_fold(NATIVE_TO_UNI(*p1), foldbuf1, &n1);
+                   to_uni_fold(NATIVE_TO_LATIN1(*p1), foldbuf1, &n1);
                }
                f1 = foldbuf1;
            }
@@ -4655,7 +4655,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                    to_utf8_fold(p2, foldbuf2, &n2);
                }
                else {
-                   to_uni_fold(NATIVE_TO_UNI(*p2), foldbuf2, &n2);
+                   to_uni_fold(NATIVE_TO_LATIN1(*p2), foldbuf2, &n2);
                }
                f2 = foldbuf2;
            }