UINT64_C/INT64_C logic shuffling.

[perl5.git] / handy.h
diff --git a/handy.h b/handy.h

index f98c8fb..8a7bf46 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -193,30 +193,47 @@ typedef U64TYPE U64;
  #   endif
  #endif /* PERL_CORE */
  
+/* INT64_C/UINT64_C are C99 from <stdint.h> (so they will not be
+ * available in strict C89 mode), but they are nice, so let's define
+ * them if necessary. */
  #if defined(HAS_QUAD) && defined(USE_64_BIT_INT)
-#   if defined(HAS_LONG_LONG) && QUADKIND == QUAD_IS_LONG_LONG
-#       define PeRl_INT64_C(c) CAT2(c,LL)
-#       define PeRl_UINT64_C(c)        CAT2(c,ULL)
-#   else
-#       if QUADKIND == QUAD_IS___INT64
-#           define PeRl_INT64_C(c)     CAT2(c,I64)
-#           define PeRl_UINT64_C(c)    CAT2(c,UI64)
-#       else
-#           if LONGSIZE == 8 && QUADKIND == QUAD_IS_LONG
-#               define PeRl_INT64_C(c) CAT2(c,L)
-#               define PeRl_UINT64_C(c)        CAT2(c,UL)
-#           else
-#               define PeRl_INT64_C(c) ((I64TYPE)(c))
-#               define PeRl_UINT64_C(c)        ((U64TYPE)(c))
-#           endif
-#       endif
-#   endif
-#   ifndef UINT64_C
-#   define UINT64_C(c) PeRl_UINT64_C(c)
-#   endif
-#   ifndef INT64_C
-#   define INT64_C(c) PeRl_INT64_C(c)
-#   endif
+#  undef PeRl_INT64_C
+#  undef PeRl_UINT64_C
+/* Prefer the native integer types (int and long) over long long
+ * (which is not C89) and Win32-specific __int64. */
+#  if QUADKIND == QUAD_IS_INT && INTSIZE == 8
+#    define PeRl_INT64_C(c)    (c)
+#    define PeRl_UINT64_C(c)   CAT2(c,U)
+#  endif
+#  if QUADKIND == QUAD_IS_LONG && LONGSIZE == 8
+#    define PeRl_INT64_C(c)    CAT2(c,L)
+#    define PeRl_UINT64_C(c)   CAT2(c,UL)
+#  endif
+#  if QUADKIND == QUAD_IS_LONG_LONG && defined(HAS_LONG_LONG)
+#    define PeRl_INT64_C(c)    CAT2(c,LL)
+#    define PeRl_UINT64_C(c)   CAT2(c,ULL)
+#  endif
+#  if QUADKIND == QUAD_IS___INT64
+#    define PeRl_INT64_C(c)    CAT2(c,I64)
+#    define PeRl_UINT64_C(c)   CAT2(c,UI64)
+#  endif
+#  ifndef PeRl_INT64_C
+#    define PeRl_INT64_C(c)    ((I64TYPE)(c)) /* last resort */
+#    define PeRl_UINT64_C(c)   ((U64TYPE)(c))
+#  endif
+/* In OS X the INT64_C/UINT64_C are defined with LL/ULL, which will
+ * not fly with C89-pedantic gcc, so let's undefine them first so that
+ * we can redefine them with our native integer preferring versions. */
+#  if defined(__APPLE__) && defined(PERL_GCC_PEDANTIC)
+#    undef INT64_C
+#    undef UINT64_C
+#  endif
+#  ifndef INT64_C
+#    define INT64_C(c) PeRl_INT64_C(c)
+#  endif
+#  ifndef UINT64_C
+#    define UINT64_C(c) PeRl_UINT64_C(c)
+#  endif
  #endif
  
  #if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX)
@@ -523,12 +540,13 @@ Variant C<isFOO_utf8> is like C<isFOO_uni>, but the input is a pointer to a
  classification of just the first (possibly multi-byte) character in the string
  is tested.
  
-Variant C<isFOO_LC> is like the C<isFOO_A> and C<isFOO_L1> variants, but uses
-the C library function that gives the named classification instead of
-hard-coded rules.  For example, C<isDIGIT_LC()> returns the result of calling
-C<isdigit()>.  This means that the result is based on the current locale, which
-is what C<LC> in the name stands for.  FALSE is always returned if the input
-won't fit into an octet.
+Variant C<isFOO_LC> is like the C<isFOO_A> and C<isFOO_L1> variants, but the
+result is based on the current locale, which is what C<LC> in the name stands
+for.  If Perl can determine that the current locale is a UTF-8 locale, it uses
+the published Unicode rules; otherwise, it uses the C library function that
+gives the named classification.  For example, C<isDIGIT_LC()> when not in a
+UTF-8 locale returns the result of calling C<isdigit()>.  FALSE is always
+returned if the input won't fit into an octet.
  
  Variant C<isFOO_LC_uvchr> is like C<isFOO_LC>, but is defined on any UV.  It
  returns the same as C<isFOO_LC> for input code points less than 256, and
@@ -633,7 +651,7 @@ Returns a boolean indicating whether the specified character is a
  whitespace character.  This is analogous
  to what C<m/\s/> matches in a regular expression.  Starting in Perl 5.18
  (experimentally), this also matches what C<m/[[:space:]]/> does.
-("Experimentally" means that this change may be backed out in 5.20 or 5.22 if
+("Experimentally" means that this change may be backed out in 5.22 if
  field experience indicates that it was unwise.)  Prior to 5.18, only the
  locale forms of this macro (the ones with C<LC> in their names) matched
  precisely what C<m/[[:space:]]/> does.  In those releases, the only difference,
@@ -647,7 +665,7 @@ C<isSPACE_LC_uvchr>, and C<isSPACE_LC_utf8>.
  (short for Posix Space)
  Starting in 5.18, this is identical (experimentally) in all its forms to the
  corresponding C<isSPACE()> macros.  ("Experimentally" means that this change
-may be backed out in 5.20 or 5.22 if field experience indicates that it
+may be backed out in 5.22 if field experience indicates that it
  was unwise.)
  The locale forms of this macro are identical to their corresponding
  C<isSPACE()> forms in all Perl releases.  In releases prior to 5.18, the
@@ -873,7 +891,7 @@ patched there.  The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
  
      /* We could be called without perl.h, in which case NATIVE_TO_ASCII() is
       * likely not defined, and so we use the native function */
-#   define isASCII(c)    isascii(c)
+#   define isASCII(c)    cBOOL(isascii(c))
  #else
  #   define isASCII(c)    ((WIDEST_UTYPE)(c) < 128)
  #endif
@@ -1241,18 +1259,24 @@ EXTCONST U32 PL_charclass[];
  #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c))                       \
                                 ? (c)                                       \
                                 : PL_mod_latin1_uc[ (U8) (c) ])
+#define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale
  
  /* Use foo_LC_uvchr() instead  of these for beyond the Latin1 range */
  
  /* For internal core Perl use only: the base macro for defining macros like
   * isALPHA_LC, which uses the current LC_CTYPE locale.  'c' is the code point
- * (0-255) to check.  'utf8_locale_classnum' is currently unused.  The code to
- * actually do the test this is passed in 'non_utf8'.  If 'c' is above 255, 0
- * is returned.  For accessing the full range of possible code points under
- * locale rules, use the macros based on _generic_LC_uvchr instead of this. */
+ * (0-255) to check.  In a UTF-8 locale, the result is the same as calling
+ * isFOO_L1(); the 'utf8_locale_classnum' parameter is something like
+ * _CC_UPPER, which gives the class number for doing this.  For non-UTF-8
+ * locales, the code to actually do the test this is passed in 'non_utf8'.  If
+ * 'c' is above 255, 0 is returned.  For accessing the full range of possible
+ * code points under locale rules, use the macros based on _generic_LC_uvchr
+ * instead of this. */
  #define _generic_LC_base(c, utf8_locale_classnum, non_utf8)                    \
             (! FITS_IN_8_BITS(c)                                                \
             ? 0                                                                 \
+           : IN_UTF8_CTYPE_LOCALE                                              \
+             ? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum))  \
               : cBOOL(non_utf8))
  
  /* For internal core Perl use only: a helper macro for defining macros like
@@ -1275,15 +1299,39 @@ EXTCONST U32 PL_charclass[];
   * helper macros */
  #define _generic_toLOWER_LC(c, function, cast)  (! FITS_IN_8_BITS(c)           \
                                                  ? (c)                          \
+                                                : (IN_UTF8_CTYPE_LOCALE)       \
+                                                  ? PL_latin1_lc[ (U8) (c) ]   \
                                                  : function((cast)(c)))
  
+/* Note that the result can be larger than a byte in a UTF-8 locale.  It
+ * returns a single value, so can't adequately return the upper case of LATIN
+ * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two
+ * values "SS");  instead it asserts against that under DEBUGGING, and
+ * otherwise returns its input */
  #define _generic_toUPPER_LC(c, function, cast)                                 \
                      (! FITS_IN_8_BITS(c)                                       \
                      ? (c)                                                      \
-                      : function((cast)(c)))
-
+                    : ((! IN_UTF8_CTYPE_LOCALE)                                \
+                      ? function((cast)(c))                                    \
+                      : ((((U8)(c)) == MICRO_SIGN)                             \
+                        ? GREEK_CAPITAL_LETTER_MU                              \
+                        : ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)  \
+                          ? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS              \
+                          : ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S)         \
+                            ? (__ASSERT_(0) (c))                               \
+                            : PL_mod_latin1_uc[ (U8) (c) ])))))
+
+/* Note that the result can be larger than a byte in a UTF-8 locale.  It
+ * returns a single value, so can't adequately return the fold case of LATIN
+ * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two
+ * values "ss"); instead it asserts against that under DEBUGGING, and
+ * otherwise returns its input */
  #define _generic_toFOLD_LC(c, function, cast)                                  \
-                      _generic_toLOWER_LC(c, function, cast)
+                    ((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE)     \
+                      ? GREEK_SMALL_LETTER_MU                                  \
+                      : (__ASSERT_(! IN_UTF8_CTYPE_LOCALE                      \
+                                   || (c) != LATIN_SMALL_LETTER_SHARP_S)       \
+                         _generic_toLOWER_LC(c, function, cast)))
  
  /* Use the libc versions for these if available. */
  #if defined(HAS_ISASCII) && ! defined(USE_NEXT_CTYPE)
@@ -1294,8 +1342,8 @@ EXTCONST U32 PL_charclass[];
  
  #if defined(HAS_ISBLANK) && ! defined(USE_NEXT_CTYPE)
  #   define isBLANK_LC(c) _generic_LC(c, _CC_BLANK, isblank)
-#else
-#   define isBLANK_LC(c) isBLANK(c)
+#else /* Unlike isASCII, varies if in a UTF-8 locale */
+#   define isBLANK_LC(c) (IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c)
  #endif
  
  #ifdef USE_NEXT_CTYPE   /* NeXT computers */
@@ -1323,11 +1371,11 @@ EXTCONST U32 PL_charclass[];
  
  #else /* !USE_NEXT_CTYPE */
  
+#  define _LC_CAST U8
+
  #  if defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII))
      /* For most other platforms */
  
-#    define _LC_CAST U8
-
  #    define isALPHA_LC(c)   _generic_LC(c, _CC_ALPHA, isalpha)
  #    define isALPHANUMERIC_LC(c)  _generic_LC(c, _CC_ALPHANUMERIC, isalnum)
  #    define isCNTRL_LC(c)    _generic_LC(c, _CC_CNTRL, iscntrl)
@@ -1602,11 +1650,23 @@ EXTCONST U32 PL_charclass[];
  #define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p)
  #define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p)
  
-/* This conversion works both ways, strangely enough. On EBCDIC platforms,
- * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't
- * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but
- * that is EOT on ASCII;  ST on EBCDIC */
-#  define toCTRL(c)    (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64)
+/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,
+ * except that they don't necessarily mean the same characters, e.g. CTRL-D is
+ * 4 on both systems, but that is EOT on ASCII;  ST on EBCDIC.
+ * '?' is special-cased on EBCDIC to APC, which is the control there that is
+ * the outlier from the block that contains the other controls, just like
+ * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0
+ * block.  If it weren't special cased, it would yield a non-control.
+ * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */
+#ifndef EBCDIC
+#  define toCTRL(c)    (toUPPER(c) ^ 64)
+#else
+#  define toCTRL(c)    ((c) == '?'                               \
+                        ? LATIN1_TO_NATIVE(0x9F)                 \
+                        : (c) == LATIN1_TO_NATIVE(0x9F)          \
+                          ? '?'                                  \
+                          : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64))
+#endif
  
  /* Line numbers are unsigned, 32 bits. */
  typedef U32 line_t;
@@ -1750,9 +1810,9 @@ PoisonWith(0xEF) for catching access to freed memory.
   * (U16)n > (size_t)~0/sizeof(U16) always being false. */
  #ifdef PERL_MALLOC_WRAP
  #define MEM_WRAP_CHECK(n,t) \
-       (void)(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t) && (croak_memory_wrap(),0))
+       (void)(UNLIKELY(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t)) && (croak_memory_wrap(),0))
  #define MEM_WRAP_CHECK_1(n,t,a) \
-       (void)(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t) && (Perl_croak_nocontext("%s",(a)),0))
+       (void)(UNLIKELY(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t)) && (Perl_croak_nocontext("%s",(a)),0))
  #define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t),
  
  #define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0):0),((n-1+PERL_STRLEN_ROUNDUP_QUANTUM)&~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM-1)))
@@ -1893,8 +1953,13 @@ void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumbe
  #define StructCopy(s,d,t) Copy(s,d,1,t)
  #endif
  
+/* C_ARRAY_LENGTH is the number of elements in the C array (so you
+ * want your zero-based indices to be less than but not equal to).
+ *
+ * C_ARRAY_END is one past the last: half-open/half-closed range,
+ * not last-inclusive range. */
  #define C_ARRAY_LENGTH(a)      (sizeof(a)/sizeof((a)[0]))
-#define C_ARRAY_END(a)         (a) + (sizeof(a)/sizeof((a)[0]))
+#define C_ARRAY_END(a)         ((a) + C_ARRAY_LENGTH(a))
  
  #ifdef NEED_VA_COPY
  # ifdef va_copy