classification of just the first (possibly multi-byte) character in the string
is tested.
-Variant C<isFOO_LC> is like the C<isFOO_A> and C<isFOO_L1> variants, but uses
-the C library function that gives the named classification instead of
-hard-coded rules. For example, C<isDIGIT_LC()> returns the result of calling
-C<isdigit()>. This means that the result is based on the current locale, which
-is what C<LC> in the name stands for. FALSE is always returned if the input
-won't fit into an octet.
+Variant C<isFOO_LC> is like the C<isFOO_A> and C<isFOO_L1> variants, but the
+result is based on the current locale, which is what C<LC> in the name stands
+for. If Perl can determine that the current locale is a UTF-8 locale, it uses
+the published Unicode rules; otherwise, it uses the C library function that
+gives the named classification. For example, C<isDIGIT_LC()> when not in a
+UTF-8 locale returns the result of calling C<isdigit()>. FALSE is always
+returned if the input won't fit into an octet.
Variant C<isFOO_LC_uvchr> is like C<isFOO_LC>, but is defined on any UV. It
returns the same as C<isFOO_LC> for input code points less than 256, and
/* We could be called without perl.h, in which case NATIVE_TO_ASCII() is
* likely not defined, and so we use the native function */
-# define isASCII(c) isascii(c)
+# define isASCII(c) cBOOL(isascii(c))
#else
# define isASCII(c) ((WIDEST_UTYPE)(c) < 128)
#endif
* But by creating these definitions, other code doesn't have to be aware of
* this detail */
#define toFOLD(c) toLOWER(c)
-#define toFOLD_LC(c) toLOWER_LC(c)
#define toTITLE(c) toUPPER(c)
#define toLOWER_A(c) toLOWER(c)
#define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \
? (c) \
: PL_mod_latin1_uc[ (U8) (c) ])
+#define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale
+
+/* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */
+
+/* For internal core Perl use only: the base macro for defining macros like
+ * isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point
+ * (0-255) to check. In a UTF-8 locale, the result is the same as calling
+ * isFOO_L1(); the 'utf8_locale_classnum' parameter is something like
+ * _CC_UPPER, which gives the class number for doing this. For non-UTF-8
+ * locales, the code to actually do the test this is passed in 'non_utf8'. If
+ * 'c' is above 255, 0 is returned. For accessing the full range of possible
+ * code points under locale rules, use the macros based on _generic_LC_uvchr
+ * instead of this. */
+#define _generic_LC_base(c, utf8_locale_classnum, non_utf8) \
+ (! FITS_IN_8_BITS(c) \
+ ? 0 \
+ : IN_UTF8_CTYPE_LOCALE \
+ ? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum)) \
+ : cBOOL(non_utf8))
+
+/* For internal core Perl use only: a helper macro for defining macros like
+ * isALPHA_LC. 'c' is the code point (0-255) to check. The function name to
+ * actually do this test is passed in 'non_utf8_func', which is called on 'c',
+ * casting 'c' to the macro _LC_CAST, which should not be parenthesized. See
+ * _generic_LC_base for more info */
+#define _generic_LC(c, utf8_locale_classnum, non_utf8_func) \
+ _generic_LC_base(c,utf8_locale_classnum, \
+ non_utf8_func( (_LC_CAST) (c)))
+
+/* For internal core Perl use only: like _generic_LC, but also returns TRUE if
+ * 'c' is the platform's native underscore character */
+#define _generic_LC_underscore(c,utf8_locale_classnum,non_utf8_func) \
+ _generic_LC_base(c, utf8_locale_classnum, \
+ (non_utf8_func( (_LC_CAST) (c)) \
+ || (char)(c) == '_'))
+
+/* These next three are also for internal core Perl use only: case-change
+ * helper macros */
+#define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \
+ ? (c) \
+ : (IN_UTF8_CTYPE_LOCALE) \
+ ? PL_latin1_lc[ (U8) (c) ] \
+ : function((cast)(c)))
+
+/* Note that the result can be larger than a byte in a UTF-8 locale. It
+ * returns a single value, so can't adequately return the upper case of LATIN
+ * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two
+ * values "SS"); instead it asserts against that under DEBUGGING, and
+ * otherwise returns its input */
+#define _generic_toUPPER_LC(c, function, cast) \
+ (! FITS_IN_8_BITS(c) \
+ ? (c) \
+ : ((! IN_UTF8_CTYPE_LOCALE) \
+ ? function((cast)(c)) \
+ : ((((U8)(c)) == MICRO_SIGN) \
+ ? GREEK_CAPITAL_LETTER_MU \
+ : ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \
+ ? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \
+ : ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \
+ ? (__ASSERT_(0) (c)) \
+ : PL_mod_latin1_uc[ (U8) (c) ])))))
+
+/* Note that the result can be larger than a byte in a UTF-8 locale. It
+ * returns a single value, so can't adequately return the fold case of LATIN
+ * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two
+ * values "ss"); instead it asserts against that under DEBUGGING, and
+ * otherwise returns its input */
+#define _generic_toFOLD_LC(c, function, cast) \
+ ((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \
+ ? GREEK_SMALL_LETTER_MU \
+ : (__ASSERT_(! IN_UTF8_CTYPE_LOCALE \
+ || (c) != LATIN_SMALL_LETTER_SHARP_S) \
+ _generic_toLOWER_LC(c, function, cast)))
/* Use the libc versions for these if available. */
#if defined(HAS_ISASCII) && ! defined(USE_NEXT_CTYPE)
#if defined(HAS_ISBLANK) && ! defined(USE_NEXT_CTYPE)
# define isBLANK_LC(c) _generic_LC(c, _CC_BLANK, isblank)
-#else
-# define isBLANK_LC(c) isBLANK(c)
+#else /* Unlike isASCII, varies if in a UTF-8 locale */
+# define isBLANK_LC(c) (IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c)
#endif
#ifdef USE_NEXT_CTYPE /* NeXT computers */
-# define isALPHANUMERIC_LC(c) NXIsAlNum((unsigned int)(c))
-# define isALPHA_LC(c) NXIsAlpha((unsigned int)(c))
-# define isCNTRL_LC(c) NXIsCntrl((unsigned int)(c))
-# define isDIGIT_LC(c) NXIsDigit((unsigned int)(c))
-# define isGRAPH_LC(c) NXIsGraph((unsigned int)(c))
-# define isIDFIRST_LC(c) (NXIsAlpha((unsigned int)(c)) || (char)(c) == '_')
-# define isLOWER_LC(c) NXIsLower((unsigned int)(c))
-# define isPRINT_LC(c) NXIsPrint((unsigned int)(c))
-# define isPUNCT_LC(c) NXIsPunct((unsigned int)(c))
-# define isSPACE_LC(c) NXIsSpace((unsigned int)(c))
-# define isUPPER_LC(c) NXIsUpper((unsigned int)(c))
-# define isWORDCHAR_LC(c) (NXIsAlNum((unsigned int)(c)) || (char)(c) == '_')
-# define isXDIGIT_LC(c) NXIsXDigit((unsigned int)(c))
-# define toLOWER_LC(c) NXToLower((unsigned int)(c))
-# define toUPPER_LC(c) NXToUpper((unsigned int)(c))
+# define _LC_CAST unsigned int /* Needed by _generic_LC. NeXT functions
+ use this as their input type */
+
+# define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, NXIsAlpha)
+# define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, NXIsAlNum)
+# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, NXIsCntrl)
+# define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, NXIsDigit)
+# define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, NXIsGraph)
+# define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, NXIsAlpha)
+# define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, NXIsLower)
+# define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, NXIsPrint)
+# define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, NXIsPunct)
+# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, NXIsSpace)
+# define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, NXIsUpper)
+# define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, NXIsAlNum)
+# define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, NXIsXdigit)
+
+# define toLOWER_LC(c) _generic_toLOWER_LC((c), NXToLower, unsigned int)
+# define toUPPER_LC(c) _generic_toUPPER_LC((c), NXToUpper, unsigned int)
+# define toFOLD_LC(c) _generic_toFOLD_LC((c), NXToLower, unsigned int)
#else /* !USE_NEXT_CTYPE */
+# define _LC_CAST U8
+
# if defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII))
/* For most other platforms */
-# define isALPHA_LC(c) (FITS_IN_8_BITS(c) && isalpha((unsigned char)(c)))
-# define isALPHANUMERIC_LC(c) (FITS_IN_8_BITS(c) \
- && isalnum((unsigned char)(c)))
-# define isCNTRL_LC(c) (FITS_IN_8_BITS(c) && iscntrl((unsigned char)(c)))
-# define isDIGIT_LC(c) (FITS_IN_8_BITS(c) && isdigit((unsigned char)(c)))
-# define isGRAPH_LC(c) (FITS_IN_8_BITS(c) && isgraph((unsigned char)(c)))
-# define isIDFIRST_LC(c) (FITS_IN_8_BITS(c) \
- && (isalpha((unsigned char)(c)) || (char)(c) == '_'))
-# define isLOWER_LC(c) (FITS_IN_8_BITS(c) && islower((unsigned char)(c)))
-# define isPRINT_LC(c) (FITS_IN_8_BITS(c) && isprint((unsigned char)(c)))
-# define isPUNCT_LC(c) (FITS_IN_8_BITS(c) && ispunct((unsigned char)(c)))
-# define isSPACE_LC(c) (FITS_IN_8_BITS(c) && isspace((unsigned char)(c)))
-# define isUPPER_LC(c) (FITS_IN_8_BITS(c) && isupper((unsigned char)(c)))
-# define isWORDCHAR_LC(c) (FITS_IN_8_BITS(c) \
- && (isalnum((unsigned char)(c)) || (char)(c) == '_'))
-# define isXDIGIT_LC(c) (FITS_IN_8_BITS(c) && isxdigit((unsigned char)(c)))
-# define toLOWER_LC(c) (FITS_IN_8_BITS(c) ? (UV)tolower((unsigned char)(c)) : (c))
-# define toUPPER_LC(c) (FITS_IN_8_BITS(c) ? (UV)toupper((unsigned char)(c)) : (c))
+
+# define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, isalpha)
+# define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, isalnum)
+# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl)
+# define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, isdigit)
+# define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, isgraph)
+# define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, isalpha)
+# define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, islower)
+# define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, isprint)
+# define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, ispunct)
+# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace)
+# define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, isupper)
+# define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, isalnum)
+# define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, isxdigit)
+
+
+# define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8)
+# define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8)
+# define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8)
# else /* The final fallback position */
# define toLOWER_LC(c) (isascii(c) ? tolower(c) : (c))
# define toUPPER_LC(c) (isascii(c) ? toupper(c) : (c))
+# define toFOLD_LC(c) (isascii(c) ? tolower(c) : (c))
# endif
#endif /* USE_NEXT_CTYPE */
#define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p)
#define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p)
-/* This conversion works both ways, strangely enough. On EBCDIC platforms,
- * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't
- * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but
- * that is EOT on ASCII; ST on EBCDIC */
-# define toCTRL(c) (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64)
+/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,
+ * except that they don't necessarily mean the same characters, e.g. CTRL-D is
+ * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC.
+ * '?' is special-cased on EBCDIC to APC, which is the control there that is
+ * the outlier from the block that contains the other controls, just like
+ * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0
+ * block. If it weren't special cased, it would yield a non-control.
+ * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */
+#ifndef EBCDIC
+# define toCTRL(c) (toUPPER(c) ^ 64)
+#else
+# define toCTRL(c) ((c) == '?' \
+ ? LATIN1_TO_NATIVE(0x9F) \
+ : (c) == LATIN1_TO_NATIVE(0x9F) \
+ ? '?' \
+ : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64))
+#endif
/* Line numbers are unsigned, 32 bits. */
typedef U32 line_t;