X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/d10b49655c586fb88be1ada909b15594b88f38ea..08ada29e09c8de732363dff63daf29c269ac11b3:/handy.h diff --git a/handy.h b/handy.h index 18e2d1c..e793039 100644 --- a/handy.h +++ b/handy.h @@ -1,7 +1,7 @@ /* handy.h * * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000, - * 2001, 2002, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others + * 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others * * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. @@ -70,10 +70,13 @@ Null SV pointer. (No longer available when C is defined.) #define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p)) #define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p)) -/* XXX Configure ought to have a test for a boolean type, if I can - just figure out all the headers such a test needs. - Andy Dougherty August 1996 -*/ +#ifdef I_STDBOOL +# include +# ifndef HAS_BOOL +# define HAS_BOOL 1 +# endif +#endif + /* bool is built-in for g++-2.6.3 and later, which might be used for extensions. <_G_config.h> defines _G_HAVE_BOOL, but we can't be sure _G_config.h will be included before this file. _G_config.h @@ -112,9 +115,10 @@ Null SV pointer. (No longer available when C is defined.) /* a simple (bool) cast may not do the right thing: if bool is defined * as char for example, then the cast from int is implementation-defined + * (bool)!!(cbool) in a ternary triggers a bug in xlc on AIX */ -#define cBOOL(cbool) ((bool)!!(cbool)) +#define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0) /* Try to figure out __func__ or __FUNCTION__ equivalent, if any. * XXX Should really be a Configure probe, with HAS__FUNCTION__ @@ -187,25 +191,29 @@ typedef U64TYPE U64; #endif /* PERL_CORE */ #if defined(HAS_QUAD) && defined(USE_64_BIT_INT) -# ifndef UINT64_C /* usually from */ -# if defined(HAS_LONG_LONG) && QUADKIND == QUAD_IS_LONG_LONG -# define INT64_C(c) CAT2(c,LL) -# define UINT64_C(c) CAT2(c,ULL) +# if defined(HAS_LONG_LONG) && QUADKIND == QUAD_IS_LONG_LONG +# define PeRl_INT64_C(c) CAT2(c,LL) +# define PeRl_UINT64_C(c) CAT2(c,ULL) +# else +# if LONGSIZE == 8 && QUADKIND == QUAD_IS_LONG +# define PeRl_INT64_C(c) CAT2(c,L) +# define PeRl_UINT64_C(c) CAT2(c,UL) # else -# if LONGSIZE == 8 && QUADKIND == QUAD_IS_LONG -# define INT64_C(c) CAT2(c,L) -# define UINT64_C(c) CAT2(c,UL) +# if defined(_WIN64) && defined(_MSC_VER) +# define PeRl_INT64_C(c) CAT2(c,I64) +# define PeRl_UINT64_C(c) CAT2(c,UI64) # else -# if defined(_WIN64) && defined(_MSC_VER) -# define INT64_C(c) CAT2(c,I64) -# define UINT64_C(c) CAT2(c,UI64) -# else -# define INT64_C(c) ((I64TYPE)(c)) -# define UINT64_C(c) ((U64TYPE)(c)) -# endif +# define PeRl_INT64_C(c) ((I64TYPE)(c)) +# define PeRl_UINT64_C(c) ((U64TYPE)(c)) # endif # endif # endif +# ifndef UINT64_C +# define UINT64_C(c) PeRl_UINT64_C(c) +# endif +# ifndef INT64_C +# define INT64_C(c) PeRl_INT64_C(c) +# endif #endif #if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX) @@ -470,7 +478,7 @@ There are three variants for all the functions in this section. The base ones operate using the character set of the platform Perl is running on. The ones with an C<_A> suffix operate on the ASCII character set, and the ones with an C<_L1> suffix operate on the full Latin1 character set. All are unaffected by -locale +locale and by C. For ASCII platforms, the base function with no suffix and the one with the C<_A> suffix are identical. The function with the C<_L1> suffix imposes the @@ -559,61 +567,71 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc */ -/* FITS_IN_8_BITS(c) returns true if c occupies no more than 8 bits. It is - * designed to be hopefully bomb-proof, making sure that no bits of - * information are lost even on a 64-bit machine, but to get the compiler to - * optimize it out if possible. This is because Configure makes sure that the - * machine has an 8-bit byte, so if c is stored in a byte, the sizeof() - * guarantees that this evaluates to a constant true at compile time. The use - * of the mask instead of '< 256' keeps gcc from complaining that it is alway - * true, when c's storage class is a byte. Use U64TYPE because U64 is known - * only in the perl core, and this macro can be called from outside that */ +/* Specify the widest unsigned type on the platform. Use U64TYPE because U64 + * is known only in the perl core, and this macro can be called from outside + * that */ #ifdef HAS_QUAD -# define FITS_IN_8_BITS(c) ((sizeof(c) == 1) || (((U64TYPE)(c) & 0xFF) == (U64TYPE)(c))) +# define WIDEST_UTYPE U64TYPE #else -# define FITS_IN_8_BITS(c) ((sizeof(c) == 1) || (((U32)(c) & 0xFF) == (U32)(c))) +# define WIDEST_UTYPE U32 +#endif + +/* FITS_IN_8_BITS(c) returns true if c doesn't have a bit set other than in + * the lower 8. It is designed to be hopefully bomb-proof, making sure that no + * bits of information are lost even on a 64-bit machine, but to get the + * compiler to optimize it out if possible. This is because Configure makes + * sure that the machine has an 8-bit byte, so if c is stored in a byte, the + * sizeof() guarantees that this evaluates to a constant true at compile time. + */ +#define FITS_IN_8_BITS(c) ((sizeof(c) == 1) || !(((WIDEST_UTYPE)(c)) & ~0xFF)) + +#ifdef EBCDIC +# define isASCII(c) (FITS_IN_8_BITS(c) && (NATIVE_TO_UNI((U8) (c)) < 128)) +#else +# define isASCII(c) ((WIDEST_UTYPE)(c) < 128) #endif -#define isASCII(c) (FITS_IN_8_BITS(c) ? NATIVE_TO_UNI((U8) c) <= 127 : 0) #define isASCII_A(c) isASCII(c) +#define isASCII_L1(c) isASCII(c) + +/* The lower 3 bits in both the ASCII and EBCDIC representations of '0' are 0, + * and the 8 possible permutations of those bits exactly comprise the 8 octal + * digits */ +#define isOCTAL_A(c) cBOOL(FITS_IN_8_BITS(c) && (0xF8 & (c)) == '0') /* ASCII range only */ #ifdef H_PERL /* If have access to perl.h, lookup in its table */ -/* Bits for PL_charclass[] */ -# define _CC_ALNUMC_A (1<<0) -# define _CC_ALNUMC_L1 (1<<1) -# define _CC_ALPHA_A (1<<2) -# define _CC_ALPHA_L1 (1<<3) -# define _CC_BLANK_A (1<<4) -# define _CC_BLANK_L1 (1<<5) -# define _CC_CHARNAME_CONT (1<<6) -# define _CC_CNTRL_A (1<<7) -# define _CC_CNTRL_L1 (1<<8) -# define _CC_DIGIT_A (1<<9) -# define _CC_GRAPH_A (1<<10) -# define _CC_GRAPH_L1 (1<<11) -# define _CC_IDFIRST_A (1<<12) -# define _CC_IDFIRST_L1 (1<<13) -# define _CC_LOWER_A (1<<14) -# define _CC_LOWER_L1 (1<<15) -# define _CC_OCTAL_A (1<<16) -# define _CC_PRINT_A (1<<17) -# define _CC_PRINT_L1 (1<<18) -# define _CC_PSXSPC_A (1<<19) -# define _CC_PSXSPC_L1 (1<<20) -# define _CC_PUNCT_A (1<<21) -# define _CC_PUNCT_L1 (1<<22) -# define _CC_SPACE_A (1<<23) -# define _CC_SPACE_L1 (1<<24) -# define _CC_UPPER_A (1<<25) -# define _CC_UPPER_L1 (1<<26) -# define _CC_WORDCHAR_A (1<<27) -# define _CC_WORDCHAR_L1 (1<<28) -# define _CC_XDIGIT_A (1<<29) -# define _CC_NONLATIN1_FOLD (1<<30) -/* Unused - * (1<<31) - */ + +/* Character class numbers. These are used in PL_charclass[] and the ones + * up through the one that corresponds to <_HIGHEST_REGCOMP_DOT_H_SYNC> are + * used by regcomp.h. These use names used in l1_char_class_tab.h but their + * actual definitions are here. If that has a name not used here, it won't + * compile. */ +# define _CC_WORDCHAR 0 +# define _CC_SPACE 1 +# define _CC_DIGIT 2 +# define _CC_ALNUMC 3 +# define _CC_ALPHA 4 +# define _CC_ASCII 5 +# define _CC_CNTRL 6 +# define _CC_GRAPH 7 +# define _CC_LOWER 8 +# define _CC_PRINT 9 +# define _CC_PUNCT 10 +# define _CC_UPPER 11 +# define _CC_XDIGIT 12 +# define _CC_PSXSPC 13 +# define _CC_BLANK 14 +# define _HIGHEST_REGCOMP_DOT_H_SYNC _CC_BLANK + +# define _CC_IDFIRST 15 +# define _CC_CHARNAME_CONT 16 +# define _CC_NONLATIN1_FOLD 17 +# define _CC_QUOTEMETA 18 +/* Unused: 19-31 + * If more bits are needed, one could add a second word for non-64bit + * QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd + * word or not. */ # ifdef DOINIT EXTCONST U32 PL_charclass[] = { @@ -624,27 +642,43 @@ EXTCONST U32 PL_charclass[] = { EXTCONST U32 PL_charclass[]; # endif -# define isALNUMC_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALNUMC_A)) -# define isALPHA_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALPHA_A)) -# define isBLANK_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_BLANK_A)) -# define isCNTRL_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CNTRL_A)) -# define isDIGIT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_DIGIT_A)) -# define isGRAPH_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_GRAPH_A)) -# define isIDFIRST_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_IDFIRST_A)) -# define isLOWER_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_LOWER_A)) -# define isOCTAL_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_OCTAL_A)) -# define isPRINT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PRINT_A)) -# define isPSXSPC_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PSXSPC_A)) -# define isPUNCT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PUNCT_A)) -# define isSPACE_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_SPACE_A)) -# define isUPPER_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_UPPER_A)) -# define isWORDCHAR_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_WORDCHAR_A)) -# define isXDIGIT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_XDIGIT_A)) + /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */ +# define _CC_mask(classnum) (1U << (classnum)) +# define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ + && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(classnum))) + + /* The mask for the _A versions of the macros; it just adds in the bit for + * ASCII. */ +# define _CC_mask_A(classnum) (_CC_mask(classnum) | _CC_mask(_CC_ASCII)) + + /* The _A version makes sure that both the desired bit and the ASCII bit + * are present */ +# define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \ + && ((PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask_A(classnum)) \ + == _CC_mask_A(classnum))) + +# define isALNUMC_A(c) _generic_isCC_A(c, _CC_ALNUMC) +# define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA) +# define isBLANK_A(c) _generic_isCC_A(c, _CC_BLANK) +# define isCNTRL_A(c) _generic_isCC_A(c, _CC_CNTRL) +# define isDIGIT_A(c) _generic_isCC(c, _CC_DIGIT) +# define isGRAPH_A(c) _generic_isCC_A(c, _CC_GRAPH) +# define isLOWER_A(c) _generic_isCC_A(c, _CC_LOWER) +# define isPRINT_A(c) _generic_isCC_A(c, _CC_PRINT) +# define isPSXSPC_A(c) _generic_isCC_A(c, _CC_PSXSPC) +# define isPUNCT_A(c) _generic_isCC_A(c, _CC_PUNCT) +# define isSPACE_A(c) _generic_isCC_A(c, _CC_SPACE) +# define isUPPER_A(c) _generic_isCC_A(c, _CC_UPPER) +# define isWORDCHAR_A(c) _generic_isCC_A(c, _CC_WORDCHAR) +# define isXDIGIT_A(c) _generic_isCC(c, _CC_XDIGIT) +# define isIDFIRST_A(c) _generic_isCC_A(c, ( _CC_IDFIRST)) + /* Either participates in a fold with a character above 255, or is a * multi-char fold */ -# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_NONLATIN1_FOLD)) +# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(_CC_NONLATIN1_FOLD))) + +# define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA) #else /* No perl.h. */ -# define isOCTAL_A(c) ((c) >= '0' && (c) <= '9') # ifdef EBCDIC # define isALNUMC_A(c) (isASCII(c) && isALNUMC(c)) # define isALPHA_A(c) (isASCII(c) && isALPHA(c)) @@ -665,8 +699,8 @@ EXTCONST U32 PL_charclass[]; # define isALNUMC_A(c) (isALPHA_A(c) || isDIGIT_A(c)) # define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c)) # define isBLANK_A(c) ((c) == ' ' || (c) == '\t') -# define isCNTRL_A(c) (FITS_IN_8_BITS(c) ? ((U8) (c) < ' ' || (c) == 127) : 0) -# define isDIGIT_A(c) ((c) >= '0' && (c) <= '9') +# define isCNTRL_A(c) (FITS_IN_8_BITS(c) && ((U8) (c) < ' ' || (c) == 127)) +# define isDIGIT_A(c) ((c) <= '9' && (c) >= '0') # define isGRAPH_A(c) (isWORDCHAR_A(c) || isPUNCT_A(c)) # define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_') # define isLOWER_A(c) ((c) >= 'a' && (c) <= 'z') @@ -674,29 +708,29 @@ EXTCONST U32 PL_charclass[]; # define isPSXSPC_A(c) (isSPACE_A(c) || (c) == '\v') # define isPUNCT_A(c) (((c) >= 33 && (c) <= 47) || ((c) >= 58 && (c) <= 64) || ((c) >= 91 && (c) <= 96) || ((c) >= 123 && (c) <= 126)) # define isSPACE_A(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) =='\r' || (c) == '\f') -# define isUPPER_A(c) ((c) >= 'A' && (c) <= 'Z') +# define isUPPER_A(c) ((c) <= 'Z' && (c) >= 'A') # define isWORDCHAR_A(c) (isALPHA_A(c) || isDIGIT_A(c) || (c) == '_') -# define isXDIGIT_A(c) (isDIGIT_A(c) || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F')) +# define isXDIGIT_A(c) (isDIGIT_A(c) || ((c) >= 'a' && (c) <= 'f') || ((c) <= 'F' && (c) >= 'A')) # endif #endif /* ASCII range definitions */ /* Latin1 definitions */ #ifdef H_PERL -# define isALNUMC_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALNUMC_L1)) -# define isALPHA_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALPHA_L1)) -# define isBLANK_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_BLANK_L1)) +# define isALNUMC_L1(c) _generic_isCC(c, _CC_ALNUMC) +# define isALPHA_L1(c) _generic_isCC(c, _CC_ALPHA) +# define isBLANK_L1(c) _generic_isCC(c, _CC_BLANK) /* continuation character for legal NAME in \N{NAME} */ -# define isCHARNAME_CONT(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CHARNAME_CONT)) -# define isCNTRL_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CNTRL_L1)) -# define isGRAPH_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_GRAPH_L1)) -# define isIDFIRST_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_IDFIRST_L1)) -# define isLOWER_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_LOWER_L1)) -# define isPRINT_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PRINT_L1)) -# define isPSXSPC_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PSXSPC_L1)) -# define isPUNCT_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PUNCT_L1)) -# define isSPACE_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_SPACE_L1)) -# define isUPPER_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_UPPER_L1)) -# define isWORDCHAR_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_WORDCHAR_L1)) +# define isCHARNAME_CONT(c) _generic_isCC(c, _CC_CHARNAME_CONT) +# define isCNTRL_L1(c) _generic_isCC(c, _CC_CNTRL) +# define isGRAPH_L1(c) _generic_isCC(c, _CC_GRAPH) +# define isLOWER_L1(c) _generic_isCC(c, _CC_LOWER) +# define isPRINT_L1(c) _generic_isCC(c, _CC_PRINT) +# define isPSXSPC_L1(c) _generic_isCC(c, _CC_PSXSPC) +# define isPUNCT_L1(c) _generic_isCC(c, _CC_PUNCT) +# define isSPACE_L1(c) _generic_isCC(c, _CC_SPACE) +# define isUPPER_L1(c) _generic_isCC(c, _CC_UPPER) +# define isWORDCHAR_L1(c) _generic_isCC(c, _CC_WORDCHAR) +# define isIDFIRST_L1(c) _generic_isCC(c, _CC_IDFIRST) #else /* No access to perl.h. Only a few provided here, just in case needed * for backwards compatibility */ /* ALPHAU includes Unicode semantics for latin1 characters. It has an extra @@ -745,7 +779,7 @@ EXTCONST U32 PL_charclass[]; # define toLOWER(c) tolower(c) # define toUPPER(c) toupper(c) #else /* Not EBCDIC: ASCII-only matching */ -# define isALNUMC(c) isALNUMC_A(c) +# define isALNUMC(c) isALNUMC_A(c) /* Mnemonic: "C's alnum" = alpha + digit */ # define isALPHA(c) isALPHA_A(c) # define isBLANK(c) isBLANK_A(c) # define isCNTRL(c) isCNTRL_A(c) @@ -792,6 +826,8 @@ EXTCONST U32 PL_charclass[]; # define isIDFIRST_LC(c) \ (NXIsAlpha((unsigned int)(c)) || (char)(c) == '_') # define isALPHA_LC(c) NXIsAlpha((unsigned int)(c)) +# define isASCII_LC(c) isASCII((unsigned int)(c)) +# define isBLANK_LC(c) isBLANK((unsigned int)(c)) # define isSPACE_LC(c) NXIsSpace((unsigned int)(c)) # define isDIGIT_LC(c) NXIsDigit((unsigned int)(c)) # define isUPPER_LC(c) NXIsUpper((unsigned int)(c)) @@ -808,9 +844,23 @@ EXTCONST U32 PL_charclass[]; # if defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII)) +/* Note that the foo_LC() macros in this case generally are defined only on + * code points 0-256, and give undefined, unwarned results if called with + * values outside that range */ + # define isALNUM_LC(c) (isalnum((unsigned char)(c)) || (char)(c) == '_') # define isIDFIRST_LC(c) (isalpha((unsigned char)(c)) || (char)(c) == '_') # define isALPHA_LC(c) isalpha((unsigned char)(c)) +# ifdef HAS_ISASCII +# define isASCII_LC(c) isascii((unsigned char)(c)) +# else +# define isASCII_LC(c) isASCII((unsigned char)(c)) +# endif +# ifdef HAS_ISBLANK +# define isBLANK_LC(c) isblank((unsigned char)(c)) +# else +# define isBLANK_LC(c) isBLANK((unsigned char)(c)) +# endif # define isSPACE_LC(c) isspace((unsigned char)(c)) # define isDIGIT_LC(c) isdigit((unsigned char)(c)) # define isUPPER_LC(c) isupper((unsigned char)(c)) @@ -828,6 +878,12 @@ EXTCONST U32 PL_charclass[]; # define isALNUM_LC(c) (isascii(c) && (isalnum(c) || (c) == '_')) # define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_')) # define isALPHA_LC(c) (isascii(c) && isalpha(c)) +# define isASCII_LC(c) isascii(c) +# ifdef HAS_ISBLANK +# define isBLANK_LC(c) (isascii(c) && isblank(c)) +# else +# define isBLANK_LC(c) isBLANK(c) +# endif # define isSPACE_LC(c) (isascii(c) && isspace(c)) # define isDIGIT_LC(c) (isascii(c) && isdigit(c)) # define isUPPER_LC(c) (isascii(c) && isupper(c)) @@ -844,29 +900,42 @@ EXTCONST U32 PL_charclass[]; #endif /* USE_NEXT_CTYPE */ #define isPSXSPC_LC(c) (isSPACE_LC(c) || (c) == '\v') -#define isBLANK_LC(c) isBLANK(c) /* could be wrong */ - -#define isALNUM_uni(c) is_uni_alnum(c) -#define isIDFIRST_uni(c) is_uni_idfirst(c) -#define isALPHA_uni(c) is_uni_alpha(c) -#define isSPACE_uni(c) is_uni_space(c) -#define isDIGIT_uni(c) is_uni_digit(c) -#define isUPPER_uni(c) is_uni_upper(c) -#define isLOWER_uni(c) is_uni_lower(c) -#define isASCII_uni(c) is_uni_ascii(c) -#define isCNTRL_uni(c) is_uni_cntrl(c) -#define isGRAPH_uni(c) is_uni_graph(c) -#define isPRINT_uni(c) is_uni_print(c) -#define isPUNCT_uni(c) is_uni_punct(c) -#define isXDIGIT_uni(c) is_uni_xdigit(c) + +/* For use in the macros just below. If the input is Latin1, use the Latin1 + * (_L1) version of the macro; otherwise use the function. Won't compile if + * 'c' isn't unsigned, as won't match function prototype. The macros do bounds + * checking, so have duplicate checks here, so could create versions of the + * macros that don't, but experiments show that gcc optimizes them out anyway. + */ +#define generic_uni(macro, function, c) ((c) < 256 \ + ? CAT2(macro, _L1)(c) \ + : function(c)) +/* Note that all ignore 'use bytes' */ + +#define isALNUM_uni(c) generic_uni(isWORDCHAR, is_uni_alnum, c) +#define isBLANK_uni(c) generic_uni(isBLANK, is_uni_blank, c) +#define isIDFIRST_uni(c) generic_uni(isIDFIRST, is_uni_idfirst, c) +#define isALPHA_uni(c) generic_uni(isALPHA, is_uni_alpha, c) +#define isSPACE_uni(c) generic_uni(isSPACE, is_uni_space, c) +#define isDIGIT_uni(c) generic_uni(isDIGIT, is_uni_digit, c) +#define isUPPER_uni(c) generic_uni(isUPPER, is_uni_upper, c) +#define isLOWER_uni(c) generic_uni(isLOWER, is_uni_lower, c) +#define isASCII_uni(c) isASCII(c) +/* All controls are in Latin1 */ +#define isCNTRL_uni(c) isCNTRL_L1(c) +#define isGRAPH_uni(c) generic_uni(isGRAPH, is_uni_graph, c) +#define isPRINT_uni(c) generic_uni(isPRINT, is_uni_print, c) +#define isPUNCT_uni(c) generic_uni(isPUNCT, is_uni_punct, c) +#define isXDIGIT_uni(c) generic_uni(isXDIGIT, is_uni_xdigit, c) + +/* Posix and regular space differ only in U+000B, which is in Latin1 */ +#define isPSXSPC_uni(c) ((c) < 256 ? isPSXSPC_L1(c) : isSPACE_uni(c)) + #define toUPPER_uni(c,s,l) to_uni_upper(c,s,l) #define toTITLE_uni(c,s,l) to_uni_title(c,s,l) #define toLOWER_uni(c,s,l) to_uni_lower(c,s,l) #define toFOLD_uni(c,s,l) to_uni_fold(c,s,l) -#define isPSXSPC_uni(c) (isSPACE_uni(c) ||(c) == '\f') -#define isBLANK_uni(c) isBLANK(c) /* could be wrong */ - #define isALNUM_LC_uvchr(c) (c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c)) #define isIDFIRST_LC_uvchr(c) (c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c)) #define isALPHA_LC_uvchr(c) (c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c)) @@ -882,47 +951,77 @@ EXTCONST U32 PL_charclass[]; #define isPSXSPC_LC_uni(c) (isSPACE_LC_uni(c) ||(c) == '\f') #define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */ -#define isALNUM_utf8(p) is_utf8_alnum(p) +/* For use in the macros just below. If the input is ASCII, use the ASCII (_A) + * version of the macro; if the input is in the upper Latin1 range, use the + * Latin1 (_L1) version of the macro, after converting from utf8; otherwise use + * the function. This relies on the fact that ASCII characters have the same + * representation whether utf8 or not */ +#define generic_utf8(macro, function, p) (isASCII(*(p)) \ + ? CAT2(CAT2(macro,_),A)(*(p)) \ + : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ + ? CAT2(macro, _L1) \ + (TWO_BYTE_UTF8_TO_UNI(*(p), \ + *((p)+1))) \ + : function(p)) + +/* Note that all assume that the utf8 has been validated, and ignore 'use + * bytes' */ + +#define isALNUM_utf8(p) generic_utf8(isWORDCHAR, is_utf8_alnum, p) /* To prevent S_scan_word in toke.c from hanging, we have to make sure that * IDFIRST is an alnum. See - * http://rt.perl.org/rt3/Ticket/Display.html?id=74022 - * for more detail than you ever wanted to know about. This used to be not the - * XID version, but we decided to go with the more modern Unicode definition */ -#define isIDFIRST_utf8(p) (is_utf8_xidfirst(p) && is_utf8_alnum(p)) -#define isIDCONT_utf8(p) is_utf8_xidcont(p) -#define isALPHA_utf8(p) is_utf8_alpha(p) -#define isSPACE_utf8(p) is_utf8_space(p) -#define isDIGIT_utf8(p) is_utf8_digit(p) -#define isUPPER_utf8(p) is_utf8_upper(p) -#define isLOWER_utf8(p) is_utf8_lower(p) -#define isASCII_utf8(p) is_utf8_ascii(p) -#define isCNTRL_utf8(p) is_utf8_cntrl(p) -#define isGRAPH_utf8(p) is_utf8_graph(p) -#define isPRINT_utf8(p) is_utf8_print(p) -#define isPUNCT_utf8(p) is_utf8_punct(p) -#define isXDIGIT_utf8(p) is_utf8_xdigit(p) + * http://rt.perl.org/rt3/Ticket/Display.html?id=74022 for more detail than you + * ever wanted to know about. XXX It is unclear if this should extend to + * isIDFIRST_uni() which it hasn't so far. (In the ASCII range, there isn't a + * difference.) This used to be not the XID version, but we decided to go with + * the more modern Unicode definition */ +#define isIDFIRST_utf8(p) (isASCII(*(p)) \ + ? isIDFIRST_A(*(p)) \ + : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ + ? isIDFIRST_L1(TWO_BYTE_UTF8_TO_UNI(*(p), \ + *((p)+1)))\ + : Perl__is_utf8__perl_idstart(aTHX_ p)) +#define isIDCONT_utf8(p) generic_utf8(isWORDCHAR, is_utf8_xidcont, p) +#define isALPHA_utf8(p) generic_utf8(isALPHA, is_utf8_alpha, p) +#define isBLANK_utf8(p) generic_utf8(isBLANK, is_utf8_blank, p) +#define isSPACE_utf8(p) generic_utf8(isSPACE, is_utf8_space, p) +#define isDIGIT_utf8(p) generic_utf8(isDIGIT, is_utf8_digit, p) +#define isUPPER_utf8(p) generic_utf8(isUPPER, is_utf8_upper, p) +#define isLOWER_utf8(p) generic_utf8(isLOWER, is_utf8_lower, p) +/* Because ASCII is invariant under utf8, the non-utf8 macro works */ +#define isASCII_utf8(p) isASCII(*p) +#define isCNTRL_utf8(p) generic_utf8(isCNTRL, is_utf8_cntrl, p) +#define isGRAPH_utf8(p) generic_utf8(isGRAPH, is_utf8_graph, p) +#define isPRINT_utf8(p) generic_utf8(isPRINT, is_utf8_print, p) +#define isPUNCT_utf8(p) generic_utf8(isPUNCT, is_utf8_punct, p) +#define isXDIGIT_utf8(p) generic_utf8(isXDIGIT, is_utf8_xdigit, p) #define toUPPER_utf8(p,s,l) to_utf8_upper(p,s,l) #define toTITLE_utf8(p,s,l) to_utf8_title(p,s,l) #define toLOWER_utf8(p,s,l) to_utf8_lower(p,s,l) -#define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f') -#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */ - -#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(utf8_to_uvchr(p, 0)) +/* Posix and regular space differ only in U+000B, which is in ASCII (and hence + * Latin1 */ +#define isPSXSPC_utf8(p) ((isASCII(*(p))) \ + ? isPSXSPC_A(*(p)) \ + : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ + ? isPSXSPC_L1(TWO_BYTE_UTF8_TO_UNI(*(p), \ + *((p)+1)))\ + : isSPACE_utf8(p))) +#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isBLANK_LC_utf8(p) isBLANK_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(valid_utf8_to_uvchr(p, 0)) #define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f') -#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */ /* This conversion works both ways, strangely enough. On EBCDIC platforms, * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII */ @@ -1004,7 +1103,7 @@ destination, C is the number of items, and C is the type. Like C but returns dest. Useful for encouraging compilers to tail-call optimise. -=for apidoc Am|void|StructCopy|type src|type dest|type +=for apidoc Am|void|StructCopy|type *src|type *dest|type This is an architecture-independent macro to copy one structure to another. =for apidoc Am|void|PoisonWith|void* dest|int nitems|type|U8 byte @@ -1218,8 +1317,8 @@ void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumbe * Local variables: * c-indentation-style: bsd * c-basic-offset: 4 - * indent-tabs-mode: t + * indent-tabs-mode: nil * End: * - * ex: set ts=8 sts=4 sw=4 noet: + * ex: set ts=8 sts=4 sw=4 et: */