X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/9ea4080170688502a51eb7f66ecfafe8f08c4cd8..de29d7249495cfea30847f736f3c068ce6844596:/handy.h diff --git a/handy.h b/handy.h index f80ba2c..2f0132f 100644 --- a/handy.h +++ b/handy.h @@ -8,6 +8,9 @@ * */ +/* IMPORTANT NOTE: Everything whose name begins with an underscore is for + * internal core Perl use only. */ + #ifndef HANDY_H /* Guard against nested #inclusion */ #define HANDY_H @@ -25,10 +28,11 @@ =head1 Handy Values =for apidoc AmU||Nullch -Null character pointer. (No longer available when C is defined.) +Null character pointer. (No longer available when C is +defined.) =for apidoc AmU||Nullsv -Null SV pointer. (No longer available when C is defined.) +Null SV pointer. (No longer available when C is defined.) =cut */ @@ -69,7 +73,7 @@ Null SV pointer. (No longer available when C is defined.) #define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p)) #define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p)) -#ifdef I_STDBOOL +#if defined(I_STDBOOL) && !defined(PERL_BOOL_AS_CHAR) # include # ifndef HAS_BOOL # define HAS_BOOL 1 @@ -85,9 +89,11 @@ Null SV pointer. (No longer available when C is defined.) Andy Dougherty February 2000 */ #ifdef __GNUG__ /* GNU g++ has bool built-in */ +# ifndef PERL_BOOL_AS_CHAR # ifndef HAS_BOOL # define HAS_BOOL 1 # endif +# endif #endif /* The NeXT dynamic loader headers will not build with the bool macro @@ -104,11 +110,10 @@ Null SV pointer. (No longer available when C is defined.) #endif /* NeXT || __NeXT__ */ #ifndef HAS_BOOL -# if defined(VMS) -# define bool int -# else -# define bool char +# ifdef bool +# undef bool # endif +# define bool char # define HAS_BOOL 1 #endif @@ -268,6 +273,16 @@ typedef U64TYPE U64; #define Ctl(ch) ((ch) & 037) +/* This is a helper macro to avoid preprocessor issues, replaced by nothing + * unless under DEBUGGING, where it expands to an assert of its argument, + * followed by a comma (hence the comma operator). If we just used a straight + * assert(), we would get a comma with nothing before it when not DEBUGGING */ +#ifdef DEBUGGING +# define __ASSERT_(statement) assert(statement), +#else +# define __ASSERT_(statement) +#endif + /* =head1 SV-Body Allocation @@ -419,12 +434,12 @@ the second, C. Returns true or false. =for apidoc Am|bool|strnNE|char* s1|char* s2|STRLEN len Test two strings to see if they are different. The C parameter -indicates the number of bytes to compare. Returns true or false. (A +indicates the number of bytes to compare. Returns true or false. (A wrapper for C). =for apidoc Am|bool|strnEQ|char* s1|char* s2|STRLEN len Test two strings to see if they are equal. The C parameter indicates -the number of bytes to compare. Returns true or false. (A wrapper for +the number of bytes to compare. Returns true or false. (A wrapper for C). =cut @@ -495,7 +510,7 @@ onto the platform. That is, the code points that are ASCII are unaffected, since ASCII is a subset of Latin-1. But the non-ASCII code points are treated as if they are Latin-1 characters. For example, C will return true when called with the code point 0xDF, which is a word character in both -ASCII and EBCDIC (though it represent different characters in each). +ASCII and EBCDIC (though it represents different characters in each). Variant C is like the C variant, but accepts any UV code point as input. If the code point is larger than 255, Unicode rules are used @@ -508,12 +523,13 @@ Variant C is like C, but the input is a pointer to a classification of just the first (possibly multi-byte) character in the string is tested. -Variant C is like the C and C variants, but uses -the C library function that gives the named classification instead of -hard-coded rules. For example, C returns the result of calling -C. This means that the result is based on the current locale, which -is what C in the name stands for. FALSE is always returned if the input -won't fit into an octet. +Variant C is like the C and C variants, but the +result is based on the current locale, which is what C in the name stands +for. If Perl can determine that the current locale is a UTF-8 locale, it uses +the published Unicode rules; otherwise, it uses the C library function that +gives the named classification. For example, C when not in a +UTF-8 locale returns the result of calling C. FALSE is always +returned if the input won't fit into an octet. Variant C is like C, but is defined on any UV. It returns the same as C for input code points less than 256, and @@ -858,7 +874,7 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc /* We could be called without perl.h, in which case NATIVE_TO_ASCII() is * likely not defined, and so we use the native function */ -# define isASCII(c) isascii(c) +# define isASCII(c) cBOOL(isascii(c)) #else # define isASCII(c) ((WIDEST_UTYPE)(c) < 128) #endif @@ -1000,6 +1016,9 @@ EXTCONST U32 PL_charclass[]; /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */ # define _CC_mask(classnum) (1U << (classnum)) + + /* For internal core Perl use only: the base macro for defining macros like + * isALPHA */ # define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ && (PL_charclass[(U8) (c)] & _CC_mask(classnum))) @@ -1007,8 +1026,9 @@ EXTCONST U32 PL_charclass[]; * ASCII. */ # define _CC_mask_A(classnum) (_CC_mask(classnum) | _CC_mask(_CC_ASCII)) - /* The _A version makes sure that both the desired bit and the ASCII bit - * are present */ + /* For internal core Perl use only: the base macro for defining macros like + * isALPHA_A. The foo_A version makes sure that both the desired bit and + * the ASCII bit are present */ # define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \ && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \ == _CC_mask_A(classnum))) @@ -1070,7 +1090,6 @@ EXTCONST U32 PL_charclass[]; /* Use the native functions. They likely will return false for all * non-ASCII values, but this makes sure */ # define isALPHA_A(c) (isASCII(c) && isalpha(c)) -# define isALPHA_A(c) (isASCII(c) && isalpha(c)) # define isALPHANUMERIC_A(c) (isASCII(c) && isalnum(c)) # define isCNTRL_A(c) (isASCII(c) && iscntrl(c)) # define isDIGIT_A(c) (isASCII(c) && isdigit(c)) @@ -1197,14 +1216,13 @@ EXTCONST U32 PL_charclass[]; #define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c)) which uses table lookup and mask instead of subtraction. (This would work because the _MOD does not apply in the ASCII range) */ -#define toLOWER(c) (isUPPER(c) ? (c) + ('a' - 'A') : (c)) -#define toUPPER(c) (isLOWER(c) ? (c) - ('a' - 'A') : (c)) +#define toLOWER(c) (isUPPER(c) ? (U8)((c) + ('a' - 'A')) : (c)) +#define toUPPER(c) (isLOWER(c) ? (U8)((c) - ('a' - 'A')) : (c)) /* In the ASCII range, these are equivalent to what they're here defined to be. * But by creating these definitions, other code doesn't have to be aware of * this detail */ #define toFOLD(c) toLOWER(c) -#define toFOLD_LC(c) toLOWER_LC(c) #define toTITLE(c) toUPPER(c) #define toLOWER_A(c) toLOWER(c) @@ -1212,84 +1230,158 @@ EXTCONST U32 PL_charclass[]; #define toFOLD_A(c) toFOLD(c) #define toTITLE_A(c) toTITLE(c) -/* Use table lookup for speed; return error character for input - * out-of-range */ +/* Use table lookup for speed; returns the input itself if is out-of-range */ #define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \ ? (c) \ : PL_latin1_lc[ (U8) (c) ]) #define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */ /* Modified uc. Is correct uc except for three non-ascii chars which are - * all mapped to one of them, and these need special handling; error - * character for input out-of-range */ + * all mapped to one of them, and these need special handling; returns the + * input itself if is out-of-range */ #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \ ? (c) \ : PL_mod_latin1_uc[ (U8) (c) ]) -#ifdef USE_NEXT_CTYPE - -# define isALPHANUMERIC_LC(c) NXIsAlNum((unsigned int)(c)) -# define isALPHA_LC(c) NXIsAlpha((unsigned int)(c)) -# define isASCII_LC(c) isASCII((unsigned int)(c)) -# define isBLANK_LC(c) isBLANK((unsigned int)(c)) -# define isCNTRL_LC(c) NXIsCntrl((unsigned int)(c)) -# define isDIGIT_LC(c) NXIsDigit((unsigned int)(c)) -# define isGRAPH_LC(c) NXIsGraph((unsigned int)(c)) -# define isIDFIRST_LC(c) (NXIsAlpha((unsigned int)(c)) || (char)(c) == '_') -# define isLOWER_LC(c) NXIsLower((unsigned int)(c)) -# define isPRINT_LC(c) NXIsPrint((unsigned int)(c)) -# define isPUNCT_LC(c) NXIsPunct((unsigned int)(c)) -# define isSPACE_LC(c) NXIsSpace((unsigned int)(c)) -# define isUPPER_LC(c) NXIsUpper((unsigned int)(c)) -# define isWORDCHAR_LC(c) (NXIsAlNum((unsigned int)(c)) || (char)(c) == '_') -# define isXDIGIT_LC(c) NXIsXDigit((unsigned int)(c)) -# define toLOWER_LC(c) NXToLower((unsigned int)(c)) -# define toUPPER_LC(c) NXToUpper((unsigned int)(c)) +#define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale + +/* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */ + +/* For internal core Perl use only: the base macro for defining macros like + * isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point + * (0-255) to check. In a UTF-8 locale, the result is the same as calling + * isFOO_L1(); the 'utf8_locale_classnum' parameter is something like + * _CC_UPPER, which gives the class number for doing this. For non-UTF-8 + * locales, the code to actually do the test this is passed in 'non_utf8'. If + * 'c' is above 255, 0 is returned. For accessing the full range of possible + * code points under locale rules, use the macros based on _generic_LC_uvchr + * instead of this. */ +#define _generic_LC_base(c, utf8_locale_classnum, non_utf8) \ + (! FITS_IN_8_BITS(c) \ + ? 0 \ + : IN_UTF8_CTYPE_LOCALE \ + ? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum)) \ + : cBOOL(non_utf8)) + +/* For internal core Perl use only: a helper macro for defining macros like + * isALPHA_LC. 'c' is the code point (0-255) to check. The function name to + * actually do this test is passed in 'non_utf8_func', which is called on 'c', + * casting 'c' to the macro _LC_CAST, which should not be parenthesized. See + * _generic_LC_base for more info */ +#define _generic_LC(c, utf8_locale_classnum, non_utf8_func) \ + _generic_LC_base(c,utf8_locale_classnum, \ + non_utf8_func( (_LC_CAST) (c))) + +/* For internal core Perl use only: like _generic_LC, but also returns TRUE if + * 'c' is the platform's native underscore character */ +#define _generic_LC_underscore(c,utf8_locale_classnum,non_utf8_func) \ + _generic_LC_base(c, utf8_locale_classnum, \ + (non_utf8_func( (_LC_CAST) (c)) \ + || (char)(c) == '_')) + +/* These next three are also for internal core Perl use only: case-change + * helper macros */ +#define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \ + ? (c) \ + : (IN_UTF8_CTYPE_LOCALE) \ + ? PL_latin1_lc[ (U8) (c) ] \ + : function((cast)(c))) + +/* Note that the result can be larger than a byte in a UTF-8 locale. It + * returns a single value, so can't adequately return the upper case of LATIN + * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two + * values "SS"); instead it asserts against that under DEBUGGING, and + * otherwise returns its input */ +#define _generic_toUPPER_LC(c, function, cast) \ + (! FITS_IN_8_BITS(c) \ + ? (c) \ + : ((! IN_UTF8_CTYPE_LOCALE) \ + ? function((cast)(c)) \ + : ((((U8)(c)) == MICRO_SIGN) \ + ? GREEK_CAPITAL_LETTER_MU \ + : ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \ + ? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \ + : ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \ + ? (__ASSERT_(0) (c)) \ + : PL_mod_latin1_uc[ (U8) (c) ]))))) + +/* Note that the result can be larger than a byte in a UTF-8 locale. It + * returns a single value, so can't adequately return the fold case of LATIN + * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two + * values "ss"); instead it asserts against that under DEBUGGING, and + * otherwise returns its input */ +#define _generic_toFOLD_LC(c, function, cast) \ + ((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \ + ? GREEK_SMALL_LETTER_MU \ + : (__ASSERT_(! IN_UTF8_CTYPE_LOCALE \ + || (c) != LATIN_SMALL_LETTER_SHARP_S) \ + _generic_toLOWER_LC(c, function, cast))) + +/* Use the libc versions for these if available. */ +#if defined(HAS_ISASCII) && ! defined(USE_NEXT_CTYPE) +# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii( (U8) (c))) +#else +# define isASCII_LC(c) isASCII(c) +#endif + +#if defined(HAS_ISBLANK) && ! defined(USE_NEXT_CTYPE) +# define isBLANK_LC(c) _generic_LC(c, _CC_BLANK, isblank) +#else /* Unlike isASCII, varies if in a UTF-8 locale */ +# define isBLANK_LC(c) (IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c) +#endif + +#ifdef USE_NEXT_CTYPE /* NeXT computers */ + +# define _LC_CAST unsigned int /* Needed by _generic_LC. NeXT functions + use this as their input type */ + +# define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, NXIsAlpha) +# define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, NXIsAlNum) +# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, NXIsCntrl) +# define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, NXIsDigit) +# define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, NXIsGraph) +# define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, NXIsAlpha) +# define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, NXIsLower) +# define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, NXIsPrint) +# define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, NXIsPunct) +# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, NXIsSpace) +# define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, NXIsUpper) +# define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, NXIsAlNum) +# define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, NXIsXdigit) + +# define toLOWER_LC(c) _generic_toLOWER_LC((c), NXToLower, unsigned int) +# define toUPPER_LC(c) _generic_toUPPER_LC((c), NXToUpper, unsigned int) +# define toFOLD_LC(c) _generic_toFOLD_LC((c), NXToLower, unsigned int) #else /* !USE_NEXT_CTYPE */ +# define _LC_CAST U8 + # if defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII)) + /* For most other platforms */ -/* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */ +# define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, isalpha) +# define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, isalnum) +# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl) +# define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, isdigit) +# define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, isgraph) +# define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, isalpha) +# define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, islower) +# define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, isprint) +# define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, ispunct) +# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace) +# define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, isupper) +# define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, isalnum) +# define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, isxdigit) -# define isALPHA_LC(c) (FITS_IN_8_BITS(c) && isalpha((unsigned char)(c))) -# define isALPHANUMERIC_LC(c) (FITS_IN_8_BITS(c) \ - && isalnum((unsigned char)(c))) -# ifdef HAS_ISASCII -# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii((unsigned char)(c))) -# else -# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isASCII((unsigned char)(c))) -# endif -# ifdef HAS_ISBLANK -# define isBLANK_LC(c) (FITS_IN_8_BITS(c) && isblank((unsigned char)(c))) -# else -# define isBLANK_LC(c) (FITS_IN_8_BITS(c) && isBLANK((unsigned char)(c))) -# endif -# define isCNTRL_LC(c) (FITS_IN_8_BITS(c) && iscntrl((unsigned char)(c))) -# define isDIGIT_LC(c) (FITS_IN_8_BITS(c) && isdigit((unsigned char)(c))) -# define isGRAPH_LC(c) (FITS_IN_8_BITS(c) && isgraph((unsigned char)(c))) -# define isIDFIRST_LC(c) (FITS_IN_8_BITS(c) \ - && (isalpha((unsigned char)(c)) || (char)(c) == '_')) -# define isLOWER_LC(c) (FITS_IN_8_BITS(c) && islower((unsigned char)(c))) -# define isPRINT_LC(c) (FITS_IN_8_BITS(c) && isprint((unsigned char)(c))) -# define isPUNCT_LC(c) (FITS_IN_8_BITS(c) && ispunct((unsigned char)(c))) -# define isSPACE_LC(c) (FITS_IN_8_BITS(c) && isspace((unsigned char)(c))) -# define isUPPER_LC(c) (FITS_IN_8_BITS(c) && isupper((unsigned char)(c))) -# define isWORDCHAR_LC(c) (FITS_IN_8_BITS(c) \ - && (isalnum((unsigned char)(c)) || (char)(c) == '_')) -# define isXDIGIT_LC(c) (FITS_IN_8_BITS(c) && isxdigit((unsigned char)(c))) -# define toLOWER_LC(c) (FITS_IN_8_BITS(c) ? tolower((unsigned char)(c)) : (c)) -# define toUPPER_LC(c) (FITS_IN_8_BITS(c) ? toupper((unsigned char)(c)) : (c)) -# else +# define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8) +# define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8) +# define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8) + +# else /* The final fallback position */ # define isALPHA_LC(c) (isascii(c) && isalpha(c)) # define isALPHANUMERIC_LC(c) (isascii(c) && isalnum(c)) -# define isASCII_LC(c) isascii(c) -# ifdef HAS_ISBLANK -# define isBLANK_LC(c) (isascii(c) && isblank(c)) -# else -# define isBLANK_LC(c) isBLANK_A(c) -# endif # define isCNTRL_LC(c) (isascii(c) && iscntrl(c)) # define isDIGIT_LC(c) (isascii(c) && isdigit(c)) # define isGRAPH_LC(c) (isascii(c) && isgraph(c)) @@ -1301,8 +1393,10 @@ EXTCONST U32 PL_charclass[]; # define isUPPER_LC(c) (isascii(c) && isupper(c)) # define isWORDCHAR_LC(c) (isascii(c) && (isalnum(c) || (c) == '_')) # define isXDIGIT_LC(c) (isascii(c) && isxdigit(c)) + # define toLOWER_LC(c) (isascii(c) ? tolower(c) : (c)) # define toUPPER_LC(c) (isascii(c) ? toupper(c) : (c)) +# define toFOLD_LC(c) (isascii(c) ? tolower(c) : (c)) # endif #endif /* USE_NEXT_CTYPE */ @@ -1313,11 +1407,18 @@ EXTCONST U32 PL_charclass[]; #define isIDCONT_LC(c) isWORDCHAR_LC(c) #define isPSXSPC_LC(c) isSPACE_LC(c) -/* For internal core Perl use only. If the input is Latin1, use the Latin1 - * macro; otherwise use the function 'above_latin1'. Won't compile if 'c' isn't unsigned, as - * won't match above_latin1 prototype. The macros do bounds checking, so have - * duplicate checks here, so could create versions of the macros that don't, - * but experiments show that gcc optimizes them out anyway. */ +/* For internal core Perl use only: the base macros for defining macros like + * isALPHA_uni. 'c' is the code point to check. 'classnum' is the POSIX class + * number defined earlier in this file. _generic_uni() is used for POSIX + * classes where there is a macro or function 'above_latin1' that takes the + * single argument 'c' and returns the desired value. These exist for those + * classes which have simple definitions, avoiding the overhead of a hash + * lookup or inversion list binary search. _generic_swash_uni() can be used + * for classes where that overhead is faster than a direct lookup. + * _generic_uni() won't compile if 'c' isn't unsigned, as it won't match the + * 'above_latin1' prototype. _generic_isCC() macro does bounds checking, so + * have duplicate checks here, so could create versions of the macros that + * don't, but experiments show that gcc optimizes them out anyway. */ /* Note that all ignore 'use bytes' */ #define _generic_uni(classnum, above_latin1, c) ((c) < 256 \ @@ -1353,6 +1454,11 @@ EXTCONST U32 PL_charclass[]; #define toTITLE_uni(c,s,l) to_uni_title(c,s,l) #define toUPPER_uni(c,s,l) to_uni_upper(c,s,l) +/* For internal core Perl use only: the base macros for defining macros like + * isALPHA_LC_uvchr. These are like isALPHA_LC, but the input can be any code + * point, not just 0-255. Like _generic_uni, there are two versions, one for + * simple class definitions; the other for more complex. These are like + * _generic_uni, so see it for more info. */ #define _generic_LC_uvchr(latin1, above_latin1, c) \ (c < 256 ? latin1(c) : above_latin1(c)) #define _generic_LC_swash_uvchr(latin1, classnum, c) \ @@ -1384,15 +1490,13 @@ EXTCONST U32 PL_charclass[]; #define isBLANK_LC_uni(c) isBLANK_LC_uvchr(UNI_TO_NATIVE(c)) -/* Everything whose name begins with an underscore is for internal core Perl - * use only. */ - -/* If the input is in the Latin1 range, use - * the Latin1 macro 'classnum' on 'p' which is a pointer to a UTF-8 string. - * Otherwise use the value given by the 'utf8' parameter. This relies on the - * fact that ASCII characters have the same representation whether utf8 or not. - * Note that it assumes that the utf8 has been validated, and ignores 'use - * bytes' */ +/* For internal core Perl use only: the base macros for defining macros like + * isALPHA_utf8. These are like the earlier defined macros, but take an input + * UTF-8 encoded string 'p'. If the input is in the Latin1 range, use + * the Latin1 macro 'classnum' on 'p'. Otherwise use the value given by the + * 'utf8' parameter. This relies on the fact that ASCII characters have the + * same representation whether utf8 or not. Note that it assumes that the utf8 + * has been validated, and ignores 'use bytes' */ #define _generic_utf8(classnum, p, utf8) (UTF8_IS_INVARIANT(*(p)) \ ? _generic_isCC(*(p), classnum) \ : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ @@ -1405,7 +1509,7 @@ EXTCONST U32 PL_charclass[]; * can be a macro */ #define _generic_func_utf8(classnum, above_latin1, p) \ _generic_utf8(classnum, p, above_latin1(p)) -/* Like the above, but passes classnum to _isFOO_utf8(), instead of having a +/* Like the above, but passes classnum to _isFOO_utf8(), instead of having an * 'above_latin1' parameter */ #define _generic_swash_utf8(classnum, p) \ _generic_utf8(classnum, p, _is_utf8_FOO(classnum, p)) @@ -1469,11 +1573,10 @@ EXTCONST U32 PL_charclass[]; #define toTITLE_utf8(p,s,l) to_utf8_title(p,s,l) #define toUPPER_utf8(p,s,l) to_utf8_upper(p,s,l) -/* For internal core Perl use only. If the input is in the Latin1 range, use - * the macro 'macro' on 'p' which is a pointer to a UTF-8 string. Otherwise - * use the value given by the 'utf8' parameter. This relies on the fact that - * ASCII characters have the same representation whether utf8 or not. Note - * that it assumes that the utf8 has been validated, and ignores 'use bytes' */ +/* For internal core Perl use only: the base macros for defining macros like + * isALPHA_LC_utf8. These are like _generic_utf8, but if the first code point + * in 'p' is within the 0-255 range, it uses locale rules from the passed-in + * 'macro' parameter */ #define _generic_LC_utf8(macro, p, utf8) \ (UTF8_IS_INVARIANT(*(p)) \ ? macro(*(p)) \ @@ -1484,7 +1587,7 @@ EXTCONST U32 PL_charclass[]; #define _generic_LC_swash_utf8(macro, classnum, p) \ _generic_LC_utf8(macro, p, _is_utf8_FOO(classnum, p)) #define _generic_LC_func_utf8(macro, above_latin1, p) \ - _generic_LC_utf8(macro, p, above_latin1(p)) + _generic_LC_utf8(macro, p, above_latin1(p)) #define isALPHANUMERIC_LC_utf8(p) _generic_LC_swash_utf8(isALPHANUMERIC_LC, \ _CC_ALPHANUMERIC, p) @@ -1530,11 +1633,23 @@ EXTCONST U32 PL_charclass[]; #define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p) #define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p) -/* This conversion works both ways, strangely enough. On EBCDIC platforms, - * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't - * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but - * that is EOT on ASCII; ST on EBCDIC */ -# define toCTRL(c) (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64) +/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, + * except that they don't necessarily mean the same characters, e.g. CTRL-D is + * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC. + * '?' is special-cased on EBCDIC to APC, which is the control there that is + * the outlier from the block that contains the other controls, just like + * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0 + * block. If it weren't special cased, it would yield a non-control. + * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */ +#ifndef EBCDIC +# define toCTRL(c) (toUPPER(c) ^ 64) +#else +# define toCTRL(c) ((c) == '?' \ + ? LATIN1_TO_NATIVE(0x9F) \ + : (c) == LATIN1_TO_NATIVE(0x9F) \ + ? '?' \ + : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64)) +#endif /* Line numbers are unsigned, 32 bits. */ typedef U32 line_t; @@ -1553,7 +1668,22 @@ typedef U32 line_t; } \ return a; -#define READ_XDIGIT(s) (isALPHA(*(s)) ? ((*(s)++ + 9) & 0xf) : (*(s)++ & 0xf)) +/* Converts a character known to represent a hexadecimal digit (0-9, A-F, or + * a-f) to its numeric value. READ_XDIGIT's argument is a string pointer, + * which is advanced. The input is validated only by an assert() in DEBUGGING + * builds. In both ASCII and EBCDIC the last 4 bits of the digits are 0-9; and + * the last 4 bits of A-F and a-f are 1-6, so adding 9 yields 10-15 */ +#define XDIGIT_VALUE(c) (__ASSERT_(isXDIGIT(c)) (0xf & (isDIGIT(c) \ + ? (c) \ + : ((c) + 9)))) +#define READ_XDIGIT(s) (__ASSERT_(isXDIGIT(*s)) (0xf & (isDIGIT(*(s)) \ + ? (*(s)++) \ + : (*(s)++ + 9)))) + +/* Converts a character known to represent an octal digit (0-7) to its numeric + * value. The input is validated only by an assert() in DEBUGGING builds. In + * both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */ +#define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c))) /* =head1 Memory Management @@ -1561,6 +1691,8 @@ typedef U32 line_t; =for apidoc Am|void|Newx|void* ptr|int nitems|type The XSUB-writer's interface to the C C function. +Memory obtained by this should B be freed with L<"Safefree">. + In 5.9.3, Newx() and friends replace the older New() API, and drops the first parameter, I, a debug aid which allowed callers to identify themselves. This aid has been superseded by a new build option, @@ -1571,27 +1703,38 @@ there for use in XS modules supporting older perls. The XSUB-writer's interface to the C C function, with cast. See also C. +Memory obtained by this should B be freed with L<"Safefree">. + =for apidoc Am|void|Newxz|void* ptr|int nitems|type The XSUB-writer's interface to the C C function. The allocated memory is zeroed with C. See also C. +Memory obtained by this should B be freed with L<"Safefree">. + =for apidoc Am|void|Renew|void* ptr|int nitems|type The XSUB-writer's interface to the C C function. +Memory obtained by this should B be freed with L<"Safefree">. + =for apidoc Am|void|Renewc|void* ptr|int nitems|type|cast The XSUB-writer's interface to the C C function, with cast. +Memory obtained by this should B be freed with L<"Safefree">. + =for apidoc Am|void|Safefree|void* ptr The XSUB-writer's interface to the C C function. +This should B be used on memory obtained using L<"Newx"> and friends. + =for apidoc Am|void|Move|void* src|void* dest|int nitems|type The XSUB-writer's interface to the C C function. The C is the source, C is the destination, C is the number of items, and C is the type. Can do overlapping moves. See also C. =for apidoc Am|void *|MoveD|void* src|void* dest|int nitems|type -Like C but returns dest. Useful for encouraging compilers to tail-call +Like C but returns dest. Useful +for encouraging compilers to tail-call optimise. =for apidoc Am|void|Copy|void* src|void* dest|int nitems|type @@ -1601,7 +1744,8 @@ C is the type. May fail on overlapping copies. See also C. =for apidoc Am|void *|CopyD|void* src|void* dest|int nitems|type -Like C but returns dest. Useful for encouraging compilers to tail-call +Like C but returns dest. Useful +for encouraging compilers to tail-call optimise. =for apidoc Am|void|Zero|void* dest|int nitems|type @@ -1611,7 +1755,8 @@ destination, C is the number of items, and C is the type. =for apidoc Am|void *|ZeroD|void* dest|int nitems|type -Like C but returns dest. Useful for encouraging compilers to tail-call +Like C but returns dest. Useful +for encouraging compilers to tail-call optimise. =for apidoc Am|void|StructCopy|type *src|type *dest|type @@ -1779,6 +1924,12 @@ void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumbe #define PoisonFree(d,n,t) PoisonWith(d,n,t,0xEF) #define Poison(d,n,t) PoisonFree(d,n,t) +#ifdef PERL_POISON +# define PERL_POISON_EXPR(x) x +#else +# define PERL_POISON_EXPR(x) +#endif + #ifdef USE_STRUCT_COPY #define StructCopy(s,d,t) (*((t*)(d)) = *((t*)(s))) #else