X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/eb0052d12453d188c7a1927092ec86ea704a0e60..31c7f561ae1fcf5096c82b0ce7d0ab0dc6899204:/handy.h?ds=sidebyside diff --git a/handy.h b/handy.h index 9261a22..9f67d44 100644 --- a/handy.h +++ b/handy.h @@ -1,7 +1,7 @@ /* handy.h * * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000, - * 2001, 2002, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others + * 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others * * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. @@ -12,11 +12,7 @@ #ifdef NULL #undef NULL #endif -#ifndef I286 # define NULL 0 -#else -# define NULL 0L -#endif #endif #ifndef PERL_CORE @@ -105,7 +101,7 @@ Null SV pointer. (No longer available when C is defined.) #endif /* NeXT || __NeXT__ */ #ifndef HAS_BOOL -# if defined(UTS) || defined(VMS) +# if defined(VMS) # define bool int # else # define bool char @@ -115,9 +111,10 @@ Null SV pointer. (No longer available when C is defined.) /* a simple (bool) cast may not do the right thing: if bool is defined * as char for example, then the cast from int is implementation-defined + * (bool)!!(cbool) in a ternary triggers a bug in xlc on AIX */ -#define cBOOL(cbool) ((bool)!!(cbool)) +#define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0) /* Try to figure out __func__ or __FUNCTION__ equivalent, if any. * XXX Should really be a Configure probe, with HAS__FUNCTION__ @@ -126,7 +123,7 @@ Null SV pointer. (No longer available when C is defined.) #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__SUNPRO_C)) /* C99 or close enough. */ # define FUNCTION__ __func__ #else -# if (defined(_MSC_VER) && _MSC_VER < 1300) || /* Pre-MSVC 7.0 has neither __func__ nor __FUNCTION and no good workarounds, either. */ \ +# if (defined(_MSC_VER) && _MSC_VER < 1300) || /* MSVC6 has neither __func__ nor __FUNCTION and no good workarounds, either. */ \ (defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */ # define FUNCTION__ "" # else @@ -190,25 +187,29 @@ typedef U64TYPE U64; #endif /* PERL_CORE */ #if defined(HAS_QUAD) && defined(USE_64_BIT_INT) -# ifndef UINT64_C /* usually from */ -# if defined(HAS_LONG_LONG) && QUADKIND == QUAD_IS_LONG_LONG -# define INT64_C(c) CAT2(c,LL) -# define UINT64_C(c) CAT2(c,ULL) +# if defined(HAS_LONG_LONG) && QUADKIND == QUAD_IS_LONG_LONG +# define PeRl_INT64_C(c) CAT2(c,LL) +# define PeRl_UINT64_C(c) CAT2(c,ULL) +# else +# if QUADKIND == QUAD_IS___INT64 +# define PeRl_INT64_C(c) CAT2(c,I64) +# define PeRl_UINT64_C(c) CAT2(c,UI64) # else # if LONGSIZE == 8 && QUADKIND == QUAD_IS_LONG -# define INT64_C(c) CAT2(c,L) -# define UINT64_C(c) CAT2(c,UL) +# define PeRl_INT64_C(c) CAT2(c,L) +# define PeRl_UINT64_C(c) CAT2(c,UL) # else -# if defined(_WIN64) && defined(_MSC_VER) -# define INT64_C(c) CAT2(c,I64) -# define UINT64_C(c) CAT2(c,UI64) -# else -# define INT64_C(c) ((I64TYPE)(c)) -# define UINT64_C(c) ((U64TYPE)(c)) -# endif +# define PeRl_INT64_C(c) ((I64TYPE)(c)) +# define PeRl_UINT64_C(c) ((U64TYPE)(c)) # endif # endif # endif +# ifndef UINT64_C +# define UINT64_C(c) PeRl_UINT64_C(c) +# endif +# ifndef INT64_C +# define INT64_C(c) PeRl_INT64_C(c) +# endif #endif #if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX) @@ -338,12 +339,13 @@ string/length pair. =cut */ -/* concatenating with "" ensures that only literal strings are accepted as argument */ +/* concatenating with "" ensures that only literal strings are accepted as + * argument */ #define STR_WITH_LEN(s) ("" s ""), (sizeof(s)-1) -/* note that STR_WITH_LEN() can't be used as argument to macros or functions that - * under some configurations might be macros, which means that it requires the full - * Perl_xxx(aTHX_ ...) form for any API calls where it's used. +/* note that STR_WITH_LEN() can't be used as argument to macros or functions + * that under some configurations might be macros, which means that it requires + * the full Perl_xxx(aTHX_ ...) form for any API calls where it's used. */ /* STR_WITH_LEN() shortcuts */ @@ -469,81 +471,189 @@ C). /* =head1 Character classes -There are three variants for all the functions in this section. The base ones -operate using the character set of the platform Perl is running on. The ones -with an C<_A> suffix operate on the ASCII character set, and the ones with an -C<_L1> suffix operate on the full Latin1 character set. All are unaffected by -locale. - -For ASCII platforms, the base function with no suffix and the one with the -C<_A> suffix are identical. The function with the C<_L1> suffix imposes the -Latin-1 character set onto the platform. That is, the code points that are -ASCII are unaffected, since ASCII is a subset of Latin-1. But the non-ASCII -code points are treated as if they are Latin-1 characters. For example, -C will return true when called with the code point 0xA0, which is -the Latin-1 NO-BREAK SPACE. - -For EBCDIC platforms, the base function with no suffix and the one with the -C<_L1> suffix should be identical, since, as of this writing, the EBCDIC code -pages that Perl knows about all are equivalent to Latin-1. The function that -ends in an C<_A> suffix will not return true unless the specified character also -has an ASCII equivalent. +This section is about functions (really macros) that classify characters +into types, such as punctuation versus alphabetic, etc. Most of these are +analogous to regular expression character classes. (See +L.) There are several variants for +each class. (Not all macros have all variants; each item below lists the +ones valid for it.) None are affected by C, and only the ones +with C in the name are affected by the current locale. + +The base function, e.g., C, takes an octet (either a C or a +C) as input and returns a boolean as to whether or not the character +represented by that octet is in the named class based on platform, Unicode, and +Perl rules. If the input is a number that doesn't fit in an octet, FALSE is +always returned. + +Variant C (e.g., C) will return TRUE only if the input is +also in the ASCII character set. For ASCII platforms, the base function with +no suffix and the one with the C<_A> suffix are identical. On EBCDIC +platforms, the C<_A> suffix function will not return true unless the specified +character also has an ASCII equivalent. + +Variant C operates on the full Latin1 character set. For EBCDIC +platforms, the base function with no suffix and the one with the C<_L1> suffix +are identical. For ASCII platforms, the C<_L1> suffix imposes the Latin-1 +character set onto the platform. That is, the code points that are ASCII are +unaffected, since ASCII is a subset of Latin-1. But the non-ASCII code points +are treated as if they are Latin-1 characters. For example, C +will return true when called with the code point 0xA0, which is the Latin-1 +NO-BREAK SPACE. + +Variant C is like the C variant, but accepts any UV code +point as input. If the code point is larger than 255, Unicode rules are used +to determine if it is in the character class. For example, +C returns TRUE, since 0x100 is LATIN CAPITAL LETTER A WITH +MACRON in Unicode, and is a word character. + +Variant C is like C, but the input is a pointer to a +(known to be well-formed) UTF-8 encoded string (C or C). The +classification of just the first character in the string is tested. + +Variant C is like the C and C variants, but uses +the C library function that gives the named classification instead of +hard-coded rules. For example, C returns the result of calling +C. This means that the result is based on the current locale, which +is what C in the name stands for. FALSE is always returned if the input +won't fit into an octet. + +Variant C is like C, but is defined on any UV. It +returns the same as C for input code points less than 256, and +returns the hard-coded, not-affected-by-locale, Unicode results for larger ones. + +Variant C is like C, but the input is a pointer to a +(known to be well-formed) UTF-8 encoded string (C or C). The +classification of just the first character in the string is tested. =for apidoc Am|bool|isALPHA|char ch Returns a boolean indicating whether the specified character is an -alphabetic character in the platform's native character set. +alphabetic character in the platform's native character set, analogous to +C. See the L for an explanation of variants -C and C. +C, C, C, C, C +C, and C. =for apidoc Am|bool|isASCII|char ch Returns a boolean indicating whether the specified character is one of the 128 -characters in the ASCII character set. On non-ASCII platforms, it is if this +characters in the ASCII character set, analogous to C. +On non-ASCII platforms, it is if this character corresponds to an ASCII character. Variants C and C are identical to C. +See the L for an explanation of variants +C, C, C, C, and +C. Note, however, that some platforms do not have the C +library routine C. In these cases, the variants whose names contain +C are the same as the corresponding ones without. + +=for apidoc Am|bool|isBLANK|char ch +Returns a boolean indicating whether the specified character is a +character considered to be a blank in the platform's native character set, +analogous to C. +See the L for an explanation of variants +C, C, C, C, C +C, and C. Note, however, that some +platforms do not have the C library routine C. In these cases, the +variants whose names contain C are the same as the corresponding ones +without. + +=for apidoc Am|bool|isCNTRL|char ch +Returns a boolean indicating whether the specified character is a +control character in the platform's native character set, +analogous to C. +See the L for an explanation of variants +C, C, C, C, C +C, and C. =for apidoc Am|bool|isDIGIT|char ch Returns a boolean indicating whether the specified character is a -digit in the platform's native character set. +digit in the platform's native character set, analogous to C. Variants C and C are identical to C. +See the L for an explanation of variants +C, C, C C, and +C. + +=for apidoc Am|bool|isGRAPH|char ch +Returns a boolean indicating whether the specified character is a +graphic character in the platform's native character set, analogous to +C. +See the L for an explanation of variants +C, C, C, C, C +C, and C. =for apidoc Am|bool|isLOWER|char ch Returns a boolean indicating whether the specified character is a -lowercase character in the platform's native character set. +lowercase character in the platform's native character set, analogous to +C. See the L for an explanation of variants -C and C. +C, C, C, C, C +C, and C. =for apidoc Am|bool|isOCTAL|char ch Returns a boolean indicating whether the specified character is an octal digit, [0-7] in the platform's native character set. -Variants C and C are identical to C. +The only two variants are C and C; each is identical to +C. + +=for apidoc Am|bool|isPUNCT|char ch +Returns a boolean indicating whether the specified character is a +punctuation character in the platform's native character set, analogous to +C. Note that the definition of what is punctuation isn't as +straightforward as one might desire. See L for details. +See the L for an explanation of variants +C, C, C, C, C +C, and C. =for apidoc Am|bool|isSPACE|char ch Returns a boolean indicating whether the specified character is a -whitespace character in the platform's native character set. This is the same -as what C<\s> matches in a regular expression. +whitespace character in the platform's native character set. This is analogous +to what C and C match in a regular expression. See the L for an explanation of variants -C and C. +C, C, C, C, C +C, and C. =for apidoc Am|bool|isUPPER|char ch Returns a boolean indicating whether the specified character is an -uppercase character in the platform's native character set. +uppercase character in the platform's native character set, analogous to +C. See the L for an explanation of variants -C and C. +C, C, C, C, C +C, and C. -=for apidoc Am|bool|isWORDCHAR|char ch +=for apidoc Am|bool|isPRINT|char ch Returns a boolean indicating whether the specified character is a -character that is any of: alphabetic, numeric, or an underscore. This is the -same as what C<\w> matches in a regular expression. -C is a synonym provided for backward compatibility. Note that it -does not have the standard C language meaning of alphanumeric, since it matches -an underscore and the standard meaning does not. +printable character in the platform's native character set, analogous to +C. See the L for an explanation of variants -C and C. +C, C, C, C, C +C, and C. + +=for apidoc Am|bool|isWORDCHAR|char ch +Returns a boolean indicating whether the specified character is a character +that is a word character, analogous to what C and C match +in a regular expression. A word character is an alphabetic character, a +decimal digit, a connecting punctuation character (such as an underscore), or +a "mark" character that attaches to one of those (like some sort of accent). +C is a synonym provided for backward compatibility, even though a +word character includes more than the standard C language meaning of +alphanumeric. +See the L for an explanation of variants +C, C, C, C, +C, C, and C. =for apidoc Am|bool|isXDIGIT|char ch Returns a boolean indicating whether the specified character is a hexadecimal -digit, [0-9A-Fa-f]. Variants C and C are -identical to C. +digit. In the ASCII range these are C<[0-9A-Fa-f]>. Variants C +and C are identical to C. +See the L for an explanation of variants +C, C, C, C, and +C. + +=head1 Miscellaneous Functions + +=for apidoc Am|U8|READ_XDIGIT|char str* +Returns the value of an ASCII-range hex digit and advances the string pointer. +Behaviour is only well defined when isXDIGIT(*str) is true. =head1 Character case changing @@ -557,6 +667,9 @@ character set, if possible; otherwise returns the input character itself. =cut +Still undocumented are ALNUMC, PSXSPC, VERTSPACE, and IDFIRST, and the other +toUPPER etc functions + Note that these macros are repeated in Devel::PPPort, so should also be patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc @@ -577,56 +690,103 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc * compiler to optimize it out if possible. This is because Configure makes * sure that the machine has an 8-bit byte, so if c is stored in a byte, the * sizeof() guarantees that this evaluates to a constant true at compile time. - * The use of the mask instead of '< 256' keeps gcc from complaining that it is - * always true, when c's storage class is a byte. */ -#define FITS_IN_8_BITS(c) ((sizeof(c) == 1) \ - || (((WIDEST_UTYPE)(c) & 0xFF) == (WIDEST_UTYPE)(c))) + */ +#define FITS_IN_8_BITS(c) ((sizeof(c) == 1) || !(((WIDEST_UTYPE)(c)) & ~0xFF)) #ifdef EBCDIC -# define isASCII(c) (FITS_IN_8_BITS(c) ? NATIVE_TO_UNI((U8) (c)) < 128 : 0) +# define isASCII(c) (FITS_IN_8_BITS(c) && (NATIVE_TO_UNI((U8) (c)) < 128)) #else # define isASCII(c) ((WIDEST_UTYPE)(c) < 128) #endif #define isASCII_A(c) isASCII(c) +#define isASCII_L1(c) isASCII(c) + +/* The lower 3 bits in both the ASCII and EBCDIC representations of '0' are 0, + * and the 8 possible permutations of those bits exactly comprise the 8 octal + * digits */ +#define isOCTAL_A(c) cBOOL(FITS_IN_8_BITS(c) && (0xF8 & (c)) == '0') /* ASCII range only */ #ifdef H_PERL /* If have access to perl.h, lookup in its table */ -/* Bits for PL_charclass[] */ -# define _CC_ALNUMC_A (1<<0) -# define _CC_ALNUMC_L1 (1<<1) -# define _CC_ALPHA_A (1<<2) -# define _CC_ALPHA_L1 (1<<3) -# define _CC_BLANK_A (1<<4) -# define _CC_BLANK_L1 (1<<5) -# define _CC_CHARNAME_CONT (1<<6) -# define _CC_CNTRL_A (1<<7) -# define _CC_CNTRL_L1 (1<<8) -# define _CC_DIGIT_A (1<<9) -# define _CC_GRAPH_A (1<<10) -# define _CC_GRAPH_L1 (1<<11) -# define _CC_IDFIRST_A (1<<12) -# define _CC_IDFIRST_L1 (1<<13) -# define _CC_LOWER_A (1<<14) -# define _CC_LOWER_L1 (1<<15) -# define _CC_OCTAL_A (1<<16) -# define _CC_PRINT_A (1<<17) -# define _CC_PRINT_L1 (1<<18) -# define _CC_PSXSPC_A (1<<19) -# define _CC_PSXSPC_L1 (1<<20) -# define _CC_PUNCT_A (1<<21) -# define _CC_PUNCT_L1 (1<<22) -# define _CC_SPACE_A (1<<23) -# define _CC_SPACE_L1 (1<<24) -# define _CC_UPPER_A (1<<25) -# define _CC_UPPER_L1 (1<<26) -# define _CC_WORDCHAR_A (1<<27) -# define _CC_WORDCHAR_L1 (1<<28) -# define _CC_XDIGIT_A (1<<29) -# define _CC_NONLATIN1_FOLD (1<<30) -/* Unused - * (1<<31) - */ + +/* Character class numbers. For internal core Perl use only. These are used + * in PL_charclass[] and the ones up through the one that corresponds to + * <_HIGHEST_REGCOMP_DOT_H_SYNC> are used by regcomp.h. These use names used + * in l1_char_class_tab.h but their actual definitions are here. If that file + * has a name not used here, it won't compile. + * + * The first group of these is ordered in what I (khw) estimate to be the + * frequency of their use. This gives a slight edge to exiting a loop earlier + * (in reginclass() in regexec.c) */ +# define _CC_WORDCHAR 0 /* \w and [:word:] */ +# define _CC_DIGIT 1 /* \d and [:digit:] */ +# define _CC_ALPHA 2 /* [:alpha:] */ +# define _CC_LOWER 3 /* [:lower:] */ +# define _CC_UPPER 4 /* [:upper:] */ +# define _CC_PUNCT 5 /* [:punct:] */ +# define _CC_PRINT 6 /* [:print:] */ +# define _CC_ALNUMC 7 /* [:alnum:] */ +# define _CC_GRAPH 8 /* [:graph:] */ + +#define _FIRST_NON_SWASH_CC 9 +/* The character classes above are implemented with swashes. The second group + * (just below) contains the ones implemented without. These are also sorted + * in rough order of the frequency of their use, except that \v should be last, + * as it isn't a real Posix character class, and some (small) inefficiencies in + * regular expression handling would be introduced by putting it in the middle + * of those that are. Also, cntrl and ascii come after the others as it may be + * useful to group these which have no members that match above Latin1, (or + * above ASCII in the latter case) */ + +# define _CC_SPACE 9 /* \s */ +# define _CC_BLANK 10 /* [:blank:] */ +# define _CC_XDIGIT 11 /* [:xdigit:] */ +# define _CC_PSXSPC 12 /* [:space:] */ +# define _CC_CNTRL 13 /* [:cntrl:] */ +# define _CC_ASCII 14 /* [:ascii:] */ +# define _CC_VERTSPACE 15 /* \v */ + +# define _HIGHEST_REGCOMP_DOT_H_SYNC _CC_VERTSPACE + +/* The members of the third group below do not need to be coordinated with data + * structures in regcomp.[ch] and regexec.c */ +# define _CC_IDFIRST 16 +# define _CC_CHARNAME_CONT 17 +# define _CC_NONLATIN1_FOLD 18 +# define _CC_QUOTEMETA 19 +# define _CC_NON_FINAL_FOLD 20 +# define _CC_IS_IN_SOME_FOLD 21 +/* Unused: 22-31 + * If more bits are needed, one could add a second word for non-64bit + * QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd + * word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it + * is used only for optimization (as of this writing), and differs in the + * Latin1 range from the ALPHA bit only in two relatively unimportant + * characters: the masculine and feminine ordinal indicators */ + +#if defined(PERL_CORE) || defined(PERL_EXT) +/* An enum version of the character class numbers, to help compilers + * optimize */ +typedef enum { + _CC_ENUM_ALNUMC = _CC_ALNUMC, + _CC_ENUM_ALPHA = _CC_ALPHA, + _CC_ENUM_DIGIT = _CC_DIGIT, + _CC_ENUM_GRAPH = _CC_GRAPH, + _CC_ENUM_LOWER = _CC_LOWER, + _CC_ENUM_PRINT = _CC_PRINT, + _CC_ENUM_PUNCT = _CC_PUNCT, + _CC_ENUM_UPPER = _CC_UPPER, + _CC_ENUM_WORDCHAR = _CC_WORDCHAR, + _CC_ENUM_SPACE = _CC_SPACE, + _CC_ENUM_BLANK = _CC_BLANK, + _CC_ENUM_XDIGIT = _CC_XDIGIT, + _CC_ENUM_CNTRL = _CC_CNTRL, + _CC_ENUM_PSXSPC = _CC_PSXSPC, + _CC_ENUM_ASCII = _CC_ASCII, + _CC_ENUM_VERTSPACE = _CC_VERTSPACE +} _char_class_number; +#endif # ifdef DOINIT EXTCONST U32 PL_charclass[] = { @@ -637,27 +797,47 @@ EXTCONST U32 PL_charclass[] = { EXTCONST U32 PL_charclass[]; # endif -# define isALNUMC_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALNUMC_A)) -# define isALPHA_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALPHA_A)) -# define isBLANK_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_BLANK_A)) -# define isCNTRL_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CNTRL_A)) -# define isDIGIT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_DIGIT_A)) -# define isGRAPH_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_GRAPH_A)) -# define isIDFIRST_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_IDFIRST_A)) -# define isLOWER_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_LOWER_A)) -# define isOCTAL_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_OCTAL_A)) -# define isPRINT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PRINT_A)) -# define isPSXSPC_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PSXSPC_A)) -# define isPUNCT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PUNCT_A)) -# define isSPACE_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_SPACE_A)) -# define isUPPER_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_UPPER_A)) -# define isWORDCHAR_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_WORDCHAR_A)) -# define isXDIGIT_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_XDIGIT_A)) + /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */ +# define _CC_mask(classnum) (1U << (classnum)) +# define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ + && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(classnum))) + + /* The mask for the _A versions of the macros; it just adds in the bit for + * ASCII. */ +# define _CC_mask_A(classnum) (_CC_mask(classnum) | _CC_mask(_CC_ASCII)) + + /* The _A version makes sure that both the desired bit and the ASCII bit + * are present */ +# define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \ + && ((PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask_A(classnum)) \ + == _CC_mask_A(classnum))) + +# define isALNUMC_A(c) _generic_isCC_A(c, _CC_ALNUMC) +# define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA) +# define isBLANK_A(c) _generic_isCC_A(c, _CC_BLANK) +# define isCNTRL_A(c) _generic_isCC_A(c, _CC_CNTRL) +# define isDIGIT_A(c) _generic_isCC(c, _CC_DIGIT) +# define isGRAPH_A(c) _generic_isCC_A(c, _CC_GRAPH) +# define isLOWER_A(c) _generic_isCC_A(c, _CC_LOWER) +# define isPRINT_A(c) _generic_isCC_A(c, _CC_PRINT) +# define isPSXSPC_A(c) _generic_isCC_A(c, _CC_PSXSPC) +# define isPUNCT_A(c) _generic_isCC_A(c, _CC_PUNCT) +# define isSPACE_A(c) _generic_isCC_A(c, _CC_SPACE) +# define isUPPER_A(c) _generic_isCC_A(c, _CC_UPPER) +# define isWORDCHAR_A(c) _generic_isCC_A(c, _CC_WORDCHAR) +# define isXDIGIT_A(c) _generic_isCC(c, _CC_XDIGIT) +# define isIDFIRST_A(c) _generic_isCC_A(c, ( _CC_IDFIRST)) + /* Either participates in a fold with a character above 255, or is a * multi-char fold */ -# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_NONLATIN1_FOLD)) +# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(_CC_NONLATIN1_FOLD))) + +# define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA) +# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \ + _generic_isCC(c, _CC_NON_FINAL_FOLD) +# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \ + _generic_isCC(c, _CC_IS_IN_SOME_FOLD) #else /* No perl.h. */ -# define isOCTAL_A(c) ((c) >= '0' && (c) <= '9') # ifdef EBCDIC # define isALNUMC_A(c) (isASCII(c) && isALNUMC(c)) # define isALPHA_A(c) (isASCII(c) && isALPHA(c)) @@ -678,49 +858,66 @@ EXTCONST U32 PL_charclass[]; # define isALNUMC_A(c) (isALPHA_A(c) || isDIGIT_A(c)) # define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c)) # define isBLANK_A(c) ((c) == ' ' || (c) == '\t') -# define isCNTRL_A(c) (FITS_IN_8_BITS(c) ? ((U8) (c) < ' ' || (c) == 127) : 0) -# define isDIGIT_A(c) ((c) >= '0' && (c) <= '9') +# define isCNTRL_A(c) (FITS_IN_8_BITS(c) && ((U8) (c) < ' ' || (c) == 127)) +# define isDIGIT_A(c) ((c) <= '9' && (c) >= '0') # define isGRAPH_A(c) (isWORDCHAR_A(c) || isPUNCT_A(c)) # define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_') # define isLOWER_A(c) ((c) >= 'a' && (c) <= 'z') # define isPRINT_A(c) (((c) >= 32 && (c) < 127)) # define isPSXSPC_A(c) (isSPACE_A(c) || (c) == '\v') -# define isPUNCT_A(c) (((c) >= 33 && (c) <= 47) || ((c) >= 58 && (c) <= 64) || ((c) >= 91 && (c) <= 96) || ((c) >= 123 && (c) <= 126)) -# define isSPACE_A(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) =='\r' || (c) == '\f') -# define isUPPER_A(c) ((c) >= 'A' && (c) <= 'Z') +# define isPUNCT_A(c) (((c) >= 33 && (c) <= 47) \ + || ((c) >= 58 && (c) <= 64) \ + || ((c) >= 91 && (c) <= 96) \ + || ((c) >= 123 && (c) <= 126)) +# define isSPACE_A(c) ((c) == ' ' \ + || (c) == '\t' \ + || (c) == '\n' \ + || (c) =='\r' \ + || (c) == '\f') +# define isUPPER_A(c) ((c) <= 'Z' && (c) >= 'A') # define isWORDCHAR_A(c) (isALPHA_A(c) || isDIGIT_A(c) || (c) == '_') -# define isXDIGIT_A(c) (isDIGIT_A(c) || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F')) +# define isXDIGIT_A(c) (isDIGIT_A(c) \ + || ((c) >= 'a' && (c) <= 'f') \ + || ((c) <= 'F' && (c) >= 'A')) # endif #endif /* ASCII range definitions */ /* Latin1 definitions */ #ifdef H_PERL -# define isALNUMC_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALNUMC_L1)) -# define isALPHA_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_ALPHA_L1)) -# define isBLANK_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_BLANK_L1)) +# define isALNUMC_L1(c) _generic_isCC(c, _CC_ALNUMC) +# define isALPHA_L1(c) _generic_isCC(c, _CC_ALPHA) +# define isBLANK_L1(c) _generic_isCC(c, _CC_BLANK) + /* continuation character for legal NAME in \N{NAME} */ -# define isCHARNAME_CONT(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CHARNAME_CONT)) -# define isCNTRL_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_CNTRL_L1)) -# define isGRAPH_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_GRAPH_L1)) -# define isIDFIRST_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_IDFIRST_L1)) -# define isLOWER_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_LOWER_L1)) -# define isPRINT_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PRINT_L1)) -# define isPSXSPC_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PSXSPC_L1)) -# define isPUNCT_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_PUNCT_L1)) -# define isSPACE_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_SPACE_L1)) -# define isUPPER_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_UPPER_L1)) -# define isWORDCHAR_L1(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_WORDCHAR_L1)) +# define isCHARNAME_CONT(c) _generic_isCC(c, _CC_CHARNAME_CONT) + +# define isCNTRL_L1(c) _generic_isCC(c, _CC_CNTRL) +# define isGRAPH_L1(c) _generic_isCC(c, _CC_GRAPH) +# define isLOWER_L1(c) _generic_isCC(c, _CC_LOWER) +# define isPRINT_L1(c) _generic_isCC(c, _CC_PRINT) +# define isPSXSPC_L1(c) _generic_isCC(c, _CC_PSXSPC) +# define isPUNCT_L1(c) _generic_isCC(c, _CC_PUNCT) +# define isSPACE_L1(c) _generic_isCC(c, _CC_SPACE) +# define isUPPER_L1(c) _generic_isCC(c, _CC_UPPER) +# define isWORDCHAR_L1(c) _generic_isCC(c, _CC_WORDCHAR) +# define isIDFIRST_L1(c) _generic_isCC(c, _CC_IDFIRST) #else /* No access to perl.h. Only a few provided here, just in case needed * for backwards compatibility */ /* ALPHAU includes Unicode semantics for latin1 characters. It has an extra * >= AA test to speed up ASCII-only tests at the expense of the others */ # define isALPHA_L1(c) (isALPHA(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \ && ((NATIVE_TO_UNI((U8) c) >= 0xC0 \ - && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \ + && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \ || NATIVE_TO_UNI((U8) c) == 0xAA \ || NATIVE_TO_UNI((U8) c) == 0xB5 \ || NATIVE_TO_UNI((U8) c) == 0xBA))) -# define isCHARNAME_CONT(c) (isALNUM_L1(c) || (c) == ' ' || (c) == '-' || (c) == '(' || (c) == ')' || (c) == ':' || NATIVE_TO_UNI((U8) c) == 0xA0) +# define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) \ + || (c) == ' ' \ + || (c) == '-' \ + || (c) == '(' \ + || (c) == ')' \ + || (c) == ':' \ + || NATIVE_TO_UNI((U8) c) == 0xA0) #endif /* Macros for backwards compatibility and for completeness when the ASCII and @@ -751,14 +948,15 @@ EXTCONST U32 PL_charclass[]; # define isPRINT(c) isprint(c) # define isPSXSPC(c) isspace(c) # define isPUNCT(c) ispunct(c) -# define isSPACE(c) (isPSXSPC(c) && (c) != '\v') +# define isSPACE(c) (isPSXSPC(c) /* && (c) != '\v' (Experimentally making + these macros identical) */) # define isUPPER(c) isupper(c) # define isXDIGIT(c) isxdigit(c) # define isWORDCHAR(c) (isalnum(c) || (c) == '_') # define toLOWER(c) tolower(c) # define toUPPER(c) toupper(c) #else /* Not EBCDIC: ASCII-only matching */ -# define isALNUMC(c) isALNUMC_A(c) +# define isALNUMC(c) isALNUMC_A(c) /* Mnemonic: "C's alnum" = alpha + digit */ # define isALPHA(c) isALPHA_A(c) # define isBLANK(c) isBLANK_A(c) # define isCNTRL(c) isCNTRL_A(c) @@ -800,163 +998,274 @@ EXTCONST U32 PL_charclass[]; #ifdef USE_NEXT_CTYPE -# define isALNUM_LC(c) \ - (NXIsAlNum((unsigned int)(c)) || (char)(c) == '_') -# define isIDFIRST_LC(c) \ - (NXIsAlpha((unsigned int)(c)) || (char)(c) == '_') -# define isALPHA_LC(c) NXIsAlpha((unsigned int)(c)) -# define isSPACE_LC(c) NXIsSpace((unsigned int)(c)) -# define isDIGIT_LC(c) NXIsDigit((unsigned int)(c)) -# define isUPPER_LC(c) NXIsUpper((unsigned int)(c)) -# define isLOWER_LC(c) NXIsLower((unsigned int)(c)) # define isALNUMC_LC(c) NXIsAlNum((unsigned int)(c)) +# define isALNUM_LC(c) isWORDCHAR_LC(c) +# define isALPHA_LC(c) NXIsAlpha((unsigned int)(c)) +# define isASCII_LC(c) isASCII((unsigned int)(c)) +# define isBLANK_LC(c) isBLANK((unsigned int)(c)) # define isCNTRL_LC(c) NXIsCntrl((unsigned int)(c)) +# define isDIGIT_LC(c) NXIsDigit((unsigned int)(c)) # define isGRAPH_LC(c) NXIsGraph((unsigned int)(c)) +# define isIDFIRST_LC(c) (NXIsAlpha((unsigned int)(c)) || (char)(c) == '_') +# define isLOWER_LC(c) NXIsLower((unsigned int)(c)) # define isPRINT_LC(c) NXIsPrint((unsigned int)(c)) # define isPUNCT_LC(c) NXIsPunct((unsigned int)(c)) -# define toUPPER_LC(c) NXToUpper((unsigned int)(c)) +# define isSPACE_LC(c) NXIsSpace((unsigned int)(c)) +# define isUPPER_LC(c) NXIsUpper((unsigned int)(c)) +# define isWORDCHAR_LC(c) (NXIsAlNum((unsigned int)(c)) || (char)(c) == '_') +# define isXDIGIT_LC(c) NXIsXDigit((unsigned int)(c)) # define toLOWER_LC(c) NXToLower((unsigned int)(c)) +# define toUPPER_LC(c) NXToUpper((unsigned int)(c)) #else /* !USE_NEXT_CTYPE */ # if defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII)) -# define isALNUM_LC(c) (isalnum((unsigned char)(c)) || (char)(c) == '_') -# define isIDFIRST_LC(c) (isalpha((unsigned char)(c)) || (char)(c) == '_') -# define isALPHA_LC(c) isalpha((unsigned char)(c)) -# define isSPACE_LC(c) isspace((unsigned char)(c)) -# define isDIGIT_LC(c) isdigit((unsigned char)(c)) -# define isUPPER_LC(c) isupper((unsigned char)(c)) -# define isLOWER_LC(c) islower((unsigned char)(c)) -# define isALNUMC_LC(c) isalnum((unsigned char)(c)) -# define isCNTRL_LC(c) iscntrl((unsigned char)(c)) -# define isGRAPH_LC(c) isgraph((unsigned char)(c)) -# define isPRINT_LC(c) isprint((unsigned char)(c)) -# define isPUNCT_LC(c) ispunct((unsigned char)(c)) -# define toUPPER_LC(c) toupper((unsigned char)(c)) -# define toLOWER_LC(c) tolower((unsigned char)(c)) +/* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */ + +# define isALNUMC_LC(c) (FITS_IN_8_BITS(c) && isalnum((unsigned char)(c))) +# define isALNUM_LC(c) isWORDCHAR_LC(c) +# define isALPHA_LC(c) (FITS_IN_8_BITS(c) && isalpha((unsigned char)(c))) +# ifdef HAS_ISASCII +# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii((unsigned char)(c))) +# else +# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isASCII((unsigned char)(c))) +# endif +# ifdef HAS_ISBLANK +# define isBLANK_LC(c) (FITS_IN_8_BITS(c) && isblank((unsigned char)(c))) +# else +# define isBLANK_LC(c) (FITS_IN_8_BITS(c) && isBLANK((unsigned char)(c))) +# endif +# define isCNTRL_LC(c) (FITS_IN_8_BITS(c) && iscntrl((unsigned char)(c))) +# define isDIGIT_LC(c) (FITS_IN_8_BITS(c) && isdigit((unsigned char)(c))) +# define isGRAPH_LC(c) (FITS_IN_8_BITS(c) && isgraph((unsigned char)(c))) +# define isIDFIRST_LC(c) (FITS_IN_8_BITS(c) \ + && (isalpha((unsigned char)(c)) || (char)(c) == '_')) +# define isLOWER_LC(c) (FITS_IN_8_BITS(c) && islower((unsigned char)(c))) +# define isPRINT_LC(c) (FITS_IN_8_BITS(c) && isprint((unsigned char)(c))) +# define isPUNCT_LC(c) (FITS_IN_8_BITS(c) && ispunct((unsigned char)(c))) +# define isSPACE_LC(c) (FITS_IN_8_BITS(c) && isspace((unsigned char)(c))) +# define isUPPER_LC(c) (FITS_IN_8_BITS(c) && isupper((unsigned char)(c))) +# define isWORDCHAR_LC(c) (FITS_IN_8_BITS(c) \ + && (isalnum((unsigned char)(c)) || (char)(c) == '_')) +# define isXDIGIT_LC(c) (FITS_IN_8_BITS(c) && isxdigit((unsigned char)(c))) +# define toLOWER_LC(c) (FITS_IN_8_BITS(c) ? tolower((unsigned char)(c)) : (c)) +# define toUPPER_LC(c) (FITS_IN_8_BITS(c) ? toupper((unsigned char)(c)) : (c)) # else -# define isALNUM_LC(c) (isascii(c) && (isalnum(c) || (c) == '_')) -# define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_')) -# define isALPHA_LC(c) (isascii(c) && isalpha(c)) -# define isSPACE_LC(c) (isascii(c) && isspace(c)) -# define isDIGIT_LC(c) (isascii(c) && isdigit(c)) -# define isUPPER_LC(c) (isascii(c) && isupper(c)) -# define isLOWER_LC(c) (isascii(c) && islower(c)) # define isALNUMC_LC(c) (isascii(c) && isalnum(c)) +# define isALNUM_LC(c) isWORDCHAR_LC(c) +# define isALPHA_LC(c) (isascii(c) && isalpha(c)) +# define isASCII_LC(c) isascii(c) +# ifdef HAS_ISBLANK +# define isBLANK_LC(c) (isascii(c) && isblank(c)) +# else +# define isBLANK_LC(c) isBLANK_A(c) +# endif # define isCNTRL_LC(c) (isascii(c) && iscntrl(c)) +# define isDIGIT_LC(c) (isascii(c) && isdigit(c)) # define isGRAPH_LC(c) (isascii(c) && isgraph(c)) +# define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_')) +# define isLOWER_LC(c) (isascii(c) && islower(c)) # define isPRINT_LC(c) (isascii(c) && isprint(c)) # define isPUNCT_LC(c) (isascii(c) && ispunct(c)) -# define toUPPER_LC(c) toupper(c) -# define toLOWER_LC(c) tolower(c) +# define isSPACE_LC(c) (isascii(c) && isspace(c)) +# define isUPPER_LC(c) (isascii(c) && isupper(c)) +# define isWORDCHAR_LC(c) (isascii(c) && (isalnum(c) || (c) == '_')) +# define isXDIGIT_LC(c) (isascii(c) && isxdigit(c)) +# define toLOWER_LC(c) (isascii(c) ? tolower(c) : (c)) +# define toUPPER_LC(c) (isascii(c) ? toupper(c) : (c)) # endif #endif /* USE_NEXT_CTYPE */ -#define isPSXSPC_LC(c) (isSPACE_LC(c) || (c) == '\v') -#define isBLANK_LC(c) isBLANK(c) /* could be wrong */ +#define isPSXSPC_LC(c) isSPACE_LC(c) + +/* For internal core Perl use only. If the input is Latin1, use the Latin1 + * macro; otherwise use the function. Won't compile if 'c' isn't unsigned, as + * won't match function prototype. The macros do bounds checking, so have + * duplicate checks here, so could create versions of the macros that don't, + * but experiments show that gcc optimizes them out anyway. */ + +/* Note that all ignore 'use bytes' */ +#define _generic_uni(classnum, function, c) ((c) < 256 \ + ? _generic_isCC(c, classnum) \ + : function(c)) +#define isALNUMC_uni(c) _generic_uni(_CC_ALNUMC, is_uni_alnumc, c) +#define isALNUM_uni(c) isWORDCHAR_uni(c) +#define isALPHA_uni(c) _generic_uni(_CC_ALPHA, is_uni_alpha, c) +#define isASCII_uni(c) isASCII(c) +#define isBLANK_uni(c) _generic_uni(_CC_BLANK, is_HORIZWS_cp_high, c) +#define isCNTRL_uni(c) isCNTRL_L1(c) /* All controls are in Latin1 */ +#define isDIGIT_uni(c) _generic_uni(_CC_DIGIT, is_uni_digit, c) +#define isGRAPH_uni(c) _generic_uni(_CC_GRAPH, is_uni_graph, c) +#define isIDFIRST_uni(c) _generic_uni(_CC_IDFIRST, _is_uni_perl_idstart, c) +#define isLOWER_uni(c) _generic_uni(_CC_LOWER, is_uni_lower, c) +#define isPRINT_uni(c) _generic_uni(_CC_PRINT, is_uni_print, c) + +/* Posix and regular space are identical above Latin1 */ +#define isPSXSPC_uni(c) _generic_uni(_CC_PSXSPC, is_XPERLSPACE_cp_high, c) + +#define isPUNCT_uni(c) _generic_uni(_CC_PUNCT, is_uni_punct, c) +#define isSPACE_uni(c) _generic_uni(_CC_SPACE, is_XPERLSPACE_cp_high, c) +#define isUPPER_uni(c) _generic_uni(_CC_UPPER, is_uni_upper, c) +#define isVERTWS_uni(c) _generic_uni(_CC_VERTSPACE, is_VERTWS_cp_high, c) +#define isWORDCHAR_uni(c) _generic_uni(_CC_WORDCHAR, is_uni_alnum, c) +#define isXDIGIT_uni(c) _generic_uni(_CC_XDIGIT, is_XDIGIT_cp_high, c) -/* For use in the macros just below. If the input is Latin1, use the Latin1 - * (_L1) version of the macro; otherwise use the function. Won't compile if - * 'c' isn't unsigned, as won't match function prototype. The macros do bounds - * checking, so have duplicate checks here, so could create versions of the - * macros that don't, but experiments show that gcc optimizes them out anyway. - */ -#define generic_uni(macro, function, c) ((c) < 256 \ - ? CAT2(macro, _L1)(c) \ - : function(c)) - -#define isALNUM_uni(c) generic_uni(isWORDCHAR, is_uni_alnum, c) -#define isIDFIRST_uni(c) generic_uni(isIDFIRST, is_uni_idfirst, c) -#define isALPHA_uni(c) generic_uni(isALPHA, is_uni_alpha, c) -#define isSPACE_uni(c) generic_uni(isSPACE, is_uni_space, c) -#define isDIGIT_uni(c) generic_uni(isDIGIT, is_uni_digit, c) -#define isUPPER_uni(c) generic_uni(isUPPER, is_uni_upper, c) -#define isLOWER_uni(c) generic_uni(isLOWER, is_uni_lower, c) -#define isASCII_uni(c) isASCII(c) -/* All controls are in Latin1 */ -#define isCNTRL_uni(c) ((c) < 256 ? isCNTRL_L1(c) : 0) -#define isGRAPH_uni(c) generic_uni(isGRAPH, is_uni_graph, c) -#define isPRINT_uni(c) generic_uni(isPRINT, is_uni_print, c) -#define isPUNCT_uni(c) generic_uni(isPUNCT, is_uni_punct, c) -#define isXDIGIT_uni(c) generic_uni(isXDIGIT, is_uni_xdigit, c) -#define toUPPER_uni(c,s,l) to_uni_upper(c,s,l) -#define toTITLE_uni(c,s,l) to_uni_title(c,s,l) -#define toLOWER_uni(c,s,l) to_uni_lower(c,s,l) #define toFOLD_uni(c,s,l) to_uni_fold(c,s,l) +#define toLOWER_uni(c,s,l) to_uni_lower(c,s,l) +#define toTITLE_uni(c,s,l) to_uni_title(c,s,l) +#define toUPPER_uni(c,s,l) to_uni_upper(c,s,l) + +#define _gnrc_is_LC_uvchr(latin1, above_latin1, c) \ + (c < 256 ? latin1(c) : above_latin1(NATIVE_TO_UNI(c))) +#define isALNUMC_LC_uvchr(c) _gnrc_is_LC_uvchr(isALNUMC_LC, is_uni_alnumc_lc, c) +#define isALNUM_LC_uvchr(c) isWORDCHAR_LC_uvchr(c) +#define isALPHA_LC_uvchr(c) _gnrc_is_LC_uvchr(isALPHA_LC, is_uni_alpha_lc, c) +#define isASCII_LC_uvchr(c) isASCII_LC(c) +#define isBLANK_LC_uvchr(c) _gnrc_is_LC_uvchr(isBLANK_LC, is_HORIZWS_cp_high, c) +#define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : 0) +#define isDIGIT_LC_uvchr(c) _gnrc_is_LC_uvchr(isDIGIT_LC, is_uni_digit_lc, c) +#define isGRAPH_LC_uvchr(c) _gnrc_is_LC_uvchr(isGRAPH_LC, is_uni_graph_lc, c) +#define isIDFIRST_LC_uvchr(c) _gnrc_is_LC_uvchr(isIDFIRST_LC, \ + is_uni_idfirst_lc, c) +#define isLOWER_LC_uvchr(c) _gnrc_is_LC_uvchr(isLOWER_LC, is_uni_lower_lc, c) +#define isPRINT_LC_uvchr(c) _gnrc_is_LC_uvchr(isPRINT_LC, is_uni_print_lc, c) +#define isPSXSPC_LC_uvchr(c) isSPACE_LC_uvchr(c) /* space is identical to posix + space under locale */ +#define isPUNCT_LC_uvchr(c) _gnrc_is_LC_uvchr(isPUNCT_LC, is_uni_punct_lc, c) +#define isSPACE_LC_uvchr(c) _gnrc_is_LC_uvchr(isSPACE_LC, \ + is_XPERLSPACE_cp_high, c) +#define isUPPER_LC_uvchr(c) _gnrc_is_LC_uvchr(isUPPER_LC, is_uni_upper_lc, c) +#define isWORDCHAR_LC_uvchr(c) _gnrc_is_LC_uvchr(isWORDCHAR_LC, \ + is_uni_alnum_lc, c) +#define isXDIGIT_LC_uvchr(c) _gnrc_is_LC_uvchr(isXDIGIT_LC, is_XDIGIT_cp_high, c) + + +#define isBLANK_LC_uni(c) isBLANK_LC_uvchr(UNI_TO_NATIVE(c)) + +/* For internal core Perl use only. If the input is in the Latin1 range, use + * the Latin1 macro 'classnum' on 'p' which is a pointer to a UTF-8 string. + * Otherwise use the value given by the 'utf8' parameter. This relies on the + * fact that ASCII characters have the same representation whether utf8 or not. + * Note that it assumes that the utf8 has been validated, and ignores 'use + * bytes' */ +#define _generic_utf8_utf8(classnum, p, utf8) (UTF8_IS_INVARIANT(*(p)) \ + ? _generic_isCC(*(p), classnum) \ + : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ + ? _generic_isCC( \ + TWO_BYTE_UTF8_TO_UNI(*(p), \ + *((p)+1 )), \ + classnum) \ + : utf8) +/* Like the above, but calls 'function(p)' to get the utf8 value */ +#define _generic_utf8(classnum, function, p) \ + _generic_utf8_utf8(classnum, p, function(p)) + +/* Like the above, but should be used only when it is known that there are no + * characters in the range 128-255 which the class is TRUE for. Hence it can + * skip the tests for this range */ +#define _generic_utf8_no_upper_latin1(classnum, function, p) \ + (UTF8_IS_INVARIANT(*(p)) \ + ? _generic_isCC(*(p), classnum) \ + : (UTF8_IS_ABOVE_LATIN1(*(p))) \ + ? function(p) \ + : 0) + +/* NOTE that some of these macros have very similar ones in regcharclass.h. + * For example, there is (at the time of this writing) an 'is_SPACE_utf8()' + * there, differing in name only by an underscore from the one here + * 'isSPACE_utf8(). The difference is that the ones here are probably more + * efficient and smaller, using an O(1) array lookup for Latin1-range code + * points; the regcharclass.h ones are implemented as a series of + * "if-else-if-else ..." */ + +#define isALNUMC_utf8(p) _generic_utf8(_CC_ALNUMC, is_utf8_alnumc, p) +#define isALNUM_utf8(p) isWORDCHAR_utf8(p) /* back compat */ +#define isALPHA_utf8(p) _generic_utf8(_CC_ALPHA, is_utf8_alpha, p) +#define isASCII_utf8(p) isASCII(*p) /* Because ASCII is invariant under + utf8, the non-utf8 macro works + */ +#define isBLANK_utf8(p) _generic_utf8(_CC_BLANK, is_HORIZWS_high, p) +#define isCNTRL_utf8(p) _generic_utf8_utf8(_CC_CNTRL, p, 0) +#define isDIGIT_utf8(p) _generic_utf8_no_upper_latin1(_CC_DIGIT, \ + is_utf8_digit, p) +#define isGRAPH_utf8(p) _generic_utf8(_CC_GRAPH, is_utf8_graph, p) +#define isIDCONT_utf8(p) _generic_utf8(_CC_WORDCHAR, is_utf8_xidcont, p) -/* Posix and regular space differ only in U+000B, which is in Latin1 */ -#define isPSXSPC_uni(c) ((c) < 256 ? isPSXSPC_L1(c) : isSPACE_uni(c)) -#define isBLANK_uni(c) isBLANK(c) /* could be wrong */ - -#define isALNUM_LC_uvchr(c) (c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c)) -#define isIDFIRST_LC_uvchr(c) (c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c)) -#define isALPHA_LC_uvchr(c) (c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c)) -#define isSPACE_LC_uvchr(c) (c < 256 ? isSPACE_LC(c) : is_uni_space_lc(c)) -#define isDIGIT_LC_uvchr(c) (c < 256 ? isDIGIT_LC(c) : is_uni_digit_lc(c)) -#define isUPPER_LC_uvchr(c) (c < 256 ? isUPPER_LC(c) : is_uni_upper_lc(c)) -#define isLOWER_LC_uvchr(c) (c < 256 ? isLOWER_LC(c) : is_uni_lower_lc(c)) -#define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : is_uni_cntrl_lc(c)) -#define isGRAPH_LC_uvchr(c) (c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c)) -#define isPRINT_LC_uvchr(c) (c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c)) -#define isPUNCT_LC_uvchr(c) (c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c)) - -#define isPSXSPC_LC_uni(c) (isSPACE_LC_uni(c) ||(c) == '\f') -#define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */ - -#define isALNUM_utf8(p) is_utf8_alnum(p) /* To prevent S_scan_word in toke.c from hanging, we have to make sure that * IDFIRST is an alnum. See - * http://rt.perl.org/rt3/Ticket/Display.html?id=74022 - * for more detail than you ever wanted to know about. This used to be not the - * XID version, but we decided to go with the more modern Unicode definition */ -#define isIDFIRST_utf8(p) (is_utf8_xidfirst(p) && is_utf8_alnum(p)) -#define isIDCONT_utf8(p) is_utf8_xidcont(p) -#define isALPHA_utf8(p) is_utf8_alpha(p) -#define isSPACE_utf8(p) is_utf8_space(p) -#define isDIGIT_utf8(p) is_utf8_digit(p) -#define isUPPER_utf8(p) is_utf8_upper(p) -#define isLOWER_utf8(p) is_utf8_lower(p) -/* Because ASCII is invariant under utf8, the non-utf8 macro works */ -#define isASCII_utf8(p) isASCII(p) -#define isCNTRL_utf8(p) is_utf8_cntrl(p) -#define isGRAPH_utf8(p) is_utf8_graph(p) -#define isPRINT_utf8(p) is_utf8_print(p) -#define isPUNCT_utf8(p) is_utf8_punct(p) -#define isXDIGIT_utf8(p) is_utf8_xdigit(p) -#define toUPPER_utf8(p,s,l) to_utf8_upper(p,s,l) -#define toTITLE_utf8(p,s,l) to_utf8_title(p,s,l) + * http://rt.perl.org/rt3/Ticket/Display.html?id=74022 for more detail than you + * ever wanted to know about. (In the ASCII range, there isn't a difference.) + * This used to be not the XID version, but we decided to go with the more + * modern Unicode definition */ +#define isIDFIRST_utf8(p) _generic_utf8(_CC_IDFIRST, \ + _is_utf8_perl_idstart, p) + +#define isLOWER_utf8(p) _generic_utf8(_CC_LOWER, is_utf8_lower, p) +#define isPRINT_utf8(p) _generic_utf8(_CC_PRINT, is_utf8_print, p) + +/* Posix and regular space are identical above Latin1 */ +#define isPSXSPC_utf8(p) _generic_utf8(_CC_PSXSPC, is_XPERLSPACE_high, p) + +#define isPUNCT_utf8(p) _generic_utf8(_CC_PUNCT, is_utf8_punct, p) +#define isSPACE_utf8(p) _generic_utf8(_CC_SPACE, is_XPERLSPACE_high, p) +#define isUPPER_utf8(p) _generic_utf8(_CC_UPPER, is_utf8_upper, p) +#define isVERTWS_utf8(p) _generic_utf8(_CC_VERTSPACE, is_VERTWS_high, p) +#define isWORDCHAR_utf8(p) _generic_utf8(_CC_WORDCHAR, is_utf8_alnum, p) +#define isXDIGIT_utf8(p) _generic_utf8_no_upper_latin1(_CC_XDIGIT, \ + is_XDIGIT_high, p) + #define toLOWER_utf8(p,s,l) to_utf8_lower(p,s,l) +#define toTITLE_utf8(p,s,l) to_utf8_title(p,s,l) +#define toUPPER_utf8(p,s,l) to_utf8_upper(p,s,l) -#define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f') -#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */ - -#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(utf8_to_uvchr(p, 0)) - -#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f') -#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */ +/* For internal core Perl use only. If the input is in the Latin1 range, use + * the macro 'macro' on 'p' which is a pointer to a UTF-8 string. Otherwise + * use the value given by the 'utf8' parameter. This relies on the fact that + * ASCII characters have the same representation whether utf8 or not. Note + * that it assumes that the utf8 has been validated, and ignores 'use bytes' */ +#define _generic_LC_utf8_utf8(macro, p, utf8) \ + (UTF8_IS_INVARIANT(*(p)) \ + ? macro(*(p)) \ + : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ + ? macro(TWO_BYTE_UTF8_TO_UNI(*(p), *((p)+1))) \ + : utf8) + +#define _generic_LC_utf8(macro, utf8_func, p) \ + _generic_LC_utf8_utf8(macro, p, utf8_func(p)) + +#define isALNUMC_LC_utf8(p) _generic_LC_utf8(isALNUMC_LC, is_utf8_alnumc, p) +#define isALNUM_LC_utf8(p) isWORDCHAR_LC_utf8(p) +#define isALPHA_LC_utf8(p) _generic_LC_utf8(isALPHA_LC, is_utf8_alpha, p) +#define isASCII_LC_utf8(p) isASCII_LC(*p) +#define isBLANK_LC_utf8(p) _generic_LC_utf8(isBLANK_LC, is_HORIZWS_high, p) +#define isCNTRL_LC_utf8(p) _generic_LC_utf8_utf8(isCNTRL_LC, p, 0) +#define isDIGIT_LC_utf8(p) _generic_LC_utf8(isDIGIT_LC, is_utf8_digit, p) +#define isGRAPH_LC_utf8(p) _generic_LC_utf8(isGRAPH_LC, is_utf8_graph, p) +#define isIDFIRST_LC_utf8(p) _generic_LC_utf8(isIDFIRST_LC, \ + _is_utf8_perl_idstart, p) +#define isLOWER_LC_utf8(p) _generic_LC_utf8(isLOWER_LC, is_utf8_lower, p) +#define isPRINT_LC_utf8(p) _generic_LC_utf8(isPRINT_LC, is_utf8_print, p) +#define isPSXSPC_LC_utf8(p) isSPACE_LC_utf8(p) /* space is identical to posix + space under locale */ +#define isPUNCT_LC_utf8(p) _generic_LC_utf8(isPUNCT_LC, is_utf8_punct, p) +#define isSPACE_LC_utf8(p) _generic_LC_utf8(isSPACE_LC, is_XPERLSPACE_high, p) +#define isUPPER_LC_utf8(p) _generic_LC_utf8(isUPPER_LC, is_utf8_upper, p) +#define isWORDCHAR_LC_utf8(p) _generic_LC_utf8(isWORDCHAR_LC, is_utf8_alnum, p) +#define isXDIGIT_LC_utf8(p) _generic_LC_utf8(isXDIGIT_LC, is_XDIGIT_high, p) /* This conversion works both ways, strangely enough. On EBCDIC platforms, - * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII */ + * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't + * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but + * that is EOT on ASCII; ST on EBCDIC */ # define toCTRL(c) (toUPPER(NATIVE_TO_UNI(c)) ^ 64) /* Line numbers are unsigned, 32 bits. */ typedef U32 line_t; -#define NOLINE ((line_t) 4294967295UL) +#define NOLINE ((line_t) 4294967295UL) /* = FFFFFFFF */ /* Helpful alias for version prescan */ #define is_LAX_VERSION(a,b) \ @@ -971,6 +1280,8 @@ typedef U32 line_t; } \ return a; +#define READ_XDIGIT(s) (isALPHA(*(s)) ? ((*(s)++ + 9) & 0xf) : (*(s)++ & 0xf)) + /* =head1 Memory Management @@ -1003,8 +1314,8 @@ The XSUB-writer's interface to the C C function. =for apidoc Am|void|Move|void* src|void* dest|int nitems|type The XSUB-writer's interface to the C C function. The C is the -source, C is the destination, C is the number of items, and C is -the type. Can do overlapping moves. See also C. +source, C is the destination, C is the number of items, and +C is the type. Can do overlapping moves. See also C. =for apidoc Am|void *|MoveD|void* src|void* dest|int nitems|type Like C but returns dest. Useful for encouraging compilers to tail-call @@ -1012,8 +1323,8 @@ optimise. =for apidoc Am|void|Copy|void* src|void* dest|int nitems|type The XSUB-writer's interface to the C C function. The C is the -source, C is the destination, C is the number of items, and C is -the type. May fail on overlapping copies. See also C. +source, C is the destination, C is the number of items, and +C is the type. May fail on overlapping copies. See also C. =for apidoc Am|void *|CopyD|void* src|void* dest|int nitems|type @@ -1030,7 +1341,7 @@ destination, C is the number of items, and C is the type. Like C but returns dest. Useful for encouraging compilers to tail-call optimise. -=for apidoc Am|void|StructCopy|type src|type dest|type +=for apidoc Am|void|StructCopy|type *src|type *dest|type This is an architecture-independent macro to copy one structure to another. =for apidoc Am|void|PoisonWith|void* dest|int nitems|type|U8 byte @@ -1063,13 +1374,13 @@ PoisonWith(0xEF) for catching access to freed memory. * overly eager compilers that will bleat about e.g. * (U16)n > (size_t)~0/sizeof(U16) always being false. */ #ifdef PERL_MALLOC_WRAP -#define MEM_WRAP_CHECK(n,t) MEM_WRAP_CHECK_1(n,t,PL_memory_wrap) +#define MEM_WRAP_CHECK(n,t) \ + (void)(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t) && (croak_memory_wrap(),0)) #define MEM_WRAP_CHECK_1(n,t,a) \ (void)(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t) && (Perl_croak_nocontext("%s",(a)),0)) #define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t), -#define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (Perl_croak_nocontext("%s",PL_memory_wrap),0):0),((n-1+PERL_STRLEN_ROUNDUP_QUANTUM)&~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM-1))) - +#define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0):0),((n-1+PERL_STRLEN_ROUNDUP_QUANTUM)&~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM-1))) #else #define MEM_WRAP_CHECK(n,t) @@ -1244,8 +1555,8 @@ void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumbe * Local variables: * c-indentation-style: bsd * c-basic-offset: 4 - * indent-tabs-mode: t + * indent-tabs-mode: nil * End: * - * ex: set ts=8 sts=4 sw=4 noet: + * ex: set ts=8 sts=4 sw=4 et: */