library routine C<isascii()>. In these cases, the variants whose names contain
C<LC> are the same as the corresponding ones without.
+Also note, that because all ASCII characters are UTF-8 invariant (meaning they
+have the exact same representation (always a single byte) whether encoded in
+UTF-8 or not), C<isASCII> will give the correct results when called with any
+byte in any string encoded or not in UTF-8. And similarly C<isASCII_utf8> will
+work properly on any string encoded or not in UTF-8.
+
=for apidoc Am|bool|isBLANK|char ch
Returns a boolean indicating whether the specified character is a
character considered to be a blank, analogous to C<m/[[:blank:]]/>.
=head1 Character case changing
-=for apidoc Am|char|toUPPER|char ch
-Converts the specified character to uppercase, if possible; otherwise returns
-the input character itself.
+=for apidoc Am|U8|toUPPER|U8 ch
+Converts the specified character to uppercase. If the input is anything but an
+ASCII lowercase character, that input character itself is returned. Variant
+C<toUPPER_A> is equivalent.
+
+=for apidoc Am|UV|toUPPER_uni|UV cp|U8* s|STRLEN* lenp
+Converts the Unicode code point C<cp> to its uppercase version, and
+stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
+that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
+bytes since the uppercase version may be longer than the original character.
+
+The first code point of the uppercased version is returned
+(but note, as explained just above, that there may be more.)
+
+=for apidoc Am|UV|toUPPER_utf8|U8* p|U8* s|STRLEN* lenp
+Converts the UTF-8 encoded character at C<p> to its uppercase version, and
+stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
+that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
+bytes since the uppercase version may be longer than the original character.
+
+The first code point of the uppercased version is returned
+(but note, as explained just above, that there may be more.)
+
+The input character at C<p> is assumed to be well-formed.
+
+=for apidoc Am|U8|toFOLD|U8 ch
+Converts the specified character to foldcase. If the input is anything but an
+ASCII uppercase character, that input character itself is returned. Variant
+C<toFOLD_A> is equivalent. (There is no equivalent C<to_FOLD_L1> for the full
+Latin1 range, as the full generality of L</toFOLD_uni> is needed there.)
+
+=for apidoc Am|UV|toFOLD_uni|UV cp|U8* s|STRLEN* lenp
+Converts the Unicode code point C<cp> to its foldcase version, and
+stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
+that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
+bytes since the foldcase version may be longer than the original character.
+
+The first code point of the foldcased version is returned
+(but note, as explained just above, that there may be more.)
+
+=for apidoc Am|UV|toFOLD_utf8|U8* p|U8* s|STRLEN* lenp
+Converts the UTF-8 encoded character at C<p> to its foldcase version, and
+stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
+that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
+bytes since the foldcase version may be longer than the original character.
+
+The first code point of the foldcased version is returned
+(but note, as explained just above, that there may be more.)
+
+The input character at C<p> is assumed to be well-formed.
+
+=for apidoc Am|U8|toLOWER|U8 ch
+Converts the specified character to lowercase. If the input is anything but an
+ASCII uppercase character, that input character itself is returned. Variant
+C<toLOWER_A> is equivalent.
+
+=for apidoc Am|U8|toLOWER_L1|U8 ch
+Converts the specified Latin1 character to lowercase. The results are undefined if
+the input doesn't fit in a byte.
+
+=for apidoc Am|U8|toLOWER_LC|U8 ch
+Converts the specified character to lowercase using the current locale's rules,
+if possible; otherwise returns the input character itself.
+
+=for apidoc Am|UV|toLOWER_uni|UV cp|U8* s|STRLEN* lenp
+Converts the Unicode code point C<cp> to its lowercase version, and
+stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
+that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
+bytes since the lowercase version may be longer than the original character.
+
+The first code point of the lowercased version is returned
+(but note, as explained just above, that there may be more.)
+
+=for apidoc Am|UV|toLOWER_utf8|U8* p|U8* s|STRLEN* lenp
+Converts the UTF-8 encoded character at C<p> to its lowercase version, and
+stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
+that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
+bytes since the lowercase version may be longer than the original character.
+
+The first code point of the lowercased version is returned
+(but note, as explained just above, that there may be more.)
+
+The input character at C<p> is assumed to be well-formed.
+
+=for apidoc Am|U8|toLOWER_LC|U8 ch
+Converts the specified character to lowercase using the current locale's rules,
+if possible; otherwise returns the input character itself.
+
+=for apidoc Am|U8|toTITLE|U8 ch
+Converts the specified character to titlecase. If the input is anything but an
+ASCII lowercase character, that input character itself is returned. Variant
+C<toTITLE_A> is equivalent. (There is no C<toTITLE_L1> for the full Latin1 range,
+as the full generality of L</toTITLE_uni> is needed there. Titlecase is not a
+concept used in locale handling, so there is no functionality for that.)
+
+=for apidoc Am|UV|toTITLE_uni|UV cp|U8* s|STRLEN* lenp
+Converts the Unicode code point C<cp> to its titlecase version, and
+stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
+that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
+bytes since the titlecase version may be longer than the original character.
+
+The first code point of the titlecased version is returned
+(but note, as explained just above, that there may be more.)
+
+=for apidoc Am|UV|toTITLE_utf8|U8* p|U8* s|STRLEN* lenp
+Converts the UTF-8 encoded character at C<p> to its titlecase version, and
+stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
+that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
+bytes since the titlecase version may be longer than the original character.
+
+The first code point of the titlecased version is returned
+(but note, as explained just above, that there may be more.)
-=for apidoc Am|char|toLOWER|char ch
-Converts the specified character to lowercase, if possible; otherwise returns
-the input character itself.
+The input character at C<p> is assumed to be well-formed.
=cut
-XXX Still undocumented isVERTWS_uni and _utf8, and the other toUPPER etc functions
+XXX Still undocumented isVERTWS_uni and _utf8; it's unclear what their names
+really should be. Also toUPPER_LC and toFOLD_LC, which are subject to change.
Note that these macros are repeated in Devel::PPPort, so should also be
patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
#define FITS_IN_8_BITS(c) ((sizeof(c) == 1) || !(((WIDEST_UTYPE)(c)) & ~0xFF))
#ifdef EBCDIC
-# define isASCII(c) (FITS_IN_8_BITS(c) && (NATIVE_TO_UNI((U8) (c)) < 128))
+# define isASCII(c) (FITS_IN_8_BITS(c) \
+ && (NATIVE_TO_LATIN1((U8) (c)) < 128))
#else
# define isASCII(c) ((WIDEST_UTYPE)(c) < 128)
#endif
/* The 1U keeps Solaris from griping when shifting sets the uppermost bit */
# define _CC_mask(classnum) (1U << (classnum))
# define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \
- && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(classnum)))
+ && (PL_charclass[(U8) (c)] & _CC_mask(classnum)))
/* The mask for the _A versions of the macros; it just adds in the bit for
* ASCII. */
/* The _A version makes sure that both the desired bit and the ASCII bit
* are present */
# define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \
- && ((PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask_A(classnum)) \
+ && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \
== _CC_mask_A(classnum)))
# define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA)
/* Either participates in a fold with a character above 255, or is a
* multi-char fold */
-# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
+# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
# define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)
# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
_generic_isCC(c, _CC_IS_IN_SOME_FOLD)
#else /* No perl.h. */
+# define isBLANK_A(c) ((c) == ' ' || (c) == '\t')
+# define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_')
+# define isWORDCHAR_A(c) (isALPHANUMERIC_A(c) || (c) == '_')
+# define isPSXSPC_A(c) (isSPACE_A(c) || (c) == '\v')
# ifdef EBCDIC
# define isALPHA_A(c) (isASCII(c) && isALPHA(c))
# define isALPHANUMERIC_A(c) (isASCII(c) && isALPHANUMERIC(c))
-# define isBLANK_A(c) (isASCII(c) && isBLANK(c))
# define isCNTRL_A(c) (isASCII(c) && isCNTRL(c))
# define isDIGIT_A(c) (isASCII(c) && isDIGIT(c))
# define isGRAPH_A(c) (isASCII(c) && isGRAPH(c))
-# define isIDFIRST_A(c) (isASCII(c) && isIDFIRST(c))
# define isLOWER_A(c) (isASCII(c) && isLOWER(c))
# define isPRINT_A(c) (isASCII(c) && isPRINT(c))
-# define isPSXSPC_A(c) (isASCII(c) && isPSXSPC(c))
# define isPUNCT_A(c) (isASCII(c) && isPUNCT(c))
# define isSPACE_A(c) (isASCII(c) && isSPACE(c))
# define isUPPER_A(c) (isASCII(c) && isUPPER(c))
-# define isWORDCHAR_A(c) (isASCII(c) && isWORDCHAR(c))
# define isXDIGIT_A(c) (isASCII(c) && isXDIGIT(c))
# else /* ASCII platform, no perl.h */
# define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c))
# define isALPHANUMERIC_A(c) (isALPHA_A(c) || isDIGIT_A(c))
-# define isBLANK_A(c) ((c) == ' ' || (c) == '\t')
-# define isCNTRL_A(c) (FITS_IN_8_BITS(c) && ((U8) (c) < ' ' || (c) == 127))
+# define isCNTRL_A(c) (isASCII(c) && ! isPRINT_A(c))
# define isDIGIT_A(c) ((c) <= '9' && (c) >= '0')
-# define isGRAPH_A(c) (isWORDCHAR_A(c) || isPUNCT_A(c))
-# define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_')
+# define isGRAPH_A(c) (isPRINT_A(c) && (c) ! = ' ')
# define isLOWER_A(c) ((c) >= 'a' && (c) <= 'z')
# define isPRINT_A(c) (((c) >= 32 && (c) < 127))
-# define isPSXSPC_A(c) (isSPACE_A(c) || (c) == '\v')
-# define isPUNCT_A(c) (((c) >= 33 && (c) <= 47) \
- || ((c) >= 58 && (c) <= 64) \
- || ((c) >= 91 && (c) <= 96) \
- || ((c) >= 123 && (c) <= 126))
+# define isPUNCT_A(c) (isGRAPH_A(c) && ! isALPHANUMERIC(c))
# define isSPACE_A(c) ((c) == ' ' \
|| (c) == '\t' \
|| (c) == '\n' \
|| (c) =='\r' \
|| (c) == '\f')
# define isUPPER_A(c) ((c) <= 'Z' && (c) >= 'A')
-# define isWORDCHAR_A(c) (isALPHA_A(c) || isDIGIT_A(c) || (c) == '_')
# define isXDIGIT_A(c) (isDIGIT_A(c) \
|| ((c) >= 'a' && (c) <= 'f') \
|| ((c) <= 'F' && (c) >= 'A'))
* for backwards compatibility */
/* ALPHAU includes Unicode semantics for latin1 characters. It has an extra
* >= AA test to speed up ASCII-only tests at the expense of the others */
-# define isALPHA_L1(c) (isALPHA(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \
- && ((NATIVE_TO_UNI((U8) c) >= 0xC0 \
- && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \
- || NATIVE_TO_UNI((U8) c) == 0xAA \
- || NATIVE_TO_UNI((U8) c) == 0xB5 \
- || NATIVE_TO_UNI((U8) c) == 0xBA)))
+# define isALPHA_L1(c) (isALPHA(c) || (NATIVE_TO_LATIN1((U8) c) >= 0xAA \
+ && ((NATIVE_TO_LATIN1((U8) c) >= 0xC0 \
+ && NATIVE_TO_LATIN1((U8) c) != 0xD7 && NATIVE_TO_LATIN1((U8) c) != 0xF7) \
+ || NATIVE_TO_LATIN1((U8) c) == 0xAA \
+ || NATIVE_TO_LATIN1((U8) c) == 0xB5 \
+ || NATIVE_TO_LATIN1((U8) c) == 0xBA)))
# define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) \
|| (c) == ' ' \
|| (c) == '-' \
|| (c) == '(' \
|| (c) == ')' \
|| (c) == ':' \
- || NATIVE_TO_UNI((U8) c) == 0xA0)
+ || NATIVE_TO_LATIN1((U8) c) == 0xA0)
#endif
/* Macros that differ between EBCDIC and ASCII. Where C89 defines a function,
#ifdef EBCDIC
# define isALPHA(c) isalpha(c)
# define isALPHANUMERIC(c) isalnum(c)
-# define isBLANK(c) ((c) == ' ' || (c) == '\t' || NATIVE_TO_UNI(c) == 0xA0)
+# define isBLANK(c) ((c) == ' ' || (c) == '\t' || NATIVE_TO_LATIN1(c) == 0xA0)
# define isCNTRL(c) iscntrl(c)
# define isDIGIT(c) isdigit(c)
# define isGRAPH(c) isgraph(c)
# define toUPPER(c) (isLOWER(c) ? (c) - ('a' - 'A') : (c))
#endif
+/* In the ASCII range, these are equivalent to what they're here defined to be.
+ * But by creating these definitions, other code doesn't have to be aware of
+ * this detail */
+#define toFOLD(c) toLOWER(c)
+#define toFOLD_LC(c) toLOWER_LC(c)
+#define toTITLE(c) toUPPER(c)
+
+#define toLOWER_A(c) toLOWER(c)
+#define toUPPER_A(c) toUPPER(c)
+#define toFOLD_A(c) toFOLD(c)
+#define toTITLE_A(c) toTITLE(c)
/* Use table lookup for speed; return error character for input
* out-of-range */
-#define toLOWER_LATIN1(c) (FITS_IN_8_BITS(c) \
- ? UNI_TO_NATIVE(PL_latin1_lc[ \
- NATIVE_TO_UNI( (U8) (c)) ]) \
- : UNICODE_REPLACEMENT)
+#define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \
+ ? (c) \
+ : PL_latin1_lc[ (U8) (c) ])
+#define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */
+
/* Modified uc. Is correct uc except for three non-ascii chars which are
* all mapped to one of them, and these need special handling; error
* character for input out-of-range */
-#define toUPPER_LATIN1_MOD(c) (FITS_IN_8_BITS(c) \
- ? UNI_TO_NATIVE(PL_mod_latin1_uc[ \
- NATIVE_TO_UNI( (U8) (c)) ]) \
- : UNICODE_REPLACEMENT)
-
+#define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \
+ ? (c) \
+ : PL_mod_latin1_uc[ (U8) (c) ])
#ifdef USE_NEXT_CTYPE
# define isALPHANUMERIC_LC(c) NXIsAlNum((unsigned int)(c))
? _generic_isCC(*(p), classnum) \
: (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
? _generic_isCC( \
- TWO_BYTE_UTF8_TO_UNI(*(p), \
+ TWO_BYTE_UTF8_TO_NATIVE(*(p), \
*((p)+1 )), \
- classnum) \
+ classnum) \
: utf8)
/* Like the above, but calls 'above_latin1(p)' to get the utf8 value. 'above_latin1'
* can be a macro */
#define isXDIGIT_utf8(p) _generic_utf8_no_upper_latin1(_CC_XDIGIT, p, \
is_XDIGIT_high(p))
+#define toFOLD_utf8(p,s,l) to_utf8_fold(p,s,l)
#define toLOWER_utf8(p,s,l) to_utf8_lower(p,s,l)
#define toTITLE_utf8(p,s,l) to_utf8_title(p,s,l)
#define toUPPER_utf8(p,s,l) to_utf8_upper(p,s,l)
* use the value given by the 'utf8' parameter. This relies on the fact that
* ASCII characters have the same representation whether utf8 or not. Note
* that it assumes that the utf8 has been validated, and ignores 'use bytes' */
-#define _generic_LC_utf8(macro, p, utf8) \
- (UTF8_IS_INVARIANT(*(p)) \
- ? macro(*(p)) \
- : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
- ? macro(TWO_BYTE_UTF8_TO_UNI(*(p), *((p)+1))) \
+#define _generic_LC_utf8(macro, p, utf8) \
+ (UTF8_IS_INVARIANT(*(p)) \
+ ? macro(*(p)) \
+ : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
+ ? macro(TWO_BYTE_UTF8_TO_NATIVE(*(p), *((p)+1))) \
: utf8)
#define _generic_LC_swash_utf8(macro, classnum, p) \
* CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't
* necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but
* that is EOT on ASCII; ST on EBCDIC */
-# define toCTRL(c) (toUPPER(NATIVE_TO_UNI(c)) ^ 64)
+# define toCTRL(c) (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64)
/* Line numbers are unsigned, 32 bits. */
typedef U32 line_t;
* (U16)n > (size_t)~0/sizeof(U16) always being false. */
#ifdef PERL_MALLOC_WRAP
#define MEM_WRAP_CHECK(n,t) \
- (void)(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t) && (Perl_croak_memory_wrap(),0))
+ (void)(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t) && (croak_memory_wrap(),0))
#define MEM_WRAP_CHECK_1(n,t,a) \
(void)(sizeof(t) > 1 && ((MEM_SIZE)(n)+0.0) > MEM_SIZE_MAX/sizeof(t) && (Perl_croak_nocontext("%s",(a)),0))
#define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t),
-#define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (Perl_croak_memory_wrap(),0):0),((n-1+PERL_STRLEN_ROUNDUP_QUANTUM)&~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM-1)))
+#define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0):0),((n-1+PERL_STRLEN_ROUNDUP_QUANTUM)&~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM-1)))
#else
#define MEM_WRAP_CHECK(n,t)
# define deprecate(s) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), "Use of " s " is deprecated")
#endif
+/* Internal macros to deal with gids and uids */
+#ifdef PERL_CORE
+
+# if Uid_t_size > IVSIZE
+# define sv_setuid(sv, uid) sv_setnv((sv), (NV)(uid))
+# define SvUID(sv) SvNV(sv)
+# else
+# if Uid_t_sign <= 0
+# define sv_setuid(sv, uid) sv_setiv((sv), (IV)(uid))
+# define SvUID(sv) SvIV(sv)
+# else
+# define sv_setuid(sv, uid) sv_setuv((sv), (UV)(uid))
+# define SvUID(sv) SvUV(sv)
+# endif
+# endif /* Uid_t_size */
+
+# if Gid_t_size > IVSIZE
+# define sv_setgid(sv, gid) sv_setnv((sv), (NV)(gid))
+# define SvGID(sv) SvNV(sv)
+# else
+# if Gid_t_sign <= 0
+# define sv_setgid(sv, gid) sv_setiv((sv), (IV)(gid))
+# define SvGID(sv) SvIV(sv)
+# else
+# define sv_setgid(sv, gid) sv_setuv((sv), (UV)(gid))
+# define SvGID(sv) SvUV(sv)
+# endif
+# endif /* Gid_t_size */
+
+#endif
+
#endif /* HANDY_H */
/*