perl5.git.perl.org Git - perl5.git/blame_incremental

Commit	Line	Data
	1	/* handy.h
	2	*
	3	* Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000,
	4	* 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/* IMPORTANT NOTE: Everything whose name begins with an underscore is for
	12	* internal core Perl use only. */
	13
	14	#ifndef PERL_HANDY_H_ /* Guard against nested #inclusion */
	15	#define PERL_HANDY_H_
	16
	17	#ifndef PERL_CORE
	18	# define Null(type) ((type)NULL)
	19
	20	/*
	21	=head1 Handy Values
	22
	23	=for apidoc AmU\|\|Nullch
	24	Null character pointer. (No longer available when C<PERL_CORE> is
	25	defined.)
	26
	27	=for apidoc AmU\|\|Nullsv
	28	Null SV pointer. (No longer available when C<PERL_CORE> is defined.)
	29
	30	=cut
	31	*/
	32
	33	# define Nullch Null(char*)
	34	# define Nullfp Null(PerlIO*)
	35	# define Nullsv Null(SV*)
	36	#endif
	37
	38	#ifdef TRUE
	39	#undef TRUE
	40	#endif
	41	#ifdef FALSE
	42	#undef FALSE
	43	#endif
	44	#define TRUE (1)
	45	#define FALSE (0)
	46
	47	/* The MUTABLE_*() macros cast pointers to the types shown, in such a way
	48	* (compiler permitting) that casting away const-ness will give a warning;
	49	* e.g.:
	50	*
	51	* const SV *sv = ...;
	52	* AV av1 = (AV)sv; <== BAD: the const has been silently cast away
	53	* AV *av2 = MUTABLE_AV(sv); <== GOOD: it may warn
	54	*/
	55
	56	#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
	57	# define MUTABLE_PTR(p) ({ void *_p = (p); _p; })
	58	#else
	59	# define MUTABLE_PTR(p) ((void *) (p))
	60	#endif
	61
	62	#define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p))
	63	#define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p))
	64	#define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p))
	65	#define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p))
	66	#define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p))
	67	#define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p))
	68
	69	#if defined(I_STDBOOL) && !defined(PERL_BOOL_AS_CHAR)
	70	# include <stdbool.h>
	71	# ifndef HAS_BOOL
	72	# define HAS_BOOL 1
	73	# endif
	74	#endif
	75
	76	/* bool is built-in for g++-2.6.3 and later, which might be used
	77	for extensions. <_G_config.h> defines _G_HAVE_BOOL, but we can't
	78	be sure _G_config.h will be included before this file. _G_config.h
	79	also defines _G_HAVE_BOOL for both gcc and g++, but only g++
	80	actually has bool. Hence, _G_HAVE_BOOL is pretty useless for us.
	81	g++ can be identified by __GNUG__.
	82	Andy Dougherty February 2000
	83	*/
	84	#ifdef __GNUG__ /* GNU g++ has bool built-in */
	85	# ifndef PERL_BOOL_AS_CHAR
	86	# ifndef HAS_BOOL
	87	# define HAS_BOOL 1
	88	# endif
	89	# endif
	90	#endif
	91
	92	#ifndef HAS_BOOL
	93	# ifdef bool
	94	# undef bool
	95	# endif
	96	# define bool char
	97	# define HAS_BOOL 1
	98	#endif
	99
	100	/* cast-to-bool. A simple (bool) cast may not do the right thing: if bool is
	101	* defined as char for example, then the cast from int is
	102	* implementation-defined (bool)!!(cbool) in a ternary triggers a bug in xlc on
	103	* AIX */
	104	#define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0)
	105
	106	/* Try to figure out __func__ or __FUNCTION__ equivalent, if any.
	107	* XXX Should really be a Configure probe, with HAS__FUNCTION__
	108	* and FUNCTION__ as results.
	109	* XXX Similarly, a Configure probe for __FILE__ and __LINE__ is needed. */
	110	#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) \|\| (defined(__SUNPRO_C)) /* C99 or close enough. */
	111	# define FUNCTION__ __func__
	112	#elif (defined(USING_MSVC6)) \|\| /* MSVC6 has neither __func__ nor __FUNCTION and no good workarounds, either. */ \
	113	(defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */
	114	# define FUNCTION__ ""
	115	#else
	116	# define FUNCTION__ __FUNCTION__ /* Common extension. */
	117	#endif
	118
	119	/* XXX A note on the perl source internal type system. The
	120	original intent was that I32 be exactly 32 bits.
	121
	122	Currently, we only guarantee that I32 is at least 32 bits.
	123	Specifically, if int is 64 bits, then so is I32. (This is the case
	124	for the Cray.) This has the advantage of meshing nicely with
	125	standard library calls (where we pass an I32 and the library is
	126	expecting an int), but the disadvantage that an I32 is not 32 bits.
	127	Andy Dougherty August 1996
	128
	129	There is no guarantee that there is any integral type with
	130	exactly 32 bits. It is perfectly legal for a system to have
	131	sizeof(short) == sizeof(int) == sizeof(long) == 8.
	132
	133	Similarly, there is no guarantee that I16 and U16 have exactly 16
	134	bits.
	135
	136	For dealing with issues that may arise from various 32/64-bit
	137	systems, we will ask Configure to check out
	138
	139	SHORTSIZE == sizeof(short)
	140	INTSIZE == sizeof(int)
	141	LONGSIZE == sizeof(long)
	142	LONGLONGSIZE == sizeof(long long) (if HAS_LONG_LONG)
	143	PTRSIZE == sizeof(void *)
	144	DOUBLESIZE == sizeof(double)
	145	LONG_DOUBLESIZE == sizeof(long double) (if HAS_LONG_DOUBLE).
	146
	147	*/
	148
	149	#ifdef I_INTTYPES /* e.g. Linux has int64_t without <inttypes.h> */
	150	# include <inttypes.h>
	151	# ifdef INT32_MIN_BROKEN
	152	# undef INT32_MIN
	153	# define INT32_MIN (-2147483647-1)
	154	# endif
	155	# ifdef INT64_MIN_BROKEN
	156	# undef INT64_MIN
	157	# define INT64_MIN (-9223372036854775807LL-1)
	158	# endif
	159	#endif
	160
	161	typedef I8TYPE I8;
	162	typedef U8TYPE U8;
	163	typedef I16TYPE I16;
	164	typedef U16TYPE U16;
	165	typedef I32TYPE I32;
	166	typedef U32TYPE U32;
	167
	168	#ifdef HAS_QUAD
	169	typedef I64TYPE I64;
	170	typedef U64TYPE U64;
	171	#endif
	172
	173	/* INT64_C/UINT64_C are C99 from <stdint.h> (so they will not be
	174	* available in strict C89 mode), but they are nice, so let's define
	175	* them if necessary. */
	176	#if defined(HAS_QUAD)
	177	# undef PeRl_INT64_C
	178	# undef PeRl_UINT64_C
	179	/* Prefer the native integer types (int and long) over long long
	180	* (which is not C89) and Win32-specific __int64. */
	181	# if QUADKIND == QUAD_IS_INT && INTSIZE == 8
	182	# define PeRl_INT64_C(c) (c)
	183	# define PeRl_UINT64_C(c) CAT2(c,U)
	184	# endif
	185	# if QUADKIND == QUAD_IS_LONG && LONGSIZE == 8
	186	# define PeRl_INT64_C(c) CAT2(c,L)
	187	# define PeRl_UINT64_C(c) CAT2(c,UL)
	188	# endif
	189	# if QUADKIND == QUAD_IS_LONG_LONG && defined(HAS_LONG_LONG)
	190	# define PeRl_INT64_C(c) CAT2(c,LL)
	191	# define PeRl_UINT64_C(c) CAT2(c,ULL)
	192	# endif
	193	# if QUADKIND == QUAD_IS___INT64
	194	# define PeRl_INT64_C(c) CAT2(c,I64)
	195	# define PeRl_UINT64_C(c) CAT2(c,UI64)
	196	# endif
	197	# ifndef PeRl_INT64_C
	198	# define PeRl_INT64_C(c) ((I64)(c)) /* last resort */
	199	# define PeRl_UINT64_C(c) ((U64)(c))
	200	# endif
	201	/* In OS X the INT64_C/UINT64_C are defined with LL/ULL, which will
	202	* not fly with C89-pedantic gcc, so let's undefine them first so that
	203	* we can redefine them with our native integer preferring versions. */
	204	# if defined(PERL_DARWIN) && defined(PERL_GCC_PEDANTIC)
	205	# undef INT64_C
	206	# undef UINT64_C
	207	# endif
	208	# ifndef INT64_C
	209	# define INT64_C(c) PeRl_INT64_C(c)
	210	# endif
	211	# ifndef UINT64_C
	212	# define UINT64_C(c) PeRl_UINT64_C(c)
	213	# endif
	214	#endif
	215
	216	#if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX)
	217
	218	/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.
	219	Please search CHAR_MAX in perl.h for further details. */
	220	#define U8_MAX UINT8_MAX
	221	#define U8_MIN UINT8_MIN
	222
	223	#define I16_MAX INT16_MAX
	224	#define I16_MIN INT16_MIN
	225	#define U16_MAX UINT16_MAX
	226	#define U16_MIN UINT16_MIN
	227
	228	#define I32_MAX INT32_MAX
	229	#define I32_MIN INT32_MIN
	230	#ifndef UINT32_MAX_BROKEN /* e.g. HP-UX with gcc messes this up */
	231	# define U32_MAX UINT32_MAX
	232	#else
	233	# define U32_MAX 4294967295U
	234	#endif
	235	#define U32_MIN UINT32_MIN
	236
	237	#else
	238
	239	/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.
	240	Please search CHAR_MAX in perl.h for further details. */
	241	#define U8_MAX PERL_UCHAR_MAX
	242	#define U8_MIN PERL_UCHAR_MIN
	243
	244	#define I16_MAX PERL_SHORT_MAX
	245	#define I16_MIN PERL_SHORT_MIN
	246	#define U16_MAX PERL_USHORT_MAX
	247	#define U16_MIN PERL_USHORT_MIN
	248
	249	#if LONGSIZE > 4
	250	# define I32_MAX PERL_INT_MAX
	251	# define I32_MIN PERL_INT_MIN
	252	# define U32_MAX PERL_UINT_MAX
	253	# define U32_MIN PERL_UINT_MIN
	254	#else
	255	# define I32_MAX PERL_LONG_MAX
	256	# define I32_MIN PERL_LONG_MIN
	257	# define U32_MAX PERL_ULONG_MAX
	258	# define U32_MIN PERL_ULONG_MIN
	259	#endif
	260
	261	#endif
	262
	263	/* log(2) (i.e., log base 10 of 2) is pretty close to 0.30103, just in case
	264	* anyone is grepping for it */
	265	#define BIT_DIGITS(N) (((N)146)/485 + 1) / log10(2) =~ 146/485 */
	266	#define TYPE_DIGITS(T) BIT_DIGITS(sizeof(T) * 8)
	267	#define TYPE_CHARS(T) (TYPE_DIGITS(T) + 2) /* sign, NUL */
	268
	269	/* Unused by core; should be deprecated */
	270	#define Ctl(ch) ((ch) & 037)
	271
	272	#if defined(PERL_CORE) \|\| defined(PERL_EXT)
	273	# ifndef MIN
	274	# define MIN(a,b) ((a) < (b) ? (a) : (b))
	275	# endif
	276	# ifndef MAX
	277	# define MAX(a,b) ((a) > (b) ? (a) : (b))
	278	# endif
	279	#endif
	280
	281	/* This is a helper macro to avoid preprocessor issues, replaced by nothing
	282	* unless under DEBUGGING, where it expands to an assert of its argument,
	283	* followed by a comma (hence the comma operator). If we just used a straight
	284	* assert(), we would get a comma with nothing before it when not DEBUGGING.
	285	*
	286	* We also use empty definition under Coverity since the __ASSERT__
	287	* checks often check for things that Really Cannot Happen, and Coverity
	288	* detects that and gets all excited. */
	289
	290	#if defined(DEBUGGING) && !defined(__COVERITY__)
	291	# define __ASSERT_(statement) assert(statement),
	292	#else
	293	# define __ASSERT_(statement)
	294	#endif
	295
	296	/*
	297	=head1 SV-Body Allocation
	298
	299	=for apidoc Ama\|SV*\|newSVpvs\|"literal string" s
	300	Like C<newSVpvn>, but takes a literal string instead of a
	301	string/length pair.
	302
	303	=for apidoc Ama\|SV*\|newSVpvs_flags\|"literal string" s\|U32 flags
	304	Like C<newSVpvn_flags>, but takes a literal string instead of
	305	a string/length pair.
	306
	307	=for apidoc Ama\|SV*\|newSVpvs_share\|"literal string" s
	308	Like C<newSVpvn_share>, but takes a literal string instead of
	309	a string/length pair and omits the hash parameter.
	310
	311	=for apidoc Am\|void\|sv_catpvs_flags\|SV* sv\|"literal string" s\|I32 flags
	312	Like C<sv_catpvn_flags>, but takes a literal string instead
	313	of a string/length pair.
	314
	315	=for apidoc Am\|void\|sv_catpvs_nomg\|SV* sv\|"literal string" s
	316	Like C<sv_catpvn_nomg>, but takes a literal string instead of
	317	a string/length pair.
	318
	319	=for apidoc Am\|void\|sv_catpvs\|SV* sv\|"literal string" s
	320	Like C<sv_catpvn>, but takes a literal string instead of a
	321	string/length pair.
	322
	323	=for apidoc Am\|void\|sv_catpvs_mg\|SV* sv\|"literal string" s
	324	Like C<sv_catpvn_mg>, but takes a literal string instead of a
	325	string/length pair.
	326
	327	=for apidoc Am\|void\|sv_setpvs\|SV* sv\|"literal string" s
	328	Like C<sv_setpvn>, but takes a literal string instead of a
	329	string/length pair.
	330
	331	=for apidoc Am\|void\|sv_setpvs_mg\|SV* sv\|"literal string" s
	332	Like C<sv_setpvn_mg>, but takes a literal string instead of a
	333	string/length pair.
	334
	335	=for apidoc Am\|SV *\|sv_setref_pvs\|"literal string" s
	336	Like C<sv_setref_pvn>, but takes a literal string instead of
	337	a string/length pair.
	338
	339	=head1 Memory Management
	340
	341	=for apidoc Ama\|char*\|savepvs\|"literal string" s
	342	Like C<savepvn>, but takes a literal string instead of a
	343	string/length pair.
	344
	345	=for apidoc Ama\|char*\|savesharedpvs\|"literal string" s
	346	A version of C<savepvs()> which allocates the duplicate string in memory
	347	which is shared between threads.
	348
	349	=head1 GV Functions
	350
	351	=for apidoc Am\|HV*\|gv_stashpvs\|"literal string" name\|I32 create
	352	Like C<gv_stashpvn>, but takes a literal string instead of a
	353	string/length pair.
	354
	355	=head1 Hash Manipulation Functions
	356
	357	=for apidoc Am\|SV*\|hv_fetchs\|HV tb\|"literal string" key\|I32 lval
	358	Like C<hv_fetch>, but takes a literal string instead of a
	359	string/length pair.
	360
	361	=for apidoc Am\|SV*\|hv_stores\|HV tb\|"literal string" key\|NULLOK SV* val
	362	Like C<hv_store>, but takes a literal string instead of a
	363	string/length pair
	364	and omits the hash parameter.
	365
	366	=head1 Lexer interface
	367
	368	=for apidoc Amx\|void\|lex_stuff_pvs\|"literal string" pv\|U32 flags
	369
	370	Like L</lex_stuff_pvn>, but takes a literal string instead of
	371	a string/length pair.
	372
	373	=cut
	374	*/
	375
	376	/* concatenating with "" ensures that only literal strings are accepted as
	377	* argument */
	378	#define STR_WITH_LEN(s) ("" s ""), (sizeof(s)-1)
	379
	380	/* note that STR_WITH_LEN() can't be used as argument to macros or functions
	381	* that under some configurations might be macros, which means that it requires
	382	* the full Perl_xxx(aTHX_ ...) form for any API calls where it's used.
	383	*/
	384
	385	/* STR_WITH_LEN() shortcuts */
	386	#define newSVpvs(str) Perl_newSVpvn(aTHX_ STR_WITH_LEN(str))
	387	#define newSVpvs_flags(str,flags) \
	388	Perl_newSVpvn_flags(aTHX_ STR_WITH_LEN(str), flags)
	389	#define newSVpvs_share(str) Perl_newSVpvn_share(aTHX_ STR_WITH_LEN(str), 0)
	390	#define sv_catpvs_flags(sv, str, flags) \
	391	Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), flags)
	392	#define sv_catpvs_nomg(sv, str) \
	393	Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), 0)
	394	#define sv_catpvs(sv, str) \
	395	Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC)
	396	#define sv_catpvs_mg(sv, str) \
	397	Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC\|SV_SMAGIC)
	398	#define sv_setpvs(sv, str) Perl_sv_setpvn(aTHX_ sv, STR_WITH_LEN(str))
	399	#define sv_setpvs_mg(sv, str) Perl_sv_setpvn_mg(aTHX_ sv, STR_WITH_LEN(str))
	400	#define sv_setref_pvs(rv, classname, str) \
	401	Perl_sv_setref_pvn(aTHX_ rv, classname, STR_WITH_LEN(str))
	402	#define savepvs(str) Perl_savepvn(aTHX_ STR_WITH_LEN(str))
	403	#define savesharedpvs(str) Perl_savesharedpvn(aTHX_ STR_WITH_LEN(str))
	404	#define gv_stashpvs(str, create) \
	405	Perl_gv_stashpvn(aTHX_ STR_WITH_LEN(str), create)
	406	#define gv_fetchpvs(namebeg, add, sv_type) \
	407	Perl_gv_fetchpvn_flags(aTHX_ STR_WITH_LEN(namebeg), add, sv_type)
	408	#define gv_fetchpvn(namebeg, len, add, sv_type) \
	409	Perl_gv_fetchpvn_flags(aTHX_ namebeg, len, add, sv_type)
	410	#define sv_catxmlpvs(dsv, str, utf8) \
	411	Perl_sv_catxmlpvn(aTHX_ dsv, STR_WITH_LEN(str), utf8)
	412
	413
	414	#define lex_stuff_pvs(pv,flags) Perl_lex_stuff_pvn(aTHX_ STR_WITH_LEN(pv), flags)
	415
	416	#define get_cvs(str, flags) \
	417	Perl_get_cvn_flags(aTHX_ STR_WITH_LEN(str), (flags))
	418
	419	/*
	420	=head1 Miscellaneous Functions
	421
	422	=for apidoc Am\|bool\|strNE\|char* s1\|char* s2
	423	Test two C<NUL>-terminated strings to see if they are different. Returns true
	424	or false.
	425
	426	=for apidoc Am\|bool\|strEQ\|char* s1\|char* s2
	427	Test two C<NUL>-terminated strings to see if they are equal. Returns true or
	428	false.
	429
	430	=for apidoc Am\|bool\|strLT\|char* s1\|char* s2
	431	Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than the
	432	second, C<s2>. Returns true or false.
	433
	434	=for apidoc Am\|bool\|strLE\|char* s1\|char* s2
	435	Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than or
	436	equal to the second, C<s2>. Returns true or false.
	437
	438	=for apidoc Am\|bool\|strGT\|char* s1\|char* s2
	439	Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than
	440	the second, C<s2>. Returns true or false.
	441
	442	=for apidoc Am\|bool\|strGE\|char* s1\|char* s2
	443	Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than
	444	or equal to the second, C<s2>. Returns true or false.
	445
	446	=for apidoc Am\|bool\|strnNE\|char* s1\|char* s2\|STRLEN len
	447	Test two C<NUL>-terminated strings to see if they are different. The C<len>
	448	parameter indicates the number of bytes to compare. Returns true or false. (A
	449	wrapper for C<strncmp>).
	450
	451	=for apidoc Am\|bool\|strnEQ\|char* s1\|char* s2\|STRLEN len
	452	Test two C<NUL>-terminated strings to see if they are equal. The C<len>
	453	parameter indicates the number of bytes to compare. Returns true or false. (A
	454	wrapper for C<strncmp>).
	455
	456	=for apidoc Am\|bool\|memEQ\|char* s1\|char* s2\|STRLEN len
	457	Test two buffers (which may contain embedded C<NUL> characters, to see if they
	458	are equal. The C<len> parameter indicates the number of bytes to compare.
	459	Returns zero if equal, or non-zero if non-equal.
	460
	461	=for apidoc Am\|bool\|memNE\|char* s1\|char* s2\|STRLEN len
	462	Test two buffers (which may contain embedded C<NUL> characters, to see if they
	463	are not equal. The C<len> parameter indicates the number of bytes to compare.
	464	Returns zero if non-equal, or non-zero if equal.
	465
	466	=cut
	467
	468	New macros should use the following conventions for their names (which are
	469	based on the underlying C library functions):
	470
	471	(mem \| str n? ) (EQ \| NE \| LT \| GT \| GE \| (( BEGIN \| END ) P? )) l? s?
	472
	473	Each has two main parameters, string-like operands that are compared
	474	against each other, as specified by the macro name. Some macros may
	475	additionally have one or potentially even two length parameters. If a length
	476	parameter applies to both string parameters, it will be positioned third;
	477	otherwise any length parameter immediately follows the string parameter it
	478	applies to.
	479
	480	If the prefix to the name is 'str', the string parameter is a pointer to a C
	481	language string. Such a string does not contain embedded NUL bytes; its
	482	length may be unknown, but can be calculated by C<strlen()>, since it is
	483	terminated by a NUL, which isn't included in its length.
	484
	485	The optional 'n' following 'str' means that that there is a third parameter,
	486	giving the maximum number of bytes to look at in each string. Even if both
	487	strings are longer than the length parameter, those extra bytes will be
	488	unexamined.
	489
	490	The 's' suffix means that the 2nd byte string parameter is a literal C
	491	double-quoted string. Its length will automatically be calculated by the
	492	macro, so no length parameter will ever be needed for it.
	493
	494	If the prefix is 'mem', the string parameters don't have to be C strings;
	495	they may contain embedded NUL bytes, do not necessarily have a terminating
	496	NUL, and their lengths can be known only through other means, which in
	497	practice are additional parameter(s) passed to the function. All 'mem'
	498	functions have at least one length parameter. Barring any 'l' or 's' suffix,
	499	there is a single length parameter, in position 3, which applies to both
	500	string parameters. The 's' suffix means, as described above, that the 2nd

1

/* handy.h

2

*

3

4

* 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others

5

*

6

* You may distribute under the terms of either the GNU General Public

7

* License or the Artistic License, as specified in the README file.

*

*/

/* IMPORTANT NOTE: Everything whose name begins with an underscore is for

12

* internal core Perl use only. */

13

14

#ifndef PERL_HANDY_H_ /* Guard against nested #inclusion */

15

#define PERL_HANDY_H_

16

17

#ifndef PERL_CORE

18

# define Null(type) ((type)NULL)

/*

=head1 Handy Values

=for apidoc AmU||Nullch

24

Null character pointer. (No longer available when C<PERL_CORE> is

25

defined.)

26

27

=for apidoc AmU||Nullsv

28

Null SV pointer. (No longer available when C<PERL_CORE> is defined.)

=cut

*/

# define Nullch Null(char*)

34

# define Nullfp Null(PerlIO*)

35

# define Nullsv Null(SV*)

#endif

#ifdef TRUE

#undef TRUE

#endif

#ifdef FALSE

#undef FALSE

#endif

#define TRUE (1)

#define FALSE (0)

/* The MUTABLE_*() macros cast pointers to the types shown, in such a way

48

* (compiler permitting) that casting away const-ness will give a warning;

49

* e.g.:

50

*

51

* const SV *sv = ...;

52

* AV *av1 = (AV*)sv; <== BAD: the const has been silently cast away

53

* AV *av2 = MUTABLE_AV(sv); <== GOOD: it may warn

54

*/

55

56

#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)

57

# define MUTABLE_PTR(p) ({ void *_p = (p); _p; })

58

#else

59

# define MUTABLE_PTR(p) ((void *) (p))

60

#endif

61

62

#define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p))

63

#define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p))

64

#define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p))

65

#define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p))

66

#define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p))

67

#define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p))

68

69

#if defined(I_STDBOOL) && !defined(PERL_BOOL_AS_CHAR)

70

# include <stdbool.h>

# ifndef HAS_BOOL

# define HAS_BOOL 1

# endif

#endif

/* bool is built-in for g++-2.6.3 and later, which might be used

77

for extensions. <_G_config.h> defines _G_HAVE_BOOL, but we can't

78

be sure _G_config.h will be included before this file. _G_config.h

79

also defines _G_HAVE_BOOL for both gcc and g++, but only g++

80

actually has bool. Hence, _G_HAVE_BOOL is pretty useless for us.

81

g++ can be identified by __GNUG__.

82

Andy Dougherty February 2000

83

*/

84

#ifdef __GNUG__ /* GNU g++ has bool built-in */

85

# ifndef PERL_BOOL_AS_CHAR

# ifndef HAS_BOOL

# define HAS_BOOL 1

# endif

# endif

#endif

#ifndef HAS_BOOL

# ifdef bool

# undef bool

# endif

# define bool char

# define HAS_BOOL 1

#endif

/* cast-to-bool. A simple (bool) cast may not do the right thing: if bool is

101

* defined as char for example, then the cast from int is

102

* implementation-defined (bool)!!(cbool) in a ternary triggers a bug in xlc on

103

* AIX */

104

#define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0)

105

106

/* Try to figure out __func__ or __FUNCTION__ equivalent, if any.

107

* XXX Should really be a Configure probe, with HAS__FUNCTION__

108

* and FUNCTION__ as results.

109

* XXX Similarly, a Configure probe for __FILE__ and __LINE__ is needed. */

110

#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__SUNPRO_C)) /* C99 or close enough. */

111

# define FUNCTION__ __func__

112

#elif (defined(USING_MSVC6)) || /* MSVC6 has neither __func__ nor __FUNCTION and no good workarounds, either. */ \

113

(defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */

114

# define FUNCTION__ ""

115

#else

116

# define FUNCTION__ __FUNCTION__ /* Common extension. */

117

#endif

118

119

/* XXX A note on the perl source internal type system. The

120

original intent was that I32 be *exactly* 32 bits.

121

122

Currently, we only guarantee that I32 is *at least* 32 bits.

123

Specifically, if int is 64 bits, then so is I32. (This is the case

124

for the Cray.) This has the advantage of meshing nicely with

125

standard library calls (where we pass an I32 and the library is

126

expecting an int), but the disadvantage that an I32 is not 32 bits.

127

Andy Dougherty August 1996

128

129

There is no guarantee that there is *any* integral type with

130

exactly 32 bits. It is perfectly legal for a system to have

131

sizeof(short) == sizeof(int) == sizeof(long) == 8.

132

133

Similarly, there is no guarantee that I16 and U16 have exactly 16

134

bits.

135

136

For dealing with issues that may arise from various 32/64-bit

137

systems, we will ask Configure to check out

138

139

SHORTSIZE == sizeof(short)

140

INTSIZE == sizeof(int)

141

LONGSIZE == sizeof(long)

142

LONGLONGSIZE == sizeof(long long) (if HAS_LONG_LONG)

143

PTRSIZE == sizeof(void *)

144

DOUBLESIZE == sizeof(double)

145

LONG_DOUBLESIZE == sizeof(long double) (if HAS_LONG_DOUBLE).

*/

#ifdef I_INTTYPES /* e.g. Linux has int64_t without <inttypes.h> */

150

# include <inttypes.h>

151

# ifdef INT32_MIN_BROKEN

152

# undef INT32_MIN

153

# define INT32_MIN (-2147483647-1)

154

# endif

155

# ifdef INT64_MIN_BROKEN

156

# undef INT64_MIN

157

# define INT64_MIN (-9223372036854775807LL-1)

# endif

#endif

typedef I8TYPE I8;

typedef U8TYPE U8;

typedef I16TYPE I16;

typedef U16TYPE U16;

typedef I32TYPE I32;

typedef U32TYPE U32;

#ifdef HAS_QUAD

typedef I64TYPE I64;

typedef U64TYPE U64;

#endif

/* INT64_C/UINT64_C are C99 from <stdint.h> (so they will not be

174

* available in strict C89 mode), but they are nice, so let's define

175

* them if necessary. */

176

#if defined(HAS_QUAD)

177

# undef PeRl_INT64_C

178

# undef PeRl_UINT64_C

179

/* Prefer the native integer types (int and long) over long long

180

* (which is not C89) and Win32-specific __int64. */

181

# if QUADKIND == QUAD_IS_INT && INTSIZE == 8

182

# define PeRl_INT64_C(c) (c)

183

# define PeRl_UINT64_C(c) CAT2(c,U)

184

# endif

185

# if QUADKIND == QUAD_IS_LONG && LONGSIZE == 8

186

# define PeRl_INT64_C(c) CAT2(c,L)

187

# define PeRl_UINT64_C(c) CAT2(c,UL)

188

# endif

189

# if QUADKIND == QUAD_IS_LONG_LONG && defined(HAS_LONG_LONG)

190

# define PeRl_INT64_C(c) CAT2(c,LL)

191

# define PeRl_UINT64_C(c) CAT2(c,ULL)

192

# endif

193

# if QUADKIND == QUAD_IS___INT64

194

# define PeRl_INT64_C(c) CAT2(c,I64)

195

# define PeRl_UINT64_C(c) CAT2(c,UI64)

196

# endif

197

# ifndef PeRl_INT64_C

198

# define PeRl_INT64_C(c) ((I64)(c)) /* last resort */

199

# define PeRl_UINT64_C(c) ((U64)(c))

200

# endif

201

/* In OS X the INT64_C/UINT64_C are defined with LL/ULL, which will

202

* not fly with C89-pedantic gcc, so let's undefine them first so that

203

* we can redefine them with our native integer preferring versions. */

204

# if defined(PERL_DARWIN) && defined(PERL_GCC_PEDANTIC)

# undef INT64_C

# undef UINT64_C

# endif

# ifndef INT64_C

# define INT64_C(c) PeRl_INT64_C(c)

210

# endif

211

# ifndef UINT64_C

212

# define UINT64_C(c) PeRl_UINT64_C(c)

# endif

#endif

#if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX)

217

218

/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.

219

Please search CHAR_MAX in perl.h for further details. */

220

#define U8_MAX UINT8_MAX

221

#define U8_MIN UINT8_MIN

222

223

#define I16_MAX INT16_MAX

224

#define I16_MIN INT16_MIN

225

#define U16_MAX UINT16_MAX

226

#define U16_MIN UINT16_MIN

227

228

#define I32_MAX INT32_MAX

229

#define I32_MIN INT32_MIN

230

#ifndef UINT32_MAX_BROKEN /* e.g. HP-UX with gcc messes this up */

231

# define U32_MAX UINT32_MAX

232

#else

233

# define U32_MAX 4294967295U

234

#endif

235

#define U32_MIN UINT32_MIN

#else

/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.

240

Please search CHAR_MAX in perl.h for further details. */

241

#define U8_MAX PERL_UCHAR_MAX

242

#define U8_MIN PERL_UCHAR_MIN

243

244

#define I16_MAX PERL_SHORT_MAX

245

#define I16_MIN PERL_SHORT_MIN

246

#define U16_MAX PERL_USHORT_MAX

247

#define U16_MIN PERL_USHORT_MIN

248

249

#if LONGSIZE > 4

250

# define I32_MAX PERL_INT_MAX

251

# define I32_MIN PERL_INT_MIN

252

# define U32_MAX PERL_UINT_MAX

253

# define U32_MIN PERL_UINT_MIN

254

#else

255

# define I32_MAX PERL_LONG_MAX

256

# define I32_MIN PERL_LONG_MIN

257

# define U32_MAX PERL_ULONG_MAX

258

# define U32_MIN PERL_ULONG_MIN

#endif

#endif

/* log(2) (i.e., log base 10 of 2) is pretty close to 0.30103, just in case

264

* anyone is grepping for it */

265

#define BIT_DIGITS(N) (((N)*146)/485 + 1) /* log10(2) =~ 146/485 */

266

#define TYPE_DIGITS(T) BIT_DIGITS(sizeof(T) * 8)

267

#define TYPE_CHARS(T) (TYPE_DIGITS(T) + 2) /* sign, NUL */

268

269

/* Unused by core; should be deprecated */

270

#define Ctl(ch) ((ch) & 037)

271

272

#if defined(PERL_CORE) || defined(PERL_EXT)

273

# ifndef MIN

274

# define MIN(a,b) ((a) < (b) ? (a) : (b))

275

# endif

276

# ifndef MAX

277

# define MAX(a,b) ((a) > (b) ? (a) : (b))

# endif

#endif

/* This is a helper macro to avoid preprocessor issues, replaced by nothing

282

* unless under DEBUGGING, where it expands to an assert of its argument,

283

* followed by a comma (hence the comma operator). If we just used a straight

284

* assert(), we would get a comma with nothing before it when not DEBUGGING.

285

*

286

* We also use empty definition under Coverity since the __ASSERT__

287

* checks often check for things that Really Cannot Happen, and Coverity

288

* detects that and gets all excited. */

289

290

#if defined(DEBUGGING) && !defined(__COVERITY__)

291

# define __ASSERT_(statement) assert(statement),

292

#else

293

# define __ASSERT_(statement)

#endif

/*

=head1 SV-Body Allocation

298

299

=for apidoc Ama|SV*|newSVpvs|"literal string" s

300

Like C<newSVpvn>, but takes a literal string instead of a

301

string/length pair.

302

303

=for apidoc Ama|SV*|newSVpvs_flags|"literal string" s|U32 flags

304

Like C<newSVpvn_flags>, but takes a literal string instead of

305

a string/length pair.

306

307

=for apidoc Ama|SV*|newSVpvs_share|"literal string" s

308

Like C<newSVpvn_share>, but takes a literal string instead of

309

a string/length pair and omits the hash parameter.

310

311

312

Like C<sv_catpvn_flags>, but takes a literal string instead

313

of a string/length pair.

314

315

=for apidoc Am|void|sv_catpvs_nomg|SV* sv|"literal string" s

316

Like C<sv_catpvn_nomg>, but takes a literal string instead of

317

a string/length pair.

318

319

=for apidoc Am|void|sv_catpvs|SV* sv|"literal string" s

320

Like C<sv_catpvn>, but takes a literal string instead of a

321

string/length pair.

322

323

=for apidoc Am|void|sv_catpvs_mg|SV* sv|"literal string" s

324

Like C<sv_catpvn_mg>, but takes a literal string instead of a

325

string/length pair.

326

327

=for apidoc Am|void|sv_setpvs|SV* sv|"literal string" s

328

Like C<sv_setpvn>, but takes a literal string instead of a

329

string/length pair.

330

331

=for apidoc Am|void|sv_setpvs_mg|SV* sv|"literal string" s

332

Like C<sv_setpvn_mg>, but takes a literal string instead of a

333

string/length pair.

334

335

=for apidoc Am|SV *|sv_setref_pvs|"literal string" s

336

Like C<sv_setref_pvn>, but takes a literal string instead of

337

a string/length pair.

338

339

=head1 Memory Management

340

341

=for apidoc Ama|char*|savepvs|"literal string" s

342

Like C<savepvn>, but takes a literal string instead of a

343

string/length pair.

344

345

=for apidoc Ama|char*|savesharedpvs|"literal string" s

346

A version of C<savepvs()> which allocates the duplicate string in memory

347

which is shared between threads.

=head1 GV Functions

=for apidoc Am|HV*|gv_stashpvs|"literal string" name|I32 create

352

Like C<gv_stashpvn>, but takes a literal string instead of a

353

string/length pair.

354

355

=head1 Hash Manipulation Functions

356

357

358

Like C<hv_fetch>, but takes a literal string instead of a

string/length pair.

Like C<hv_store>, but takes a literal string instead of a

363

string/length pair

364

and omits the hash parameter.

365

366

=head1 Lexer interface

367

368

=for apidoc Amx|void|lex_stuff_pvs|"literal string" pv|U32 flags

369

370

Like L</lex_stuff_pvn>, but takes a literal string instead of

371

a string/length pair.

=cut

*/

/* concatenating with "" ensures that only literal strings are accepted as

377

* argument */

378

#define STR_WITH_LEN(s) ("" s ""), (sizeof(s)-1)

379

380

/* note that STR_WITH_LEN() can't be used as argument to macros or functions

381

* that under some configurations might be macros, which means that it requires

382

* the full Perl_xxx(aTHX_ ...) form for any API calls where it's used.

383

*/

384

385

/* STR_WITH_LEN() shortcuts */

386

#define newSVpvs(str) Perl_newSVpvn(aTHX_ STR_WITH_LEN(str))

387

#define newSVpvs_flags(str,flags) \

388

Perl_newSVpvn_flags(aTHX_ STR_WITH_LEN(str), flags)

389

#define newSVpvs_share(str) Perl_newSVpvn_share(aTHX_ STR_WITH_LEN(str), 0)

390

#define sv_catpvs_flags(sv, str, flags) \

391

Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), flags)

392

#define sv_catpvs_nomg(sv, str) \

393

Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), 0)

394

#define sv_catpvs(sv, str) \

395

Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC)

396

#define sv_catpvs_mg(sv, str) \

397

Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC|SV_SMAGIC)

398

#define sv_setpvs(sv, str) Perl_sv_setpvn(aTHX_ sv, STR_WITH_LEN(str))

399

#define sv_setpvs_mg(sv, str) Perl_sv_setpvn_mg(aTHX_ sv, STR_WITH_LEN(str))

400

#define sv_setref_pvs(rv, classname, str) \

401

Perl_sv_setref_pvn(aTHX_ rv, classname, STR_WITH_LEN(str))

402

#define savepvs(str) Perl_savepvn(aTHX_ STR_WITH_LEN(str))

403

#define savesharedpvs(str) Perl_savesharedpvn(aTHX_ STR_WITH_LEN(str))

404

#define gv_stashpvs(str, create) \

405

Perl_gv_stashpvn(aTHX_ STR_WITH_LEN(str), create)

406

#define gv_fetchpvs(namebeg, add, sv_type) \

407

Perl_gv_fetchpvn_flags(aTHX_ STR_WITH_LEN(namebeg), add, sv_type)

408

#define gv_fetchpvn(namebeg, len, add, sv_type) \

409

Perl_gv_fetchpvn_flags(aTHX_ namebeg, len, add, sv_type)

410

#define sv_catxmlpvs(dsv, str, utf8) \

411

Perl_sv_catxmlpvn(aTHX_ dsv, STR_WITH_LEN(str), utf8)

412

413

414

#define lex_stuff_pvs(pv,flags) Perl_lex_stuff_pvn(aTHX_ STR_WITH_LEN(pv), flags)

415

416

#define get_cvs(str, flags) \

417

Perl_get_cvn_flags(aTHX_ STR_WITH_LEN(str), (flags))

418

419

/*

420

=head1 Miscellaneous Functions

421

422

=for apidoc Am|bool|strNE|char* s1|char* s2

423

Test two C<NUL>-terminated strings to see if they are different. Returns true

424

or false.

425

426

=for apidoc Am|bool|strEQ|char* s1|char* s2

427

Test two C<NUL>-terminated strings to see if they are equal. Returns true or

428

false.

429

430

=for apidoc Am|bool|strLT|char* s1|char* s2

431

Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than the

432

second, C<s2>. Returns true or false.

433

434

=for apidoc Am|bool|strLE|char* s1|char* s2

435

Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than or

436

equal to the second, C<s2>. Returns true or false.

437

438

=for apidoc Am|bool|strGT|char* s1|char* s2

439

Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than

440

the second, C<s2>. Returns true or false.

441

442

=for apidoc Am|bool|strGE|char* s1|char* s2

443

Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than

444

or equal to the second, C<s2>. Returns true or false.

445

446

447

Test two C<NUL>-terminated strings to see if they are different. The C<len>

448

parameter indicates the number of bytes to compare. Returns true or false. (A

449

wrapper for C<strncmp>).

450

451

452

Test two C<NUL>-terminated strings to see if they are equal. The C<len>

453

parameter indicates the number of bytes to compare. Returns true or false. (A

454

wrapper for C<strncmp>).

455

456

457

Test two buffers (which may contain embedded C<NUL> characters, to see if they

458

are equal. The C<len> parameter indicates the number of bytes to compare.

459

Returns zero if equal, or non-zero if non-equal.

460

461

462

Test two buffers (which may contain embedded C<NUL> characters, to see if they

463

are not equal. The C<len> parameter indicates the number of bytes to compare.

464

Returns zero if non-equal, or non-zero if equal.

=cut

New macros should use the following conventions for their names (which are

469

based on the underlying C library functions):

470

471

(mem | str n? ) (EQ | NE | LT | GT | GE | (( BEGIN | END ) P? )) l? s?

472

473

Each has two main parameters, string-like operands that are compared

474

against each other, as specified by the macro name. Some macros may

475

additionally have one or potentially even two length parameters. If a length

476

parameter applies to both string parameters, it will be positioned third;

477

otherwise any length parameter immediately follows the string parameter it

478

applies to.

479

480

If the prefix to the name is 'str', the string parameter is a pointer to a C

481

language string. Such a string does not contain embedded NUL bytes; its

482

length may be unknown, but can be calculated by C<strlen()>, since it is

483

terminated by a NUL, which isn't included in its length.

484

485

The optional 'n' following 'str' means that that there is a third parameter,

486

giving the maximum number of bytes to look at in each string. Even if both

487

strings are longer than the length parameter, those extra bytes will be

488

unexamined.

489

490

The 's' suffix means that the 2nd byte string parameter is a literal C

491

double-quoted string. Its length will automatically be calculated by the

492

macro, so no length parameter will ever be needed for it.

493

494

If the prefix is 'mem', the string parameters don't have to be C strings;

495

they may contain embedded NUL bytes, do not necessarily have a terminating

496

NUL, and their lengths can be known only through other means, which in

497

practice are additional parameter(s) passed to the function. All 'mem'

498

functions have at least one length parameter. Barring any 'l' or 's' suffix,

499

there is a single length parameter, in position 3, which applies to both

500

string parameters. The 's' suffix means, as described above, that the 2nd

501

string is a literal double-quoted C string (hence its length is calculated by

502

the macro, and the length parameter to the function applies just to the first

503

string parameter, and hence is positioned just after it). An 'l' suffix

504

means that the 2nd string parameter has its own length parameter, and the

505

signature will look like memFOOl(s1, l1, s2, l2).

506

507

BEGIN (and END) are for testing if the 2nd string is an initial (or final)

508

substring of the 1st string. 'P' if present indicates that the substring

509

must be a "proper" one in tha mathematical sense that the first one must be

510

strictly larger than the 2nd.

*/

#define strNE(s1,s2) (strcmp(s1,s2) != 0)

516

#define strEQ(s1,s2) (strcmp(s1,s2) == 0)

517

#define strLT(s1,s2) (strcmp(s1,s2) < 0)

518

#define strLE(s1,s2) (strcmp(s1,s2) <= 0)

519

#define strGT(s1,s2) (strcmp(s1,s2) > 0)

520

#define strGE(s1,s2) (strcmp(s1,s2) >= 0)

521

522

#define strnNE(s1,s2,l) (strncmp(s1,s2,l) != 0)

523

#define strnEQ(s1,s2,l) (strncmp(s1,s2,l) == 0)

524

525

#define memNE(s1,s2,l) (memcmp(s1,s2,l) != 0)

526

#define memEQ(s1,s2,l) (memcmp(s1,s2,l) == 0)

527

528

/* memEQ and memNE where second comparand is a string constant */

529

#define memEQs(s1, l, s2) \

530

(((sizeof(s2)-1) == (l)) && memEQ((s1), ("" s2 ""), (sizeof(s2)-1)))

531

#define memNEs(s1, l, s2) (! memEQs(s1, l, s2))

532

533

/* Keep these private until we decide it was a good idea */

534

#if defined(PERL_CORE) || defined(PERL_EXT) || defined(PERL_EXT_POSIX)

535

536

#define strBEGINs(s1,s2) (strncmp(s1,"" s2 "", sizeof(s2)-1) == 0)

537

538

#define memBEGINs(s1, l, s2) \

539

( (l) >= sizeof(s2) - 1 \

540

&& memEQ(s1, "" s2 "", sizeof(s2)-1))

541

#define memBEGINPs(s1, l, s2) \

542

( (l) > sizeof(s2) - 1 \

543

&& memEQ(s1, "" s2 "", sizeof(s2)-1))

544

#define memENDs(s1, l, s2) \

545

( (l) >= sizeof(s2) - 1 \

546

&& memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1))

547

#define memENDPs(s1, l, s2) \

548

( (l) > sizeof(s2) \

549

&& memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1))

550

#endif /* End of making macros private */

551

552

#define memLT(s1,s2,l) (memcmp(s1,s2,l) < 0)

553

#define memLE(s1,s2,l) (memcmp(s1,s2,l) <= 0)

554

#define memGT(s1,s2,l) (memcmp(s1,s2,l) > 0)

555

#define memGE(s1,s2,l) (memcmp(s1,s2,l) >= 0)

/*

* Character classes.

*

* Unfortunately, the introduction of locales means that we

561

* can't trust isupper(), etc. to tell the truth. And when

562

* it comes to /\w+/ with tainting enabled, we *must* be able

563

* to trust our character classes.

564

*

565

* Therefore, the default tests in the text of Perl will be

566

* independent of locale. Any code that wants to depend on

567

* the current locale will use the tests that begin with "lc".

568

*/

569

570

#ifdef HAS_SETLOCALE /* XXX Is there a better test for this? */

# ifndef CTYPE256

# define CTYPE256

# endif

#endif

/*

=head1 Character classification

579

This section is about functions (really macros) that classify characters

580

into types, such as punctuation versus alphabetic, etc. Most of these are

581

analogous to regular expression character classes. (See

582

L<perlrecharclass/POSIX Character Classes>.) There are several variants for

583

each class. (Not all macros have all variants; each item below lists the

584

ones valid for it.) None are affected by C<use bytes>, and only the ones

585

with C<LC> in the name are affected by the current locale.

586

587

The base function, e.g., C<isALPHA()>, takes an octet (either a C<char> or a

588

C<U8>) as input and returns a boolean as to whether or not the character

589

represented by that octet is (or on non-ASCII platforms, corresponds to) an

590

ASCII character in the named class based on platform, Unicode, and Perl rules.

591

If the input is a number that doesn't fit in an octet, FALSE is returned.

592

593

Variant C<isI<FOO>_A> (e.g., C<isALPHA_A()>) is identical to the base function

594

with no suffix C<"_A">. This variant is used to emphasize by its name that

595

only ASCII-range characters can return TRUE.

596

597

Variant C<isI<FOO>_L1> imposes the Latin-1 (or EBCDIC equivalent) character set

598

onto the platform. That is, the code points that are ASCII are unaffected,

599

since ASCII is a subset of Latin-1. But the non-ASCII code points are treated

600

as if they are Latin-1 characters. For example, C<isWORDCHAR_L1()> will return

601

true when called with the code point 0xDF, which is a word character in both

602

ASCII and EBCDIC (though it represents different characters in each).

603

604

Variant C<isI<FOO>_uvchr> is like the C<isI<FOO>_L1> variant, but accepts any UV code

605

point as input. If the code point is larger than 255, Unicode rules are used

606

to determine if it is in the character class. For example,

607

C<isWORDCHAR_uvchr(0x100)> returns TRUE, since 0x100 is LATIN CAPITAL LETTER A

608

WITH MACRON in Unicode, and is a word character.

609

610

Variant C<isI<FOO>_utf8_safe> is like C<isI<FOO>_uvchr>, but is used for UTF-8

611

encoded strings. Each call classifies one character, even if the string

612

contains many. This variant takes two parameters. The first, C, is a

613

pointer to the first byte of the character to be classified. (Recall that it

614

may take more than one byte to represent a character in UTF-8 strings.) The

615

second parameter, C<e>, points to anywhere in the string beyond the first

616

character, up to one byte past the end of the entire string. The suffix

617

C<_safe> in the function's name indicates that it will not attempt to read

618

beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this

619

is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the input

620

character is malformed in some way, the program may croak, or the function may

621

return FALSE, at the discretion of the implementation, and subject to change in

622

future releases.

623

624

Variant C<isI<FOO>_utf8> is like C<isI<FOO>_utf8_safe>, but takes just a single

625

parameter, C, which has the same meaning as the corresponding parameter does

626

in C<isI<FOO>_utf8_safe>. The function therefore can't check if it is reading

627

beyond the end of the string. Starting in Perl v5.30, it will take a second

628

parameter, becoming a synonym for C<isI<FOO>_utf8_safe>. At that time every

629

program that uses it will have to be changed to successfully compile. In the

630

meantime, the first runtime call to C<isI<FOO>_utf8> from each call point in the

631

program will raise a deprecation warning, enabled by default. You can convert

632

your program now to use C<isI<FOO>_utf8_safe>, and avoid the warnings, and get an

633

extra measure of protection, or you can wait until v5.30, when you'll be forced

634

to add the C<e> parameter.

635

636

Variant C<isI<FOO>_LC> is like the C<isI<FOO>_A> and C<isI<FOO>_L1> variants, but the

637

result is based on the current locale, which is what C<LC> in the name stands

638

for. If Perl can determine that the current locale is a UTF-8 locale, it uses

639

the published Unicode rules; otherwise, it uses the C library function that

640

gives the named classification. For example, C<isDIGIT_LC()> when not in a

641

UTF-8 locale returns the result of calling C<isdigit()>. FALSE is always

642

returned if the input won't fit into an octet. On some platforms where the C

643

library function is known to be defective, Perl changes its result to follow

644

the POSIX standard's rules.

645

646

Variant C<isI<FOO>_LC_uvchr> is like C<isI<FOO>_LC>, but is defined on any UV. It

647

returns the same as C<isI<FOO>_LC> for input code points less than 256, and

648

returns the hard-coded, not-affected-by-locale, Unicode results for larger ones.

649

650

Variant C<isI<FOO>_LC_utf8_safe> is like C<isI<FOO>_LC_uvchr>, but is used for UTF-8

651

encoded strings. Each call classifies one character, even if the string

652

contains many. This variant takes two parameters. The first, C, is a

653

pointer to the first byte of the character to be classified. (Recall that it

654

may take more than one byte to represent a character in UTF-8 strings.) The

655

second parameter, C<e>, points to anywhere in the string beyond the first

656

character, up to one byte past the end of the entire string. The suffix

657

C<_safe> in the function's name indicates that it will not attempt to read

658

beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this

659

is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the input

660

character is malformed in some way, the program may croak, or the function may

661

return FALSE, at the discretion of the implementation, and subject to change in

662

future releases.

663

664

Variant C<isI<FOO>_LC_utf8> is like C<isI<FOO>_LC_utf8_safe>, but takes just a single

665

parameter, C, which has the same meaning as the corresponding parameter does

666

in C<isI<FOO>_LC_utf8_safe>. The function therefore can't check if it is reading

667

beyond the end of the string. Starting in Perl v5.30, it will take a second

668

parameter, becoming a synonym for C<isI<FOO>_LC_utf8_safe>. At that time every

669

program that uses it will have to be changed to successfully compile. In the

670

meantime, the first runtime call to C<isI<FOO>_LC_utf8> from each call point in

671

the program will raise a deprecation warning, enabled by default. You can

672

convert your program now to use C<isI<FOO>_LC_utf8_safe>, and avoid the warnings,

673

and get an extra measure of protection, or you can wait until v5.30, when

674

you'll be forced to add the C<e> parameter.

675

676

=for apidoc Am|bool|isALPHA|char ch

677

Returns a boolean indicating whether the specified character is an

678

alphabetic character, analogous to C<m/[[:alpha:]]/>.

679

See the L<top of this section|/Character classification> for an explanation of

680

variants

681

C<isALPHA_A>, C<isALPHA_L1>, C<isALPHA_uvchr>, C<isALPHA_utf8_safe>,

682

C<isALPHA_LC>, C<isALPHA_LC_uvchr>, and C<isALPHA_LC_utf8_safe>.

683

684

=for apidoc Am|bool|isALPHANUMERIC|char ch

685

Returns a boolean indicating whether the specified character is a either an

686

alphabetic character or decimal digit, analogous to C<m/[[:alnum:]]/>.

687

See the L<top of this section|/Character classification> for an explanation of

688

variants

689

C<isALPHANUMERIC_A>, C<isALPHANUMERIC_L1>, C<isALPHANUMERIC_uvchr>,

690

C<isALPHANUMERIC_utf8_safe>, C<isALPHANUMERIC_LC>, C<isALPHANUMERIC_LC_uvchr>,

691

and C<isALPHANUMERIC_LC_utf8_safe>.

692

693

=for apidoc Am|bool|isASCII|char ch

694

Returns a boolean indicating whether the specified character is one of the 128

695

characters in the ASCII character set, analogous to C<m/[[:ascii:]]/>.

696

On non-ASCII platforms, it returns TRUE iff this

697

character corresponds to an ASCII character. Variants C<isASCII_A()> and

698

C<isASCII_L1()> are identical to C<isASCII()>.

699

See the L<top of this section|/Character classification> for an explanation of

700

variants

701

C<isASCII_uvchr>, C<isASCII_utf8_safe>, C<isASCII_LC>, C<isASCII_LC_uvchr>, and

702

C<isASCII_LC_utf8_safe>. Note, however, that some platforms do not have the C

703

library routine C<isascii()>. In these cases, the variants whose names contain

704

C<LC> are the same as the corresponding ones without.

705

706

Also note, that because all ASCII characters are UTF-8 invariant (meaning they

707

have the exact same representation (always a single byte) whether encoded in

708

UTF-8 or not), C<isASCII> will give the correct results when called with any

709

byte in any string encoded or not in UTF-8. And similarly C<isASCII_utf8_safe>

710

will work properly on any string encoded or not in UTF-8.

711

712

=for apidoc Am|bool|isBLANK|char ch

713

Returns a boolean indicating whether the specified character is a

714

character considered to be a blank, analogous to C<m/[[:blank:]]/>.

715

See the L<top of this section|/Character classification> for an explanation of

716

variants

717

C<isBLANK_A>, C<isBLANK_L1>, C<isBLANK_uvchr>, C<isBLANK_utf8_safe>,

718

C<isBLANK_LC>, C<isBLANK_LC_uvchr>, and C<isBLANK_LC_utf8_safe>. Note,

719

however, that some platforms do not have the C library routine

720

C<isblank()>. In these cases, the variants whose names contain C<LC> are

721

the same as the corresponding ones without.

722

723

=for apidoc Am|bool|isCNTRL|char ch

724

Returns a boolean indicating whether the specified character is a

725

control character, analogous to C<m/[[:cntrl:]]/>.

726

See the L<top of this section|/Character classification> for an explanation of

727

variants

728

C<isCNTRL_A>, C<isCNTRL_L1>, C<isCNTRL_uvchr>, C<isCNTRL_utf8_safe>,

729

C<isCNTRL_LC>, C<isCNTRL_LC_uvchr>, and C<isCNTRL_LC_utf8_safe> On EBCDIC

730

platforms, you almost always want to use the C<isCNTRL_L1> variant.

731

732

=for apidoc Am|bool|isDIGIT|char ch

733

Returns a boolean indicating whether the specified character is a

734

digit, analogous to C<m/[[:digit:]]/>.

735

Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>.

736

See the L<top of this section|/Character classification> for an explanation of

737

variants

738

C<isDIGIT_uvchr>, C<isDIGIT_utf8_safe>, C<isDIGIT_LC>, C<isDIGIT_LC_uvchr>, and

739

C<isDIGIT_LC_utf8_safe>.

740

741

=for apidoc Am|bool|isGRAPH|char ch

742

Returns a boolean indicating whether the specified character is a

743

graphic character, analogous to C<m/[[:graph:]]/>.

744

See the L<top of this section|/Character classification> for an explanation of

745

variants C<isGRAPH_A>, C<isGRAPH_L1>, C<isGRAPH_uvchr>, C<isGRAPH_utf8_safe>,

746

C<isGRAPH_LC>, C<isGRAPH_LC_uvchr>, and C<isGRAPH_LC_utf8_safe>.

747

748

=for apidoc Am|bool|isLOWER|char ch

749

Returns a boolean indicating whether the specified character is a

750

lowercase character, analogous to C<m/[[:lower:]]/>.

751

See the L<top of this section|/Character classification> for an explanation of

752

variants

753

C<isLOWER_A>, C<isLOWER_L1>, C<isLOWER_uvchr>, C<isLOWER_utf8_safe>,

754

C<isLOWER_LC>, C<isLOWER_LC_uvchr>, and C<isLOWER_LC_utf8_safe>.

755

756

=for apidoc Am|bool|isOCTAL|char ch

757

Returns a boolean indicating whether the specified character is an

758

octal digit, [0-7].

759

The only two variants are C<isOCTAL_A> and C<isOCTAL_L1>; each is identical to

760

C<isOCTAL>.

761

762

=for apidoc Am|bool|isPUNCT|char ch

763

Returns a boolean indicating whether the specified character is a

764

punctuation character, analogous to C<m/[[:punct:]]/>.

765

Note that the definition of what is punctuation isn't as

766

straightforward as one might desire. See L<perlrecharclass/POSIX Character

767

Classes> for details.

768

See the L<top of this section|/Character classification> for an explanation of

769

variants C<isPUNCT_A>, C<isPUNCT_L1>, C<isPUNCT_uvchr>, C<isPUNCT_utf8_safe>,

770

C<isPUNCT_LC>, C<isPUNCT_LC_uvchr>, and C<isPUNCT_LC_utf8_safe>.

771

772

=for apidoc Am|bool|isSPACE|char ch

773

Returns a boolean indicating whether the specified character is a

774

whitespace character. This is analogous

775

to what C<m/\s/> matches in a regular expression. Starting in Perl 5.18

776

this also matches what C<m/[[:space:]]/> does. Prior to 5.18, only the

777

locale forms of this macro (the ones with C<LC> in their names) matched

778

precisely what C<m/[[:space:]]/> does. In those releases, the only difference,

779

in the non-locale variants, was that C<isSPACE()> did not match a vertical tab.

780

(See L</isPSXSPC> for a macro that matches a vertical tab in all releases.)

781

See the L<top of this section|/Character classification> for an explanation of

782

variants

783

C<isSPACE_A>, C<isSPACE_L1>, C<isSPACE_uvchr>, C<isSPACE_utf8_safe>,

784

C<isSPACE_LC>, C<isSPACE_LC_uvchr>, and C<isSPACE_LC_utf8_safe>.

785

786

=for apidoc Am|bool|isPSXSPC|char ch

787

(short for Posix Space)

788

Starting in 5.18, this is identical in all its forms to the

789

corresponding C<isSPACE()> macros.

790

The locale forms of this macro are identical to their corresponding

791

C<isSPACE()> forms in all Perl releases. In releases prior to 5.18, the

792

non-locale forms differ from their C<isSPACE()> forms only in that the

793

C<isSPACE()> forms don't match a Vertical Tab, and the C<isPSXSPC()> forms do.

794

Otherwise they are identical. Thus this macro is analogous to what

795

C<m/[[:space:]]/> matches in a regular expression.

796

See the L<top of this section|/Character classification> for an explanation of

797

variants C<isPSXSPC_A>, C<isPSXSPC_L1>, C<isPSXSPC_uvchr>, C<isPSXSPC_utf8_safe>,

798

C<isPSXSPC_LC>, C<isPSXSPC_LC_uvchr>, and C<isPSXSPC_LC_utf8_safe>.

799

800

=for apidoc Am|bool|isUPPER|char ch

801

Returns a boolean indicating whether the specified character is an

802

uppercase character, analogous to C<m/[[:upper:]]/>.

803

See the L<top of this section|/Character classification> for an explanation of

804

variants C<isUPPER_A>, C<isUPPER_L1>, C<isUPPER_uvchr>, C<isUPPER_utf8_safe>,

805

C<isUPPER_LC>, C<isUPPER_LC_uvchr>, and C<isUPPER_LC_utf8_safe>.

806

807

=for apidoc Am|bool|isPRINT|char ch

808

Returns a boolean indicating whether the specified character is a

809

printable character, analogous to C<m/[[:print:]]/>.

810

See the L<top of this section|/Character classification> for an explanation of

811

variants

812

C<isPRINT_A>, C<isPRINT_L1>, C<isPRINT_uvchr>, C<isPRINT_utf8_safe>,

813

C<isPRINT_LC>, C<isPRINT_LC_uvchr>, and C<isPRINT_LC_utf8_safe>.

814

815

=for apidoc Am|bool|isWORDCHAR|char ch

816

Returns a boolean indicating whether the specified character is a character

817

that is a word character, analogous to what C<m/\w/> and C<m/[[:word:]]/> match

818

in a regular expression. A word character is an alphabetic character, a

819

decimal digit, a connecting punctuation character (such as an underscore), or

820

a "mark" character that attaches to one of those (like some sort of accent).

821

C<isALNUM()> is a synonym provided for backward compatibility, even though a

822

word character includes more than the standard C language meaning of

823

alphanumeric.

824

See the L<top of this section|/Character classification> for an explanation of

825

variants C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>, and

826

C<isWORDCHAR_utf8_safe>. C<isWORDCHAR_LC>, C<isWORDCHAR_LC_uvchr>, and

827

C<isWORDCHAR_LC_utf8_safe> are also as described there, but additionally

828

include the platform's native underscore.

829

830

=for apidoc Am|bool|isXDIGIT|char ch

831

Returns a boolean indicating whether the specified character is a hexadecimal

832

digit. In the ASCII range these are C<[0-9A-Fa-f]>. Variants C<isXDIGIT_A()>

833

and C<isXDIGIT_L1()> are identical to C<isXDIGIT()>.

834

See the L<top of this section|/Character classification> for an explanation of

835

variants

836

C<isXDIGIT_uvchr>, C<isXDIGIT_utf8_safe>, C<isXDIGIT_LC>, C<isXDIGIT_LC_uvchr>,

837

and C<isXDIGIT_LC_utf8_safe>.

838

839

=for apidoc Am|bool|isIDFIRST|char ch

840

Returns a boolean indicating whether the specified character can be the first

841

character of an identifier. This is very close to, but not quite the same as

842

the official Unicode property C<XID_Start>. The difference is that this

843

returns true only if the input character also matches L</isWORDCHAR>.

844

See the L<top of this section|/Character classification> for an explanation of

845

variants

846

C<isIDFIRST_A>, C<isIDFIRST_L1>, C<isIDFIRST_uvchr>, C<isIDFIRST_utf8_safe>,

847

C<isIDFIRST_LC>, C<isIDFIRST_LC_uvchr>, and C<isIDFIRST_LC_utf8_safe>.

848

849

=for apidoc Am|bool|isIDCONT|char ch

850

Returns a boolean indicating whether the specified character can be the

851

second or succeeding character of an identifier. This is very close to, but

852

not quite the same as the official Unicode property C<XID_Continue>. The

853

difference is that this returns true only if the input character also matches

854

L</isWORDCHAR>. See the L<top of this section|/Character classification> for

855

an

856

explanation of variants C<isIDCONT_A>, C<isIDCONT_L1>, C<isIDCONT_uvchr>,

857

C<isIDCONT_utf8_safe>, C<isIDCONT_LC>, C<isIDCONT_LC_uvchr>, and

858

C<isIDCONT_LC_utf8_safe>.

859

860

=head1 Miscellaneous Functions

861

862

=for apidoc Am|U8|READ_XDIGIT|char str*

863

Returns the value of an ASCII-range hex digit and advances the string pointer.

864

Behaviour is only well defined when isXDIGIT(*str) is true.

865

866

=head1 Character case changing

867

Perl uses "full" Unicode case mappings. This means that converting a single

868

character to another case may result in a sequence of more than one character.

869

For example, the uppercase of C<E<223>> (LATIN SMALL LETTER SHARP S) is the two

870

character sequence C<SS>. This presents some complications The lowercase of

871

all characters in the range 0..255 is a single character, and thus

872

C<L</toLOWER_L1>> is furnished. But, C<toUPPER_L1> can't exist, as it couldn't

873

return a valid result for all legal inputs. Instead C<L</toUPPER_uvchr>> has

874

an API that does allow every possible legal result to be returned.) Likewise

875

no other function that is crippled by not being able to give the correct

876

results for the full range of possible inputs has been implemented here.

877

878

=for apidoc Am|U8|toUPPER|U8 ch

879

Converts the specified character to uppercase. If the input is anything but an

880

ASCII lowercase character, that input character itself is returned. Variant

881

C<toUPPER_A> is equivalent.

882

883

884

Converts the code point C<cp> to its uppercase version, and

885

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code

886

point is interpreted as native if less than 256; otherwise as Unicode. Note

887

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

888

bytes since the uppercase version may be longer than the original character.

889

890

The first code point of the uppercased version is returned

891

(but note, as explained at L<the top of this section|/Character case

892

changing>, that there may be more.)

893

894

=for apidoc Am|UV|toUPPER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp

895

Converts the first UTF-8 encoded character in the sequence starting at C and

896

extending no further than S<C<e - 1>> to its uppercase version, and

897

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note

898

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

899

bytes since the uppercase version may be longer than the original character.

900

901

The first code point of the uppercased version is returned

902

(but note, as explained at L<the top of this section|/Character case

903

changing>, that there may be more).

904

905

The suffix C<_safe> in the function's name indicates that it will not attempt

906

to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is

907

true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the

908

input character is malformed in some way, the program may croak, or the

909

function may return the REPLACEMENT CHARACTER, at the discretion of the

910

implementation, and subject to change in future releases.

911

912

=for apidoc Am|UV|toUPPER_utf8|U8* p|U8* s|STRLEN* lenp

913

This is like C<L</toUPPER_utf8_safe>>, but doesn't have the C<e>

914

parameter The function therefore can't check if it is reading

915

beyond the end of the string. Starting in Perl v5.30, it will take the C<e>

916

parameter, becoming a synonym for C<toUPPER_utf8_safe>. At that time every

917

program that uses it will have to be changed to successfully compile. In the

918

meantime, the first runtime call to C<toUPPER_utf8> from each call point in the

919

program will raise a deprecation warning, enabled by default. You can convert

920

your program now to use C<toUPPER_utf8_safe>, and avoid the warnings, and get an

921

extra measure of protection, or you can wait until v5.30, when you'll be forced

922

to add the C<e> parameter.

923

924

=for apidoc Am|U8|toFOLD|U8 ch

925

Converts the specified character to foldcase. If the input is anything but an

926

ASCII uppercase character, that input character itself is returned. Variant

927

C<toFOLD_A> is equivalent. (There is no equivalent C<to_FOLD_L1> for the full

928

Latin1 range, as the full generality of L</toFOLD_uvchr> is needed there.)

929

930

931

Converts the code point C<cp> to its foldcase version, and

932

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code

933

point is interpreted as native if less than 256; otherwise as Unicode. Note

934

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

935

bytes since the foldcase version may be longer than the original character.

936

937

The first code point of the foldcased version is returned

938

(but note, as explained at L<the top of this section|/Character case

939

changing>, that there may be more).

940

941

=for apidoc Am|UV|toFOLD_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp

942

Converts the first UTF-8 encoded character in the sequence starting at C and

943

extending no further than S<C<e - 1>> to its foldcase version, and

944

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note

945

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

946

bytes since the foldcase version may be longer than the original character.

947

948

The first code point of the foldcased version is returned

949

(but note, as explained at L<the top of this section|/Character case

950

changing>, that there may be more).

951

952

The suffix C<_safe> in the function's name indicates that it will not attempt

953

to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is

954

true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the

955

input character is malformed in some way, the program may croak, or the

956

function may return the REPLACEMENT CHARACTER, at the discretion of the

957

implementation, and subject to change in future releases.

958

959

=for apidoc Am|UV|toFOLD_utf8|U8* p|U8* s|STRLEN* lenp

960

This is like C<L</toFOLD_utf8_safe>>, but doesn't have the C<e>

961

parameter The function therefore can't check if it is reading

962

beyond the end of the string. Starting in Perl v5.30, it will take the C<e>

963

parameter, becoming a synonym for C<toFOLD_utf8_safe>. At that time every

964

program that uses it will have to be changed to successfully compile. In the

965

meantime, the first runtime call to C<toFOLD_utf8> from each call point in the

966

program will raise a deprecation warning, enabled by default. You can convert

967

your program now to use C<toFOLD_utf8_safe>, and avoid the warnings, and get an

968

extra measure of protection, or you can wait until v5.30, when you'll be forced

969

to add the C<e> parameter.

970

971

=for apidoc Am|U8|toLOWER|U8 ch

972

Converts the specified character to lowercase. If the input is anything but an

973

ASCII uppercase character, that input character itself is returned. Variant

974

C<toLOWER_A> is equivalent.

975

976

=for apidoc Am|U8|toLOWER_L1|U8 ch

977

Converts the specified Latin1 character to lowercase. The results are

978

undefined if the input doesn't fit in a byte.

979

980

=for apidoc Am|U8|toLOWER_LC|U8 ch

981

Converts the specified character to lowercase using the current locale's rules,

982

if possible; otherwise returns the input character itself.

983

984

985

Converts the code point C<cp> to its lowercase version, and

986

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code

987

point is interpreted as native if less than 256; otherwise as Unicode. Note

988

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

989

bytes since the lowercase version may be longer than the original character.

990

991

The first code point of the lowercased version is returned

992

(but note, as explained at L<the top of this section|/Character case

993

changing>, that there may be more).

994

995

996

=for apidoc Am|UV|toLOWER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp

997

Converts the first UTF-8 encoded character in the sequence starting at C and

998

extending no further than S<C<e - 1>> to its lowercase version, and

999

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note

1000

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

1001

bytes since the lowercase version may be longer than the original character.

1002

1003

The first code point of the lowercased version is returned

1004

(but note, as explained at L<the top of this section|/Character case

1005

changing>, that there may be more).

1006

1007

The suffix C<_safe> in the function's name indicates that it will not attempt

1008

to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is

1009

true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the

1010

input character is malformed in some way, the program may croak, or the

1011

function may return the REPLACEMENT CHARACTER, at the discretion of the

1012

implementation, and subject to change in future releases.

1013

1014

=for apidoc Am|UV|toLOWER_utf8|U8* p|U8* s|STRLEN* lenp

1015

This is like C<L</toLOWER_utf8_safe>>, but doesn't have the C<e>

1016

parameter The function therefore can't check if it is reading

1017

beyond the end of the string. Starting in Perl v5.30, it will take the C<e>

1018

parameter, becoming a synonym for C<toLOWER_utf8_safe>. At that time every

1019

program that uses it will have to be changed to successfully compile. In the

1020

meantime, the first runtime call to C<toLOWER_utf8> from each call point in the

1021

program will raise a deprecation warning, enabled by default. You can convert

1022

your program now to use C<toLOWER_utf8_safe>, and avoid the warnings, and get an

1023

extra measure of protection, or you can wait until v5.30, when you'll be forced

1024

to add the C<e> parameter.

1025

1026

=for apidoc Am|U8|toTITLE|U8 ch

1027

Converts the specified character to titlecase. If the input is anything but an

1028

ASCII lowercase character, that input character itself is returned. Variant

1029

C<toTITLE_A> is equivalent. (There is no C<toTITLE_L1> for the full Latin1

1030

range, as the full generality of L</toTITLE_uvchr> is needed there. Titlecase is

1031

not a concept used in locale handling, so there is no functionality for that.)

1032

1033

1034

Converts the code point C<cp> to its titlecase version, and

1035

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code

1036

point is interpreted as native if less than 256; otherwise as Unicode. Note

1037

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

1038

bytes since the titlecase version may be longer than the original character.

1039

1040

The first code point of the titlecased version is returned

1041

(but note, as explained at L<the top of this section|/Character case

1042

changing>, that there may be more).

1043

1044

=for apidoc Am|UV|toTITLE_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp

1045

Converts the first UTF-8 encoded character in the sequence starting at C and

1046

extending no further than S<C<e - 1>> to its titlecase version, and

1047

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note

1048

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

1049

bytes since the titlecase version may be longer than the original character.

1050

1051

The first code point of the titlecased version is returned

1052

(but note, as explained at L<the top of this section|/Character case

1053

changing>, that there may be more).

1054

1055

The suffix C<_safe> in the function's name indicates that it will not attempt

1056

to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is

1057

true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the

1058

input character is malformed in some way, the program may croak, or the

1059

function may return the REPLACEMENT CHARACTER, at the discretion of the

1060

implementation, and subject to change in future releases.

1061

1062

=for apidoc Am|UV|toTITLE_utf8|U8* p|U8* s|STRLEN* lenp

1063

This is like C<L</toLOWER_utf8_safe>>, but doesn't have the C<e>

1064

parameter The function therefore can't check if it is reading

1065

beyond the end of the string. Starting in Perl v5.30, it will take the C<e>

1066

parameter, becoming a synonym for C<toTITLE_utf8_safe>. At that time every

1067

program that uses it will have to be changed to successfully compile. In the

1068

meantime, the first runtime call to C<toTITLE_utf8> from each call point in the

1069

program will raise a deprecation warning, enabled by default. You can convert

1070

your program now to use C<toTITLE_utf8_safe>, and avoid the warnings, and get an

1071

extra measure of protection, or you can wait until v5.30, when you'll be forced

1072

to add the C<e> parameter.

=cut

XXX Still undocumented isVERTWS_uvchr and _utf8; it's unclear what their names

1077

really should be. Also toUPPER_LC and toFOLD_LC, which are subject to change,

1078

and aren't general purpose as they don't work on U+DF, and assert against that.

1079

1080

Note that these macros are repeated in Devel::PPPort, so should also be

1081

patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc

*/

/* Specify the widest unsigned type on the platform. Use U64TYPE because U64

1086

* is known only in the perl core, and this macro can be called from outside

1087

* that */

1088

#ifdef HAS_QUAD

1089

# define WIDEST_UTYPE U64TYPE

1090

#else

1091

# define WIDEST_UTYPE U32

1092

#endif

1093

1094

/* FITS_IN_8_BITS(c) returns true if c doesn't have a bit set other than in

1095

* the lower 8. It is designed to be hopefully bomb-proof, making sure that no

1096

* bits of information are lost even on a 64-bit machine, but to get the

1097

* compiler to optimize it out if possible. This is because Configure makes

1098

* sure that the machine has an 8-bit byte, so if c is stored in a byte, the

1099

* sizeof() guarantees that this evaluates to a constant true at compile time.

1100

*

1101

* For Coverity, be always true, because otherwise Coverity thinks

1102

* it finds several expressions that are always true, independent

1103

* of operands. Well, they are, but that is kind of the point.

1104

*/

1105

#ifndef __COVERITY__

1106

/* The '| 0' part ensures a compiler error if c is not integer (like e.g., a

1107

* pointer) */

1108

#define FITS_IN_8_BITS(c) ( (sizeof(c) == 1) \

1109

|| !(((WIDEST_UTYPE)((c) | 0)) & ~0xFF))

1110

#else

1111

#define FITS_IN_8_BITS(c) (1)

#endif

#ifdef EBCDIC

# ifndef _ALL_SOURCE

/* The native libc isascii() et.al. functions return the wrong results

1117

* on at least z/OS unless this is defined. */

1118

# error _ALL_SOURCE should probably be defined

1119

# endif

1120

#else

1121

/* There is a simple definition of ASCII for ASCII platforms. But the

1122

* EBCDIC one isn't so simple, so is defined using table look-up like the

1123

* other macros below.

1124

*

1125

* The cast here is used instead of '(c) >= 0', because some compilers emit

1126

* a warning that that test is always true when the parameter is an

1127

* unsigned type. khw supposes that it could be written as

1128

* && ((c) == '\0' || (c) > 0)

1129

* to avoid the message, but the cast will likely avoid extra branches even

1130

* with stupid compilers.

1131

*

1132

* The '| 0' part ensures a compiler error if c is not integer (like e.g.,

1133

* a pointer) */

1134

# define isASCII(c) ((WIDEST_UTYPE)((c) | 0) < 128)

1135

#endif

1136

1137

/* Take the eight possible bit patterns of the lower 3 bits and you get the

1138

* lower 3 bits of the 8 octal digits, in both ASCII and EBCDIC, so those bits

1139

* can be ignored. If the rest match '0', we have an octal */

1140

#define isOCTAL_A(c) (((WIDEST_UTYPE)((c) | 0) & ~7) == '0')

1141

1142

#ifdef H_PERL /* If have access to perl.h, lookup in its table */

1143

1144

/* Character class numbers. For internal core Perl use only. The ones less

1145

* than 32 are used in PL_charclass[] and the ones up through the one that

1146

* corresponds to <_HIGHEST_REGCOMP_DOT_H_SYNC> are used by regcomp.h and

1147

* related files. PL_charclass ones use names used in l1_char_class_tab.h but

1148

* their actual definitions are here. If that file has a name not used here,

1149

* it won't compile.

1150

*

1151

* The first group of these is ordered in what I (khw) estimate to be the

1152

* frequency of their use. This gives a slight edge to exiting a loop earlier

1153

* (in reginclass() in regexec.c) */

1154

# define _CC_WORDCHAR 0 /* \w and [:word:] */

1155

# define _CC_DIGIT 1 /* \d and [:digit:] */

1156

# define _CC_ALPHA 2 /* [:alpha:] */

1157

# define _CC_LOWER 3 /* [:lower:] */

1158

# define _CC_UPPER 4 /* [:upper:] */

1159

# define _CC_PUNCT 5 /* [:punct:] */

1160

# define _CC_PRINT 6 /* [:print:] */

1161

# define _CC_ALPHANUMERIC 7 /* [:alnum:] */

1162

# define _CC_GRAPH 8 /* [:graph:] */

1163

# define _CC_CASED 9 /* [:lower:] or [:upper:] under /i */

1164

1165

#define _FIRST_NON_SWASH_CC 10

1166

/* The character classes above are implemented with swashes. The second group

1167

* (just below) contains the ones implemented without. These are also sorted

1168

* in rough order of the frequency of their use, except that \v should be last,

1169

* as it isn't a real Posix character class, and some (small) inefficiencies in

1170

* regular expression handling would be introduced by putting it in the middle

1171

* of those that are. Also, cntrl and ascii come after the others as it may be

1172

* useful to group these which have no members that match above Latin1, (or

1173

* above ASCII in the latter case) */

1174

1175

# define _CC_SPACE 10 /* \s, [:space:] */

1176

# define _CC_PSXSPC _CC_SPACE /* XXX Temporary, can be removed

1177

when the deprecated isFOO_utf8()

1178

functions are removed */

1179

# define _CC_BLANK 11 /* [:blank:] */

1180

# define _CC_XDIGIT 12 /* [:xdigit:] */

1181

# define _CC_CNTRL 13 /* [:cntrl:] */

1182

# define _CC_ASCII 14 /* [:ascii:] */

1183

# define _CC_VERTSPACE 15 /* \v */

1184

1185

# define _HIGHEST_REGCOMP_DOT_H_SYNC _CC_VERTSPACE

1186

1187

/* The members of the third group below do not need to be coordinated with data

1188

* structures in regcomp.[ch] and regexec.c. */

1189

# define _CC_IDFIRST 16

1190

# define _CC_CHARNAME_CONT 17

1191

# define _CC_NONLATIN1_FOLD 18

1192

# define _CC_NONLATIN1_SIMPLE_FOLD 19

1193

# define _CC_QUOTEMETA 20

1194

# define _CC_NON_FINAL_FOLD 21

1195

# define _CC_IS_IN_SOME_FOLD 22

1196

# define _CC_MNEMONIC_CNTRL 23

1197

1198

# define _CC_IDCONT 24 /* XXX Temporary, can be removed when the deprecated

1199

isFOO_utf8() functions are removed */

1200

1201

/* This next group is only used on EBCDIC platforms, so theoretically could be

1202

* shared with something entirely different that's only on ASCII platforms */

1203

# define _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE 28

1204

# define _CC_UTF8_IS_START 29

1205

# define _CC_UTF8_IS_DOWNGRADEABLE_START 30

1206

# define _CC_UTF8_IS_CONTINUATION 31

1207

/* Unused: 24-27

1208

* If more bits are needed, one could add a second word for non-64bit

1209

* QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd

1210

* word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it

1211

* is used only for optimization (as of this writing), and differs in the

1212

* Latin1 range from the ALPHA bit only in two relatively unimportant

1213

* characters: the masculine and feminine ordinal indicators, so removing it

1214

* would just cause /i regexes which match them to run less efficiently.

1215

* Similarly the EBCDIC-only bits are used just for speed, and could be

1216

* replaced by other means */

1217

1218

#if defined(PERL_CORE) || defined(PERL_EXT)

1219

/* An enum version of the character class numbers, to help compilers

1220

* optimize */

1221

typedef enum {

1222

_CC_ENUM_ALPHA = _CC_ALPHA,

1223

_CC_ENUM_ALPHANUMERIC = _CC_ALPHANUMERIC,

1224

_CC_ENUM_ASCII = _CC_ASCII,

1225

_CC_ENUM_BLANK = _CC_BLANK,

1226

_CC_ENUM_CASED = _CC_CASED,

1227

_CC_ENUM_CNTRL = _CC_CNTRL,

1228

_CC_ENUM_DIGIT = _CC_DIGIT,

1229

_CC_ENUM_GRAPH = _CC_GRAPH,

1230

_CC_ENUM_LOWER = _CC_LOWER,

1231

_CC_ENUM_PRINT = _CC_PRINT,

1232

_CC_ENUM_PUNCT = _CC_PUNCT,

1233

_CC_ENUM_SPACE = _CC_SPACE,

1234

_CC_ENUM_UPPER = _CC_UPPER,

1235

_CC_ENUM_VERTSPACE = _CC_VERTSPACE,

1236

_CC_ENUM_WORDCHAR = _CC_WORDCHAR,

1237

_CC_ENUM_XDIGIT = _CC_XDIGIT

1238

} _char_class_number;

1239

#endif

1240

1241

#define POSIX_SWASH_COUNT _FIRST_NON_SWASH_CC

1242

#define POSIX_CC_COUNT (_HIGHEST_REGCOMP_DOT_H_SYNC + 1)

1243

1244

#if defined(PERL_IN_UTF8_C) \

1245

|| defined(PERL_IN_REGCOMP_C) \

1246

|| defined(PERL_IN_REGEXEC_C)

1247

# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \

1248

|| _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \

1249

|| _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9

1250

#error Need to adjust order of swash_property_names[]

1251

# endif

1252

1253

/* This is declared static in each of the few files that this is #defined for

1254

* to keep them from being publicly accessible. Hence there is a small amount

1255

* of wasted space */

1256

1257

static const char* const swash_property_names[] = {

"XPosixWord",

"XPosixDigit",

"XPosixAlpha",

"XPosixLower",

"XPosixUpper",

"XPosixPunct",

"XPosixPrint",

"XPosixAlnum",

"XPosixGraph",

"Cased"

};

#endif

START_EXTERN_C

# ifdef DOINIT

EXTCONST U32 PL_charclass[] = {

1274

# include "l1_char_class_tab.h"

1275

};

1276

1277

# else /* ! DOINIT */

1278

EXTCONST U32 PL_charclass[];

# endif

END_EXTERN_C

/* The 1U keeps Solaris from griping when shifting sets the uppermost bit */

1283

# define _CC_mask(classnum) (1U << (classnum))

1284

1285

/* For internal core Perl use only: the base macro for defining macros like

1286

* isALPHA */

1287

# define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \

1288

&& (PL_charclass[(U8) (c)] & _CC_mask(classnum)))

1289

1290

/* The mask for the _A versions of the macros; it just adds in the bit for

1291

* ASCII. */

1292

# define _CC_mask_A(classnum) (_CC_mask(classnum) | _CC_mask(_CC_ASCII))

1293

1294

/* For internal core Perl use only: the base macro for defining macros like

1295

* isALPHA_A. The foo_A version makes sure that both the desired bit and

1296

* the ASCII bit are present */

1297

# define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \

1298

&& ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \

1299

== _CC_mask_A(classnum)))

1300

1301

# define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA)

1302

# define isALPHANUMERIC_A(c) _generic_isCC_A(c, _CC_ALPHANUMERIC)

1303

# define isBLANK_A(c) _generic_isCC_A(c, _CC_BLANK)

1304

# define isCNTRL_A(c) _generic_isCC_A(c, _CC_CNTRL)

1305

# define isDIGIT_A(c) _generic_isCC(c, _CC_DIGIT) /* No non-ASCII digits */

1306

# define isGRAPH_A(c) _generic_isCC_A(c, _CC_GRAPH)

1307

# define isLOWER_A(c) _generic_isCC_A(c, _CC_LOWER)

1308

# define isPRINT_A(c) _generic_isCC_A(c, _CC_PRINT)

1309

# define isPUNCT_A(c) _generic_isCC_A(c, _CC_PUNCT)

1310

# define isSPACE_A(c) _generic_isCC_A(c, _CC_SPACE)

1311

# define isUPPER_A(c) _generic_isCC_A(c, _CC_UPPER)

1312

# define isWORDCHAR_A(c) _generic_isCC_A(c, _CC_WORDCHAR)

1313

# define isXDIGIT_A(c) _generic_isCC(c, _CC_XDIGIT) /* No non-ASCII xdigits

1314

*/

1315

# define isIDFIRST_A(c) _generic_isCC_A(c, _CC_IDFIRST)

1316

# define isALPHA_L1(c) _generic_isCC(c, _CC_ALPHA)

1317

# define isALPHANUMERIC_L1(c) _generic_isCC(c, _CC_ALPHANUMERIC)

1318

# define isBLANK_L1(c) _generic_isCC(c, _CC_BLANK)

1319

1320

/* continuation character for legal NAME in \N{NAME} */

1321

# define isCHARNAME_CONT(c) _generic_isCC(c, _CC_CHARNAME_CONT)

1322

1323

# define isCNTRL_L1(c) _generic_isCC(c, _CC_CNTRL)

1324

# define isGRAPH_L1(c) _generic_isCC(c, _CC_GRAPH)

1325

# define isLOWER_L1(c) _generic_isCC(c, _CC_LOWER)

1326

# define isPRINT_L1(c) _generic_isCC(c, _CC_PRINT)

1327

# define isPSXSPC_L1(c) isSPACE_L1(c)

1328

# define isPUNCT_L1(c) _generic_isCC(c, _CC_PUNCT)

1329

# define isSPACE_L1(c) _generic_isCC(c, _CC_SPACE)

1330

# define isUPPER_L1(c) _generic_isCC(c, _CC_UPPER)

1331

# define isWORDCHAR_L1(c) _generic_isCC(c, _CC_WORDCHAR)

1332

# define isIDFIRST_L1(c) _generic_isCC(c, _CC_IDFIRST)

1333

1334

# ifdef EBCDIC

1335

# define isASCII(c) _generic_isCC(c, _CC_ASCII)

1336

# endif

1337

1338

/* Participates in a single-character fold with a character above 255 */

1339

# define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_SIMPLE_FOLD)))

1340

1341

/* Like the above, but also can be part of a multi-char fold */

1342

# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD)))

1343

1344

# define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)

1345

# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \

1346

_generic_isCC(c, _CC_NON_FINAL_FOLD)

1347

# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \

1348

_generic_isCC(c, _CC_IS_IN_SOME_FOLD)

1349

# define _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \

1350

_generic_isCC(c, _CC_MNEMONIC_CNTRL)

1351

#else /* else we don't have perl.h H_PERL */

1352

1353

/* If we don't have perl.h, we are compiling a utility program. Below we

1354

* hard-code various macro definitions that wouldn't otherwise be available

1355

* to it. Most are coded based on first principles. These are written to

1356

* avoid EBCDIC vs. ASCII #ifdef's as much as possible. */

1357

# define isDIGIT_A(c) ((c) <= '9' && (c) >= '0')

1358

# define isBLANK_A(c) ((c) == ' ' || (c) == '\t')

1359

# define isSPACE_A(c) (isBLANK_A(c) \

|| (c) == '\n' \

|| (c) == '\r' \

|| (c) == '\v' \

|| (c) == '\f')

/* On EBCDIC, there are gaps between 'i' and 'j'; 'r' and 's'. Same for

1365

* uppercase. The tests for those aren't necessary on ASCII, but hurt only

1366

* performance (if optimization isn't on), and allow the same code to be

1367

* used for both platform types */

1368

# define isLOWER_A(c) ((c) >= 'a' && (c) <= 'z' \

1369

&& ( (c) <= 'i' \

1370

|| ((c) >= 'j' && (c) <= 'r') \

1371

|| (c) >= 's'))

1372

# define isUPPER_A(c) ((c) >= 'A' && (c) <= 'Z' \

1373

&& ( (c) <= 'I' \

1374

|| ((c) >= 'J' && (c) <= 'R') \

1375

|| (c) >= 'S'))

1376

# define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c))

1377

# define isALPHANUMERIC_A(c) (isALPHA_A(c) || isDIGIT_A(c))

1378

# define isWORDCHAR_A(c) (isALPHANUMERIC_A(c) || (c) == '_')

1379

# define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_')

1380

# define isXDIGIT_A(c) (isDIGIT_A(c) \

1381

|| ((c) >= 'a' && (c) <= 'f') \

1382

|| ((c) <= 'F' && (c) >= 'A'))

1383

# define isPUNCT_A(c) ((c) == '-' || (c) == '!' || (c) == '"' \

1384

|| (c) == '#' || (c) == '$' || (c) == '%' \

1385

|| (c) == '&' || (c) == '\'' || (c) == '(' \

1386

|| (c) == ')' || (c) == '*' || (c) == '+' \

1387

|| (c) == ',' || (c) == '.' || (c) == '/' \

1388

|| (c) == ':' || (c) == ';' || (c) == '<' \

1389

|| (c) == '=' || (c) == '>' || (c) == '?' \

1390

|| (c) == '@' || (c) == '[' || (c) == '\\' \

1391

|| (c) == ']' || (c) == '^' || (c) == '_' \

1392

|| (c) == '`' || (c) == '{' || (c) == '|' \

1393

|| (c) == '}' || (c) == '~')

1394

# define isGRAPH_A(c) (isALPHANUMERIC_A(c) || isPUNCT_A(c))

1395

# define isPRINT_A(c) (isGRAPH_A(c) || (c) == ' ')

1396

1397

# ifdef EBCDIC

1398

/* The below is accurate for the 3 EBCDIC code pages traditionally

1399

* supported by perl. The only difference between them in the controls

1400

* is the position of \n, and that is represented symbolically below */

1401

# define isCNTRL_A(c) ((c) == '\0' || (c) == '\a' || (c) == '\b' \

1402

|| (c) == '\f' || (c) == '\n' || (c) == '\r' \

1403

|| (c) == '\t' || (c) == '\v' \

1404

|| ((c) <= 3 && (c) >= 1) /* SOH, STX, ETX */ \

1405

|| (c) == 7 /* U+7F DEL */ \

1406

|| ((c) <= 0x13 && (c) >= 0x0E) /* SO, SI */ \

1407

/* DLE, DC[1-3] */ \

1408

|| (c) == 0x18 /* U+18 CAN */ \

1409

|| (c) == 0x19 /* U+19 EOM */ \

1410

|| ((c) <= 0x1F && (c) >= 0x1C) /* [FGRU]S */ \

1411

|| (c) == 0x26 /* U+17 ETB */ \

1412

|| (c) == 0x27 /* U+1B ESC */ \

1413

|| (c) == 0x2D /* U+05 ENQ */ \

1414

|| (c) == 0x2E /* U+06 ACK */ \

1415

|| (c) == 0x32 /* U+16 SYN */ \

1416

|| (c) == 0x37 /* U+04 EOT */ \

1417

|| (c) == 0x3C /* U+14 DC4 */ \

1418

|| (c) == 0x3D /* U+15 NAK */ \

1419

|| (c) == 0x3F)/* U+1A SUB */

1420

# define isASCII(c) (isCNTRL_A(c) || isPRINT_A(c))

1421

# else /* isASCII is already defined for ASCII platforms, so can use that to

1422

define isCNTRL */

1423

# define isCNTRL_A(c) (isASCII(c) && ! isPRINT_A(c))

1424

# endif

1425

1426

/* The _L1 macros may be unnecessary for the utilities; I (khw) added them

1427

* during debugging, and it seems best to keep them. We may be called

1428

* without NATIVE_TO_LATIN1 being defined. On ASCII platforms, it doesn't

1429

* do anything anyway, so make it not a problem */

1430

# if ! defined(EBCDIC) && ! defined(NATIVE_TO_LATIN1)

1431

# define NATIVE_TO_LATIN1(ch) (ch)

1432

# endif

1433

# define isALPHA_L1(c) (isUPPER_L1(c) || isLOWER_L1(c))

1434

# define isALPHANUMERIC_L1(c) (isALPHA_L1(c) || isDIGIT_A(c))

1435

# define isBLANK_L1(c) (isBLANK_A(c) \

1436

|| (FITS_IN_8_BITS(c) \

1437

&& NATIVE_TO_LATIN1((U8) c) == 0xA0))

1438

# define isCNTRL_L1(c) (FITS_IN_8_BITS(c) && (! isPRINT_L1(c)))

1439

# define isGRAPH_L1(c) (isPRINT_L1(c) && (! isBLANK_L1(c)))

1440

# define isLOWER_L1(c) (isLOWER_A(c) \

1441

|| (FITS_IN_8_BITS(c) \

1442

&& (( NATIVE_TO_LATIN1((U8) c) >= 0xDF \

1443

&& NATIVE_TO_LATIN1((U8) c) != 0xF7) \

1444

|| NATIVE_TO_LATIN1((U8) c) == 0xAA \

1445

|| NATIVE_TO_LATIN1((U8) c) == 0xBA \

1446

|| NATIVE_TO_LATIN1((U8) c) == 0xB5)))

1447

# define isPRINT_L1(c) (isPRINT_A(c) \

1448

|| (FITS_IN_8_BITS(c) \

1449

&& NATIVE_TO_LATIN1((U8) c) >= 0xA0))

1450

# define isPUNCT_L1(c) (isPUNCT_A(c) \

1451

|| (FITS_IN_8_BITS(c) \

1452

&& ( NATIVE_TO_LATIN1((U8) c) == 0xA1 \

1453

|| NATIVE_TO_LATIN1((U8) c) == 0xA7 \

1454

|| NATIVE_TO_LATIN1((U8) c) == 0xAB \

1455

|| NATIVE_TO_LATIN1((U8) c) == 0xB6 \

1456

|| NATIVE_TO_LATIN1((U8) c) == 0xB7 \

1457

|| NATIVE_TO_LATIN1((U8) c) == 0xBB \

1458

|| NATIVE_TO_LATIN1((U8) c) == 0xBF)))

1459

# define isSPACE_L1(c) (isSPACE_A(c) \

1460

|| (FITS_IN_8_BITS(c) \

1461

&& ( NATIVE_TO_LATIN1((U8) c) == 0x85 \

1462

|| NATIVE_TO_LATIN1((U8) c) == 0xA0)))

1463

# define isUPPER_L1(c) (isUPPER_A(c) \

1464

|| (FITS_IN_8_BITS(c) \

1465

&& ( NATIVE_TO_LATIN1((U8) c) >= 0xC0 \

1466

&& NATIVE_TO_LATIN1((U8) c) <= 0xDE \

1467

&& NATIVE_TO_LATIN1((U8) c) != 0xD7)))

1468

# define isWORDCHAR_L1(c) (isIDFIRST_L1(c) || isDIGIT_A(c))

1469

# define isIDFIRST_L1(c) (isALPHA_L1(c) || NATIVE_TO_LATIN1(c) == '_')

1470

# define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) \

|| isBLANK_L1(c) \

|| (c) == '-' \

|| (c) == '(' \

|| (c) == ')')

/* The following are not fully accurate in the above-ASCII range. I (khw)

1476

* don't think it's necessary to be so for the purposes where this gets

1477

* compiled */

1478

# define _isQUOTEMETA(c) (FITS_IN_8_BITS(c) && ! isWORDCHAR_L1(c))

1479

# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) isALPHA_L1(c)

1480

1481

/* And these aren't accurate at all. They are useful only for above

1482

* Latin1, which utilities and bootstrapping don't deal with */

1483

# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 0

1484

# define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0

1485

# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0

1486

1487

/* Many of the macros later in this file are defined in terms of these. By

1488

* implementing them with a function, which converts the class number into

1489

* a call to the desired macro, all of the later ones work. However, that

1490

* function won't be actually defined when building a utility program (no

1491

* perl.h), and so a compiler error will be generated if one is attempted

1492

* to be used. And the above-Latin1 code points require Unicode tables to

1493

* be present, something unlikely to be the case when bootstrapping */

1494

# define _generic_isCC(c, classnum) \

1495

(FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), TRUE))

1496

# define _generic_isCC_A(c, classnum) \

1497

(FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), FALSE))

1498

#endif /* End of no perl.h H_PERL */

1499

1500

#define isALPHANUMERIC(c) isALPHANUMERIC_A(c)

1501

#define isALPHA(c) isALPHA_A(c)

1502

#define isASCII_A(c) isASCII(c)

1503

#define isASCII_L1(c) isASCII(c)

1504

#define isBLANK(c) isBLANK_A(c)

1505

#define isCNTRL(c) isCNTRL_A(c)

1506

#define isDIGIT(c) isDIGIT_A(c)

1507

#define isGRAPH(c) isGRAPH_A(c)

1508

#define isIDFIRST(c) isIDFIRST_A(c)

1509

#define isLOWER(c) isLOWER_A(c)

1510

#define isPRINT(c) isPRINT_A(c)

1511

#define isPSXSPC_A(c) isSPACE_A(c)

1512

#define isPSXSPC(c) isPSXSPC_A(c)

1513

#define isPSXSPC_L1(c) isSPACE_L1(c)

1514

#define isPUNCT(c) isPUNCT_A(c)

1515

#define isSPACE(c) isSPACE_A(c)

1516

#define isUPPER(c) isUPPER_A(c)

1517

#define isWORDCHAR(c) isWORDCHAR_A(c)

1518

#define isXDIGIT(c) isXDIGIT_A(c)

1519

1520

/* ASCII casing. These could also be written as

1521

#define toLOWER(c) (isASCII(c) ? toLOWER_LATIN1(c) : (c))

1522

#define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c))

1523

which uses table lookup and mask instead of subtraction. (This would

1524

work because the _MOD does not apply in the ASCII range) */

1525

#define toLOWER(c) (isUPPER(c) ? (U8)((c) + ('a' - 'A')) : (c))

1526

#define toUPPER(c) (isLOWER(c) ? (U8)((c) - ('a' - 'A')) : (c))

1527

1528

/* In the ASCII range, these are equivalent to what they're here defined to be.

1529

* But by creating these definitions, other code doesn't have to be aware of

1530

* this detail */

1531

#define toFOLD(c) toLOWER(c)

1532

#define toTITLE(c) toUPPER(c)

1533

1534

#define toLOWER_A(c) toLOWER(c)

1535

#define toUPPER_A(c) toUPPER(c)

1536

#define toFOLD_A(c) toFOLD(c)

1537

#define toTITLE_A(c) toTITLE(c)

1538

1539

/* Use table lookup for speed; returns the input itself if is out-of-range */

1540

#define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \

1541

? (c) \

1542

: PL_latin1_lc[ (U8) (c) ])

1543

#define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */

1544

1545

/* Modified uc. Is correct uc except for three non-ascii chars which are

1546

* all mapped to one of them, and these need special handling; returns the

1547

* input itself if is out-of-range */

1548

#define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \

1549

? (c) \

1550

: PL_mod_latin1_uc[ (U8) (c) ])

1551

#define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale

1552

1553

/* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */

1554

1555

/* For internal core Perl use only: the base macro for defining macros like

1556

* isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point

1557

* (0-255) to check. In a UTF-8 locale, the result is the same as calling

1558

* isFOO_L1(); the 'utf8_locale_classnum' parameter is something like

1559

* _CC_UPPER, which gives the class number for doing this. For non-UTF-8

1560

* locales, the code to actually do the test this is passed in 'non_utf8'. If

1561

* 'c' is above 255, 0 is returned. For accessing the full range of possible

1562

* code points under locale rules, use the macros based on _generic_LC_uvchr

1563

* instead of this. */

1564

#define _generic_LC_base(c, utf8_locale_classnum, non_utf8) \

1565

(! FITS_IN_8_BITS(c) \

1566

? 0 \

1567

: IN_UTF8_CTYPE_LOCALE \

1568

? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum)) \

1569

: cBOOL(non_utf8))

1570

1571

/* For internal core Perl use only: a helper macro for defining macros like

1572

* isALPHA_LC. 'c' is the code point (0-255) to check. The function name to

1573

* actually do this test is passed in 'non_utf8_func', which is called on 'c',

1574

* casting 'c' to the macro _LC_CAST, which should not be parenthesized. See

1575

* _generic_LC_base for more info */

1576

#define _generic_LC(c, utf8_locale_classnum, non_utf8_func) \

1577

_generic_LC_base(c,utf8_locale_classnum, \

1578

non_utf8_func( (_LC_CAST) (c)))

1579

1580

/* For internal core Perl use only: like _generic_LC, but also returns TRUE if

1581

* 'c' is the platform's native underscore character */

1582

#define _generic_LC_underscore(c,utf8_locale_classnum,non_utf8_func) \

1583

_generic_LC_base(c, utf8_locale_classnum, \

1584

(non_utf8_func( (_LC_CAST) (c)) \

1585

|| (char)(c) == '_'))

1586

1587

/* These next three are also for internal core Perl use only: case-change

1588

* helper macros */

1589

#define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \

1590

? (c) \

1591

: (IN_UTF8_CTYPE_LOCALE) \

1592

? PL_latin1_lc[ (U8) (c) ] \

1593

: (cast)function((cast)(c)))

1594

1595

/* Note that the result can be larger than a byte in a UTF-8 locale. It

1596

* returns a single value, so can't adequately return the upper case of LATIN

1597

* SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two

1598

* values "SS"); instead it asserts against that under DEBUGGING, and

1599

* otherwise returns its input */

1600

#define _generic_toUPPER_LC(c, function, cast) \

1601

(! FITS_IN_8_BITS(c) \

1602

? (c) \

1603

: ((! IN_UTF8_CTYPE_LOCALE) \

1604

? (cast)function((cast)(c)) \

1605

: ((((U8)(c)) == MICRO_SIGN) \

1606

? GREEK_CAPITAL_LETTER_MU \

1607

: ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \

1608

? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \

1609

: ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \

1610

? (__ASSERT_(0) (c)) \

1611

: PL_mod_latin1_uc[ (U8) (c) ])))))

1612

1613

/* Note that the result can be larger than a byte in a UTF-8 locale. It

1614

* returns a single value, so can't adequately return the fold case of LATIN

1615

* SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two

1616

* values "ss"); instead it asserts against that under DEBUGGING, and

1617

* otherwise returns its input */

1618

#define _generic_toFOLD_LC(c, function, cast) \

1619

((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \

1620

? GREEK_SMALL_LETTER_MU \

1621

: (__ASSERT_(! IN_UTF8_CTYPE_LOCALE \

1622

|| (c) != LATIN_SMALL_LETTER_SHARP_S) \

1623

_generic_toLOWER_LC(c, function, cast)))

1624

1625

/* Use the libc versions for these if available. */

1626

#if defined(HAS_ISASCII)

1627

# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii( (U8) (c)))

1628

#else

1629

# define isASCII_LC(c) isASCII(c)

1630

#endif

1631

1632

#if defined(HAS_ISBLANK)

1633

# define isBLANK_LC(c) _generic_LC(c, _CC_BLANK, isblank)

1634

#else /* Unlike isASCII, varies if in a UTF-8 locale */

1635

# define isBLANK_LC(c) ((IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c))

#endif

#define _LC_CAST U8

#ifdef WIN32

/* The Windows functions don't bother to follow the POSIX standard, which

1642

* for example says that something can't both be a printable and a control.

1643

* But Windows treats the \t control as a printable, and does such things

1644

* as making superscripts into both digits and punctuation. This tames

1645

* these flaws by assuming that the definitions of both controls and space

1646

* are correct, and then making sure that other definitions don't have

1647

* weirdnesses, by making sure that isalnum() isn't also ispunct(), etc.

1648

* Not all possible weirdnesses are checked for, just the ones that were

1649

* detected on actual Microsoft code pages */

1650

1651

# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl)

1652

# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace)

1653

1654

# define isALPHA_LC(c) (_generic_LC(c, _CC_ALPHA, isalpha) \

1655

&& isALPHANUMERIC_LC(c))

1656

# define isALPHANUMERIC_LC(c) (_generic_LC(c, _CC_ALPHANUMERIC, isalnum) && \

1657

! isPUNCT_LC(c))

1658

# define isDIGIT_LC(c) (_generic_LC(c, _CC_DIGIT, isdigit) && \

1659

isALPHANUMERIC_LC(c))

1660

# define isGRAPH_LC(c) (_generic_LC(c, _CC_GRAPH, isgraph) && isPRINT_LC(c))

1661

# define isIDFIRST_LC(c) (((c) == '_') \

1662

|| (_generic_LC(c, _CC_IDFIRST, isalpha) && ! isPUNCT_LC(c)))

1663

# define isLOWER_LC(c) (_generic_LC(c, _CC_LOWER, islower) && isALPHA_LC(c))

1664

# define isPRINT_LC(c) (_generic_LC(c, _CC_PRINT, isprint) && ! isCNTRL_LC(c))

1665

# define isPUNCT_LC(c) (_generic_LC(c, _CC_PUNCT, ispunct) && ! isCNTRL_LC(c))

1666

# define isUPPER_LC(c) (_generic_LC(c, _CC_UPPER, isupper) && isALPHA_LC(c))

1667

# define isWORDCHAR_LC(c) (((c) == '_') || isALPHANUMERIC_LC(c))

1668

# define isXDIGIT_LC(c) (_generic_LC(c, _CC_XDIGIT, isxdigit) \

1669

&& isALPHANUMERIC_LC(c))

1670

1671

# define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8)

1672

# define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8)

1673

# define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8)

1674

1675

#elif defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII))

1676

/* For most other platforms */

1677

1678

# define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, isalpha)

1679

# define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, isalnum)

1680

# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl)

1681

# define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, isdigit)

1682

# define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, isgraph)

1683

# define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, isalpha)

1684

# define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, islower)

1685

# define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, isprint)

1686

# define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, ispunct)

1687

# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace)

1688

# define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, isupper)

1689

# define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, isalnum)

1690

# define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, isxdigit)

1691

1692

1693

# define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8)

1694

# define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8)

1695

# define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8)

1696

1697

#else /* The final fallback position */

1698

1699

# define isALPHA_LC(c) (isascii(c) && isalpha(c))

1700

# define isALPHANUMERIC_LC(c) (isascii(c) && isalnum(c))

1701

# define isCNTRL_LC(c) (isascii(c) && iscntrl(c))

1702

# define isDIGIT_LC(c) (isascii(c) && isdigit(c))

1703

# define isGRAPH_LC(c) (isascii(c) && isgraph(c))

1704

# define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_'))

1705

# define isLOWER_LC(c) (isascii(c) && islower(c))

1706

# define isPRINT_LC(c) (isascii(c) && isprint(c))

1707

# define isPUNCT_LC(c) (isascii(c) && ispunct(c))

1708

# define isSPACE_LC(c) (isascii(c) && isspace(c))

1709

# define isUPPER_LC(c) (isascii(c) && isupper(c))

1710

# define isWORDCHAR_LC(c) (isascii(c) && (isalnum(c) || (c) == '_'))

1711

# define isXDIGIT_LC(c) (isascii(c) && isxdigit(c))

1712

1713

# define toLOWER_LC(c) (isascii(c) ? tolower(c) : (c))

1714

# define toUPPER_LC(c) (isascii(c) ? toupper(c) : (c))

1715

# define toFOLD_LC(c) (isascii(c) ? tolower(c) : (c))

#endif

#define isIDCONT(c) isWORDCHAR(c)

1720

#define isIDCONT_A(c) isWORDCHAR_A(c)

1721

#define isIDCONT_L1(c) isWORDCHAR_L1(c)

1722

#define isIDCONT_LC(c) isWORDCHAR_LC(c)

1723

#define isPSXSPC_LC(c) isSPACE_LC(c)

1724

1725

/* For internal core Perl use only: the base macros for defining macros like

1726

* isALPHA_uvchr. 'c' is the code point to check. 'classnum' is the POSIX class

1727

* number defined earlier in this file. _generic_uvchr() is used for POSIX

1728

* classes where there is a macro or function 'above_latin1' that takes the

1729

* single argument 'c' and returns the desired value. These exist for those

1730

* classes which have simple definitions, avoiding the overhead of a hash

1731

* lookup or inversion list binary search. _generic_swash_uvchr() can be used

1732

* for classes where that overhead is faster than a direct lookup.

1733

* _generic_uvchr() won't compile if 'c' isn't unsigned, as it won't match the

1734

* 'above_latin1' prototype. _generic_isCC() macro does bounds checking, so

1735

* have duplicate checks here, so could create versions of the macros that

1736

* don't, but experiments show that gcc optimizes them out anyway. */

1737

1738

/* Note that all ignore 'use bytes' */

1739

#define _generic_uvchr(classnum, above_latin1, c) ((c) < 256 \

1740

? _generic_isCC(c, classnum) \

1741

: above_latin1(c))

1742

#define _generic_swash_uvchr(classnum, c) ((c) < 256 \

1743

? _generic_isCC(c, classnum) \

1744

: _is_uni_FOO(classnum, c))

1745

#define isALPHA_uvchr(c) _generic_swash_uvchr(_CC_ALPHA, c)

1746

#define isALPHANUMERIC_uvchr(c) _generic_swash_uvchr(_CC_ALPHANUMERIC, c)

1747

#define isASCII_uvchr(c) isASCII(c)

1748

#define isBLANK_uvchr(c) _generic_uvchr(_CC_BLANK, is_HORIZWS_cp_high, c)

1749

#define isCNTRL_uvchr(c) isCNTRL_L1(c) /* All controls are in Latin1 */

1750

#define isDIGIT_uvchr(c) _generic_swash_uvchr(_CC_DIGIT, c)

1751

#define isGRAPH_uvchr(c) _generic_swash_uvchr(_CC_GRAPH, c)

1752

#define isIDCONT_uvchr(c) \

1753

_generic_uvchr(_CC_WORDCHAR, _is_uni_perl_idcont, c)

1754

#define isIDFIRST_uvchr(c) \

1755

_generic_uvchr(_CC_IDFIRST, _is_uni_perl_idstart, c)

1756

#define isLOWER_uvchr(c) _generic_swash_uvchr(_CC_LOWER, c)

1757

#define isPRINT_uvchr(c) _generic_swash_uvchr(_CC_PRINT, c)

1758

1759

#define isPUNCT_uvchr(c) _generic_swash_uvchr(_CC_PUNCT, c)

1760

#define isSPACE_uvchr(c) _generic_uvchr(_CC_SPACE, is_XPERLSPACE_cp_high, c)

1761

#define isPSXSPC_uvchr(c) isSPACE_uvchr(c)

1762

1763

#define isUPPER_uvchr(c) _generic_swash_uvchr(_CC_UPPER, c)

1764

#define isVERTWS_uvchr(c) _generic_uvchr(_CC_VERTSPACE, is_VERTWS_cp_high, c)

1765

#define isWORDCHAR_uvchr(c) _generic_swash_uvchr(_CC_WORDCHAR, c)

1766

#define isXDIGIT_uvchr(c) _generic_uvchr(_CC_XDIGIT, is_XDIGIT_cp_high, c)

1767

1768

#define toFOLD_uvchr(c,s,l) to_uni_fold(c,s,l)

1769

#define toLOWER_uvchr(c,s,l) to_uni_lower(c,s,l)

1770

#define toTITLE_uvchr(c,s,l) to_uni_title(c,s,l)

1771

#define toUPPER_uvchr(c,s,l) to_uni_upper(c,s,l)

1772

1773

/* For backwards compatibility, even though '_uni' should mean official Unicode

1774

* code points, in Perl it means native for those below 256 */

1775

#define isALPHA_uni(c) isALPHA_uvchr(c)

1776

#define isALPHANUMERIC_uni(c) isALPHANUMERIC_uvchr(c)

1777

#define isASCII_uni(c) isASCII_uvchr(c)

1778

#define isBLANK_uni(c) isBLANK_uvchr(c)

1779

#define isCNTRL_uni(c) isCNTRL_uvchr(c)

1780

#define isDIGIT_uni(c) isDIGIT_uvchr(c)

1781

#define isGRAPH_uni(c) isGRAPH_uvchr(c)

1782

#define isIDCONT_uni(c) isIDCONT_uvchr(c)

1783

#define isIDFIRST_uni(c) isIDFIRST_uvchr(c)

1784

#define isLOWER_uni(c) isLOWER_uvchr(c)

1785

#define isPRINT_uni(c) isPRINT_uvchr(c)

1786

#define isPUNCT_uni(c) isPUNCT_uvchr(c)

1787

#define isSPACE_uni(c) isSPACE_uvchr(c)

1788

#define isPSXSPC_uni(c) isPSXSPC_uvchr(c)

1789

#define isUPPER_uni(c) isUPPER_uvchr(c)

1790

#define isVERTWS_uni(c) isVERTWS_uvchr(c)

1791

#define isWORDCHAR_uni(c) isWORDCHAR_uvchr(c)

1792

#define isXDIGIT_uni(c) isXDIGIT_uvchr(c)

1793

#define toFOLD_uni(c,s,l) toFOLD_uvchr(c,s,l)

1794

#define toLOWER_uni(c,s,l) toLOWER_uvchr(c,s,l)

1795

#define toTITLE_uni(c,s,l) toTITLE_uvchr(c,s,l)

1796

#define toUPPER_uni(c,s,l) toUPPER_uvchr(c,s,l)

1797

1798

/* For internal core Perl use only: the base macros for defining macros like

1799

* isALPHA_LC_uvchr. These are like isALPHA_LC, but the input can be any code

1800

* point, not just 0-255. Like _generic_uvchr, there are two versions, one for

1801

* simple class definitions; the other for more complex. These are like

1802

* _generic_uvchr, so see it for more info. */

1803

#define _generic_LC_uvchr(latin1, above_latin1, c) \

1804

(c < 256 ? latin1(c) : above_latin1(c))

1805

#define _generic_LC_swash_uvchr(latin1, classnum, c) \

1806

(c < 256 ? latin1(c) : _is_uni_FOO(classnum, c))

1807

1808

#define isALPHA_LC_uvchr(c) _generic_LC_swash_uvchr(isALPHA_LC, _CC_ALPHA, c)

1809

#define isALPHANUMERIC_LC_uvchr(c) _generic_LC_swash_uvchr(isALPHANUMERIC_LC, \

1810

_CC_ALPHANUMERIC, c)

1811

#define isASCII_LC_uvchr(c) isASCII_LC(c)

1812

#define isBLANK_LC_uvchr(c) _generic_LC_uvchr(isBLANK_LC, \

1813

is_HORIZWS_cp_high, c)

1814

#define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : 0)

1815

#define isDIGIT_LC_uvchr(c) _generic_LC_swash_uvchr(isDIGIT_LC, _CC_DIGIT, c)

1816

#define isGRAPH_LC_uvchr(c) _generic_LC_swash_uvchr(isGRAPH_LC, _CC_GRAPH, c)

1817

#define isIDCONT_LC_uvchr(c) _generic_LC_uvchr(isIDCONT_LC, \

1818

_is_uni_perl_idcont, c)

1819

#define isIDFIRST_LC_uvchr(c) _generic_LC_uvchr(isIDFIRST_LC, \

1820

_is_uni_perl_idstart, c)

1821

#define isLOWER_LC_uvchr(c) _generic_LC_swash_uvchr(isLOWER_LC, _CC_LOWER, c)

1822

#define isPRINT_LC_uvchr(c) _generic_LC_swash_uvchr(isPRINT_LC, _CC_PRINT, c)

1823

#define isPSXSPC_LC_uvchr(c) isSPACE_LC_uvchr(c)

1824

#define isPUNCT_LC_uvchr(c) _generic_LC_swash_uvchr(isPUNCT_LC, _CC_PUNCT, c)

1825

#define isSPACE_LC_uvchr(c) _generic_LC_uvchr(isSPACE_LC, \

1826

is_XPERLSPACE_cp_high, c)

1827

#define isUPPER_LC_uvchr(c) _generic_LC_swash_uvchr(isUPPER_LC, _CC_UPPER, c)

1828

#define isWORDCHAR_LC_uvchr(c) _generic_LC_swash_uvchr(isWORDCHAR_LC, \

1829

_CC_WORDCHAR, c)

1830

#define isXDIGIT_LC_uvchr(c) _generic_LC_uvchr(isXDIGIT_LC, \

1831

is_XDIGIT_cp_high, c)

1832

1833

#define isBLANK_LC_uni(c) isBLANK_LC_uvchr(UNI_TO_NATIVE(c))

1834

1835

/* For internal core Perl use only: the base macros for defining macros like

1836

* isALPHA_utf8. These are like the earlier defined macros, but take an input

1837

* UTF-8 encoded string 'p'. If the input is in the Latin1 range, use

1838

* the Latin1 macro 'classnum' on 'p'. Otherwise use the value given by the

1839

* 'utf8' parameter. This relies on the fact that ASCII characters have the

1840

* same representation whether utf8 or not. Note that it assumes that the utf8

1841

* has been validated, and ignores 'use bytes' */

1842

#define _base_generic_utf8(enum_name, name, p, use_locale ) \

1843

_is_utf8_FOO(CAT2(_CC_, enum_name), \

1844

(const U8 *) p, \

1845

"is" STRINGIFY(name) "_utf8", \

1846

"is" STRINGIFY(name) "_utf8_safe", \

1847

1, use_locale, __FILE__,__LINE__)

1848

1849

#define _generic_utf8(name, p) _base_generic_utf8(name, name, p, 0)

1850

1851

/* The "_safe" macros make sure that we don't attempt to read beyond 'e', but

1852

* they don't otherwise go out of their way to look for malformed UTF-8. If

1853

* they can return accurate results without knowing if the input is otherwise

1854

* malformed, they do so. For example isASCII is accurate in spite of any

1855

* non-length malformations because it looks only at a single byte. Likewise

1856

* isDIGIT looks just at the first byte for code points 0-255, as all UTF-8

1857

* variant ones return FALSE. But, if the input has to be well-formed in order

1858

* for the results to be accurate, the macros will test and if malformed will

1859

* call a routine to die

1860

*

1861

* Except for toke.c, the macros do assume that e > p, asserting that on

1862

* DEBUGGING builds. Much code that calls these depends on this being true,

1863

* for other reasons. toke.c is treated specially as using the regular

1864

* assertion breaks it in many ways. All strings that these operate on there

1865

* are supposed to have an extra NUL character at the end, so that *e = \0. A

1866

* bunch of code in toke.c assumes that this is true, so the assertion allows

1867

* for that */

1868

#ifdef PERL_IN_TOKE_C

1869

# define _utf8_safe_assert(p,e) ((e) > (p) || ((e) == (p) && *(p) == '\0'))

1870

#else

1871

# define _utf8_safe_assert(p,e) ((e) > (p))

1872

#endif

1873

1874

#define _generic_utf8_safe(classnum, p, e, above_latin1) \

1875

(__ASSERT_(_utf8_safe_assert(p, e)) \

1876

(UTF8_IS_INVARIANT(*(p))) \

1877

? _generic_isCC(*(p), classnum) \

1878

: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \

1879

? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \

1880

? _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \

1881

classnum) \

1882

: (_force_out_malformed_utf8_message( \

1883

(U8 *) (p), (U8 *) (e), 0, 1), 0)) \

1884

: above_latin1))

1885

/* Like the above, but calls 'above_latin1(p)' to get the utf8 value.

1886

* 'above_latin1' can be a macro */

1887

#define _generic_func_utf8_safe(classnum, above_latin1, p, e) \

1888

_generic_utf8_safe(classnum, p, e, above_latin1(p, e))

1889

#define _generic_non_swash_utf8_safe(classnum, above_latin1, p, e) \

1890

_generic_utf8_safe(classnum, p, e, \

1891

(UNLIKELY((e) - (p) < UTF8SKIP(p)) \

1892

? (_force_out_malformed_utf8_message( \

1893

(U8 *) (p), (U8 *) (e), 0, 1), 0) \

1894

: above_latin1(p)))

1895

/* Like the above, but passes classnum to _isFOO_utf8(), instead of having an

1896

* 'above_latin1' parameter */

1897

#define _generic_swash_utf8_safe(classnum, p, e) \

1898

_generic_utf8_safe(classnum, p, e, _is_utf8_FOO_with_len(classnum, p, e))

1899

1900

/* Like the above, but should be used only when it is known that there are no

1901

* characters in the upper-Latin1 range (128-255 on ASCII platforms) which the

1902

* class is TRUE for. Hence it can skip the tests for this range.

1903

* 'above_latin1' should include its arguments */

1904

#define _generic_utf8_safe_no_upper_latin1(classnum, p, e, above_latin1) \

1905

(__ASSERT_(_utf8_safe_assert(p, e)) \

1906

(UTF8_IS_INVARIANT(*(p))) \

1907

? _generic_isCC(*(p), classnum) \

1908

: (UTF8_IS_DOWNGRADEABLE_START(*(p))) \

1909

? 0 /* Note that doesn't check validity for latin1 */ \

1910

: above_latin1)

1911

1912

/* NOTE that some of these macros have very similar ones in regcharclass.h.

1913

* For example, there is (at the time of this writing) an 'is_SPACE_utf8()'

1914

* there, differing in name only by an underscore from the one here

1915

* 'isSPACE_utf8(). The difference is that the ones here are probably more

1916

* efficient and smaller, using an O(1) array lookup for Latin1-range code

1917

* points; the regcharclass.h ones are implemented as a series of

1918

* "if-else-if-else ..." */

1919

1920

#define isALPHA_utf8(p) _generic_utf8(ALPHA, p)

1921

#define isALPHANUMERIC_utf8(p) _generic_utf8(ALPHANUMERIC, p)

1922

#define isASCII_utf8(p) _generic_utf8(ASCII, p)

1923

#define isBLANK_utf8(p) _generic_utf8(BLANK, p)

1924

#define isCNTRL_utf8(p) _generic_utf8(CNTRL, p)

1925

#define isDIGIT_utf8(p) _generic_utf8(DIGIT, p)

1926

#define isGRAPH_utf8(p) _generic_utf8(GRAPH, p)

1927

#define isIDCONT_utf8(p) _generic_utf8(IDCONT, p)

1928

#define isIDFIRST_utf8(p) _generic_utf8(IDFIRST, p)

1929

#define isLOWER_utf8(p) _generic_utf8(LOWER, p)

1930

#define isPRINT_utf8(p) _generic_utf8(PRINT, p)

1931

#define isPSXSPC_utf8(p) _generic_utf8(PSXSPC, p)

1932

#define isPUNCT_utf8(p) _generic_utf8(PUNCT, p)

1933

#define isSPACE_utf8(p) _generic_utf8(SPACE, p)

1934

#define isUPPER_utf8(p) _generic_utf8(UPPER, p)

1935

#define isVERTWS_utf8(p) _generic_utf8(VERTSPACE, p)

1936

#define isWORDCHAR_utf8(p) _generic_utf8(WORDCHAR, p)

1937

#define isXDIGIT_utf8(p) _generic_utf8(XDIGIT, p)

1938

1939

#define isALPHA_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_ALPHA, p, e)

1940

#define isALPHANUMERIC_utf8_safe(p, e) \

1941

_generic_swash_utf8_safe(_CC_ALPHANUMERIC, p, e)

1942

#define isASCII_utf8_safe(p, e) \

1943

/* Because ASCII is invariant under utf8, the non-utf8 macro \

1944

* works */ \

1945

(__ASSERT_(_utf8_safe_assert(p, e)) isASCII(*(p)))

1946

#define isBLANK_utf8_safe(p, e) \

1947

_generic_non_swash_utf8_safe(_CC_BLANK, is_HORIZWS_high, p, e)

1948

1949

#ifdef EBCDIC

1950

/* Because all controls are UTF-8 invariants in EBCDIC, we can use this

1951

* more efficient macro instead of the more general one */

1952

# define isCNTRL_utf8_safe(p, e) \

1953

(__ASSERT_(_utf8_safe_assert(p, e)) isCNTRL_L1(*(p)))

1954

#else

1955

# define isCNTRL_utf8_safe(p, e) _generic_utf8_safe(_CC_CNTRL, p, e, 0)

1956

#endif

1957

1958

#define isDIGIT_utf8_safe(p, e) \

1959

_generic_utf8_safe_no_upper_latin1(_CC_DIGIT, p, e, \

1960

_is_utf8_FOO_with_len(_CC_DIGIT, p, e))

1961

#define isGRAPH_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_GRAPH, p, e)

1962

#define isIDCONT_utf8_safe(p, e) _generic_func_utf8_safe(_CC_WORDCHAR, \

1963

_is_utf8_perl_idcont_with_len, p, e)

1964

1965

/* To prevent S_scan_word in toke.c from hanging, we have to make sure that

1966

* IDFIRST is an alnum. See

1967

* http://rt.perl.org/rt3/Ticket/Display.html?id=74022 for more detail than you

1968

* ever wanted to know about. (In the ASCII range, there isn't a difference.)

1969

* This used to be not the XID version, but we decided to go with the more

1970

* modern Unicode definition */

1971

#define isIDFIRST_utf8_safe(p, e) \

1972

_generic_func_utf8_safe(_CC_IDFIRST, \

1973

_is_utf8_perl_idstart_with_len, (U8 *) (p), (U8 *) (e))

1974

1975

#define isLOWER_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_LOWER, p, e)

1976

#define isPRINT_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_PRINT, p, e)

1977

#define isPSXSPC_utf8_safe(p, e) isSPACE_utf8_safe(p, e)

1978

#define isPUNCT_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_PUNCT, p, e)

1979

#define isSPACE_utf8_safe(p, e) \

1980

_generic_non_swash_utf8_safe(_CC_SPACE, is_XPERLSPACE_high, p, e)

1981

#define isUPPER_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_UPPER, p, e)

1982

#define isVERTWS_utf8_safe(p, e) \

1983

_generic_non_swash_utf8_safe(_CC_VERTSPACE, is_VERTWS_high, p, e)

1984

#define isWORDCHAR_utf8_safe(p, e) \

1985

_generic_swash_utf8_safe(_CC_WORDCHAR, p, e)

1986

#define isXDIGIT_utf8_safe(p, e) \

1987

_generic_utf8_safe_no_upper_latin1(_CC_XDIGIT, p, e, \

1988

(UNLIKELY((e) - (p) < UTF8SKIP(p)) \

1989

? (_force_out_malformed_utf8_message( \

1990

(U8 *) (p), (U8 *) (e), 0, 1), 0) \

1991

: is_XDIGIT_high(p)))

1992

1993

#define toFOLD_utf8(p,s,l) to_utf8_fold(p,s,l)

1994

#define toLOWER_utf8(p,s,l) to_utf8_lower(p,s,l)

1995

#define toTITLE_utf8(p,s,l) to_utf8_title(p,s,l)

1996

#define toUPPER_utf8(p,s,l) to_utf8_upper(p,s,l)

1997

1998

/* For internal core use only, subject to change */

1999

#define _toFOLD_utf8_flags(p,e,s,l,f) _to_utf8_fold_flags (p,e,s,l,f, "", 0)

2000

#define _toLOWER_utf8_flags(p,e,s,l,f) _to_utf8_lower_flags(p,e,s,l,f, "", 0)

2001

#define _toTITLE_utf8_flags(p,e,s,l,f) _to_utf8_title_flags(p,e,s,l,f, "", 0)

2002

#define _toUPPER_utf8_flags(p,e,s,l,f) _to_utf8_upper_flags(p,e,s,l,f, "", 0)

2003

2004

#define toFOLD_utf8_safe(p,e,s,l) _toFOLD_utf8_flags(p,e,s,l, FOLD_FLAGS_FULL)

2005

#define toLOWER_utf8_safe(p,e,s,l) _toLOWER_utf8_flags(p,e,s,l, 0)

2006

#define toTITLE_utf8_safe(p,e,s,l) _toTITLE_utf8_flags(p,e,s,l, 0)

2007

#define toUPPER_utf8_safe(p,e,s,l) _toUPPER_utf8_flags(p,e,s,l, 0)

2008

2009

/* For internal core Perl use only: the base macros for defining macros like

2010

* isALPHA_LC_utf8. These are like _generic_utf8, but if the first code point

2011

* in 'p' is within the 0-255 range, it uses locale rules from the passed-in

2012

* 'macro' parameter */

2013

#define _generic_LC_utf8(name, p) _base_generic_utf8(name, name, p, 1)

2014

2015

#define isALPHA_LC_utf8(p) _generic_LC_utf8(ALPHA, p)

2016

#define isALPHANUMERIC_LC_utf8(p) _generic_LC_utf8(ALPHANUMERIC, p)

2017

#define isASCII_LC_utf8(p) _generic_LC_utf8(ASCII, p)

2018

#define isBLANK_LC_utf8(p) _generic_LC_utf8(BLANK, p)

2019

#define isCNTRL_LC_utf8(p) _generic_LC_utf8(CNTRL, p)

2020

#define isDIGIT_LC_utf8(p) _generic_LC_utf8(DIGIT, p)

2021

#define isGRAPH_LC_utf8(p) _generic_LC_utf8(GRAPH, p)

2022

#define isIDCONT_LC_utf8(p) _generic_LC_utf8(IDCONT, p)

2023

#define isIDFIRST_LC_utf8(p) _generic_LC_utf8(IDFIRST, p)

2024

#define isLOWER_LC_utf8(p) _generic_LC_utf8(LOWER, p)

2025

#define isPRINT_LC_utf8(p) _generic_LC_utf8(PRINT, p)

2026

#define isPSXSPC_LC_utf8(p) _generic_LC_utf8(PSXSPC, p)

2027

#define isPUNCT_LC_utf8(p) _generic_LC_utf8(PUNCT, p)

2028

#define isSPACE_LC_utf8(p) _generic_LC_utf8(SPACE, p)

2029

#define isUPPER_LC_utf8(p) _generic_LC_utf8(UPPER, p)

2030

#define isWORDCHAR_LC_utf8(p) _generic_LC_utf8(WORDCHAR, p)

2031

#define isXDIGIT_LC_utf8(p) _generic_LC_utf8(XDIGIT, p)

2032

2033

/* For internal core Perl use only: the base macros for defining macros like

2034

* isALPHA_LC_utf8_safe. These are like _generic_utf8, but if the first code

2035

* point in 'p' is within the 0-255 range, it uses locale rules from the

2036

* passed-in 'macro' parameter */

2037

#define _generic_LC_utf8_safe(macro, p, e, above_latin1) \

2038

(__ASSERT_(_utf8_safe_assert(p, e)) \

2039

(UTF8_IS_INVARIANT(*(p))) \

2040

? macro(*(p)) \

2041

: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \

2042

? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \

2043

? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \

2044

: (_force_out_malformed_utf8_message( \

2045

(U8 *) (p), (U8 *) (e), 0, 1), 0)) \

2046

: above_latin1))

2047

2048

#define _generic_LC_swash_utf8_safe(macro, classnum, p, e) \

2049

_generic_LC_utf8_safe(macro, p, e, \

2050

_is_utf8_FOO_with_len(classnum, p, e))

2051

2052

#define _generic_LC_func_utf8_safe(macro, above_latin1, p, e) \

2053

_generic_LC_utf8_safe(macro, p, e, above_latin1(p, e))

2054

2055

#define _generic_LC_non_swash_utf8_safe(classnum, above_latin1, p, e) \

2056

_generic_LC_utf8_safe(classnum, p, e, \

2057

(UNLIKELY((e) - (p) < UTF8SKIP(p)) \

2058

? (_force_out_malformed_utf8_message( \

2059

(U8 *) (p), (U8 *) (e), 0, 1), 0) \

2060

: above_latin1(p)))

2061

2062

#define isALPHANUMERIC_LC_utf8_safe(p, e) \

2063

_generic_LC_swash_utf8_safe(isALPHANUMERIC_LC, \

2064

_CC_ALPHANUMERIC, p, e)

2065

#define isALPHA_LC_utf8_safe(p, e) \

2066

_generic_LC_swash_utf8_safe(isALPHA_LC, _CC_ALPHA, p, e)

2067

#define isASCII_LC_utf8_safe(p, e) \

2068

(__ASSERT_(_utf8_safe_assert(p, e)) isASCII_LC(*(p)))

2069

#define isBLANK_LC_utf8_safe(p, e) \

2070

_generic_LC_non_swash_utf8_safe(isBLANK_LC, is_HORIZWS_high, p, e)

2071

#define isCNTRL_LC_utf8_safe(p, e) \

2072

_generic_LC_utf8_safe(isCNTRL_LC, p, e, 0)

2073

#define isDIGIT_LC_utf8_safe(p, e) \

2074

_generic_LC_swash_utf8_safe(isDIGIT_LC, _CC_DIGIT, p, e)

2075

#define isGRAPH_LC_utf8_safe(p, e) \

2076

_generic_LC_swash_utf8_safe(isGRAPH_LC, _CC_GRAPH, p, e)

2077

#define isIDCONT_LC_utf8_safe(p, e) \

2078

_generic_LC_func_utf8_safe(isIDCONT_LC, \

2079

_is_utf8_perl_idcont_with_len, p, e)

2080

#define isIDFIRST_LC_utf8_safe(p, e) \

2081

_generic_LC_func_utf8_safe(isIDFIRST_LC, \

2082

_is_utf8_perl_idstart_with_len, p, e)

2083

#define isLOWER_LC_utf8_safe(p, e) \

2084

_generic_LC_swash_utf8_safe(isLOWER_LC, _CC_LOWER, p, e)

2085

#define isPRINT_LC_utf8_safe(p, e) \

2086

_generic_LC_swash_utf8_safe(isPRINT_LC, _CC_PRINT, p, e)

2087

#define isPSXSPC_LC_utf8_safe(p, e) isSPACE_LC_utf8_safe(p, e)

2088

#define isPUNCT_LC_utf8_safe(p, e) \

2089

_generic_LC_swash_utf8_safe(isPUNCT_LC, _CC_PUNCT, p, e)

2090

#define isSPACE_LC_utf8_safe(p, e) \

2091

_generic_LC_non_swash_utf8_safe(isSPACE_LC, is_XPERLSPACE_high, p, e)

2092

#define isUPPER_LC_utf8_safe(p, e) \

2093

_generic_LC_swash_utf8_safe(isUPPER_LC, _CC_UPPER, p, e)

2094

#define isWORDCHAR_LC_utf8_safe(p, e) \

2095

_generic_LC_swash_utf8_safe(isWORDCHAR_LC, _CC_WORDCHAR, p, e)

2096

#define isXDIGIT_LC_utf8_safe(p, e) \

2097

_generic_LC_non_swash_utf8_safe(isXDIGIT_LC, is_XDIGIT_high, p, e)

2098

2099

/* Macros for backwards compatibility and for completeness when the ASCII and

2100

* Latin1 values are identical */

2101

#define isALPHAU(c) isALPHA_L1(c)

2102

#define isDIGIT_L1(c) isDIGIT_A(c)

2103

#define isOCTAL(c) isOCTAL_A(c)

2104

#define isOCTAL_L1(c) isOCTAL_A(c)

2105

#define isXDIGIT_L1(c) isXDIGIT_A(c)

2106

#define isALNUM(c) isWORDCHAR(c)

2107

#define isALNUMU(c) isWORDCHAR_L1(c)

2108

#define isALNUM_LC(c) isWORDCHAR_LC(c)

2109

#define isALNUM_uni(c) isWORDCHAR_uni(c)

2110

#define isALNUM_LC_uvchr(c) isWORDCHAR_LC_uvchr(c)

2111

#define isALNUM_utf8(p) isWORDCHAR_utf8(p)

2112

#define isALNUM_LC_utf8(p) isWORDCHAR_LC_utf8(p)

2113

#define isALNUMC_A(c) isALPHANUMERIC_A(c) /* Mnemonic: "C's alnum" */

2114

#define isALNUMC_L1(c) isALPHANUMERIC_L1(c)

2115

#define isALNUMC(c) isALPHANUMERIC(c)

2116

#define isALNUMC_LC(c) isALPHANUMERIC_LC(c)

2117

#define isALNUMC_uni(c) isALPHANUMERIC_uni(c)

2118

#define isALNUMC_LC_uvchr(c) isALPHANUMERIC_LC_uvchr(c)

2119

#define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p)

2120

#define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p)

2121

2122

/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,

2123

* except that they don't necessarily mean the same characters, e.g. CTRL-D is

2124

* 4 on both systems, but that is EOT on ASCII; ST on EBCDIC.

2125

* '?' is special-cased on EBCDIC to APC, which is the control there that is

2126

* the outlier from the block that contains the other controls, just like

2127

* toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0

2128

* block. If it weren't special cased, it would yield a non-control.

2129

* The conversion works both ways, so toCTRL('D') is 4, and toCTRL(4) is D,

2130

* etc. */

2131

#ifndef EBCDIC

2132

# define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) toUPPER(((U8)(c))) ^ 64)

2133

#else

2134

# define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) \

2135

((isPRINT_A(c)) \

2136

? (UNLIKELY((c) == '?') \

2137

? QUESTION_MARK_CTRL \

2138

: (NATIVE_TO_LATIN1(toUPPER((U8) (c))) ^ 64)) \

2139

: (UNLIKELY((c) == QUESTION_MARK_CTRL) \

2140

? '?' \

2141

: (LATIN1_TO_NATIVE(((U8) (c)) ^ 64)))))

2142

#endif

2143

2144

/* Line numbers are unsigned, 32 bits. */

2145

typedef U32 line_t;

2146

#define NOLINE ((line_t) 4294967295UL) /* = FFFFFFFF */

2147

2148

/* Helpful alias for version prescan */

2149

#define is_LAX_VERSION(a,b) \

2150

(a != Perl_prescan_version(aTHX_ a, FALSE, b, NULL, NULL, NULL, NULL))

2151

2152

#define is_STRICT_VERSION(a,b) \

2153

(a != Perl_prescan_version(aTHX_ a, TRUE, b, NULL, NULL, NULL, NULL))

2154

2155

#define BADVERSION(a,b,c) \

if (b) { \

*b = c; \

} \

return a;

/* Converts a character known to represent a hexadecimal digit (0-9, A-F, or

2162

* a-f) to its numeric value. READ_XDIGIT's argument is a string pointer,

2163

* which is advanced. The input is validated only by an assert() in DEBUGGING

2164

* builds. In both ASCII and EBCDIC the last 4 bits of the digits are 0-9; and

2165

* the last 4 bits of A-F and a-f are 1-6, so adding 9 yields 10-15 */

2166

#define XDIGIT_VALUE(c) (__ASSERT_(isXDIGIT(c)) (0xf & (isDIGIT(c) \

2167

? (c) \

2168

: ((c) + 9))))

2169

#define READ_XDIGIT(s) (__ASSERT_(isXDIGIT(*s)) (0xf & (isDIGIT(*(s)) \

? (*(s)++) \

: (*(s)++ + 9))))

/* Converts a character known to represent an octal digit (0-7) to its numeric

2174

* value. The input is validated only by an assert() in DEBUGGING builds. In

2175

* both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */

2176

#define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c)))

2177

2178

/* Efficiently returns a boolean as to if two native characters are equivalent

2179

* case-insenstively. At least one of the characters must be one of [A-Za-z];

2180

* the ALPHA in the name is to remind you of that. This is asserted() in

2181

* DEBUGGING builds. Because [A-Za-z] are invariant under UTF-8, this macro

2182

* works (on valid input) for both non- and UTF-8-encoded bytes.

2183

*

2184

* When one of the inputs is a compile-time constant and gets folded by the

2185

* compiler, this reduces to an AND and a TEST. On both EBCDIC and ASCII

2186

* machines, 'A' and 'a' differ by a single bit; the same with the upper and

2187

* lower case of all other ASCII-range alphabetics. On ASCII platforms, they

2188

* are 32 apart; on EBCDIC, they are 64. At compile time, this uses an

2189

* exclusive 'or' to find that bit and then inverts it to form a mask, with

2190

* just a single 0, in the bit position where the upper- and lowercase differ.

2191

* */

2192

#define isALPHA_FOLD_EQ(c1, c2) \

2193

(__ASSERT_(isALPHA_A(c1) || isALPHA_A(c2)) \

2194

((c1) & ~('A' ^ 'a')) == ((c2) & ~('A' ^ 'a')))

2195

#define isALPHA_FOLD_NE(c1, c2) (! isALPHA_FOLD_EQ((c1), (c2)))

2196

2197

/*

2198

=head1 Memory Management

2199

2200

2201

The XSUB-writer's interface to the C C<malloc> function.

2202

2203

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2204

2205

In 5.9.3, Newx() and friends replace the older New() API, and drops

2206

the first parameter, I<x>, a debug aid which allowed callers to identify

2207

themselves. This aid has been superseded by a new build option,

2208

PERL_MEM_LOG (see L<perlhacktips/PERL_MEM_LOG>). The older API is still

2209

there for use in XS modules supporting older perls.

2210

2211

2212

The XSUB-writer's interface to the C C<malloc> function, with

2213

cast. See also C<L</Newx>>.

2214

2215

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2216

2217

2218

The XSUB-writer's interface to the C C<malloc> function. The allocated

2219

memory is zeroed with C<memzero>. See also C<L</Newx>>.

2220

2221

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2222

2223

2224

The XSUB-writer's interface to the C C<realloc> function.

2225

2226

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2227

2228

2229

The XSUB-writer's interface to the C C<realloc> function, with

2230

cast.

2231

2232

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2233

2234

=for apidoc Am|void|Safefree|void* ptr

2235

The XSUB-writer's interface to the C C<free> function.

2236

2237

This should B<ONLY> be used on memory obtained using L</"Newx"> and friends.

2238

2239

2240

The XSUB-writer's interface to the C C<memmove> function. The C<src> is the

2241

source, C<dest> is the destination, C<nitems> is the number of items, and

2242

C<type> is the type. Can do overlapping moves. See also C<L</Copy>>.

2243

2244

2245

Like C<Move> but returns C<dest>. Useful

2246

for encouraging compilers to tail-call

optimise.

The XSUB-writer's interface to the C C<memcpy> function. The C<src> is the

2251

source, C<dest> is the destination, C<nitems> is the number of items, and

2252

C<type> is the type. May fail on overlapping copies. See also C<L</Move>>.

Like C<Copy> but returns C<dest>. Useful

2257

for encouraging compilers to tail-call

optimise.

The XSUB-writer's interface to the C C<memzero> function. The C<dest> is the

2263

destination, C<nitems> is the number of items, and C<type> is the type.

Like C<Zero> but returns dest. Useful

2268

for encouraging compilers to tail-call

optimise.

This is an architecture-independent macro to copy one structure to another.

Fill up memory with a byte pattern (a byte repeated over and over

2277

again) that hopefully catches attempts to access uninitialized memory.

PoisonWith(0xAB) for catching access to allocated but uninitialized memory.

PoisonWith(0xEF) for catching access to freed memory.

PoisonWith(0xEF) for catching access to freed memory.

=cut */

/* Maintained for backwards-compatibility only. Use newSV() instead. */

2294

#ifndef PERL_CORE

2295

#define NEWSV(x,len) newSV(len)

2296

#endif

2297

2298

#define MEM_SIZE_MAX ((MEM_SIZE)-1)

2299

2300

#define _PERL_STRLEN_ROUNDUP_UNCHECKED(n) (((n) - 1 + PERL_STRLEN_ROUNDUP_QUANTUM) & ~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM - 1))

2301

2302

#ifdef PERL_MALLOC_WRAP

2303

2304

/* This expression will be constant-folded at compile time. It checks

2305

* whether or not the type of the count n is so small (e.g. U8 or U16, or

2306

* U32 on 64-bit systems) that there's no way a wrap-around could occur.

2307

* As well as avoiding the need for a run-time check in some cases, it's

2308

* designed to avoid compiler warnings like:

2309

* comparison is always false due to limited range of data type

2310

* It's mathematically equivalent to

2311

* max(n) * sizeof(t) > MEM_SIZE_MAX

2312

*/

2313

2314

# define _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) \

2315

( sizeof(MEM_SIZE) < sizeof(n) \

2316

|| sizeof(t) > ((MEM_SIZE)1 << 8*(sizeof(MEM_SIZE) - sizeof(n))))

2317

2318

/* This is written in a slightly odd way to avoid various spurious

2319

* compiler warnings. We *want* to write the expression as

2320

* _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) && (n > C)

2321

* (for some compile-time constant C), but even when the LHS

2322

* constant-folds to false at compile-time, g++ insists on emitting

2323

* warnings about the RHS (e.g. "comparison is always false"), so instead

* we write it as

*

* (cond ? n : X) > C

*

* where X is a constant with X > C always false. Choosing a value for X

2329

* is tricky. If 0, some compilers will complain about 0 > C always being

2330

* false; if 1, Coverity complains when n happens to be the constant value

2331

* '1', that cond ? 1 : 1 has the same value on both branches; so use C

2332

* for X and hope that nothing else whines.

2333

*/

2334

2335

# define _MEM_WRAP_WILL_WRAP(n,t) \

2336

((_MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) ? (MEM_SIZE)(n) : \

2337

MEM_SIZE_MAX/sizeof(t)) > MEM_SIZE_MAX/sizeof(t))

2338

2339

# define MEM_WRAP_CHECK(n,t) \

2340

(void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \

2341

&& (croak_memory_wrap(),0))

2342

2343

# define MEM_WRAP_CHECK_1(n,t,a) \

2344

(void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \

2345

&& (Perl_croak_nocontext("%s",(a)),0))

2346

2347

#define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t),

2348

2349

#define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0) : 0), _PERL_STRLEN_ROUNDUP_UNCHECKED(n))

2350

#else

2351

2352

#define MEM_WRAP_CHECK(n,t)

2353

#define MEM_WRAP_CHECK_1(n,t,a)

2354

#define MEM_WRAP_CHECK_2(n,t,a,b)

2355

#define MEM_WRAP_CHECK_(n,t)

2356

2357

#define PERL_STRLEN_ROUNDUP(n) _PERL_STRLEN_ROUNDUP_UNCHECKED(n)

#endif

#ifdef PERL_MEM_LOG

/*

* If PERL_MEM_LOG is defined, all Newx()s, Renew()s, and Safefree()s

2364

* go through functions, which are handy for debugging breakpoints, but

2365

* which more importantly get the immediate calling environment (file and

2366

* line number, and C function name if available) passed in. This info can

2367

* then be used for logging the calls, for which one gets a sample

2368

* implementation unless -DPERL_MEM_LOG_NOIMPL is also defined.

2369

*

2370

* Known problems:

2371

* - not all memory allocs get logged, only those

2372

* that go through Newx() and derivatives (while all

2373

* Safefrees do get logged)

2374

* - __FILE__ and __LINE__ do not work everywhere

2375

* - __func__ or __FUNCTION__ even less so

2376

* - I think more goes on after the perlio frees but

2377

* the thing is that STDERR gets closed (as do all

2378

* the file descriptors)

2379

* - no deeper calling stack than the caller of the Newx()

2380

* or the kind, but do I look like a C reflection/introspection

2381

* utility to you?

2382

* - the function prototypes for the logging functions

2383

* probably should maybe be somewhere else than handy.h

2384

* - one could consider inlining (macrofying) the logging

2385

* for speed, but I am too lazy

2386

* - one could imagine recording the allocations in a hash,

2387

* (keyed by the allocation address?), and maintain that

2388

* through reallocs and frees, but how to do that without

2389

* any News() happening...?

2390

* - lots of -Ddefines to get useful/controllable output

2391

* - lots of ENV reads

*/

# ifdef PERL_CORE

# ifndef PERL_MEM_LOG_NOIMPL

enum mem_log_type {

MLT_ALLOC,

MLT_REALLOC,

MLT_FREE,

MLT_NEW_SV,

MLT_DEL_SV

};

# endif

# if defined(PERL_IN_SV_C) /* those are only used in sv.c */

2405

void Perl_mem_log_new_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname);

2406

void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname);

# endif

# endif

#endif

#ifdef PERL_MEM_LOG

#define MEM_LOG_ALLOC(n,t,a) Perl_mem_log_alloc(n,sizeof(t),STRINGIFY(t),a,__FILE__,__LINE__,FUNCTION__)

2414

#define MEM_LOG_REALLOC(n,t,v,a) Perl_mem_log_realloc(n,sizeof(t),STRINGIFY(t),v,a,__FILE__,__LINE__,FUNCTION__)

2415

#define MEM_LOG_FREE(a) Perl_mem_log_free(a,__FILE__,__LINE__,FUNCTION__)

2416

#endif

2417

2418

#ifndef MEM_LOG_ALLOC

2419

#define MEM_LOG_ALLOC(n,t,a) (a)

2420

#endif

2421

#ifndef MEM_LOG_REALLOC

2422

#define MEM_LOG_REALLOC(n,t,v,a) (a)

2423

#endif

2424

#ifndef MEM_LOG_FREE

2425

#define MEM_LOG_FREE(a) (a)

2426

#endif

2427

2428

#define Newx(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t))))))

2429

#define Newxc(v,n,t,c) (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t))))))

2430

#define Newxz(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safecalloc((n),sizeof(t)))))

2431

2432

#ifndef PERL_CORE

2433

/* pre 5.9.x compatibility */

2434

#define New(x,v,n,t) Newx(v,n,t)

2435

#define Newc(x,v,n,t,c) Newxc(v,n,t,c)

2436

#define Newz(x,v,n,t) Newxz(v,n,t)

2437

#endif

2438

2439

#define Renew(v,n,t) \

2440

(v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t))))))

2441

#define Renewc(v,n,t,c) \

2442

(v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t))))))

2443

2444

#ifdef PERL_POISON

2445

#define Safefree(d) \

2446

((d) ? (void)(safefree(MEM_LOG_FREE((Malloc_t)(d))), Poison(&(d), 1, Malloc_t)) : (void) 0)

2447

#else

2448

#define Safefree(d) safefree(MEM_LOG_FREE((Malloc_t)(d)))

2449

#endif

2450

2451

#define perl_assert_ptr(p) assert( ((void*)(p)) != 0 )

2452

2453

2454

#define Move(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memmove((char*)(d),(const char*)(s), (n) * sizeof(t)))

2455

#define Copy(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memcpy((char*)(d),(const char*)(s), (n) * sizeof(t)))

2456

#define Zero(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), (void)memzero((char*)(d), (n) * sizeof(t)))

2457

2458

#define MoveD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memmove((char*)(d),(const char*)(s), (n) * sizeof(t)))

2459

#define CopyD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memcpy((char*)(d),(const char*)(s), (n) * sizeof(t)))

2460

#define ZeroD(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), memzero((char*)(d), (n) * sizeof(t)))

2461

2462

#define PoisonWith(d,n,t,b) (MEM_WRAP_CHECK_(n,t) (void)memset((char*)(d), (U8)(b), (n) * sizeof(t)))

2463

#define PoisonNew(d,n,t) PoisonWith(d,n,t,0xAB)

2464

#define PoisonFree(d,n,t) PoisonWith(d,n,t,0xEF)

2465

#define Poison(d,n,t) PoisonFree(d,n,t)

2466

2467

#ifdef PERL_POISON

2468

# define PERL_POISON_EXPR(x) x

2469

#else

2470

# define PERL_POISON_EXPR(x)

2471

#endif

2472

2473

#define StructCopy(s,d,t) (*((t*)(d)) = *((t*)(s)))

2474

2475

/* C_ARRAY_LENGTH is the number of elements in the C array (so you

2476

* want your zero-based indices to be less than but not equal to).

2477

*

2478

* C_ARRAY_END is one past the last: half-open/half-closed range,

2479

* not last-inclusive range. */

2480

#define C_ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))

2481

#define C_ARRAY_END(a) ((a) + C_ARRAY_LENGTH(a))

#ifdef NEED_VA_COPY

# ifdef va_copy

# define Perl_va_copy(s, d) va_copy(d, s)

2486

# elif defined(__va_copy)

2487

# define Perl_va_copy(s, d) __va_copy(d, s)

2488

# else

2489

# define Perl_va_copy(s, d) Copy(s, d, 1, va_list)

# endif

#endif

/* convenience debug macros */

2494

#ifdef USE_ITHREADS

2495

#define pTHX_FORMAT "Perl interpreter: 0x%p"

2496

#define pTHX__FORMAT ", Perl interpreter: 0x%p"

2497

#define pTHX_VALUE_ (void *)my_perl,

2498

#define pTHX_VALUE (void *)my_perl

2499

#define pTHX__VALUE_ ,(void *)my_perl,

2500

#define pTHX__VALUE ,(void *)my_perl

#else

#define pTHX_FORMAT

#define pTHX__FORMAT

#define pTHX_VALUE_

#define pTHX_VALUE

#define pTHX__VALUE_

#define pTHX__VALUE

#endif /* USE_ITHREADS */

2509

2510

/* Perl_deprecate was not part of the public API, and did not have a deprecate()

2511

shortcut macro defined without -DPERL_CORE. Neither codesearch.google.com nor

2512

CPAN::Unpack show any users outside the core. */

2513

#ifdef PERL_CORE

2514

# define deprecate(s) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \

2515

"Use of " s " is deprecated")

2516

# define deprecate_disappears_in(when,message) \

2517

Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \

2518

message ", and will disappear in Perl " when)

2519

# define deprecate_fatal_in(when,message) \

2520

Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \

2521

message ". Its use will be fatal in Perl " when)

2522

#endif

2523

2524

/* Internal macros to deal with gids and uids */

2525

#ifdef PERL_CORE

2526

2527

# if Uid_t_size > IVSIZE

2528

# define sv_setuid(sv, uid) sv_setnv((sv), (NV)(uid))

2529

# define SvUID(sv) SvNV(sv)

2530

# elif Uid_t_sign <= 0

2531

# define sv_setuid(sv, uid) sv_setiv((sv), (IV)(uid))

2532

# define SvUID(sv) SvIV(sv)

2533

# else

2534

# define sv_setuid(sv, uid) sv_setuv((sv), (UV)(uid))

2535

# define SvUID(sv) SvUV(sv)

2536

# endif /* Uid_t_size */

2537

2538

# if Gid_t_size > IVSIZE

2539

# define sv_setgid(sv, gid) sv_setnv((sv), (NV)(gid))

2540

# define SvGID(sv) SvNV(sv)

2541

# elif Gid_t_sign <= 0

2542

# define sv_setgid(sv, gid) sv_setiv((sv), (IV)(gid))

2543

# define SvGID(sv) SvIV(sv)

2544

# else

2545

# define sv_setgid(sv, gid) sv_setuv((sv), (UV)(gid))

2546

# define SvGID(sv) SvUV(sv)

2547

# endif /* Gid_t_size */

#endif

#endif /* PERL_HANDY_H_ */

2552

2553

/*

2554

* ex: set ts=8 sts=4 sw=4 et:

2555

*/