perl5.git.perl.org Git - perl5.git/blame_incremental

Commit	Line	Data
	1	/* handy.h
	2	*
	3	* Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000,
	4	* 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/* IMPORTANT NOTE: Everything whose name begins with an underscore is for
	12	* internal core Perl use only. */
	13
	14	#ifndef PERL_HANDY_H_ /* Guard against nested #inclusion */
	15	#define PERL_HANDY_H_
	16
	17	#ifndef PERL_CORE
	18	# define Null(type) ((type)NULL)
	19
	20	/*
	21	=head1 Handy Values
	22
	23	=for apidoc AmU\|\|Nullch
	24	Null character pointer. (No longer available when C<PERL_CORE> is
	25	defined.)
	26
	27	=for apidoc AmU\|\|Nullsv
	28	Null SV pointer. (No longer available when C<PERL_CORE> is defined.)
	29
	30	=cut
	31	*/
	32
	33	# define Nullch Null(char*)
	34	# define Nullfp Null(PerlIO*)
	35	# define Nullsv Null(SV*)
	36	#endif
	37
	38	#ifdef TRUE
	39	#undef TRUE
	40	#endif
	41	#ifdef FALSE
	42	#undef FALSE
	43	#endif
	44	#define TRUE (1)
	45	#define FALSE (0)
	46
	47	/* The MUTABLE_*() macros cast pointers to the types shown, in such a way
	48	* (compiler permitting) that casting away const-ness will give a warning;
	49	* e.g.:
	50	*
	51	* const SV *sv = ...;
	52	* AV av1 = (AV)sv; <== BAD: the const has been silently cast away
	53	* AV *av2 = MUTABLE_AV(sv); <== GOOD: it may warn
	54	*/
	55
	56	#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
	57	# define MUTABLE_PTR(p) ({ void *_p = (p); _p; })
	58	#else
	59	# define MUTABLE_PTR(p) ((void *) (p))
	60	#endif
	61
	62	#define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p))
	63	#define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p))
	64	#define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p))
	65	#define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p))
	66	#define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p))
	67	#define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p))
	68
	69	#if defined(I_STDBOOL) && !defined(PERL_BOOL_AS_CHAR)
	70	# include <stdbool.h>
	71	# ifndef HAS_BOOL
	72	# define HAS_BOOL 1
	73	# endif
	74	#endif
	75
	76	/* bool is built-in for g++-2.6.3 and later, which might be used
	77	for extensions. <_G_config.h> defines _G_HAVE_BOOL, but we can't
	78	be sure _G_config.h will be included before this file. _G_config.h
	79	also defines _G_HAVE_BOOL for both gcc and g++, but only g++
	80	actually has bool. Hence, _G_HAVE_BOOL is pretty useless for us.
	81	g++ can be identified by __GNUG__.
	82	Andy Dougherty February 2000
	83	*/
	84	#ifdef __GNUG__ /* GNU g++ has bool built-in */
	85	# ifndef PERL_BOOL_AS_CHAR
	86	# ifndef HAS_BOOL
	87	# define HAS_BOOL 1
	88	# endif
	89	# endif
	90	#endif
	91
	92	#ifndef HAS_BOOL
	93	# ifdef bool
	94	# undef bool
	95	# endif
	96	# define bool char
	97	# define HAS_BOOL 1
	98	#endif
	99
	100	/* cast-to-bool. A simple (bool) cast may not do the right thing: if bool is
	101	* defined as char for example, then the cast from int is
	102	* implementation-defined (bool)!!(cbool) in a ternary triggers a bug in xlc on
	103	* AIX */
	104	#define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0)
	105
	106	/* Try to figure out __func__ or __FUNCTION__ equivalent, if any.
	107	* XXX Should really be a Configure probe, with HAS__FUNCTION__
	108	* and FUNCTION__ as results.
	109	* XXX Similarly, a Configure probe for __FILE__ and __LINE__ is needed. */
	110	#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) \|\| (defined(__SUNPRO_C)) /* C99 or close enough. */
	111	# define FUNCTION__ __func__
	112	#elif (defined(USING_MSVC6)) \|\| /* MSVC6 has neither __func__ nor __FUNCTION and no good workarounds, either. */ \
	113	(defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */
	114	# define FUNCTION__ ""
	115	#else
	116	# define FUNCTION__ __FUNCTION__ /* Common extension. */
	117	#endif
	118
	119	/* XXX A note on the perl source internal type system. The
	120	original intent was that I32 be exactly 32 bits.
	121
	122	Currently, we only guarantee that I32 is at least 32 bits.
	123	Specifically, if int is 64 bits, then so is I32. (This is the case
	124	for the Cray.) This has the advantage of meshing nicely with
	125	standard library calls (where we pass an I32 and the library is
	126	expecting an int), but the disadvantage that an I32 is not 32 bits.
	127	Andy Dougherty August 1996
	128
	129	There is no guarantee that there is any integral type with
	130	exactly 32 bits. It is perfectly legal for a system to have
	131	sizeof(short) == sizeof(int) == sizeof(long) == 8.
	132
	133	Similarly, there is no guarantee that I16 and U16 have exactly 16
	134	bits.
	135
	136	For dealing with issues that may arise from various 32/64-bit
	137	systems, we will ask Configure to check out
	138
	139	SHORTSIZE == sizeof(short)
	140	INTSIZE == sizeof(int)
	141	LONGSIZE == sizeof(long)
	142	LONGLONGSIZE == sizeof(long long) (if HAS_LONG_LONG)
	143	PTRSIZE == sizeof(void *)
	144	DOUBLESIZE == sizeof(double)
	145	LONG_DOUBLESIZE == sizeof(long double) (if HAS_LONG_DOUBLE).
	146
	147	*/
	148
	149	#ifdef I_INTTYPES /* e.g. Linux has int64_t without <inttypes.h> */
	150	# include <inttypes.h>
	151	# ifdef INT32_MIN_BROKEN
	152	# undef INT32_MIN
	153	# define INT32_MIN (-2147483647-1)
	154	# endif
	155	# ifdef INT64_MIN_BROKEN
	156	# undef INT64_MIN
	157	# define INT64_MIN (-9223372036854775807LL-1)
	158	# endif
	159	#endif
	160
	161	typedef I8TYPE I8;
	162	typedef U8TYPE U8;
	163	typedef I16TYPE I16;
	164	typedef U16TYPE U16;
	165	typedef I32TYPE I32;
	166	typedef U32TYPE U32;
	167
	168	#ifdef QUADKIND
	169	typedef I64TYPE I64;
	170	typedef U64TYPE U64;
	171	#endif
	172
	173	#if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX)
	174
	175	/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.
	176	Please search CHAR_MAX in perl.h for further details. */
	177	#define U8_MAX UINT8_MAX
	178	#define U8_MIN UINT8_MIN
	179
	180	#define I16_MAX INT16_MAX
	181	#define I16_MIN INT16_MIN
	182	#define U16_MAX UINT16_MAX
	183	#define U16_MIN UINT16_MIN
	184
	185	#define I32_MAX INT32_MAX
	186	#define I32_MIN INT32_MIN
	187	#ifndef UINT32_MAX_BROKEN /* e.g. HP-UX with gcc messes this up */
	188	# define U32_MAX UINT32_MAX
	189	#else
	190	# define U32_MAX 4294967295U
	191	#endif
	192	#define U32_MIN UINT32_MIN
	193
	194	#else
	195
	196	/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.
	197	Please search CHAR_MAX in perl.h for further details. */
	198	#define U8_MAX PERL_UCHAR_MAX
	199	#define U8_MIN PERL_UCHAR_MIN
	200
	201	#define I16_MAX PERL_SHORT_MAX
	202	#define I16_MIN PERL_SHORT_MIN
	203	#define U16_MAX PERL_USHORT_MAX
	204	#define U16_MIN PERL_USHORT_MIN
	205
	206	#if LONGSIZE > 4
	207	# define I32_MAX PERL_INT_MAX
	208	# define I32_MIN PERL_INT_MIN
	209	# define U32_MAX PERL_UINT_MAX
	210	# define U32_MIN PERL_UINT_MIN
	211	#else
	212	# define I32_MAX PERL_LONG_MAX
	213	# define I32_MIN PERL_LONG_MIN
	214	# define U32_MAX PERL_ULONG_MAX
	215	# define U32_MIN PERL_ULONG_MIN
	216	#endif
	217
	218	#endif
	219
	220	/* These C99 typedefs are useful sometimes for, say, loop variables whose
	221	* maximum values are small, but for which speed trumps size. If we have a C99
	222	* compiler, use that. Otherwise, a plain 'int' should be good enough.
	223	*
	224	* Restrict these to core for now until we are more certain this is a good
	225	* idea. */
	226	#if defined(PERL_CORE) \|\| defined(PERL_EXT)
	227	# ifdef I_STDINT
	228	typedef int_fast8_t PERL_INT_FAST8_T;
	229	typedef uint_fast8_t PERL_UINT_FAST8_T;
	230	typedef int_fast16_t PERL_INT_FAST16_T;
	231	typedef uint_fast16_t PERL_UINT_FAST16_T;
	232	# else
	233	typedef int PERL_INT_FAST8_T;
	234	typedef unsigned int PERL_UINT_FAST8_T;
	235	typedef int PERL_INT_FAST16_T;
	236	typedef unsigned int PERL_UINT_FAST16_T;
	237	# endif
	238	#endif
	239
	240	/* log(2) (i.e., log base 10 of 2) is pretty close to 0.30103, just in case
	241	* anyone is grepping for it */
	242	#define BIT_DIGITS(N) (((N)146)/485 + 1) / log10(2) =~ 146/485 */
	243	#define TYPE_DIGITS(T) BIT_DIGITS(sizeof(T) * 8)
	244	#define TYPE_CHARS(T) (TYPE_DIGITS(T) + 2) /* sign, NUL */
	245
	246	/* Unused by core; should be deprecated */
	247	#define Ctl(ch) ((ch) & 037)
	248
	249	#if defined(PERL_CORE) \|\| defined(PERL_EXT)
	250	# ifndef MIN
	251	# define MIN(a,b) ((a) < (b) ? (a) : (b))
	252	# endif
	253	# ifndef MAX
	254	# define MAX(a,b) ((a) > (b) ? (a) : (b))
	255	# endif
	256	#endif
	257
	258	/* Returns a boolean as to whether the input unsigned number is a power of 2
	259	* (20, 21, etc). In other words if it has just a single bit set.
	260	* If not, subtracting 1 would leave the uppermost bit set, so the & would
	261	* yield non-zero */
	262	#if defined(PERL_CORE) \|\| defined(PERL_EXT)
	263	# define isPOWER_OF_2(n) (n && (n & (n-1)) == 0)
	264	#endif
	265
	266	/* This is a helper macro to avoid preprocessor issues, replaced by nothing
	267	* unless under DEBUGGING, where it expands to an assert of its argument,
	268	* followed by a comma (hence the comma operator). If we just used a straight
	269	* assert(), we would get a comma with nothing before it when not DEBUGGING.
	270	*
	271	* We also use empty definition under Coverity since the __ASSERT__
	272	* checks often check for things that Really Cannot Happen, and Coverity
	273	* detects that and gets all excited. */
	274
	275	#if defined(DEBUGGING) && !defined(__COVERITY__)
	276	# define __ASSERT_(statement) assert(statement),
	277	#else
	278	# define __ASSERT_(statement)
	279	#endif
	280
	281	/*
	282	=head1 SV Manipulation Functions
	283
	284	=for apidoc Ama\|SV*\|newSVpvs\|"literal string" s
	285	Like C<newSVpvn>, but takes a literal string instead of a
	286	string/length pair.
	287
	288	=for apidoc Ama\|SV*\|newSVpvs_flags\|"literal string" s\|U32 flags
	289	Like C<newSVpvn_flags>, but takes a literal string instead of
	290	a string/length pair.
	291
	292	=for apidoc Ama\|SV*\|newSVpvs_share\|"literal string" s
	293	Like C<newSVpvn_share>, but takes a literal string instead of
	294	a string/length pair and omits the hash parameter.
	295
	296	=for apidoc Am\|void\|sv_catpvs_flags\|SV* sv\|"literal string" s\|I32 flags
	297	Like C<sv_catpvn_flags>, but takes a literal string instead
	298	of a string/length pair.
	299
	300	=for apidoc Am\|void\|sv_catpvs_nomg\|SV* sv\|"literal string" s
	301	Like C<sv_catpvn_nomg>, but takes a literal string instead of
	302	a string/length pair.
	303
	304	=for apidoc Am\|void\|sv_catpvs\|SV* sv\|"literal string" s
	305	Like C<sv_catpvn>, but takes a literal string instead of a
	306	string/length pair.
	307
	308	=for apidoc Am\|void\|sv_catpvs_mg\|SV* sv\|"literal string" s
	309	Like C<sv_catpvn_mg>, but takes a literal string instead of a
	310	string/length pair.
	311
	312	=for apidoc Am\|void\|sv_setpvs\|SV* sv\|"literal string" s
	313	Like C<sv_setpvn>, but takes a literal string instead of a
	314	string/length pair.
	315
	316	=for apidoc Am\|void\|sv_setpvs_mg\|SV* sv\|"literal string" s
	317	Like C<sv_setpvn_mg>, but takes a literal string instead of a
	318	string/length pair.
	319
	320	=for apidoc Am\|SV *\|sv_setref_pvs\|"literal string" s
	321	Like C<sv_setref_pvn>, but takes a literal string instead of
	322	a string/length pair.
	323
	324	=head1 Memory Management
	325
	326	=for apidoc Ama\|char*\|savepvs\|"literal string" s
	327	Like C<savepvn>, but takes a literal string instead of a
	328	string/length pair.
	329
	330	=for apidoc Ama\|char*\|savesharedpvs\|"literal string" s
	331	A version of C<savepvs()> which allocates the duplicate string in memory
	332	which is shared between threads.
	333
	334	=head1 GV Functions
	335
	336	=for apidoc Am\|HV*\|gv_stashpvs\|"literal string" name\|I32 create
	337	Like C<gv_stashpvn>, but takes a literal string instead of a
	338	string/length pair.
	339
	340	=head1 Hash Manipulation Functions
	341
	342	=for apidoc Am\|SV*\|hv_fetchs\|HV tb\|"literal string" key\|I32 lval
	343	Like C<hv_fetch>, but takes a literal string instead of a
	344	string/length pair.
	345
	346	=for apidoc Am\|SV*\|hv_stores\|HV tb\|"literal string" key\|SV* val
	347	Like C<hv_store>, but takes a literal string instead of a
	348	string/length pair
	349	and omits the hash parameter.
	350
	351	=head1 Lexer interface
	352
	353	=for apidoc Amx\|void\|lex_stuff_pvs\|"literal string" pv\|U32 flags
	354
	355	Like L</lex_stuff_pvn>, but takes a literal string instead of
	356	a string/length pair.
	357
	358	=cut
	359	*/
	360
	361	/* concatenating with "" ensures that only literal strings are accepted as
	362	* argument */
	363	#define STR_WITH_LEN(s) ("" s ""), (sizeof(s)-1)
	364
	365	/* note that STR_WITH_LEN() can't be used as argument to macros or functions
	366	* that under some configurations might be macros, which means that it requires
	367	* the full Perl_xxx(aTHX_ ...) form for any API calls where it's used.
	368	*/
	369
	370	/* STR_WITH_LEN() shortcuts */
	371	#define newSVpvs(str) Perl_newSVpvn(aTHX_ STR_WITH_LEN(str))
	372	#define newSVpvs_flags(str,flags) \
	373	Perl_newSVpvn_flags(aTHX_ STR_WITH_LEN(str), flags)
	374	#define newSVpvs_share(str) Perl_newSVpvn_share(aTHX_ STR_WITH_LEN(str), 0)
	375	#define sv_catpvs_flags(sv, str, flags) \
	376	Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), flags)
	377	#define sv_catpvs_nomg(sv, str) \
	378	Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), 0)
	379	#define sv_catpvs(sv, str) \
	380	Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC)
	381	#define sv_catpvs_mg(sv, str) \
	382	Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC\|SV_SMAGIC)
	383	#define sv_setpvs(sv, str) Perl_sv_setpvn(aTHX_ sv, STR_WITH_LEN(str))
	384	#define sv_setpvs_mg(sv, str) Perl_sv_setpvn_mg(aTHX_ sv, STR_WITH_LEN(str))
	385	#define sv_setref_pvs(rv, classname, str) \
	386	Perl_sv_setref_pvn(aTHX_ rv, classname, STR_WITH_LEN(str))
	387	#define savepvs(str) Perl_savepvn(aTHX_ STR_WITH_LEN(str))
	388	#define savesharedpvs(str) Perl_savesharedpvn(aTHX_ STR_WITH_LEN(str))
	389	#define gv_stashpvs(str, create) \
	390	Perl_gv_stashpvn(aTHX_ STR_WITH_LEN(str), create)
	391	#define gv_fetchpvs(namebeg, add, sv_type) \
	392	Perl_gv_fetchpvn_flags(aTHX_ STR_WITH_LEN(namebeg), add, sv_type)
	393	#define gv_fetchpvn(namebeg, len, add, sv_type) \
	394	Perl_gv_fetchpvn_flags(aTHX_ namebeg, len, add, sv_type)
	395	#define sv_catxmlpvs(dsv, str, utf8) \
	396	Perl_sv_catxmlpvn(aTHX_ dsv, STR_WITH_LEN(str), utf8)
	397
	398
	399	#define lex_stuff_pvs(pv,flags) Perl_lex_stuff_pvn(aTHX_ STR_WITH_LEN(pv), flags)
	400
	401	#define get_cvs(str, flags) \
	402	Perl_get_cvn_flags(aTHX_ STR_WITH_LEN(str), (flags))
	403
	404	/*
	405	=head1 Miscellaneous Functions
	406
	407	=for apidoc Am\|bool\|strNE\|char* s1\|char* s2
	408	Test two C<NUL>-terminated strings to see if they are different. Returns true
	409	or false.
	410
	411	=for apidoc Am\|bool\|strEQ\|char* s1\|char* s2
	412	Test two C<NUL>-terminated strings to see if they are equal. Returns true or
	413	false.
	414
	415	=for apidoc Am\|bool\|strLT\|char* s1\|char* s2
	416	Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than the
	417	second, C<s2>. Returns true or false.
	418
	419	=for apidoc Am\|bool\|strLE\|char* s1\|char* s2
	420	Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than or
	421	equal to the second, C<s2>. Returns true or false.
	422
	423	=for apidoc Am\|bool\|strGT\|char* s1\|char* s2
	424	Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than
	425	the second, C<s2>. Returns true or false.
	426
	427	=for apidoc Am\|bool\|strGE\|char* s1\|char* s2
	428	Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than
	429	or equal to the second, C<s2>. Returns true or false.
	430
	431	=for apidoc Am\|bool\|strnNE\|char* s1\|char* s2\|STRLEN len
	432	Test two C<NUL>-terminated strings to see if they are different. The C<len>
	433	parameter indicates the number of bytes to compare. Returns true or false. (A
	434	wrapper for C<strncmp>).
	435
	436	=for apidoc Am\|bool\|strnEQ\|char* s1\|char* s2\|STRLEN len
	437	Test two C<NUL>-terminated strings to see if they are equal. The C<len>
	438	parameter indicates the number of bytes to compare. Returns true or false. (A
	439	wrapper for C<strncmp>).
	440
	441	=for apidoc Am\|bool\|memEQ\|char* s1\|char* s2\|STRLEN len
	442	Test two buffers (which may contain embedded C<NUL> characters, to see if they
	443	are equal. The C<len> parameter indicates the number of bytes to compare.
	444	Returns zero if equal, or non-zero if non-equal.
	445
	446	=for apidoc Am\|bool\|memNE\|char* s1\|char* s2\|STRLEN len
	447	Test two buffers (which may contain embedded C<NUL> characters, to see if they
	448	are not equal. The C<len> parameter indicates the number of bytes to compare.
	449	Returns zero if non-equal, or non-zero if equal.
	450
	451	=cut
	452
	453	New macros should use the following conventions for their names (which are
	454	based on the underlying C library functions):
	455
	456	(mem \| str n? ) (EQ \| NE \| LT \| GT \| GE \| (( BEGIN \| END ) P? )) l? s?
	457
	458	Each has two main parameters, string-like operands that are compared
	459	against each other, as specified by the macro name. Some macros may
	460	additionally have one or potentially even two length parameters. If a length
	461	parameter applies to both string parameters, it will be positioned third;
	462	otherwise any length parameter immediately follows the string parameter it
	463	applies to.
	464
	465	If the prefix to the name is 'str', the string parameter is a pointer to a C
	466	language string. Such a string does not contain embedded NUL bytes; its
	467	length may be unknown, but can be calculated by C<strlen()>, since it is
	468	terminated by a NUL, which isn't included in its length.
	469
	470	The optional 'n' following 'str' means that that there is a third parameter,
	471	giving the maximum number of bytes to look at in each string. Even if both
	472	strings are longer than the length parameter, those extra bytes will be
	473	unexamined.
	474
	475	The 's' suffix means that the 2nd byte string parameter is a literal C
	476	double-quoted string. Its length will automatically be calculated by the
	477	macro, so no length parameter will ever be needed for it.
	478
	479	If the prefix is 'mem', the string parameters don't have to be C strings;
	480	they may contain embedded NUL bytes, do not necessarily have a terminating
	481	NUL, and their lengths can be known only through other means, which in
	482	practice are additional parameter(s) passed to the function. All 'mem'
	483	functions have at least one length parameter. Barring any 'l' or 's' suffix,
	484	there is a single length parameter, in position 3, which applies to both
	485	string parameters. The 's' suffix means, as described above, that the 2nd
	486	string is a literal double-quoted C string (hence its length is calculated by
	487	the macro, and the length parameter to the function applies just to the first
	488	string parameter, and hence is positioned just after it). An 'l' suffix
	489	means that the 2nd string parameter has its own length parameter, and the
	490	signature will look like memFOOl(s1, l1, s2, l2).
	491
	492	BEGIN (and END) are for testing if the 2nd string is an initial (or final)
	493	substring of the 1st string. 'P' if present indicates that the substring
	494	must be a "proper" one in tha mathematical sense that the first one must be
	495	strictly larger than the 2nd.
	496
	497	*/
	498
	499
	500	#define strNE(s1,s2) (strcmp(s1,s2) != 0)

1

/* handy.h

2

*

3

4

* 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others

5

*

6

* You may distribute under the terms of either the GNU General Public

7

* License or the Artistic License, as specified in the README file.

*

*/

/* IMPORTANT NOTE: Everything whose name begins with an underscore is for

12

* internal core Perl use only. */

13

14

#ifndef PERL_HANDY_H_ /* Guard against nested #inclusion */

15

#define PERL_HANDY_H_

16

17

#ifndef PERL_CORE

18

# define Null(type) ((type)NULL)

/*

=head1 Handy Values

=for apidoc AmU||Nullch

24

Null character pointer. (No longer available when C<PERL_CORE> is

25

defined.)

26

27

=for apidoc AmU||Nullsv

28

Null SV pointer. (No longer available when C<PERL_CORE> is defined.)

=cut

*/

# define Nullch Null(char*)

34

# define Nullfp Null(PerlIO*)

35

# define Nullsv Null(SV*)

#endif

#ifdef TRUE

#undef TRUE

#endif

#ifdef FALSE

#undef FALSE

#endif

#define TRUE (1)

#define FALSE (0)

/* The MUTABLE_*() macros cast pointers to the types shown, in such a way

48

* (compiler permitting) that casting away const-ness will give a warning;

49

* e.g.:

50

*

51

* const SV *sv = ...;

52

* AV *av1 = (AV*)sv; <== BAD: the const has been silently cast away

53

* AV *av2 = MUTABLE_AV(sv); <== GOOD: it may warn

54

*/

55

56

#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)

57

# define MUTABLE_PTR(p) ({ void *_p = (p); _p; })

58

#else

59

# define MUTABLE_PTR(p) ((void *) (p))

60

#endif

61

62

#define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p))

63

#define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p))

64

#define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p))

65

#define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p))

66

#define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p))

67

#define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p))

68

69

#if defined(I_STDBOOL) && !defined(PERL_BOOL_AS_CHAR)

70

# include <stdbool.h>

# ifndef HAS_BOOL

# define HAS_BOOL 1

# endif

#endif

/* bool is built-in for g++-2.6.3 and later, which might be used

77

for extensions. <_G_config.h> defines _G_HAVE_BOOL, but we can't

78

be sure _G_config.h will be included before this file. _G_config.h

79

also defines _G_HAVE_BOOL for both gcc and g++, but only g++

80

actually has bool. Hence, _G_HAVE_BOOL is pretty useless for us.

81

g++ can be identified by __GNUG__.

82

Andy Dougherty February 2000

83

*/

84

#ifdef __GNUG__ /* GNU g++ has bool built-in */

85

# ifndef PERL_BOOL_AS_CHAR

# ifndef HAS_BOOL

# define HAS_BOOL 1

# endif

# endif

#endif

#ifndef HAS_BOOL

# ifdef bool

# undef bool

# endif

# define bool char

# define HAS_BOOL 1

#endif

/* cast-to-bool. A simple (bool) cast may not do the right thing: if bool is

101

* defined as char for example, then the cast from int is

102

* implementation-defined (bool)!!(cbool) in a ternary triggers a bug in xlc on

103

* AIX */

104

#define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0)

105

106

/* Try to figure out __func__ or __FUNCTION__ equivalent, if any.

107

* XXX Should really be a Configure probe, with HAS__FUNCTION__

108

* and FUNCTION__ as results.

109

* XXX Similarly, a Configure probe for __FILE__ and __LINE__ is needed. */

110

#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__SUNPRO_C)) /* C99 or close enough. */

111

# define FUNCTION__ __func__

112

#elif (defined(USING_MSVC6)) || /* MSVC6 has neither __func__ nor __FUNCTION and no good workarounds, either. */ \

113

(defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */

114

# define FUNCTION__ ""

115

#else

116

# define FUNCTION__ __FUNCTION__ /* Common extension. */

117

#endif

118

119

/* XXX A note on the perl source internal type system. The

120

original intent was that I32 be *exactly* 32 bits.

121

122

Currently, we only guarantee that I32 is *at least* 32 bits.

123

Specifically, if int is 64 bits, then so is I32. (This is the case

124

for the Cray.) This has the advantage of meshing nicely with

125

standard library calls (where we pass an I32 and the library is

126

expecting an int), but the disadvantage that an I32 is not 32 bits.

127

Andy Dougherty August 1996

128

129

There is no guarantee that there is *any* integral type with

130

exactly 32 bits. It is perfectly legal for a system to have

131

sizeof(short) == sizeof(int) == sizeof(long) == 8.

132

133

Similarly, there is no guarantee that I16 and U16 have exactly 16

134

bits.

135

136

For dealing with issues that may arise from various 32/64-bit

137

systems, we will ask Configure to check out

138

139

SHORTSIZE == sizeof(short)

140

INTSIZE == sizeof(int)

141

LONGSIZE == sizeof(long)

142

LONGLONGSIZE == sizeof(long long) (if HAS_LONG_LONG)

143

PTRSIZE == sizeof(void *)

144

DOUBLESIZE == sizeof(double)

145

LONG_DOUBLESIZE == sizeof(long double) (if HAS_LONG_DOUBLE).

*/

#ifdef I_INTTYPES /* e.g. Linux has int64_t without <inttypes.h> */

150

# include <inttypes.h>

151

# ifdef INT32_MIN_BROKEN

152

# undef INT32_MIN

153

# define INT32_MIN (-2147483647-1)

154

# endif

155

# ifdef INT64_MIN_BROKEN

156

# undef INT64_MIN

157

# define INT64_MIN (-9223372036854775807LL-1)

# endif

#endif

typedef I8TYPE I8;

typedef U8TYPE U8;

typedef I16TYPE I16;

typedef U16TYPE U16;

typedef I32TYPE I32;

typedef U32TYPE U32;

#ifdef QUADKIND

typedef I64TYPE I64;

typedef U64TYPE U64;

#endif

#if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX)

174

175

/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.

176

Please search CHAR_MAX in perl.h for further details. */

177

#define U8_MAX UINT8_MAX

178

#define U8_MIN UINT8_MIN

179

180

#define I16_MAX INT16_MAX

181

#define I16_MIN INT16_MIN

182

#define U16_MAX UINT16_MAX

183

#define U16_MIN UINT16_MIN

184

185

#define I32_MAX INT32_MAX

186

#define I32_MIN INT32_MIN

187

#ifndef UINT32_MAX_BROKEN /* e.g. HP-UX with gcc messes this up */

188

# define U32_MAX UINT32_MAX

189

#else

190

# define U32_MAX 4294967295U

191

#endif

192

#define U32_MIN UINT32_MIN

#else

/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.

197

Please search CHAR_MAX in perl.h for further details. */

198

#define U8_MAX PERL_UCHAR_MAX

199

#define U8_MIN PERL_UCHAR_MIN

200

201

#define I16_MAX PERL_SHORT_MAX

202

#define I16_MIN PERL_SHORT_MIN

203

#define U16_MAX PERL_USHORT_MAX

204

#define U16_MIN PERL_USHORT_MIN

205

206

#if LONGSIZE > 4

207

# define I32_MAX PERL_INT_MAX

208

# define I32_MIN PERL_INT_MIN

209

# define U32_MAX PERL_UINT_MAX

210

# define U32_MIN PERL_UINT_MIN

211

#else

212

# define I32_MAX PERL_LONG_MAX

213

# define I32_MIN PERL_LONG_MIN

214

# define U32_MAX PERL_ULONG_MAX

215

# define U32_MIN PERL_ULONG_MIN

#endif

#endif

/* These C99 typedefs are useful sometimes for, say, loop variables whose

221

* maximum values are small, but for which speed trumps size. If we have a C99

222

* compiler, use that. Otherwise, a plain 'int' should be good enough.

223

*

224

* Restrict these to core for now until we are more certain this is a good

225

* idea. */

226

#if defined(PERL_CORE) || defined(PERL_EXT)

227

# ifdef I_STDINT

228

typedef int_fast8_t PERL_INT_FAST8_T;

229

typedef uint_fast8_t PERL_UINT_FAST8_T;

230

typedef int_fast16_t PERL_INT_FAST16_T;

231

typedef uint_fast16_t PERL_UINT_FAST16_T;

232

# else

233

typedef int PERL_INT_FAST8_T;

234

typedef unsigned int PERL_UINT_FAST8_T;

235

typedef int PERL_INT_FAST16_T;

236

typedef unsigned int PERL_UINT_FAST16_T;

# endif

#endif

/* log(2) (i.e., log base 10 of 2) is pretty close to 0.30103, just in case

241

* anyone is grepping for it */

242

#define BIT_DIGITS(N) (((N)*146)/485 + 1) /* log10(2) =~ 146/485 */

243

#define TYPE_DIGITS(T) BIT_DIGITS(sizeof(T) * 8)

244

#define TYPE_CHARS(T) (TYPE_DIGITS(T) + 2) /* sign, NUL */

245

246

/* Unused by core; should be deprecated */

247

#define Ctl(ch) ((ch) & 037)

248

249

#if defined(PERL_CORE) || defined(PERL_EXT)

250

# ifndef MIN

251

# define MIN(a,b) ((a) < (b) ? (a) : (b))

252

# endif

253

# ifndef MAX

254

# define MAX(a,b) ((a) > (b) ? (a) : (b))

# endif

#endif

/* Returns a boolean as to whether the input unsigned number is a power of 2

259

* (2**0, 2**1, etc). In other words if it has just a single bit set.

260

* If not, subtracting 1 would leave the uppermost bit set, so the & would

261

* yield non-zero */

262

#if defined(PERL_CORE) || defined(PERL_EXT)

263

# define isPOWER_OF_2(n) (n && (n & (n-1)) == 0)

264

#endif

265

266

/* This is a helper macro to avoid preprocessor issues, replaced by nothing

267

* unless under DEBUGGING, where it expands to an assert of its argument,

268

* followed by a comma (hence the comma operator). If we just used a straight

269

* assert(), we would get a comma with nothing before it when not DEBUGGING.

270

*

271

* We also use empty definition under Coverity since the __ASSERT__

272

* checks often check for things that Really Cannot Happen, and Coverity

273

* detects that and gets all excited. */

274

275

#if defined(DEBUGGING) && !defined(__COVERITY__)

276

# define __ASSERT_(statement) assert(statement),

277

#else

278

# define __ASSERT_(statement)

#endif

/*

=head1 SV Manipulation Functions

283

284

=for apidoc Ama|SV*|newSVpvs|"literal string" s

285

Like C<newSVpvn>, but takes a literal string instead of a

286

string/length pair.

287

288

=for apidoc Ama|SV*|newSVpvs_flags|"literal string" s|U32 flags

289

Like C<newSVpvn_flags>, but takes a literal string instead of

290

a string/length pair.

291

292

=for apidoc Ama|SV*|newSVpvs_share|"literal string" s

293

Like C<newSVpvn_share>, but takes a literal string instead of

294

a string/length pair and omits the hash parameter.

295

296

297

Like C<sv_catpvn_flags>, but takes a literal string instead

298

of a string/length pair.

299

300

=for apidoc Am|void|sv_catpvs_nomg|SV* sv|"literal string" s

301

Like C<sv_catpvn_nomg>, but takes a literal string instead of

302

a string/length pair.

303

304

=for apidoc Am|void|sv_catpvs|SV* sv|"literal string" s

305

Like C<sv_catpvn>, but takes a literal string instead of a

306

string/length pair.

307

308

=for apidoc Am|void|sv_catpvs_mg|SV* sv|"literal string" s

309

Like C<sv_catpvn_mg>, but takes a literal string instead of a

310

string/length pair.

311

312

=for apidoc Am|void|sv_setpvs|SV* sv|"literal string" s

313

Like C<sv_setpvn>, but takes a literal string instead of a

314

string/length pair.

315

316

=for apidoc Am|void|sv_setpvs_mg|SV* sv|"literal string" s

317

Like C<sv_setpvn_mg>, but takes a literal string instead of a

318

string/length pair.

319

320

=for apidoc Am|SV *|sv_setref_pvs|"literal string" s

321

Like C<sv_setref_pvn>, but takes a literal string instead of

322

a string/length pair.

323

324

=head1 Memory Management

325

326

=for apidoc Ama|char*|savepvs|"literal string" s

327

Like C<savepvn>, but takes a literal string instead of a

328

string/length pair.

329

330

=for apidoc Ama|char*|savesharedpvs|"literal string" s

331

A version of C<savepvs()> which allocates the duplicate string in memory

332

which is shared between threads.

=head1 GV Functions

=for apidoc Am|HV*|gv_stashpvs|"literal string" name|I32 create

337

Like C<gv_stashpvn>, but takes a literal string instead of a

338

string/length pair.

339

340

=head1 Hash Manipulation Functions

341

342

343

Like C<hv_fetch>, but takes a literal string instead of a

string/length pair.

Like C<hv_store>, but takes a literal string instead of a

348

string/length pair

349

and omits the hash parameter.

350

351

=head1 Lexer interface

352

353

=for apidoc Amx|void|lex_stuff_pvs|"literal string" pv|U32 flags

354

355

Like L</lex_stuff_pvn>, but takes a literal string instead of

356

a string/length pair.

=cut

*/

/* concatenating with "" ensures that only literal strings are accepted as

362

* argument */

363

#define STR_WITH_LEN(s) ("" s ""), (sizeof(s)-1)

364

365

/* note that STR_WITH_LEN() can't be used as argument to macros or functions

366

* that under some configurations might be macros, which means that it requires

367

* the full Perl_xxx(aTHX_ ...) form for any API calls where it's used.

368

*/

369

370

/* STR_WITH_LEN() shortcuts */

371

#define newSVpvs(str) Perl_newSVpvn(aTHX_ STR_WITH_LEN(str))

372

#define newSVpvs_flags(str,flags) \

373

Perl_newSVpvn_flags(aTHX_ STR_WITH_LEN(str), flags)

374

#define newSVpvs_share(str) Perl_newSVpvn_share(aTHX_ STR_WITH_LEN(str), 0)

375

#define sv_catpvs_flags(sv, str, flags) \

376

Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), flags)

377

#define sv_catpvs_nomg(sv, str) \

378

Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), 0)

379

#define sv_catpvs(sv, str) \

380

Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC)

381

#define sv_catpvs_mg(sv, str) \

382

Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC|SV_SMAGIC)

383

#define sv_setpvs(sv, str) Perl_sv_setpvn(aTHX_ sv, STR_WITH_LEN(str))

384

#define sv_setpvs_mg(sv, str) Perl_sv_setpvn_mg(aTHX_ sv, STR_WITH_LEN(str))

385

#define sv_setref_pvs(rv, classname, str) \

386

Perl_sv_setref_pvn(aTHX_ rv, classname, STR_WITH_LEN(str))

387

#define savepvs(str) Perl_savepvn(aTHX_ STR_WITH_LEN(str))

388

#define savesharedpvs(str) Perl_savesharedpvn(aTHX_ STR_WITH_LEN(str))

389

#define gv_stashpvs(str, create) \

390

Perl_gv_stashpvn(aTHX_ STR_WITH_LEN(str), create)

391

#define gv_fetchpvs(namebeg, add, sv_type) \

392

Perl_gv_fetchpvn_flags(aTHX_ STR_WITH_LEN(namebeg), add, sv_type)

393

#define gv_fetchpvn(namebeg, len, add, sv_type) \

394

Perl_gv_fetchpvn_flags(aTHX_ namebeg, len, add, sv_type)

395

#define sv_catxmlpvs(dsv, str, utf8) \

396

Perl_sv_catxmlpvn(aTHX_ dsv, STR_WITH_LEN(str), utf8)

397

398

399

#define lex_stuff_pvs(pv,flags) Perl_lex_stuff_pvn(aTHX_ STR_WITH_LEN(pv), flags)

400

401

#define get_cvs(str, flags) \

402

Perl_get_cvn_flags(aTHX_ STR_WITH_LEN(str), (flags))

403

404

/*

405

=head1 Miscellaneous Functions

406

407

=for apidoc Am|bool|strNE|char* s1|char* s2

408

Test two C<NUL>-terminated strings to see if they are different. Returns true

409

or false.

410

411

=for apidoc Am|bool|strEQ|char* s1|char* s2

412

Test two C<NUL>-terminated strings to see if they are equal. Returns true or

413

false.

414

415

=for apidoc Am|bool|strLT|char* s1|char* s2

416

Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than the

417

second, C<s2>. Returns true or false.

418

419

=for apidoc Am|bool|strLE|char* s1|char* s2

420

Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than or

421

equal to the second, C<s2>. Returns true or false.

422

423

=for apidoc Am|bool|strGT|char* s1|char* s2

424

Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than

425

the second, C<s2>. Returns true or false.

426

427

=for apidoc Am|bool|strGE|char* s1|char* s2

428

Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than

429

or equal to the second, C<s2>. Returns true or false.

430

431

432

Test two C<NUL>-terminated strings to see if they are different. The C<len>

433

parameter indicates the number of bytes to compare. Returns true or false. (A

434

wrapper for C<strncmp>).

435

436

437

Test two C<NUL>-terminated strings to see if they are equal. The C<len>

438

parameter indicates the number of bytes to compare. Returns true or false. (A

439

wrapper for C<strncmp>).

440

441

442

Test two buffers (which may contain embedded C<NUL> characters, to see if they

443

are equal. The C<len> parameter indicates the number of bytes to compare.

444

Returns zero if equal, or non-zero if non-equal.

445

446

447

Test two buffers (which may contain embedded C<NUL> characters, to see if they

448

are not equal. The C<len> parameter indicates the number of bytes to compare.

449

Returns zero if non-equal, or non-zero if equal.

=cut

New macros should use the following conventions for their names (which are

454

based on the underlying C library functions):

455

456

(mem | str n? ) (EQ | NE | LT | GT | GE | (( BEGIN | END ) P? )) l? s?

457

458

Each has two main parameters, string-like operands that are compared

459

against each other, as specified by the macro name. Some macros may

460

additionally have one or potentially even two length parameters. If a length

461

parameter applies to both string parameters, it will be positioned third;

462

otherwise any length parameter immediately follows the string parameter it

463

applies to.

464

465

If the prefix to the name is 'str', the string parameter is a pointer to a C

466

language string. Such a string does not contain embedded NUL bytes; its

467

length may be unknown, but can be calculated by C<strlen()>, since it is

468

terminated by a NUL, which isn't included in its length.

469

470

The optional 'n' following 'str' means that that there is a third parameter,

471

giving the maximum number of bytes to look at in each string. Even if both

472

strings are longer than the length parameter, those extra bytes will be

473

unexamined.

474

475

The 's' suffix means that the 2nd byte string parameter is a literal C

476

double-quoted string. Its length will automatically be calculated by the

477

macro, so no length parameter will ever be needed for it.

478

479

If the prefix is 'mem', the string parameters don't have to be C strings;

480

they may contain embedded NUL bytes, do not necessarily have a terminating

481

NUL, and their lengths can be known only through other means, which in

482

practice are additional parameter(s) passed to the function. All 'mem'

483

functions have at least one length parameter. Barring any 'l' or 's' suffix,

484

there is a single length parameter, in position 3, which applies to both

485

string parameters. The 's' suffix means, as described above, that the 2nd

486

string is a literal double-quoted C string (hence its length is calculated by

487

the macro, and the length parameter to the function applies just to the first

488

string parameter, and hence is positioned just after it). An 'l' suffix

489

means that the 2nd string parameter has its own length parameter, and the

490

signature will look like memFOOl(s1, l1, s2, l2).

491

492

BEGIN (and END) are for testing if the 2nd string is an initial (or final)

493

substring of the 1st string. 'P' if present indicates that the substring

494

must be a "proper" one in tha mathematical sense that the first one must be

495

strictly larger than the 2nd.

*/

#define strNE(s1,s2) (strcmp(s1,s2) != 0)

501

#define strEQ(s1,s2) (strcmp(s1,s2) == 0)

502

#define strLT(s1,s2) (strcmp(s1,s2) < 0)

503

#define strLE(s1,s2) (strcmp(s1,s2) <= 0)

504

#define strGT(s1,s2) (strcmp(s1,s2) > 0)

505

#define strGE(s1,s2) (strcmp(s1,s2) >= 0)

506

507

#define strnNE(s1,s2,l) (strncmp(s1,s2,l) != 0)

508

#define strnEQ(s1,s2,l) (strncmp(s1,s2,l) == 0)

509

510

#define memNE(s1,s2,l) (memcmp(s1,s2,l) != 0)

511

#define memEQ(s1,s2,l) (memcmp(s1,s2,l) == 0)

512

513

/* memEQ and memNE where second comparand is a string constant */

514

#define memEQs(s1, l, s2) \

515

(((sizeof(s2)-1) == (l)) && memEQ((s1), ("" s2 ""), (sizeof(s2)-1)))

516

#define memNEs(s1, l, s2) (! memEQs(s1, l, s2))

517

518

/* Keep these private until we decide it was a good idea */

519

#if defined(PERL_CORE) || defined(PERL_EXT) || defined(PERL_EXT_POSIX)

520

521

#define strBEGINs(s1,s2) (strncmp(s1,"" s2 "", sizeof(s2)-1) == 0)

522

523

#define memBEGINs(s1, l, s2) \

524

( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \

525

&& memEQ(s1, "" s2 "", sizeof(s2)-1))

526

#define memBEGINPs(s1, l, s2) \

527

( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) - 1 \

528

&& memEQ(s1, "" s2 "", sizeof(s2)-1))

529

#define memENDs(s1, l, s2) \

530

( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \

531

&& memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1))

532

#define memENDPs(s1, l, s2) \

533

( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) \

534

&& memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1))

535

#endif /* End of making macros private */

536

537

#define memLT(s1,s2,l) (memcmp(s1,s2,l) < 0)

538

#define memLE(s1,s2,l) (memcmp(s1,s2,l) <= 0)

539

#define memGT(s1,s2,l) (memcmp(s1,s2,l) > 0)

540

#define memGE(s1,s2,l) (memcmp(s1,s2,l) >= 0)

/*

* Character classes.

*

* Unfortunately, the introduction of locales means that we

546

* can't trust isupper(), etc. to tell the truth. And when

547

* it comes to /\w+/ with tainting enabled, we *must* be able

548

* to trust our character classes.

549

*

550

* Therefore, the default tests in the text of Perl will be

551

* independent of locale. Any code that wants to depend on

552

* the current locale will use the tests that begin with "lc".

553

*/

554

555

#ifdef HAS_SETLOCALE /* XXX Is there a better test for this? */

# ifndef CTYPE256

# define CTYPE256

# endif

#endif

/*

=head1 Character classification

564

This section is about functions (really macros) that classify characters

565

into types, such as punctuation versus alphabetic, etc. Most of these are

566

analogous to regular expression character classes. (See

567

L<perlrecharclass/POSIX Character Classes>.) There are several variants for

568

each class. (Not all macros have all variants; each item below lists the

569

ones valid for it.) None are affected by C<use bytes>, and only the ones

570

with C<LC> in the name are affected by the current locale.

571

572

The base function, e.g., C<isALPHA()>, takes an octet (either a C<char> or a

573

C<U8>) as input and returns a boolean as to whether or not the character

574

represented by that octet is (or on non-ASCII platforms, corresponds to) an

575

ASCII character in the named class based on platform, Unicode, and Perl rules.

576

If the input is a number that doesn't fit in an octet, FALSE is returned.

577

578

Variant C<isI<FOO>_A> (e.g., C<isALPHA_A()>) is identical to the base function

579

with no suffix C<"_A">. This variant is used to emphasize by its name that

580

only ASCII-range characters can return TRUE.

581

582

Variant C<isI<FOO>_L1> imposes the Latin-1 (or EBCDIC equivalent) character set

583

onto the platform. That is, the code points that are ASCII are unaffected,

584

since ASCII is a subset of Latin-1. But the non-ASCII code points are treated

585

as if they are Latin-1 characters. For example, C<isWORDCHAR_L1()> will return

586

true when called with the code point 0xDF, which is a word character in both

587

ASCII and EBCDIC (though it represents different characters in each).

588

589

Variant C<isI<FOO>_uvchr> is like the C<isI<FOO>_L1> variant, but accepts any UV code

590

point as input. If the code point is larger than 255, Unicode rules are used

591

to determine if it is in the character class. For example,

592

C<isWORDCHAR_uvchr(0x100)> returns TRUE, since 0x100 is LATIN CAPITAL LETTER A

593

WITH MACRON in Unicode, and is a word character.

594

595

Variant C<isI<FOO>_utf8_safe> is like C<isI<FOO>_uvchr>, but is used for UTF-8

596

encoded strings. Each call classifies one character, even if the string

597

contains many. This variant takes two parameters. The first, C, is a

598

pointer to the first byte of the character to be classified. (Recall that it

599

may take more than one byte to represent a character in UTF-8 strings.) The

600

second parameter, C<e>, points to anywhere in the string beyond the first

601

character, up to one byte past the end of the entire string. The suffix

602

C<_safe> in the function's name indicates that it will not attempt to read

603

beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this

604

is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the input

605

character is malformed in some way, the program may croak, or the function may

606

return FALSE, at the discretion of the implementation, and subject to change in

607

future releases.

608

609

Variant C<isI<FOO>_utf8> is like C<isI<FOO>_utf8_safe>, but takes just a single

610

parameter, C, which has the same meaning as the corresponding parameter does

611

in C<isI<FOO>_utf8_safe>. The function therefore can't check if it is reading

612

beyond the end of the string. Starting in Perl v5.30, it will take a second

613

parameter, becoming a synonym for C<isI<FOO>_utf8_safe>. At that time every

614

program that uses it will have to be changed to successfully compile. In the

615

meantime, the first runtime call to C<isI<FOO>_utf8> from each call point in the

616

program will raise a deprecation warning, enabled by default. You can convert

617

your program now to use C<isI<FOO>_utf8_safe>, and avoid the warnings, and get an

618

extra measure of protection, or you can wait until v5.30, when you'll be forced

619

to add the C<e> parameter.

620

621

Variant C<isI<FOO>_LC> is like the C<isI<FOO>_A> and C<isI<FOO>_L1> variants, but the

622

result is based on the current locale, which is what C<LC> in the name stands

623

for. If Perl can determine that the current locale is a UTF-8 locale, it uses

624

the published Unicode rules; otherwise, it uses the C library function that

625

gives the named classification. For example, C<isDIGIT_LC()> when not in a

626

UTF-8 locale returns the result of calling C<isdigit()>. FALSE is always

627

returned if the input won't fit into an octet. On some platforms where the C

628

library function is known to be defective, Perl changes its result to follow

629

the POSIX standard's rules.

630

631

Variant C<isI<FOO>_LC_uvchr> is like C<isI<FOO>_LC>, but is defined on any UV. It

632

returns the same as C<isI<FOO>_LC> for input code points less than 256, and

633

returns the hard-coded, not-affected-by-locale, Unicode results for larger ones.

634

635

Variant C<isI<FOO>_LC_utf8_safe> is like C<isI<FOO>_LC_uvchr>, but is used for UTF-8

636

encoded strings. Each call classifies one character, even if the string

637

contains many. This variant takes two parameters. The first, C, is a

638

pointer to the first byte of the character to be classified. (Recall that it

639

may take more than one byte to represent a character in UTF-8 strings.) The

640

second parameter, C<e>, points to anywhere in the string beyond the first

641

character, up to one byte past the end of the entire string. The suffix

642

C<_safe> in the function's name indicates that it will not attempt to read

643

beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this

644

is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the input

645

character is malformed in some way, the program may croak, or the function may

646

return FALSE, at the discretion of the implementation, and subject to change in

647

future releases.

648

649

Variant C<isI<FOO>_LC_utf8> is like C<isI<FOO>_LC_utf8_safe>, but takes just a single

650

parameter, C, which has the same meaning as the corresponding parameter does

651

in C<isI<FOO>_LC_utf8_safe>. The function therefore can't check if it is reading

652

beyond the end of the string. Starting in Perl v5.30, it will take a second

653

parameter, becoming a synonym for C<isI<FOO>_LC_utf8_safe>. At that time every

654

program that uses it will have to be changed to successfully compile. In the

655

meantime, the first runtime call to C<isI<FOO>_LC_utf8> from each call point in

656

the program will raise a deprecation warning, enabled by default. You can

657

convert your program now to use C<isI<FOO>_LC_utf8_safe>, and avoid the warnings,

658

and get an extra measure of protection, or you can wait until v5.30, when

659

you'll be forced to add the C<e> parameter.

660

661

=for apidoc Am|bool|isALPHA|char ch

662

Returns a boolean indicating whether the specified character is an

663

alphabetic character, analogous to C<m/[[:alpha:]]/>.

664

See the L<top of this section|/Character classification> for an explanation of

665

variants

666

C<isALPHA_A>, C<isALPHA_L1>, C<isALPHA_uvchr>, C<isALPHA_utf8_safe>,

667

C<isALPHA_LC>, C<isALPHA_LC_uvchr>, and C<isALPHA_LC_utf8_safe>.

668

669

=for apidoc Am|bool|isALPHANUMERIC|char ch

670

Returns a boolean indicating whether the specified character is a either an

671

alphabetic character or decimal digit, analogous to C<m/[[:alnum:]]/>.

672

See the L<top of this section|/Character classification> for an explanation of

673

variants

674

C<isALPHANUMERIC_A>, C<isALPHANUMERIC_L1>, C<isALPHANUMERIC_uvchr>,

675

C<isALPHANUMERIC_utf8_safe>, C<isALPHANUMERIC_LC>, C<isALPHANUMERIC_LC_uvchr>,

676

and C<isALPHANUMERIC_LC_utf8_safe>.

677

678

=for apidoc Am|bool|isASCII|char ch

679

Returns a boolean indicating whether the specified character is one of the 128

680

characters in the ASCII character set, analogous to C<m/[[:ascii:]]/>.

681

On non-ASCII platforms, it returns TRUE iff this

682

character corresponds to an ASCII character. Variants C<isASCII_A()> and

683

C<isASCII_L1()> are identical to C<isASCII()>.

684

See the L<top of this section|/Character classification> for an explanation of

685

variants

686

C<isASCII_uvchr>, C<isASCII_utf8_safe>, C<isASCII_LC>, C<isASCII_LC_uvchr>, and

687

C<isASCII_LC_utf8_safe>. Note, however, that some platforms do not have the C

688

library routine C<isascii()>. In these cases, the variants whose names contain

689

C<LC> are the same as the corresponding ones without.

690

691

Also note, that because all ASCII characters are UTF-8 invariant (meaning they

692

have the exact same representation (always a single byte) whether encoded in

693

UTF-8 or not), C<isASCII> will give the correct results when called with any

694

byte in any string encoded or not in UTF-8. And similarly C<isASCII_utf8_safe>

695

will work properly on any string encoded or not in UTF-8.

696

697

=for apidoc Am|bool|isBLANK|char ch

698

Returns a boolean indicating whether the specified character is a

699

character considered to be a blank, analogous to C<m/[[:blank:]]/>.

700

See the L<top of this section|/Character classification> for an explanation of

701

variants

702

C<isBLANK_A>, C<isBLANK_L1>, C<isBLANK_uvchr>, C<isBLANK_utf8_safe>,

703

C<isBLANK_LC>, C<isBLANK_LC_uvchr>, and C<isBLANK_LC_utf8_safe>. Note,

704

however, that some platforms do not have the C library routine

705

C<isblank()>. In these cases, the variants whose names contain C<LC> are

706

the same as the corresponding ones without.

707

708

=for apidoc Am|bool|isCNTRL|char ch

709

Returns a boolean indicating whether the specified character is a

710

control character, analogous to C<m/[[:cntrl:]]/>.

711

See the L<top of this section|/Character classification> for an explanation of

712

variants

713

C<isCNTRL_A>, C<isCNTRL_L1>, C<isCNTRL_uvchr>, C<isCNTRL_utf8_safe>,

714

C<isCNTRL_LC>, C<isCNTRL_LC_uvchr>, and C<isCNTRL_LC_utf8_safe> On EBCDIC

715

platforms, you almost always want to use the C<isCNTRL_L1> variant.

716

717

=for apidoc Am|bool|isDIGIT|char ch

718

Returns a boolean indicating whether the specified character is a

719

digit, analogous to C<m/[[:digit:]]/>.

720

Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>.

721

See the L<top of this section|/Character classification> for an explanation of

722

variants

723

C<isDIGIT_uvchr>, C<isDIGIT_utf8_safe>, C<isDIGIT_LC>, C<isDIGIT_LC_uvchr>, and

724

C<isDIGIT_LC_utf8_safe>.

725

726

=for apidoc Am|bool|isGRAPH|char ch

727

Returns a boolean indicating whether the specified character is a

728

graphic character, analogous to C<m/[[:graph:]]/>.

729

See the L<top of this section|/Character classification> for an explanation of

730

variants C<isGRAPH_A>, C<isGRAPH_L1>, C<isGRAPH_uvchr>, C<isGRAPH_utf8_safe>,

731

C<isGRAPH_LC>, C<isGRAPH_LC_uvchr>, and C<isGRAPH_LC_utf8_safe>.

732

733

=for apidoc Am|bool|isLOWER|char ch

734

Returns a boolean indicating whether the specified character is a

735

lowercase character, analogous to C<m/[[:lower:]]/>.

736

See the L<top of this section|/Character classification> for an explanation of

737

variants

738

C<isLOWER_A>, C<isLOWER_L1>, C<isLOWER_uvchr>, C<isLOWER_utf8_safe>,

739

C<isLOWER_LC>, C<isLOWER_LC_uvchr>, and C<isLOWER_LC_utf8_safe>.

740

741

=for apidoc Am|bool|isOCTAL|char ch

742

Returns a boolean indicating whether the specified character is an

743

octal digit, [0-7].

744

The only two variants are C<isOCTAL_A> and C<isOCTAL_L1>; each is identical to

745

C<isOCTAL>.

746

747

=for apidoc Am|bool|isPUNCT|char ch

748

Returns a boolean indicating whether the specified character is a

749

punctuation character, analogous to C<m/[[:punct:]]/>.

750

Note that the definition of what is punctuation isn't as

751

straightforward as one might desire. See L<perlrecharclass/POSIX Character

752

Classes> for details.

753

See the L<top of this section|/Character classification> for an explanation of

754

variants C<isPUNCT_A>, C<isPUNCT_L1>, C<isPUNCT_uvchr>, C<isPUNCT_utf8_safe>,

755

C<isPUNCT_LC>, C<isPUNCT_LC_uvchr>, and C<isPUNCT_LC_utf8_safe>.

756

757

=for apidoc Am|bool|isSPACE|char ch

758

Returns a boolean indicating whether the specified character is a

759

whitespace character. This is analogous

760

to what C<m/\s/> matches in a regular expression. Starting in Perl 5.18

761

this also matches what C<m/[[:space:]]/> does. Prior to 5.18, only the

762

locale forms of this macro (the ones with C<LC> in their names) matched

763

precisely what C<m/[[:space:]]/> does. In those releases, the only difference,

764

in the non-locale variants, was that C<isSPACE()> did not match a vertical tab.

765

(See L</isPSXSPC> for a macro that matches a vertical tab in all releases.)

766

See the L<top of this section|/Character classification> for an explanation of

767

variants

768

C<isSPACE_A>, C<isSPACE_L1>, C<isSPACE_uvchr>, C<isSPACE_utf8_safe>,

769

C<isSPACE_LC>, C<isSPACE_LC_uvchr>, and C<isSPACE_LC_utf8_safe>.

770

771

=for apidoc Am|bool|isPSXSPC|char ch

772

(short for Posix Space)

773

Starting in 5.18, this is identical in all its forms to the

774

corresponding C<isSPACE()> macros.

775

The locale forms of this macro are identical to their corresponding

776

C<isSPACE()> forms in all Perl releases. In releases prior to 5.18, the

777

non-locale forms differ from their C<isSPACE()> forms only in that the

778

C<isSPACE()> forms don't match a Vertical Tab, and the C<isPSXSPC()> forms do.

779

Otherwise they are identical. Thus this macro is analogous to what

780

C<m/[[:space:]]/> matches in a regular expression.

781

See the L<top of this section|/Character classification> for an explanation of

782

variants C<isPSXSPC_A>, C<isPSXSPC_L1>, C<isPSXSPC_uvchr>, C<isPSXSPC_utf8_safe>,

783

C<isPSXSPC_LC>, C<isPSXSPC_LC_uvchr>, and C<isPSXSPC_LC_utf8_safe>.

784

785

=for apidoc Am|bool|isUPPER|char ch

786

Returns a boolean indicating whether the specified character is an

787

uppercase character, analogous to C<m/[[:upper:]]/>.

788

See the L<top of this section|/Character classification> for an explanation of

789

variants C<isUPPER_A>, C<isUPPER_L1>, C<isUPPER_uvchr>, C<isUPPER_utf8_safe>,

790

C<isUPPER_LC>, C<isUPPER_LC_uvchr>, and C<isUPPER_LC_utf8_safe>.

791

792

=for apidoc Am|bool|isPRINT|char ch

793

Returns a boolean indicating whether the specified character is a

794

printable character, analogous to C<m/[[:print:]]/>.

795

See the L<top of this section|/Character classification> for an explanation of

796

variants

797

C<isPRINT_A>, C<isPRINT_L1>, C<isPRINT_uvchr>, C<isPRINT_utf8_safe>,

798

C<isPRINT_LC>, C<isPRINT_LC_uvchr>, and C<isPRINT_LC_utf8_safe>.

799

800

=for apidoc Am|bool|isWORDCHAR|char ch

801

Returns a boolean indicating whether the specified character is a character

802

that is a word character, analogous to what C<m/\w/> and C<m/[[:word:]]/> match

803

in a regular expression. A word character is an alphabetic character, a

804

decimal digit, a connecting punctuation character (such as an underscore), or

805

a "mark" character that attaches to one of those (like some sort of accent).

806

C<isALNUM()> is a synonym provided for backward compatibility, even though a

807

word character includes more than the standard C language meaning of

808

alphanumeric.

809

See the L<top of this section|/Character classification> for an explanation of

810

variants C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>, and

811

C<isWORDCHAR_utf8_safe>. C<isWORDCHAR_LC>, C<isWORDCHAR_LC_uvchr>, and

812

C<isWORDCHAR_LC_utf8_safe> are also as described there, but additionally

813

include the platform's native underscore.

814

815

=for apidoc Am|bool|isXDIGIT|char ch

816

Returns a boolean indicating whether the specified character is a hexadecimal

817

digit. In the ASCII range these are C<[0-9A-Fa-f]>. Variants C<isXDIGIT_A()>

818

and C<isXDIGIT_L1()> are identical to C<isXDIGIT()>.

819

See the L<top of this section|/Character classification> for an explanation of

820

variants

821

C<isXDIGIT_uvchr>, C<isXDIGIT_utf8_safe>, C<isXDIGIT_LC>, C<isXDIGIT_LC_uvchr>,

822

and C<isXDIGIT_LC_utf8_safe>.

823

824

=for apidoc Am|bool|isIDFIRST|char ch

825

Returns a boolean indicating whether the specified character can be the first

826

character of an identifier. This is very close to, but not quite the same as

827

the official Unicode property C<XID_Start>. The difference is that this

828

returns true only if the input character also matches L</isWORDCHAR>.

829

See the L<top of this section|/Character classification> for an explanation of

830

variants

831

C<isIDFIRST_A>, C<isIDFIRST_L1>, C<isIDFIRST_uvchr>, C<isIDFIRST_utf8_safe>,

832

C<isIDFIRST_LC>, C<isIDFIRST_LC_uvchr>, and C<isIDFIRST_LC_utf8_safe>.

833

834

=for apidoc Am|bool|isIDCONT|char ch

835

Returns a boolean indicating whether the specified character can be the

836

second or succeeding character of an identifier. This is very close to, but

837

not quite the same as the official Unicode property C<XID_Continue>. The

838

difference is that this returns true only if the input character also matches

839

L</isWORDCHAR>. See the L<top of this section|/Character classification> for

840

an

841

explanation of variants C<isIDCONT_A>, C<isIDCONT_L1>, C<isIDCONT_uvchr>,

842

C<isIDCONT_utf8_safe>, C<isIDCONT_LC>, C<isIDCONT_LC_uvchr>, and

843

C<isIDCONT_LC_utf8_safe>.

844

845

=head1 Miscellaneous Functions

846

847

=for apidoc Am|U8|READ_XDIGIT|char str*

848

Returns the value of an ASCII-range hex digit and advances the string pointer.

849

Behaviour is only well defined when isXDIGIT(*str) is true.

850

851

=head1 Character case changing

852

Perl uses "full" Unicode case mappings. This means that converting a single

853

character to another case may result in a sequence of more than one character.

854

For example, the uppercase of C<E<223>> (LATIN SMALL LETTER SHARP S) is the two

855

character sequence C<SS>. This presents some complications The lowercase of

856

all characters in the range 0..255 is a single character, and thus

857

C<L</toLOWER_L1>> is furnished. But, C<toUPPER_L1> can't exist, as it couldn't

858

return a valid result for all legal inputs. Instead C<L</toUPPER_uvchr>> has

859

an API that does allow every possible legal result to be returned.) Likewise

860

no other function that is crippled by not being able to give the correct

861

results for the full range of possible inputs has been implemented here.

862

863

=for apidoc Am|U8|toUPPER|U8 ch

864

Converts the specified character to uppercase. If the input is anything but an

865

ASCII lowercase character, that input character itself is returned. Variant

866

C<toUPPER_A> is equivalent.

867

868

869

Converts the code point C<cp> to its uppercase version, and

870

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code

871

point is interpreted as native if less than 256; otherwise as Unicode. Note

872

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

873

bytes since the uppercase version may be longer than the original character.

874

875

The first code point of the uppercased version is returned

876

(but note, as explained at L<the top of this section|/Character case

877

changing>, that there may be more.)

878

879

=for apidoc Am|UV|toUPPER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp

880

Converts the first UTF-8 encoded character in the sequence starting at C and

881

extending no further than S<C<e - 1>> to its uppercase version, and

882

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note

883

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

884

bytes since the uppercase version may be longer than the original character.

885

886

The first code point of the uppercased version is returned

887

(but note, as explained at L<the top of this section|/Character case

888

changing>, that there may be more).

889

890

The suffix C<_safe> in the function's name indicates that it will not attempt

891

to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is

892

true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the

893

input character is malformed in some way, the program may croak, or the

894

function may return the REPLACEMENT CHARACTER, at the discretion of the

895

implementation, and subject to change in future releases.

896

897

=for apidoc Am|UV|toUPPER_utf8|U8* p|U8* s|STRLEN* lenp

898

This is like C<L</toUPPER_utf8_safe>>, but doesn't have the C<e>

899

parameter The function therefore can't check if it is reading

900

beyond the end of the string. Starting in Perl v5.30, it will take the C<e>

901

parameter, becoming a synonym for C<toUPPER_utf8_safe>. At that time every

902

program that uses it will have to be changed to successfully compile. In the

903

meantime, the first runtime call to C<toUPPER_utf8> from each call point in the

904

program will raise a deprecation warning, enabled by default. You can convert

905

your program now to use C<toUPPER_utf8_safe>, and avoid the warnings, and get an

906

extra measure of protection, or you can wait until v5.30, when you'll be forced

907

to add the C<e> parameter.

908

909

=for apidoc Am|U8|toFOLD|U8 ch

910

Converts the specified character to foldcase. If the input is anything but an

911

ASCII uppercase character, that input character itself is returned. Variant

912

C<toFOLD_A> is equivalent. (There is no equivalent C<to_FOLD_L1> for the full

913

Latin1 range, as the full generality of L</toFOLD_uvchr> is needed there.)

914

915

916

Converts the code point C<cp> to its foldcase version, and

917

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code

918

point is interpreted as native if less than 256; otherwise as Unicode. Note

919

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

920

bytes since the foldcase version may be longer than the original character.

921

922

The first code point of the foldcased version is returned

923

(but note, as explained at L<the top of this section|/Character case

924

changing>, that there may be more).

925

926

=for apidoc Am|UV|toFOLD_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp

927

Converts the first UTF-8 encoded character in the sequence starting at C and

928

extending no further than S<C<e - 1>> to its foldcase version, and

929

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note

930

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

931

bytes since the foldcase version may be longer than the original character.

932

933

The first code point of the foldcased version is returned

934

(but note, as explained at L<the top of this section|/Character case

935

changing>, that there may be more).

936

937

The suffix C<_safe> in the function's name indicates that it will not attempt

938

to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is

939

true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the

940

input character is malformed in some way, the program may croak, or the

941

function may return the REPLACEMENT CHARACTER, at the discretion of the

942

implementation, and subject to change in future releases.

943

944

=for apidoc Am|UV|toFOLD_utf8|U8* p|U8* s|STRLEN* lenp

945

This is like C<L</toFOLD_utf8_safe>>, but doesn't have the C<e>

946

parameter The function therefore can't check if it is reading

947

beyond the end of the string. Starting in Perl v5.30, it will take the C<e>

948

parameter, becoming a synonym for C<toFOLD_utf8_safe>. At that time every

949

program that uses it will have to be changed to successfully compile. In the

950

meantime, the first runtime call to C<toFOLD_utf8> from each call point in the

951

program will raise a deprecation warning, enabled by default. You can convert

952

your program now to use C<toFOLD_utf8_safe>, and avoid the warnings, and get an

953

extra measure of protection, or you can wait until v5.30, when you'll be forced

954

to add the C<e> parameter.

955

956

=for apidoc Am|U8|toLOWER|U8 ch

957

Converts the specified character to lowercase. If the input is anything but an

958

ASCII uppercase character, that input character itself is returned. Variant

959

C<toLOWER_A> is equivalent.

960

961

=for apidoc Am|U8|toLOWER_L1|U8 ch

962

Converts the specified Latin1 character to lowercase. The results are

963

undefined if the input doesn't fit in a byte.

964

965

=for apidoc Am|U8|toLOWER_LC|U8 ch

966

Converts the specified character to lowercase using the current locale's rules,

967

if possible; otherwise returns the input character itself.

968

969

970

Converts the code point C<cp> to its lowercase version, and

971

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code

972

point is interpreted as native if less than 256; otherwise as Unicode. Note

973

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

974

bytes since the lowercase version may be longer than the original character.

975

976

The first code point of the lowercased version is returned

977

(but note, as explained at L<the top of this section|/Character case

978

changing>, that there may be more).

979

980

981

=for apidoc Am|UV|toLOWER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp

982

Converts the first UTF-8 encoded character in the sequence starting at C and

983

extending no further than S<C<e - 1>> to its lowercase version, and

984

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note

985

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

986

bytes since the lowercase version may be longer than the original character.

987

988

The first code point of the lowercased version is returned

989

(but note, as explained at L<the top of this section|/Character case

990

changing>, that there may be more).

991

992

The suffix C<_safe> in the function's name indicates that it will not attempt

993

to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is

994

true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the

995

input character is malformed in some way, the program may croak, or the

996

function may return the REPLACEMENT CHARACTER, at the discretion of the

997

implementation, and subject to change in future releases.

998

999

=for apidoc Am|UV|toLOWER_utf8|U8* p|U8* s|STRLEN* lenp

1000

This is like C<L</toLOWER_utf8_safe>>, but doesn't have the C<e>

1001

parameter The function therefore can't check if it is reading

1002

beyond the end of the string. Starting in Perl v5.30, it will take the C<e>

1003

parameter, becoming a synonym for C<toLOWER_utf8_safe>. At that time every

1004

program that uses it will have to be changed to successfully compile. In the

1005

meantime, the first runtime call to C<toLOWER_utf8> from each call point in the

1006

program will raise a deprecation warning, enabled by default. You can convert

1007

your program now to use C<toLOWER_utf8_safe>, and avoid the warnings, and get an

1008

extra measure of protection, or you can wait until v5.30, when you'll be forced

1009

to add the C<e> parameter.

1010

1011

=for apidoc Am|U8|toTITLE|U8 ch

1012

Converts the specified character to titlecase. If the input is anything but an

1013

ASCII lowercase character, that input character itself is returned. Variant

1014

C<toTITLE_A> is equivalent. (There is no C<toTITLE_L1> for the full Latin1

1015

range, as the full generality of L</toTITLE_uvchr> is needed there. Titlecase is

1016

not a concept used in locale handling, so there is no functionality for that.)

1017

1018

1019

Converts the code point C<cp> to its titlecase version, and

1020

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code

1021

point is interpreted as native if less than 256; otherwise as Unicode. Note

1022

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

1023

bytes since the titlecase version may be longer than the original character.

1024

1025

The first code point of the titlecased version is returned

1026

(but note, as explained at L<the top of this section|/Character case

1027

changing>, that there may be more).

1028

1029

=for apidoc Am|UV|toTITLE_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp

1030

Converts the first UTF-8 encoded character in the sequence starting at C and

1031

extending no further than S<C<e - 1>> to its titlecase version, and

1032

stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note

1033

that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>

1034

bytes since the titlecase version may be longer than the original character.

1035

1036

The first code point of the titlecased version is returned

1037

(but note, as explained at L<the top of this section|/Character case

1038

changing>, that there may be more).

1039

1040

The suffix C<_safe> in the function's name indicates that it will not attempt

1041

to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is

1042

true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the

1043

input character is malformed in some way, the program may croak, or the

1044

function may return the REPLACEMENT CHARACTER, at the discretion of the

1045

implementation, and subject to change in future releases.

1046

1047

=for apidoc Am|UV|toTITLE_utf8|U8* p|U8* s|STRLEN* lenp

1048

This is like C<L</toLOWER_utf8_safe>>, but doesn't have the C<e>

1049

parameter The function therefore can't check if it is reading

1050

beyond the end of the string. Starting in Perl v5.30, it will take the C<e>

1051

parameter, becoming a synonym for C<toTITLE_utf8_safe>. At that time every

1052

program that uses it will have to be changed to successfully compile. In the

1053

meantime, the first runtime call to C<toTITLE_utf8> from each call point in the

1054

program will raise a deprecation warning, enabled by default. You can convert

1055

your program now to use C<toTITLE_utf8_safe>, and avoid the warnings, and get an

1056

extra measure of protection, or you can wait until v5.30, when you'll be forced

1057

to add the C<e> parameter.

=cut

XXX Still undocumented isVERTWS_uvchr and _utf8; it's unclear what their names

1062

really should be. Also toUPPER_LC and toFOLD_LC, which are subject to change,

1063

and aren't general purpose as they don't work on U+DF, and assert against that.

1064

1065

Note that these macros are repeated in Devel::PPPort, so should also be

1066

patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc

*/

/* Specify the widest unsigned type on the platform. */

1071

#ifdef QUADKIND

1072

# define WIDEST_UTYPE U64

1073

#else

1074

# define WIDEST_UTYPE U32

1075

#endif

1076

1077

/* FITS_IN_8_BITS(c) returns true if c doesn't have a bit set other than in

1078

* the lower 8. It is designed to be hopefully bomb-proof, making sure that no

1079

* bits of information are lost even on a 64-bit machine, but to get the

1080

* compiler to optimize it out if possible. This is because Configure makes

1081

* sure that the machine has an 8-bit byte, so if c is stored in a byte, the

1082

* sizeof() guarantees that this evaluates to a constant true at compile time.

1083

*

1084

* For Coverity, be always true, because otherwise Coverity thinks

1085

* it finds several expressions that are always true, independent

1086

* of operands. Well, they are, but that is kind of the point.

1087

*/

1088

#ifndef __COVERITY__

1089

/* The '| 0' part ensures a compiler error if c is not integer (like e.g., a

1090

* pointer) */

1091

#define FITS_IN_8_BITS(c) ( (sizeof(c) == 1) \

1092

|| !(((WIDEST_UTYPE)((c) | 0)) & ~0xFF))

1093

#else

1094

#define FITS_IN_8_BITS(c) (1)

#endif

#ifdef EBCDIC

# ifndef _ALL_SOURCE

/* The native libc isascii() et.al. functions return the wrong results

1100

* on at least z/OS unless this is defined. */

1101

# error _ALL_SOURCE should probably be defined

1102

# endif

1103

#else

1104

/* There is a simple definition of ASCII for ASCII platforms. But the

1105

* EBCDIC one isn't so simple, so is defined using table look-up like the

1106

* other macros below.

1107

*

1108

* The cast here is used instead of '(c) >= 0', because some compilers emit

1109

* a warning that that test is always true when the parameter is an

1110

* unsigned type. khw supposes that it could be written as

1111

* && ((c) == '\0' || (c) > 0)

1112

* to avoid the message, but the cast will likely avoid extra branches even

1113

* with stupid compilers.

1114

*

1115

* The '| 0' part ensures a compiler error if c is not integer (like e.g.,

1116

* a pointer) */

1117

# define isASCII(c) ((WIDEST_UTYPE)((c) | 0) < 128)

1118

#endif

1119

1120

/* Take the eight possible bit patterns of the lower 3 bits and you get the

1121

* lower 3 bits of the 8 octal digits, in both ASCII and EBCDIC, so those bits

1122

* can be ignored. If the rest match '0', we have an octal */

1123

#define isOCTAL_A(c) (((WIDEST_UTYPE)((c) | 0) & ~7) == '0')

1124

1125

#ifdef H_PERL /* If have access to perl.h, lookup in its table */

1126

1127

/* Character class numbers. For internal core Perl use only. The ones less

1128

* than 32 are used in PL_charclass[] and the ones up through the one that

1129

* corresponds to <_HIGHEST_REGCOMP_DOT_H_SYNC> are used by regcomp.h and

1130

* related files. PL_charclass ones use names used in l1_char_class_tab.h but

1131

* their actual definitions are here. If that file has a name not used here,

1132

* it won't compile.

1133

*

1134

* The first group of these is ordered in what I (khw) estimate to be the

1135

* frequency of their use. This gives a slight edge to exiting a loop earlier

1136

* (in reginclass() in regexec.c). Except \v should be last, as it isn't a

1137

* real Posix character class, and some (small) inefficiencies in regular

1138

* expression handling would be introduced by putting it in the middle of those

1139

* that are. Also, cntrl and ascii come after the others as it may be useful

1140

* to group these which have no members that match above Latin1, (or above

1141

* ASCII in the latter case) */

1142

1143

# define _CC_WORDCHAR 0 /* \w and [:word:] */

1144

# define _CC_DIGIT 1 /* \d and [:digit:] */

1145

# define _CC_ALPHA 2 /* [:alpha:] */

1146

# define _CC_LOWER 3 /* [:lower:] */

1147

# define _CC_UPPER 4 /* [:upper:] */

1148

# define _CC_PUNCT 5 /* [:punct:] */

1149

# define _CC_PRINT 6 /* [:print:] */

1150

# define _CC_ALPHANUMERIC 7 /* [:alnum:] */

1151

# define _CC_GRAPH 8 /* [:graph:] */

1152

# define _CC_CASED 9 /* [:lower:] or [:upper:] under /i */

1153

# define _CC_SPACE 10 /* \s, [:space:] */

1154

# define _CC_PSXSPC _CC_SPACE /* XXX Temporary, can be removed

1155

when the deprecated isFOO_utf8()

1156

functions are removed */

1157

# define _CC_BLANK 11 /* [:blank:] */

1158

# define _CC_XDIGIT 12 /* [:xdigit:] */

1159

# define _CC_CNTRL 13 /* [:cntrl:] */

1160

# define _CC_ASCII 14 /* [:ascii:] */

1161

# define _CC_VERTSPACE 15 /* \v */

1162

1163

# define _HIGHEST_REGCOMP_DOT_H_SYNC _CC_VERTSPACE

1164

1165

/* The members of the third group below do not need to be coordinated with data

1166

* structures in regcomp.[ch] and regexec.c. */

1167

# define _CC_IDFIRST 16

1168

# define _CC_CHARNAME_CONT 17

1169

# define _CC_NONLATIN1_FOLD 18

1170

# define _CC_NONLATIN1_SIMPLE_FOLD 19

1171

# define _CC_QUOTEMETA 20

1172

# define _CC_NON_FINAL_FOLD 21

1173

# define _CC_IS_IN_SOME_FOLD 22

1174

# define _CC_MNEMONIC_CNTRL 23

1175

1176

# define _CC_IDCONT 24 /* XXX Temporary, can be removed when the deprecated

1177

isFOO_utf8() functions are removed */

1178

1179

/* This next group is only used on EBCDIC platforms, so theoretically could be

1180

* shared with something entirely different that's only on ASCII platforms */

1181

# define _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE 28

1182

# define _CC_UTF8_IS_START 29

1183

# define _CC_UTF8_IS_DOWNGRADEABLE_START 30

1184

# define _CC_UTF8_IS_CONTINUATION 31

1185

/* Unused: 24-27

1186

* If more bits are needed, one could add a second word for non-64bit

1187

* QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd

1188

* word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it

1189

* is used only for optimization (as of this writing), and differs in the

1190

* Latin1 range from the ALPHA bit only in two relatively unimportant

1191

* characters: the masculine and feminine ordinal indicators, so removing it

1192

* would just cause /i regexes which match them to run less efficiently.

1193

* Similarly the EBCDIC-only bits are used just for speed, and could be

1194

* replaced by other means */

1195

1196

#if defined(PERL_CORE) || defined(PERL_EXT)

1197

/* An enum version of the character class numbers, to help compilers

1198

* optimize */

1199

typedef enum {

1200

_CC_ENUM_ALPHA = _CC_ALPHA,

1201

_CC_ENUM_ALPHANUMERIC = _CC_ALPHANUMERIC,

1202

_CC_ENUM_ASCII = _CC_ASCII,

1203

_CC_ENUM_BLANK = _CC_BLANK,

1204

_CC_ENUM_CASED = _CC_CASED,

1205

_CC_ENUM_CNTRL = _CC_CNTRL,

1206

_CC_ENUM_DIGIT = _CC_DIGIT,

1207

_CC_ENUM_GRAPH = _CC_GRAPH,

1208

_CC_ENUM_LOWER = _CC_LOWER,

1209

_CC_ENUM_PRINT = _CC_PRINT,

1210

_CC_ENUM_PUNCT = _CC_PUNCT,

1211

_CC_ENUM_SPACE = _CC_SPACE,

1212

_CC_ENUM_UPPER = _CC_UPPER,

1213

_CC_ENUM_VERTSPACE = _CC_VERTSPACE,

1214

_CC_ENUM_WORDCHAR = _CC_WORDCHAR,

1215

_CC_ENUM_XDIGIT = _CC_XDIGIT

1216

} _char_class_number;

1217

#endif

1218

1219

#define POSIX_CC_COUNT (_HIGHEST_REGCOMP_DOT_H_SYNC + 1)

START_EXTERN_C

# ifdef DOINIT

EXTCONST U32 PL_charclass[] = {

1224

# include "l1_char_class_tab.h"

1225

};

1226

1227

# else /* ! DOINIT */

1228

EXTCONST U32 PL_charclass[];

# endif

END_EXTERN_C

/* The 1U keeps Solaris from griping when shifting sets the uppermost bit */

1233

# define _CC_mask(classnum) (1U << (classnum))

1234

1235

/* For internal core Perl use only: the base macro for defining macros like

1236

* isALPHA */

1237

# define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \

1238

&& (PL_charclass[(U8) (c)] & _CC_mask(classnum)))

1239

1240

/* The mask for the _A versions of the macros; it just adds in the bit for

1241

* ASCII. */

1242

# define _CC_mask_A(classnum) (_CC_mask(classnum) | _CC_mask(_CC_ASCII))

1243

1244

/* For internal core Perl use only: the base macro for defining macros like

1245

* isALPHA_A. The foo_A version makes sure that both the desired bit and

1246

* the ASCII bit are present */

1247

# define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \

1248

&& ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \

1249

== _CC_mask_A(classnum)))

1250

1251

# define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA)

1252

# define isALPHANUMERIC_A(c) _generic_isCC_A(c, _CC_ALPHANUMERIC)

1253

# define isBLANK_A(c) _generic_isCC_A(c, _CC_BLANK)

1254

# define isCNTRL_A(c) _generic_isCC_A(c, _CC_CNTRL)

1255

# define isDIGIT_A(c) _generic_isCC(c, _CC_DIGIT) /* No non-ASCII digits */

1256

# define isGRAPH_A(c) _generic_isCC_A(c, _CC_GRAPH)

1257

# define isLOWER_A(c) _generic_isCC_A(c, _CC_LOWER)

1258

# define isPRINT_A(c) _generic_isCC_A(c, _CC_PRINT)

1259

# define isPUNCT_A(c) _generic_isCC_A(c, _CC_PUNCT)

1260

# define isSPACE_A(c) _generic_isCC_A(c, _CC_SPACE)

1261

# define isUPPER_A(c) _generic_isCC_A(c, _CC_UPPER)

1262

# define isWORDCHAR_A(c) _generic_isCC_A(c, _CC_WORDCHAR)

1263

# define isXDIGIT_A(c) _generic_isCC(c, _CC_XDIGIT) /* No non-ASCII xdigits

1264

*/

1265

# define isIDFIRST_A(c) _generic_isCC_A(c, _CC_IDFIRST)

1266

# define isALPHA_L1(c) _generic_isCC(c, _CC_ALPHA)

1267

# define isALPHANUMERIC_L1(c) _generic_isCC(c, _CC_ALPHANUMERIC)

1268

# define isBLANK_L1(c) _generic_isCC(c, _CC_BLANK)

1269

1270

/* continuation character for legal NAME in \N{NAME} */

1271

# define isCHARNAME_CONT(c) _generic_isCC(c, _CC_CHARNAME_CONT)

1272

1273

# define isCNTRL_L1(c) _generic_isCC(c, _CC_CNTRL)

1274

# define isGRAPH_L1(c) _generic_isCC(c, _CC_GRAPH)

1275

# define isLOWER_L1(c) _generic_isCC(c, _CC_LOWER)

1276

# define isPRINT_L1(c) _generic_isCC(c, _CC_PRINT)

1277

# define isPSXSPC_L1(c) isSPACE_L1(c)

1278

# define isPUNCT_L1(c) _generic_isCC(c, _CC_PUNCT)

1279

# define isSPACE_L1(c) _generic_isCC(c, _CC_SPACE)

1280

# define isUPPER_L1(c) _generic_isCC(c, _CC_UPPER)

1281

# define isWORDCHAR_L1(c) _generic_isCC(c, _CC_WORDCHAR)

1282

# define isIDFIRST_L1(c) _generic_isCC(c, _CC_IDFIRST)

1283

1284

# ifdef EBCDIC

1285

# define isASCII(c) _generic_isCC(c, _CC_ASCII)

1286

# endif

1287

1288

/* Participates in a single-character fold with a character above 255 */

1289

# define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_SIMPLE_FOLD)))

1290

1291

/* Like the above, but also can be part of a multi-char fold */

1292

# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD)))

1293

1294

# define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)

1295

# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \

1296

_generic_isCC(c, _CC_NON_FINAL_FOLD)

1297

# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \

1298

_generic_isCC(c, _CC_IS_IN_SOME_FOLD)

1299

# define _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \

1300

_generic_isCC(c, _CC_MNEMONIC_CNTRL)

1301

#else /* else we don't have perl.h H_PERL */

1302

1303

/* If we don't have perl.h, we are compiling a utility program. Below we

1304

* hard-code various macro definitions that wouldn't otherwise be available

1305

* to it. Most are coded based on first principles. These are written to

1306

* avoid EBCDIC vs. ASCII #ifdef's as much as possible. */

1307

# define isDIGIT_A(c) ((c) <= '9' && (c) >= '0')

1308

# define isBLANK_A(c) ((c) == ' ' || (c) == '\t')

1309

# define isSPACE_A(c) (isBLANK_A(c) \

|| (c) == '\n' \

|| (c) == '\r' \

|| (c) == '\v' \

|| (c) == '\f')

/* On EBCDIC, there are gaps between 'i' and 'j'; 'r' and 's'. Same for

1315

* uppercase. The tests for those aren't necessary on ASCII, but hurt only

1316

* performance (if optimization isn't on), and allow the same code to be

1317

* used for both platform types */

1318

# define isLOWER_A(c) ((c) >= 'a' && (c) <= 'z' \

1319

&& ( (c) <= 'i' \

1320

|| ((c) >= 'j' && (c) <= 'r') \

1321

|| (c) >= 's'))

1322

# define isUPPER_A(c) ((c) >= 'A' && (c) <= 'Z' \

1323

&& ( (c) <= 'I' \

1324

|| ((c) >= 'J' && (c) <= 'R') \

1325

|| (c) >= 'S'))

1326

# define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c))

1327

# define isALPHANUMERIC_A(c) (isALPHA_A(c) || isDIGIT_A(c))

1328

# define isWORDCHAR_A(c) (isALPHANUMERIC_A(c) || (c) == '_')

1329

# define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_')

1330

# define isXDIGIT_A(c) (isDIGIT_A(c) \

1331

|| ((c) >= 'a' && (c) <= 'f') \

1332

|| ((c) <= 'F' && (c) >= 'A'))

1333

# define isPUNCT_A(c) ((c) == '-' || (c) == '!' || (c) == '"' \

1334

|| (c) == '#' || (c) == '$' || (c) == '%' \

1335

|| (c) == '&' || (c) == '\'' || (c) == '(' \

1336

|| (c) == ')' || (c) == '*' || (c) == '+' \

1337

|| (c) == ',' || (c) == '.' || (c) == '/' \

1338

|| (c) == ':' || (c) == ';' || (c) == '<' \

1339

|| (c) == '=' || (c) == '>' || (c) == '?' \

1340

|| (c) == '@' || (c) == '[' || (c) == '\\' \

1341

|| (c) == ']' || (c) == '^' || (c) == '_' \

1342

|| (c) == '`' || (c) == '{' || (c) == '|' \

1343

|| (c) == '}' || (c) == '~')

1344

# define isGRAPH_A(c) (isALPHANUMERIC_A(c) || isPUNCT_A(c))

1345

# define isPRINT_A(c) (isGRAPH_A(c) || (c) == ' ')

1346

1347

# ifdef EBCDIC

1348

/* The below is accurate for the 3 EBCDIC code pages traditionally

1349

* supported by perl. The only difference between them in the controls

1350

* is the position of \n, and that is represented symbolically below */

1351

# define isCNTRL_A(c) ((c) == '\0' || (c) == '\a' || (c) == '\b' \

1352

|| (c) == '\f' || (c) == '\n' || (c) == '\r' \

1353

|| (c) == '\t' || (c) == '\v' \

1354

|| ((c) <= 3 && (c) >= 1) /* SOH, STX, ETX */ \

1355

|| (c) == 7 /* U+7F DEL */ \

1356

|| ((c) <= 0x13 && (c) >= 0x0E) /* SO, SI */ \

1357

/* DLE, DC[1-3] */ \

1358

|| (c) == 0x18 /* U+18 CAN */ \

1359

|| (c) == 0x19 /* U+19 EOM */ \

1360

|| ((c) <= 0x1F && (c) >= 0x1C) /* [FGRU]S */ \

1361

|| (c) == 0x26 /* U+17 ETB */ \

1362

|| (c) == 0x27 /* U+1B ESC */ \

1363

|| (c) == 0x2D /* U+05 ENQ */ \

1364

|| (c) == 0x2E /* U+06 ACK */ \

1365

|| (c) == 0x32 /* U+16 SYN */ \

1366

|| (c) == 0x37 /* U+04 EOT */ \

1367

|| (c) == 0x3C /* U+14 DC4 */ \

1368

|| (c) == 0x3D /* U+15 NAK */ \

1369

|| (c) == 0x3F)/* U+1A SUB */

1370

# define isASCII(c) (isCNTRL_A(c) || isPRINT_A(c))

1371

# else /* isASCII is already defined for ASCII platforms, so can use that to

1372

define isCNTRL */

1373

# define isCNTRL_A(c) (isASCII(c) && ! isPRINT_A(c))

1374

# endif

1375

1376

/* The _L1 macros may be unnecessary for the utilities; I (khw) added them

1377

* during debugging, and it seems best to keep them. We may be called

1378

* without NATIVE_TO_LATIN1 being defined. On ASCII platforms, it doesn't

1379

* do anything anyway, so make it not a problem */

1380

# if ! defined(EBCDIC) && ! defined(NATIVE_TO_LATIN1)

1381

# define NATIVE_TO_LATIN1(ch) (ch)

1382

# endif

1383

# define isALPHA_L1(c) (isUPPER_L1(c) || isLOWER_L1(c))

1384

# define isALPHANUMERIC_L1(c) (isALPHA_L1(c) || isDIGIT_A(c))

1385

# define isBLANK_L1(c) (isBLANK_A(c) \

1386

|| (FITS_IN_8_BITS(c) \

1387

&& NATIVE_TO_LATIN1((U8) c) == 0xA0))

1388

# define isCNTRL_L1(c) (FITS_IN_8_BITS(c) && (! isPRINT_L1(c)))

1389

# define isGRAPH_L1(c) (isPRINT_L1(c) && (! isBLANK_L1(c)))

1390

# define isLOWER_L1(c) (isLOWER_A(c) \

1391

|| (FITS_IN_8_BITS(c) \

1392

&& (( NATIVE_TO_LATIN1((U8) c) >= 0xDF \

1393

&& NATIVE_TO_LATIN1((U8) c) != 0xF7) \

1394

|| NATIVE_TO_LATIN1((U8) c) == 0xAA \

1395

|| NATIVE_TO_LATIN1((U8) c) == 0xBA \

1396

|| NATIVE_TO_LATIN1((U8) c) == 0xB5)))

1397

# define isPRINT_L1(c) (isPRINT_A(c) \

1398

|| (FITS_IN_8_BITS(c) \

1399

&& NATIVE_TO_LATIN1((U8) c) >= 0xA0))

1400

# define isPUNCT_L1(c) (isPUNCT_A(c) \

1401

|| (FITS_IN_8_BITS(c) \

1402

&& ( NATIVE_TO_LATIN1((U8) c) == 0xA1 \

1403

|| NATIVE_TO_LATIN1((U8) c) == 0xA7 \

1404

|| NATIVE_TO_LATIN1((U8) c) == 0xAB \

1405

|| NATIVE_TO_LATIN1((U8) c) == 0xB6 \

1406

|| NATIVE_TO_LATIN1((U8) c) == 0xB7 \

1407

|| NATIVE_TO_LATIN1((U8) c) == 0xBB \

1408

|| NATIVE_TO_LATIN1((U8) c) == 0xBF)))

1409

# define isSPACE_L1(c) (isSPACE_A(c) \

1410

|| (FITS_IN_8_BITS(c) \

1411

&& ( NATIVE_TO_LATIN1((U8) c) == 0x85 \

1412

|| NATIVE_TO_LATIN1((U8) c) == 0xA0)))

1413

# define isUPPER_L1(c) (isUPPER_A(c) \

1414

|| (FITS_IN_8_BITS(c) \

1415

&& ( NATIVE_TO_LATIN1((U8) c) >= 0xC0 \

1416

&& NATIVE_TO_LATIN1((U8) c) <= 0xDE \

1417

&& NATIVE_TO_LATIN1((U8) c) != 0xD7)))

1418

# define isWORDCHAR_L1(c) (isIDFIRST_L1(c) || isDIGIT_A(c))

1419

# define isIDFIRST_L1(c) (isALPHA_L1(c) || NATIVE_TO_LATIN1(c) == '_')

1420

# define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) \

|| isBLANK_L1(c) \

|| (c) == '-' \

|| (c) == '(' \

|| (c) == ')')

/* The following are not fully accurate in the above-ASCII range. I (khw)

1426

* don't think it's necessary to be so for the purposes where this gets

1427

* compiled */

1428

# define _isQUOTEMETA(c) (FITS_IN_8_BITS(c) && ! isWORDCHAR_L1(c))

1429

# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) isALPHA_L1(c)

1430

1431

/* And these aren't accurate at all. They are useful only for above

1432

* Latin1, which utilities and bootstrapping don't deal with */

1433

# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 0

1434

# define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0

1435

# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0

1436

1437

/* Many of the macros later in this file are defined in terms of these. By

1438

* implementing them with a function, which converts the class number into

1439

* a call to the desired macro, all of the later ones work. However, that

1440

* function won't be actually defined when building a utility program (no

1441

* perl.h), and so a compiler error will be generated if one is attempted

1442

* to be used. And the above-Latin1 code points require Unicode tables to

1443

* be present, something unlikely to be the case when bootstrapping */

1444

# define _generic_isCC(c, classnum) \

1445

(FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), TRUE))

1446

# define _generic_isCC_A(c, classnum) \

1447

(FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), FALSE))

1448

#endif /* End of no perl.h H_PERL */

1449

1450

#define isALPHANUMERIC(c) isALPHANUMERIC_A(c)

1451

#define isALPHA(c) isALPHA_A(c)

1452

#define isASCII_A(c) isASCII(c)

1453

#define isASCII_L1(c) isASCII(c)

1454

#define isBLANK(c) isBLANK_A(c)

1455

#define isCNTRL(c) isCNTRL_A(c)

1456

#define isDIGIT(c) isDIGIT_A(c)

1457

#define isGRAPH(c) isGRAPH_A(c)

1458

#define isIDFIRST(c) isIDFIRST_A(c)

1459

#define isLOWER(c) isLOWER_A(c)

1460

#define isPRINT(c) isPRINT_A(c)

1461

#define isPSXSPC_A(c) isSPACE_A(c)

1462

#define isPSXSPC(c) isPSXSPC_A(c)

1463

#define isPSXSPC_L1(c) isSPACE_L1(c)

1464

#define isPUNCT(c) isPUNCT_A(c)

1465

#define isSPACE(c) isSPACE_A(c)

1466

#define isUPPER(c) isUPPER_A(c)

1467

#define isWORDCHAR(c) isWORDCHAR_A(c)

1468

#define isXDIGIT(c) isXDIGIT_A(c)

1469

1470

/* ASCII casing. These could also be written as

1471

#define toLOWER(c) (isASCII(c) ? toLOWER_LATIN1(c) : (c))

1472

#define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c))

1473

which uses table lookup and mask instead of subtraction. (This would

1474

work because the _MOD does not apply in the ASCII range).

1475

1476

These actually are UTF-8 invariant casing, not just ASCII, as any non-ASCII

1477

UTF-8 invariants are neither upper nor lower. (Only on EBCDIC platforms are

1478

there non-ASCII invariants, and all of them are controls.) */

1479

#define toLOWER(c) (isUPPER(c) ? (U8)((c) + ('a' - 'A')) : (c))

1480

#define toUPPER(c) (isLOWER(c) ? (U8)((c) - ('a' - 'A')) : (c))

1481

1482

/* In the ASCII range, these are equivalent to what they're here defined to be.

1483

* But by creating these definitions, other code doesn't have to be aware of

1484

* this detail. Actually this works for all UTF-8 invariants, not just the

1485

* ASCII range. (EBCDIC platforms can have non-ASCII invariants.) */

1486

#define toFOLD(c) toLOWER(c)

1487

#define toTITLE(c) toUPPER(c)

1488

1489

#define toLOWER_A(c) toLOWER(c)

1490

#define toUPPER_A(c) toUPPER(c)

1491

#define toFOLD_A(c) toFOLD(c)

1492

#define toTITLE_A(c) toTITLE(c)

1493

1494

/* Use table lookup for speed; returns the input itself if is out-of-range */

1495

#define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \

1496

? (c) \

1497

: PL_latin1_lc[ (U8) (c) ])

1498

#define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */

1499

1500

/* Modified uc. Is correct uc except for three non-ascii chars which are

1501

* all mapped to one of them, and these need special handling; returns the

1502

* input itself if is out-of-range */

1503

#define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \

1504

? (c) \

1505

: PL_mod_latin1_uc[ (U8) (c) ])

1506

#define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale

1507

1508

/* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */

1509

1510

/* For internal core Perl use only: the base macro for defining macros like

1511

* isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point

1512

* (0-255) to check. In a UTF-8 locale, the result is the same as calling

1513

* isFOO_L1(); the 'utf8_locale_classnum' parameter is something like

1514

* _CC_UPPER, which gives the class number for doing this. For non-UTF-8

1515

* locales, the code to actually do the test this is passed in 'non_utf8'. If

1516

* 'c' is above 255, 0 is returned. For accessing the full range of possible

1517

* code points under locale rules, use the macros based on _generic_LC_uvchr

1518

* instead of this. */

1519

#define _generic_LC_base(c, utf8_locale_classnum, non_utf8) \

1520

(! FITS_IN_8_BITS(c) \

1521

? 0 \

1522

: IN_UTF8_CTYPE_LOCALE \

1523

? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum)) \

1524

: cBOOL(non_utf8))

1525

1526

/* For internal core Perl use only: a helper macro for defining macros like

1527

* isALPHA_LC. 'c' is the code point (0-255) to check. The function name to

1528

* actually do this test is passed in 'non_utf8_func', which is called on 'c',

1529

* casting 'c' to the macro _LC_CAST, which should not be parenthesized. See

1530

* _generic_LC_base for more info */

1531

#define _generic_LC(c, utf8_locale_classnum, non_utf8_func) \

1532

_generic_LC_base(c,utf8_locale_classnum, \

1533

non_utf8_func( (_LC_CAST) (c)))

1534

1535

/* For internal core Perl use only: like _generic_LC, but also returns TRUE if

1536

* 'c' is the platform's native underscore character */

1537

#define _generic_LC_underscore(c,utf8_locale_classnum,non_utf8_func) \

1538

_generic_LC_base(c, utf8_locale_classnum, \

1539

(non_utf8_func( (_LC_CAST) (c)) \

1540

|| (char)(c) == '_'))

1541

1542

/* These next three are also for internal core Perl use only: case-change

1543

* helper macros */

1544

#define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \

1545

? (c) \

1546

: (IN_UTF8_CTYPE_LOCALE) \

1547

? PL_latin1_lc[ (U8) (c) ] \

1548

: (cast)function((cast)(c)))

1549

1550

/* Note that the result can be larger than a byte in a UTF-8 locale. It

1551

* returns a single value, so can't adequately return the upper case of LATIN

1552

* SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two

1553

* values "SS"); instead it asserts against that under DEBUGGING, and

1554

* otherwise returns its input */

1555

#define _generic_toUPPER_LC(c, function, cast) \

1556

(! FITS_IN_8_BITS(c) \

1557

? (c) \

1558

: ((! IN_UTF8_CTYPE_LOCALE) \

1559

? (cast)function((cast)(c)) \

1560

: ((((U8)(c)) == MICRO_SIGN) \

1561

? GREEK_CAPITAL_LETTER_MU \

1562

: ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \

1563

? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \

1564

: ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \

1565

? (__ASSERT_(0) (c)) \

1566

: PL_mod_latin1_uc[ (U8) (c) ])))))

1567

1568

/* Note that the result can be larger than a byte in a UTF-8 locale. It

1569

* returns a single value, so can't adequately return the fold case of LATIN

1570

* SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two

1571

* values "ss"); instead it asserts against that under DEBUGGING, and

1572

* otherwise returns its input */

1573

#define _generic_toFOLD_LC(c, function, cast) \

1574

((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \

1575

? GREEK_SMALL_LETTER_MU \

1576

: (__ASSERT_(! IN_UTF8_CTYPE_LOCALE \

1577

|| (c) != LATIN_SMALL_LETTER_SHARP_S) \

1578

_generic_toLOWER_LC(c, function, cast)))

1579

1580

/* Use the libc versions for these if available. */

1581

#if defined(HAS_ISASCII)

1582

# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii( (U8) (c)))

1583

#else

1584

# define isASCII_LC(c) isASCII(c)

1585

#endif

1586

1587

#if defined(HAS_ISBLANK)

1588

# define isBLANK_LC(c) _generic_LC(c, _CC_BLANK, isblank)

1589

#else /* Unlike isASCII, varies if in a UTF-8 locale */

1590

# define isBLANK_LC(c) ((IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c))

#endif

#define _LC_CAST U8

#ifdef WIN32

/* The Windows functions don't bother to follow the POSIX standard, which

1597

* for example says that something can't both be a printable and a control.

1598

* But Windows treats the \t control as a printable, and does such things

1599

* as making superscripts into both digits and punctuation. This tames

1600

* these flaws by assuming that the definitions of both controls and space

1601

* are correct, and then making sure that other definitions don't have

1602

* weirdnesses, by making sure that isalnum() isn't also ispunct(), etc.

1603

* Not all possible weirdnesses are checked for, just the ones that were

1604

* detected on actual Microsoft code pages */

1605

1606

# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl)

1607

# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace)

1608

1609

# define isALPHA_LC(c) (_generic_LC(c, _CC_ALPHA, isalpha) \

1610

&& isALPHANUMERIC_LC(c))

1611

# define isALPHANUMERIC_LC(c) (_generic_LC(c, _CC_ALPHANUMERIC, isalnum) && \

1612

! isPUNCT_LC(c))

1613

# define isDIGIT_LC(c) (_generic_LC(c, _CC_DIGIT, isdigit) && \

1614

isALPHANUMERIC_LC(c))

1615

# define isGRAPH_LC(c) (_generic_LC(c, _CC_GRAPH, isgraph) && isPRINT_LC(c))

1616

# define isIDFIRST_LC(c) (((c) == '_') \

1617

|| (_generic_LC(c, _CC_IDFIRST, isalpha) && ! isPUNCT_LC(c)))

1618

# define isLOWER_LC(c) (_generic_LC(c, _CC_LOWER, islower) && isALPHA_LC(c))

1619

# define isPRINT_LC(c) (_generic_LC(c, _CC_PRINT, isprint) && ! isCNTRL_LC(c))

1620

# define isPUNCT_LC(c) (_generic_LC(c, _CC_PUNCT, ispunct) && ! isCNTRL_LC(c))

1621

# define isUPPER_LC(c) (_generic_LC(c, _CC_UPPER, isupper) && isALPHA_LC(c))

1622

# define isWORDCHAR_LC(c) (((c) == '_') || isALPHANUMERIC_LC(c))

1623

# define isXDIGIT_LC(c) (_generic_LC(c, _CC_XDIGIT, isxdigit) \

1624

&& isALPHANUMERIC_LC(c))

1625

1626

# define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8)

1627

# define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8)

1628

# define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8)

1629

1630

#elif defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII))

1631

/* For most other platforms */

1632

1633

# define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, isalpha)

1634

# define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, isalnum)

1635

# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl)

1636

# define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, isdigit)

1637

# define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, isgraph)

1638

# define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, isalpha)

1639

# define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, islower)

1640

# define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, isprint)

1641

# define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, ispunct)

1642

# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace)

1643

# define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, isupper)

1644

# define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, isalnum)

1645

# define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, isxdigit)

1646

1647

1648

# define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8)

1649

# define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8)

1650

# define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8)

1651

1652

#else /* The final fallback position */

1653

1654

# define isALPHA_LC(c) (isascii(c) && isalpha(c))

1655

# define isALPHANUMERIC_LC(c) (isascii(c) && isalnum(c))

1656

# define isCNTRL_LC(c) (isascii(c) && iscntrl(c))

1657

# define isDIGIT_LC(c) (isascii(c) && isdigit(c))

1658

# define isGRAPH_LC(c) (isascii(c) && isgraph(c))

1659

# define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_'))

1660

# define isLOWER_LC(c) (isascii(c) && islower(c))

1661

# define isPRINT_LC(c) (isascii(c) && isprint(c))

1662

# define isPUNCT_LC(c) (isascii(c) && ispunct(c))

1663

# define isSPACE_LC(c) (isascii(c) && isspace(c))

1664

# define isUPPER_LC(c) (isascii(c) && isupper(c))

1665

# define isWORDCHAR_LC(c) (isascii(c) && (isalnum(c) || (c) == '_'))

1666

# define isXDIGIT_LC(c) (isascii(c) && isxdigit(c))

1667

1668

# define toLOWER_LC(c) (isascii(c) ? tolower(c) : (c))

1669

# define toUPPER_LC(c) (isascii(c) ? toupper(c) : (c))

1670

# define toFOLD_LC(c) (isascii(c) ? tolower(c) : (c))

#endif

#define isIDCONT(c) isWORDCHAR(c)

1675

#define isIDCONT_A(c) isWORDCHAR_A(c)

1676

#define isIDCONT_L1(c) isWORDCHAR_L1(c)

1677

#define isIDCONT_LC(c) isWORDCHAR_LC(c)

1678

#define isPSXSPC_LC(c) isSPACE_LC(c)

1679

1680

/* For internal core Perl use only: the base macros for defining macros like

1681

* isALPHA_uvchr. 'c' is the code point to check. 'classnum' is the POSIX class

1682

* number defined earlier in this file. _generic_uvchr() is used for POSIX

1683

* classes where there is a macro or function 'above_latin1' that takes the

1684

* single argument 'c' and returns the desired value. These exist for those

1685

* classes which have simple definitions, avoiding the overhead of a hash

1686

* lookup or inversion list binary search. _generic_swash_uvchr() can be used

1687

* for classes where that overhead is faster than a direct lookup.

1688

* _generic_uvchr() won't compile if 'c' isn't unsigned, as it won't match the

1689

* 'above_latin1' prototype. _generic_isCC() macro does bounds checking, so

1690

* have duplicate checks here, so could create versions of the macros that

1691

* don't, but experiments show that gcc optimizes them out anyway. */

1692

1693

/* Note that all ignore 'use bytes' */

1694

#define _generic_uvchr(classnum, above_latin1, c) ((c) < 256 \

1695

? _generic_isCC(c, classnum) \

1696

: above_latin1(c))

1697

#define _generic_swash_uvchr(classnum, c) ((c) < 256 \

1698

? _generic_isCC(c, classnum) \

1699

: _is_uni_FOO(classnum, c))

1700

#define isALPHA_uvchr(c) _generic_swash_uvchr(_CC_ALPHA, c)

1701

#define isALPHANUMERIC_uvchr(c) _generic_swash_uvchr(_CC_ALPHANUMERIC, c)

1702

#define isASCII_uvchr(c) isASCII(c)

1703

#define isBLANK_uvchr(c) _generic_uvchr(_CC_BLANK, is_HORIZWS_cp_high, c)

1704

#define isCNTRL_uvchr(c) isCNTRL_L1(c) /* All controls are in Latin1 */

1705

#define isDIGIT_uvchr(c) _generic_swash_uvchr(_CC_DIGIT, c)

1706

#define isGRAPH_uvchr(c) _generic_swash_uvchr(_CC_GRAPH, c)

1707

#define isIDCONT_uvchr(c) \

1708

_generic_uvchr(_CC_WORDCHAR, _is_uni_perl_idcont, c)

1709

#define isIDFIRST_uvchr(c) \

1710

_generic_uvchr(_CC_IDFIRST, _is_uni_perl_idstart, c)

1711

#define isLOWER_uvchr(c) _generic_swash_uvchr(_CC_LOWER, c)

1712

#define isPRINT_uvchr(c) _generic_swash_uvchr(_CC_PRINT, c)

1713

1714

#define isPUNCT_uvchr(c) _generic_swash_uvchr(_CC_PUNCT, c)

1715

#define isSPACE_uvchr(c) _generic_uvchr(_CC_SPACE, is_XPERLSPACE_cp_high, c)

1716

#define isPSXSPC_uvchr(c) isSPACE_uvchr(c)

1717

1718

#define isUPPER_uvchr(c) _generic_swash_uvchr(_CC_UPPER, c)

1719

#define isVERTWS_uvchr(c) _generic_uvchr(_CC_VERTSPACE, is_VERTWS_cp_high, c)

1720

#define isWORDCHAR_uvchr(c) _generic_swash_uvchr(_CC_WORDCHAR, c)

1721

#define isXDIGIT_uvchr(c) _generic_uvchr(_CC_XDIGIT, is_XDIGIT_cp_high, c)

1722

1723

#define toFOLD_uvchr(c,s,l) to_uni_fold(c,s,l)

1724

#define toLOWER_uvchr(c,s,l) to_uni_lower(c,s,l)

1725

#define toTITLE_uvchr(c,s,l) to_uni_title(c,s,l)

1726

#define toUPPER_uvchr(c,s,l) to_uni_upper(c,s,l)

1727

1728

/* For backwards compatibility, even though '_uni' should mean official Unicode

1729

* code points, in Perl it means native for those below 256 */

1730

#define isALPHA_uni(c) isALPHA_uvchr(c)

1731

#define isALPHANUMERIC_uni(c) isALPHANUMERIC_uvchr(c)

1732

#define isASCII_uni(c) isASCII_uvchr(c)

1733

#define isBLANK_uni(c) isBLANK_uvchr(c)

1734

#define isCNTRL_uni(c) isCNTRL_uvchr(c)

1735

#define isDIGIT_uni(c) isDIGIT_uvchr(c)

1736

#define isGRAPH_uni(c) isGRAPH_uvchr(c)

1737

#define isIDCONT_uni(c) isIDCONT_uvchr(c)

1738

#define isIDFIRST_uni(c) isIDFIRST_uvchr(c)

1739

#define isLOWER_uni(c) isLOWER_uvchr(c)

1740

#define isPRINT_uni(c) isPRINT_uvchr(c)

1741

#define isPUNCT_uni(c) isPUNCT_uvchr(c)

1742

#define isSPACE_uni(c) isSPACE_uvchr(c)

1743

#define isPSXSPC_uni(c) isPSXSPC_uvchr(c)

1744

#define isUPPER_uni(c) isUPPER_uvchr(c)

1745

#define isVERTWS_uni(c) isVERTWS_uvchr(c)

1746

#define isWORDCHAR_uni(c) isWORDCHAR_uvchr(c)

1747

#define isXDIGIT_uni(c) isXDIGIT_uvchr(c)

1748

#define toFOLD_uni(c,s,l) toFOLD_uvchr(c,s,l)

1749

#define toLOWER_uni(c,s,l) toLOWER_uvchr(c,s,l)

1750

#define toTITLE_uni(c,s,l) toTITLE_uvchr(c,s,l)

1751

#define toUPPER_uni(c,s,l) toUPPER_uvchr(c,s,l)

1752

1753

/* For internal core Perl use only: the base macros for defining macros like

1754

* isALPHA_LC_uvchr. These are like isALPHA_LC, but the input can be any code

1755

* point, not just 0-255. Like _generic_uvchr, there are two versions, one for

1756

* simple class definitions; the other for more complex. These are like

1757

* _generic_uvchr, so see it for more info. */

1758

#define _generic_LC_uvchr(latin1, above_latin1, c) \

1759

(c < 256 ? latin1(c) : above_latin1(c))

1760

#define _generic_LC_swash_uvchr(latin1, classnum, c) \

1761

(c < 256 ? latin1(c) : _is_uni_FOO(classnum, c))

1762

1763

#define isALPHA_LC_uvchr(c) _generic_LC_swash_uvchr(isALPHA_LC, _CC_ALPHA, c)

1764

#define isALPHANUMERIC_LC_uvchr(c) _generic_LC_swash_uvchr(isALPHANUMERIC_LC, \

1765

_CC_ALPHANUMERIC, c)

1766

#define isASCII_LC_uvchr(c) isASCII_LC(c)

1767

#define isBLANK_LC_uvchr(c) _generic_LC_uvchr(isBLANK_LC, \

1768

is_HORIZWS_cp_high, c)

1769

#define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : 0)

1770

#define isDIGIT_LC_uvchr(c) _generic_LC_swash_uvchr(isDIGIT_LC, _CC_DIGIT, c)

1771

#define isGRAPH_LC_uvchr(c) _generic_LC_swash_uvchr(isGRAPH_LC, _CC_GRAPH, c)

1772

#define isIDCONT_LC_uvchr(c) _generic_LC_uvchr(isIDCONT_LC, \

1773

_is_uni_perl_idcont, c)

1774

#define isIDFIRST_LC_uvchr(c) _generic_LC_uvchr(isIDFIRST_LC, \

1775

_is_uni_perl_idstart, c)

1776

#define isLOWER_LC_uvchr(c) _generic_LC_swash_uvchr(isLOWER_LC, _CC_LOWER, c)

1777

#define isPRINT_LC_uvchr(c) _generic_LC_swash_uvchr(isPRINT_LC, _CC_PRINT, c)

1778

#define isPSXSPC_LC_uvchr(c) isSPACE_LC_uvchr(c)

1779

#define isPUNCT_LC_uvchr(c) _generic_LC_swash_uvchr(isPUNCT_LC, _CC_PUNCT, c)

1780

#define isSPACE_LC_uvchr(c) _generic_LC_uvchr(isSPACE_LC, \

1781

is_XPERLSPACE_cp_high, c)

1782

#define isUPPER_LC_uvchr(c) _generic_LC_swash_uvchr(isUPPER_LC, _CC_UPPER, c)

1783

#define isWORDCHAR_LC_uvchr(c) _generic_LC_swash_uvchr(isWORDCHAR_LC, \

1784

_CC_WORDCHAR, c)

1785

#define isXDIGIT_LC_uvchr(c) _generic_LC_uvchr(isXDIGIT_LC, \

1786

is_XDIGIT_cp_high, c)

1787

1788

#define isBLANK_LC_uni(c) isBLANK_LC_uvchr(UNI_TO_NATIVE(c))

1789

1790

/* For internal core Perl use only: the base macros for defining macros like

1791

* isALPHA_utf8. These are like the earlier defined macros, but take an input

1792

* UTF-8 encoded string 'p'. If the input is in the Latin1 range, use

1793

* the Latin1 macro 'classnum' on 'p'. Otherwise use the value given by the

1794

* 'utf8' parameter. This relies on the fact that ASCII characters have the

1795

* same representation whether utf8 or not. Note that it assumes that the utf8

1796

* has been validated, and ignores 'use bytes' */

1797

#define _base_generic_utf8(enum_name, name, p, use_locale ) \

1798

_is_utf8_FOO(CAT2(_CC_, enum_name), \

1799

(const U8 *) p, \

1800

"is" STRINGIFY(name) "_utf8", \

1801

"is" STRINGIFY(name) "_utf8_safe", \

1802

1, use_locale, __FILE__,__LINE__)

1803

1804

#define _generic_utf8(name, p) _base_generic_utf8(name, name, p, 0)

1805

1806

/* The "_safe" macros make sure that we don't attempt to read beyond 'e', but

1807

* they don't otherwise go out of their way to look for malformed UTF-8. If

1808

* they can return accurate results without knowing if the input is otherwise

1809

* malformed, they do so. For example isASCII is accurate in spite of any

1810

* non-length malformations because it looks only at a single byte. Likewise

1811

* isDIGIT looks just at the first byte for code points 0-255, as all UTF-8

1812

* variant ones return FALSE. But, if the input has to be well-formed in order

1813

* for the results to be accurate, the macros will test and if malformed will

1814

* call a routine to die

1815

*

1816

* Except for toke.c, the macros do assume that e > p, asserting that on

1817

* DEBUGGING builds. Much code that calls these depends on this being true,

1818

* for other reasons. toke.c is treated specially as using the regular

1819

* assertion breaks it in many ways. All strings that these operate on there

1820

* are supposed to have an extra NUL character at the end, so that *e = \0. A

1821

* bunch of code in toke.c assumes that this is true, so the assertion allows

1822

* for that */

1823

#ifdef PERL_IN_TOKE_C

1824

# define _utf8_safe_assert(p,e) ((e) > (p) || ((e) == (p) && *(p) == '\0'))

1825

#else

1826

# define _utf8_safe_assert(p,e) ((e) > (p))

1827

#endif

1828

1829

#define _generic_utf8_safe(classnum, p, e, above_latin1) \

1830

(__ASSERT_(_utf8_safe_assert(p, e)) \

1831

(UTF8_IS_INVARIANT(*(p))) \

1832

? _generic_isCC(*(p), classnum) \

1833

: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \

1834

? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \

1835

? _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \

1836

classnum) \

1837

: (_force_out_malformed_utf8_message( \

1838

(U8 *) (p), (U8 *) (e), 0, 1), 0)) \

1839

: above_latin1))

1840

/* Like the above, but calls 'above_latin1(p)' to get the utf8 value.

1841

* 'above_latin1' can be a macro */

1842

#define _generic_func_utf8_safe(classnum, above_latin1, p, e) \

1843

_generic_utf8_safe(classnum, p, e, above_latin1(p, e))

1844

#define _generic_non_swash_utf8_safe(classnum, above_latin1, p, e) \

1845

_generic_utf8_safe(classnum, p, e, \

1846

(UNLIKELY((e) - (p) < UTF8SKIP(p)) \

1847

? (_force_out_malformed_utf8_message( \

1848

(U8 *) (p), (U8 *) (e), 0, 1), 0) \

1849

: above_latin1(p)))

1850

/* Like the above, but passes classnum to _isFOO_utf8(), instead of having an

1851

* 'above_latin1' parameter */

1852

#define _generic_swash_utf8_safe(classnum, p, e) \

1853

_generic_utf8_safe(classnum, p, e, _is_utf8_FOO_with_len(classnum, p, e))

1854

1855

/* Like the above, but should be used only when it is known that there are no

1856

* characters in the upper-Latin1 range (128-255 on ASCII platforms) which the

1857

* class is TRUE for. Hence it can skip the tests for this range.

1858

* 'above_latin1' should include its arguments */

1859

#define _generic_utf8_safe_no_upper_latin1(classnum, p, e, above_latin1) \

1860

(__ASSERT_(_utf8_safe_assert(p, e)) \

1861

(UTF8_IS_INVARIANT(*(p))) \

1862

? _generic_isCC(*(p), classnum) \

1863

: (UTF8_IS_DOWNGRADEABLE_START(*(p))) \

1864

? 0 /* Note that doesn't check validity for latin1 */ \

: above_latin1)

#define isALPHA_utf8(p) _generic_utf8(ALPHA, p)

1869

#define isALPHANUMERIC_utf8(p) _generic_utf8(ALPHANUMERIC, p)

1870

#define isASCII_utf8(p) _generic_utf8(ASCII, p)

1871

#define isBLANK_utf8(p) _generic_utf8(BLANK, p)

1872

#define isCNTRL_utf8(p) _generic_utf8(CNTRL, p)

1873

#define isDIGIT_utf8(p) _generic_utf8(DIGIT, p)

1874

#define isGRAPH_utf8(p) _generic_utf8(GRAPH, p)

1875

#define isIDCONT_utf8(p) _generic_utf8(IDCONT, p)

1876

#define isIDFIRST_utf8(p) _generic_utf8(IDFIRST, p)

1877

#define isLOWER_utf8(p) _generic_utf8(LOWER, p)

1878

#define isPRINT_utf8(p) _generic_utf8(PRINT, p)

1879

#define isPSXSPC_utf8(p) _generic_utf8(PSXSPC, p)

1880

#define isPUNCT_utf8(p) _generic_utf8(PUNCT, p)

1881

#define isSPACE_utf8(p) _generic_utf8(SPACE, p)

1882

#define isUPPER_utf8(p) _generic_utf8(UPPER, p)

1883

#define isVERTWS_utf8(p) _generic_utf8(VERTSPACE, p)

1884

#define isWORDCHAR_utf8(p) _generic_utf8(WORDCHAR, p)

1885

#define isXDIGIT_utf8(p) _generic_utf8(XDIGIT, p)

1886

1887

#define isALPHA_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_ALPHA, p, e)

1888

#define isALPHANUMERIC_utf8_safe(p, e) \

1889

_generic_swash_utf8_safe(_CC_ALPHANUMERIC, p, e)

1890

#define isASCII_utf8_safe(p, e) \

1891

/* Because ASCII is invariant under utf8, the non-utf8 macro \

1892

* works */ \

1893

(__ASSERT_(_utf8_safe_assert(p, e)) isASCII(*(p)))

1894

#define isBLANK_utf8_safe(p, e) \

1895

_generic_non_swash_utf8_safe(_CC_BLANK, is_HORIZWS_high, p, e)

1896

1897

#ifdef EBCDIC

1898

/* Because all controls are UTF-8 invariants in EBCDIC, we can use this

1899

* more efficient macro instead of the more general one */

1900

# define isCNTRL_utf8_safe(p, e) \

1901

(__ASSERT_(_utf8_safe_assert(p, e)) isCNTRL_L1(*(p)))

1902

#else

1903

# define isCNTRL_utf8_safe(p, e) _generic_utf8_safe(_CC_CNTRL, p, e, 0)

1904

#endif

1905

1906

#define isDIGIT_utf8_safe(p, e) \

1907

_generic_utf8_safe_no_upper_latin1(_CC_DIGIT, p, e, \

1908

_is_utf8_FOO_with_len(_CC_DIGIT, p, e))

1909

#define isGRAPH_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_GRAPH, p, e)

1910

#define isIDCONT_utf8_safe(p, e) _generic_func_utf8_safe(_CC_WORDCHAR, \

1911

_is_utf8_perl_idcont_with_len, p, e)

1912

1913

/* To prevent S_scan_word in toke.c from hanging, we have to make sure that

1914

* IDFIRST is an alnum. See

1915

* http://rt.perl.org/rt3/Ticket/Display.html?id=74022 for more detail than you

1916

* ever wanted to know about. (In the ASCII range, there isn't a difference.)

1917

* This used to be not the XID version, but we decided to go with the more

1918

* modern Unicode definition */

1919

#define isIDFIRST_utf8_safe(p, e) \

1920

_generic_func_utf8_safe(_CC_IDFIRST, \

1921

_is_utf8_perl_idstart_with_len, (U8 *) (p), (U8 *) (e))

1922

1923

#define isLOWER_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_LOWER, p, e)

1924

#define isPRINT_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_PRINT, p, e)

1925

#define isPSXSPC_utf8_safe(p, e) isSPACE_utf8_safe(p, e)

1926

#define isPUNCT_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_PUNCT, p, e)

1927

#define isSPACE_utf8_safe(p, e) \

1928

_generic_non_swash_utf8_safe(_CC_SPACE, is_XPERLSPACE_high, p, e)

1929

#define isUPPER_utf8_safe(p, e) _generic_swash_utf8_safe(_CC_UPPER, p, e)

1930

#define isVERTWS_utf8_safe(p, e) \

1931

_generic_non_swash_utf8_safe(_CC_VERTSPACE, is_VERTWS_high, p, e)

1932

#define isWORDCHAR_utf8_safe(p, e) \

1933

_generic_swash_utf8_safe(_CC_WORDCHAR, p, e)

1934

#define isXDIGIT_utf8_safe(p, e) \

1935

_generic_utf8_safe_no_upper_latin1(_CC_XDIGIT, p, e, \

1936

(UNLIKELY((e) - (p) < UTF8SKIP(p)) \

1937

? (_force_out_malformed_utf8_message( \

1938

(U8 *) (p), (U8 *) (e), 0, 1), 0) \

1939

: is_XDIGIT_high(p)))

1940

1941

#define toFOLD_utf8(p,s,l) to_utf8_fold(p,s,l)

1942

#define toLOWER_utf8(p,s,l) to_utf8_lower(p,s,l)

1943

#define toTITLE_utf8(p,s,l) to_utf8_title(p,s,l)

1944

#define toUPPER_utf8(p,s,l) to_utf8_upper(p,s,l)

1945

1946

/* For internal core use only, subject to change */

1947

#define _toFOLD_utf8_flags(p,e,s,l,f) _to_utf8_fold_flags (p,e,s,l,f, "", 0)

1948

#define _toLOWER_utf8_flags(p,e,s,l,f) _to_utf8_lower_flags(p,e,s,l,f, "", 0)

1949

#define _toTITLE_utf8_flags(p,e,s,l,f) _to_utf8_title_flags(p,e,s,l,f, "", 0)

1950

#define _toUPPER_utf8_flags(p,e,s,l,f) _to_utf8_upper_flags(p,e,s,l,f, "", 0)

1951

1952

#define toFOLD_utf8_safe(p,e,s,l) _toFOLD_utf8_flags(p,e,s,l, FOLD_FLAGS_FULL)

1953

#define toLOWER_utf8_safe(p,e,s,l) _toLOWER_utf8_flags(p,e,s,l, 0)

1954

#define toTITLE_utf8_safe(p,e,s,l) _toTITLE_utf8_flags(p,e,s,l, 0)

1955

#define toUPPER_utf8_safe(p,e,s,l) _toUPPER_utf8_flags(p,e,s,l, 0)

1956

1957

/* For internal core Perl use only: the base macros for defining macros like

1958

* isALPHA_LC_utf8. These are like _generic_utf8, but if the first code point

1959

* in 'p' is within the 0-255 range, it uses locale rules from the passed-in

1960

* 'macro' parameter */

1961

#define _generic_LC_utf8(name, p) _base_generic_utf8(name, name, p, 1)

1962

1963

#define isALPHA_LC_utf8(p) _generic_LC_utf8(ALPHA, p)

1964

#define isALPHANUMERIC_LC_utf8(p) _generic_LC_utf8(ALPHANUMERIC, p)

1965

#define isASCII_LC_utf8(p) _generic_LC_utf8(ASCII, p)

1966

#define isBLANK_LC_utf8(p) _generic_LC_utf8(BLANK, p)

1967

#define isCNTRL_LC_utf8(p) _generic_LC_utf8(CNTRL, p)

1968

#define isDIGIT_LC_utf8(p) _generic_LC_utf8(DIGIT, p)

1969

#define isGRAPH_LC_utf8(p) _generic_LC_utf8(GRAPH, p)

1970

#define isIDCONT_LC_utf8(p) _generic_LC_utf8(IDCONT, p)

1971

#define isIDFIRST_LC_utf8(p) _generic_LC_utf8(IDFIRST, p)

1972

#define isLOWER_LC_utf8(p) _generic_LC_utf8(LOWER, p)

1973

#define isPRINT_LC_utf8(p) _generic_LC_utf8(PRINT, p)

1974

#define isPSXSPC_LC_utf8(p) _generic_LC_utf8(PSXSPC, p)

1975

#define isPUNCT_LC_utf8(p) _generic_LC_utf8(PUNCT, p)

1976

#define isSPACE_LC_utf8(p) _generic_LC_utf8(SPACE, p)

1977

#define isUPPER_LC_utf8(p) _generic_LC_utf8(UPPER, p)

1978

#define isWORDCHAR_LC_utf8(p) _generic_LC_utf8(WORDCHAR, p)

1979

#define isXDIGIT_LC_utf8(p) _generic_LC_utf8(XDIGIT, p)

1980

1981

/* For internal core Perl use only: the base macros for defining macros like

1982

* isALPHA_LC_utf8_safe. These are like _generic_utf8, but if the first code

1983

* point in 'p' is within the 0-255 range, it uses locale rules from the

1984

* passed-in 'macro' parameter */

1985

#define _generic_LC_utf8_safe(macro, p, e, above_latin1) \

1986

(__ASSERT_(_utf8_safe_assert(p, e)) \

1987

(UTF8_IS_INVARIANT(*(p))) \

1988

? macro(*(p)) \

1989

: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \

1990

? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \

1991

? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \

1992

: (_force_out_malformed_utf8_message( \

1993

(U8 *) (p), (U8 *) (e), 0, 1), 0)) \

1994

: above_latin1))

1995

1996

#define _generic_LC_swash_utf8_safe(macro, classnum, p, e) \

1997

_generic_LC_utf8_safe(macro, p, e, \

1998

_is_utf8_FOO_with_len(classnum, p, e))

1999

2000

#define _generic_LC_func_utf8_safe(macro, above_latin1, p, e) \

2001

_generic_LC_utf8_safe(macro, p, e, above_latin1(p, e))

2002

2003

#define _generic_LC_non_swash_utf8_safe(classnum, above_latin1, p, e) \

2004

_generic_LC_utf8_safe(classnum, p, e, \

2005

(UNLIKELY((e) - (p) < UTF8SKIP(p)) \

2006

? (_force_out_malformed_utf8_message( \

2007

(U8 *) (p), (U8 *) (e), 0, 1), 0) \

2008

: above_latin1(p)))

2009

2010

#define isALPHANUMERIC_LC_utf8_safe(p, e) \

2011

_generic_LC_swash_utf8_safe(isALPHANUMERIC_LC, \

2012

_CC_ALPHANUMERIC, p, e)

2013

#define isALPHA_LC_utf8_safe(p, e) \

2014

_generic_LC_swash_utf8_safe(isALPHA_LC, _CC_ALPHA, p, e)

2015

#define isASCII_LC_utf8_safe(p, e) \

2016

(__ASSERT_(_utf8_safe_assert(p, e)) isASCII_LC(*(p)))

2017

#define isBLANK_LC_utf8_safe(p, e) \

2018

_generic_LC_non_swash_utf8_safe(isBLANK_LC, is_HORIZWS_high, p, e)

2019

#define isCNTRL_LC_utf8_safe(p, e) \

2020

_generic_LC_utf8_safe(isCNTRL_LC, p, e, 0)

2021

#define isDIGIT_LC_utf8_safe(p, e) \

2022

_generic_LC_swash_utf8_safe(isDIGIT_LC, _CC_DIGIT, p, e)

2023

#define isGRAPH_LC_utf8_safe(p, e) \

2024

_generic_LC_swash_utf8_safe(isGRAPH_LC, _CC_GRAPH, p, e)

2025

#define isIDCONT_LC_utf8_safe(p, e) \

2026

_generic_LC_func_utf8_safe(isIDCONT_LC, \

2027

_is_utf8_perl_idcont_with_len, p, e)

2028

#define isIDFIRST_LC_utf8_safe(p, e) \

2029

_generic_LC_func_utf8_safe(isIDFIRST_LC, \

2030

_is_utf8_perl_idstart_with_len, p, e)

2031

#define isLOWER_LC_utf8_safe(p, e) \

2032

_generic_LC_swash_utf8_safe(isLOWER_LC, _CC_LOWER, p, e)

2033

#define isPRINT_LC_utf8_safe(p, e) \

2034

_generic_LC_swash_utf8_safe(isPRINT_LC, _CC_PRINT, p, e)

2035

#define isPSXSPC_LC_utf8_safe(p, e) isSPACE_LC_utf8_safe(p, e)

2036

#define isPUNCT_LC_utf8_safe(p, e) \

2037

_generic_LC_swash_utf8_safe(isPUNCT_LC, _CC_PUNCT, p, e)

2038

#define isSPACE_LC_utf8_safe(p, e) \

2039

_generic_LC_non_swash_utf8_safe(isSPACE_LC, is_XPERLSPACE_high, p, e)

2040

#define isUPPER_LC_utf8_safe(p, e) \

2041

_generic_LC_swash_utf8_safe(isUPPER_LC, _CC_UPPER, p, e)

2042

#define isWORDCHAR_LC_utf8_safe(p, e) \

2043

_generic_LC_swash_utf8_safe(isWORDCHAR_LC, _CC_WORDCHAR, p, e)

2044

#define isXDIGIT_LC_utf8_safe(p, e) \

2045

_generic_LC_non_swash_utf8_safe(isXDIGIT_LC, is_XDIGIT_high, p, e)

2046

2047

/* Macros for backwards compatibility and for completeness when the ASCII and

2048

* Latin1 values are identical */

2049

#define isALPHAU(c) isALPHA_L1(c)

2050

#define isDIGIT_L1(c) isDIGIT_A(c)

2051

#define isOCTAL(c) isOCTAL_A(c)

2052

#define isOCTAL_L1(c) isOCTAL_A(c)

2053

#define isXDIGIT_L1(c) isXDIGIT_A(c)

2054

#define isALNUM(c) isWORDCHAR(c)

2055

#define isALNUMU(c) isWORDCHAR_L1(c)

2056

#define isALNUM_LC(c) isWORDCHAR_LC(c)

2057

#define isALNUM_uni(c) isWORDCHAR_uni(c)

2058

#define isALNUM_LC_uvchr(c) isWORDCHAR_LC_uvchr(c)

2059

#define isALNUM_utf8(p) isWORDCHAR_utf8(p)

2060

#define isALNUM_LC_utf8(p) isWORDCHAR_LC_utf8(p)

2061

#define isALNUMC_A(c) isALPHANUMERIC_A(c) /* Mnemonic: "C's alnum" */

2062

#define isALNUMC_L1(c) isALPHANUMERIC_L1(c)

2063

#define isALNUMC(c) isALPHANUMERIC(c)

2064

#define isALNUMC_LC(c) isALPHANUMERIC_LC(c)

2065

#define isALNUMC_uni(c) isALPHANUMERIC_uni(c)

2066

#define isALNUMC_LC_uvchr(c) isALPHANUMERIC_LC_uvchr(c)

2067

#define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p)

2068

#define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p)

2069

2070

/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,

2071

* except that they don't necessarily mean the same characters, e.g. CTRL-D is

2072

* 4 on both systems, but that is EOT on ASCII; ST on EBCDIC.

2073

* '?' is special-cased on EBCDIC to APC, which is the control there that is

2074

* the outlier from the block that contains the other controls, just like

2075

* toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0

2076

* block. If it weren't special cased, it would yield a non-control.

2077

* The conversion works both ways, so toCTRL('D') is 4, and toCTRL(4) is D,

2078

* etc. */

2079

#ifndef EBCDIC

2080

# define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) toUPPER(((U8)(c))) ^ 64)

2081

#else

2082

# define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) \

2083

((isPRINT_A(c)) \

2084

? (UNLIKELY((c) == '?') \

2085

? QUESTION_MARK_CTRL \

2086

: (NATIVE_TO_LATIN1(toUPPER((U8) (c))) ^ 64)) \

2087

: (UNLIKELY((c) == QUESTION_MARK_CTRL) \

2088

? '?' \

2089

: (LATIN1_TO_NATIVE(((U8) (c)) ^ 64)))))

2090

#endif

2091

2092

/* Line numbers are unsigned, 32 bits. */

2093

typedef U32 line_t;

2094

#define NOLINE ((line_t) 4294967295UL) /* = FFFFFFFF */

2095

2096

/* Helpful alias for version prescan */

2097

#define is_LAX_VERSION(a,b) \

2098

(a != Perl_prescan_version(aTHX_ a, FALSE, b, NULL, NULL, NULL, NULL))

2099

2100

#define is_STRICT_VERSION(a,b) \

2101

(a != Perl_prescan_version(aTHX_ a, TRUE, b, NULL, NULL, NULL, NULL))

2102

2103

#define BADVERSION(a,b,c) \

if (b) { \

*b = c; \

} \

return a;

/* Converts a character known to represent a hexadecimal digit (0-9, A-F, or

2110

* a-f) to its numeric value. READ_XDIGIT's argument is a string pointer,

2111

* which is advanced. The input is validated only by an assert() in DEBUGGING

2112

* builds. In both ASCII and EBCDIC the last 4 bits of the digits are 0-9; and

2113

* the last 4 bits of A-F and a-f are 1-6, so adding 9 yields 10-15 */

2114

#define XDIGIT_VALUE(c) (__ASSERT_(isXDIGIT(c)) (0xf & (isDIGIT(c) \

2115

? (c) \

2116

: ((c) + 9))))

2117

#define READ_XDIGIT(s) (__ASSERT_(isXDIGIT(*s)) (0xf & (isDIGIT(*(s)) \

? (*(s)++) \

: (*(s)++ + 9))))

/* Converts a character known to represent an octal digit (0-7) to its numeric

2122

* value. The input is validated only by an assert() in DEBUGGING builds. In

2123

* both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */

2124

#define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c)))

2125

2126

/* Efficiently returns a boolean as to if two native characters are equivalent

2127

* case-insenstively. At least one of the characters must be one of [A-Za-z];

2128

* the ALPHA in the name is to remind you of that. This is asserted() in

2129

* DEBUGGING builds. Because [A-Za-z] are invariant under UTF-8, this macro

2130

* works (on valid input) for both non- and UTF-8-encoded bytes.

2131

*

2132

* When one of the inputs is a compile-time constant and gets folded by the

2133

* compiler, this reduces to an AND and a TEST. On both EBCDIC and ASCII

2134

* machines, 'A' and 'a' differ by a single bit; the same with the upper and

2135

* lower case of all other ASCII-range alphabetics. On ASCII platforms, they

2136

* are 32 apart; on EBCDIC, they are 64. At compile time, this uses an

2137

* exclusive 'or' to find that bit and then inverts it to form a mask, with

2138

* just a single 0, in the bit position where the upper- and lowercase differ.

2139

* */

2140

#define isALPHA_FOLD_EQ(c1, c2) \

2141

(__ASSERT_(isALPHA_A(c1) || isALPHA_A(c2)) \

2142

((c1) & ~('A' ^ 'a')) == ((c2) & ~('A' ^ 'a')))

2143

#define isALPHA_FOLD_NE(c1, c2) (! isALPHA_FOLD_EQ((c1), (c2)))

2144

2145

/*

2146

=head1 Memory Management

2147

2148

2149

The XSUB-writer's interface to the C C<malloc> function.

2150

2151

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2152

2153

In 5.9.3, Newx() and friends replace the older New() API, and drops

2154

the first parameter, I<x>, a debug aid which allowed callers to identify

2155

themselves. This aid has been superseded by a new build option,

2156

PERL_MEM_LOG (see L<perlhacktips/PERL_MEM_LOG>). The older API is still

2157

there for use in XS modules supporting older perls.

2158

2159

2160

The XSUB-writer's interface to the C C<malloc> function, with

2161

cast. See also C<L</Newx>>.

2162

2163

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2164

2165

2166

The XSUB-writer's interface to the C C<malloc> function. The allocated

2167

memory is zeroed with C<memzero>. See also C<L</Newx>>.

2168

2169

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2170

2171

2172

The XSUB-writer's interface to the C C<realloc> function.

2173

2174

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2175

2176

2177

The XSUB-writer's interface to the C C<realloc> function, with

2178

cast.

2179

2180

Memory obtained by this should B<ONLY> be freed with L</"Safefree">.

2181

2182

=for apidoc Am|void|Safefree|void* ptr

2183

The XSUB-writer's interface to the C C<free> function.

2184

2185

This should B<ONLY> be used on memory obtained using L</"Newx"> and friends.

2186

2187

2188

The XSUB-writer's interface to the C C<memmove> function. The C<src> is the

2189

source, C<dest> is the destination, C<nitems> is the number of items, and

2190

C<type> is the type. Can do overlapping moves. See also C<L</Copy>>.

2191

2192

2193

Like C<Move> but returns C<dest>. Useful

2194

for encouraging compilers to tail-call

optimise.

The XSUB-writer's interface to the C C<memcpy> function. The C<src> is the

2199

source, C<dest> is the destination, C<nitems> is the number of items, and

2200

C<type> is the type. May fail on overlapping copies. See also C<L</Move>>.

Like C<Copy> but returns C<dest>. Useful

2205

for encouraging compilers to tail-call

optimise.

The XSUB-writer's interface to the C C<memzero> function. The C<dest> is the

2211

destination, C<nitems> is the number of items, and C<type> is the type.

Like C<Zero> but returns dest. Useful

2216

for encouraging compilers to tail-call

optimise.

This is an architecture-independent macro to copy one structure to another.

Fill up memory with a byte pattern (a byte repeated over and over

2225

again) that hopefully catches attempts to access uninitialized memory.

PoisonWith(0xAB) for catching access to allocated but uninitialized memory.

PoisonWith(0xEF) for catching access to freed memory.

PoisonWith(0xEF) for catching access to freed memory.

=cut */

/* Maintained for backwards-compatibility only. Use newSV() instead. */

2242

#ifndef PERL_CORE

2243

#define NEWSV(x,len) newSV(len)

2244

#endif

2245

2246

#define MEM_SIZE_MAX ((MEM_SIZE)-1)

2247

2248

#define _PERL_STRLEN_ROUNDUP_UNCHECKED(n) (((n) - 1 + PERL_STRLEN_ROUNDUP_QUANTUM) & ~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM - 1))

2249

2250

#ifdef PERL_MALLOC_WRAP

2251

2252

/* This expression will be constant-folded at compile time. It checks

2253

* whether or not the type of the count n is so small (e.g. U8 or U16, or

2254

* U32 on 64-bit systems) that there's no way a wrap-around could occur.

2255

* As well as avoiding the need for a run-time check in some cases, it's

2256

* designed to avoid compiler warnings like:

2257

* comparison is always false due to limited range of data type

2258

* It's mathematically equivalent to

2259

* max(n) * sizeof(t) > MEM_SIZE_MAX

2260

*/

2261

2262

# define _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) \

2263

( sizeof(MEM_SIZE) < sizeof(n) \

2264

|| sizeof(t) > ((MEM_SIZE)1 << 8*(sizeof(MEM_SIZE) - sizeof(n))))

2265

2266

/* This is written in a slightly odd way to avoid various spurious

2267

* compiler warnings. We *want* to write the expression as

2268

* _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) && (n > C)

2269

* (for some compile-time constant C), but even when the LHS

2270

* constant-folds to false at compile-time, g++ insists on emitting

2271

* warnings about the RHS (e.g. "comparison is always false"), so instead

* we write it as

*

* (cond ? n : X) > C

*

* where X is a constant with X > C always false. Choosing a value for X

2277

* is tricky. If 0, some compilers will complain about 0 > C always being

2278

* false; if 1, Coverity complains when n happens to be the constant value

2279

* '1', that cond ? 1 : 1 has the same value on both branches; so use C

2280

* for X and hope that nothing else whines.

2281

*/

2282

2283

# define _MEM_WRAP_WILL_WRAP(n,t) \

2284

((_MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) ? (MEM_SIZE)(n) : \

2285

MEM_SIZE_MAX/sizeof(t)) > MEM_SIZE_MAX/sizeof(t))

2286

2287

# define MEM_WRAP_CHECK(n,t) \

2288

(void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \

2289

&& (croak_memory_wrap(),0))

2290

2291

# define MEM_WRAP_CHECK_1(n,t,a) \

2292

(void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \

2293

&& (Perl_croak_nocontext("%s",(a)),0))

2294

2295

/* "a" arg must be a string literal */

2296

# define MEM_WRAP_CHECK_s(n,t,a) \

2297

(void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \

2298

&& (Perl_croak_nocontext("" a ""),0))

2299

2300

#define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t),

2301

2302

#define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0) : 0), _PERL_STRLEN_ROUNDUP_UNCHECKED(n))

2303

#else

2304

2305

#define MEM_WRAP_CHECK(n,t)

2306

#define MEM_WRAP_CHECK_1(n,t,a)

2307

#define MEM_WRAP_CHECK_s(n,t,a)

2308

#define MEM_WRAP_CHECK_(n,t)

2309

2310

#define PERL_STRLEN_ROUNDUP(n) _PERL_STRLEN_ROUNDUP_UNCHECKED(n)

#endif

#ifdef PERL_MEM_LOG

/*

* If PERL_MEM_LOG is defined, all Newx()s, Renew()s, and Safefree()s

2317

* go through functions, which are handy for debugging breakpoints, but

2318

* which more importantly get the immediate calling environment (file and

2319

* line number, and C function name if available) passed in. This info can

2320

* then be used for logging the calls, for which one gets a sample

2321

* implementation unless -DPERL_MEM_LOG_NOIMPL is also defined.

2322

*

2323

* Known problems:

2324

* - not all memory allocs get logged, only those

2325

* that go through Newx() and derivatives (while all

2326

* Safefrees do get logged)

2327

* - __FILE__ and __LINE__ do not work everywhere

2328

* - __func__ or __FUNCTION__ even less so

2329

* - I think more goes on after the perlio frees but

2330

* the thing is that STDERR gets closed (as do all

2331

* the file descriptors)

2332

* - no deeper calling stack than the caller of the Newx()

2333

* or the kind, but do I look like a C reflection/introspection

2334

* utility to you?

2335

* - the function prototypes for the logging functions

2336

* probably should maybe be somewhere else than handy.h

2337

* - one could consider inlining (macrofying) the logging

2338

* for speed, but I am too lazy

2339

* - one could imagine recording the allocations in a hash,

2340

* (keyed by the allocation address?), and maintain that

2341

* through reallocs and frees, but how to do that without

2342

* any News() happening...?

2343

* - lots of -Ddefines to get useful/controllable output

2344

* - lots of ENV reads

*/

# ifdef PERL_CORE

# ifndef PERL_MEM_LOG_NOIMPL

enum mem_log_type {

MLT_ALLOC,

MLT_REALLOC,

MLT_FREE,

MLT_NEW_SV,

MLT_DEL_SV

};

# endif

# if defined(PERL_IN_SV_C) /* those are only used in sv.c */

2358

void Perl_mem_log_new_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname);

2359

void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname);

# endif

# endif

#endif

#ifdef PERL_MEM_LOG

#define MEM_LOG_ALLOC(n,t,a) Perl_mem_log_alloc(n,sizeof(t),STRINGIFY(t),a,__FILE__,__LINE__,FUNCTION__)

2367

#define MEM_LOG_REALLOC(n,t,v,a) Perl_mem_log_realloc(n,sizeof(t),STRINGIFY(t),v,a,__FILE__,__LINE__,FUNCTION__)

2368

#define MEM_LOG_FREE(a) Perl_mem_log_free(a,__FILE__,__LINE__,FUNCTION__)

2369

#endif

2370

2371

#ifndef MEM_LOG_ALLOC

2372

#define MEM_LOG_ALLOC(n,t,a) (a)

2373

#endif

2374

#ifndef MEM_LOG_REALLOC

2375

#define MEM_LOG_REALLOC(n,t,v,a) (a)

2376

#endif

2377

#ifndef MEM_LOG_FREE

2378

#define MEM_LOG_FREE(a) (a)

2379

#endif

2380

2381

#define Newx(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t))))))

2382

#define Newxc(v,n,t,c) (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t))))))

2383

#define Newxz(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safecalloc((n),sizeof(t)))))

2384

2385

#ifndef PERL_CORE

2386

/* pre 5.9.x compatibility */

2387

#define New(x,v,n,t) Newx(v,n,t)

2388

#define Newc(x,v,n,t,c) Newxc(v,n,t,c)

2389

#define Newz(x,v,n,t) Newxz(v,n,t)

2390

#endif

2391

2392

#define Renew(v,n,t) \

2393

(v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t))))))

2394

#define Renewc(v,n,t,c) \

2395

(v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t))))))

2396

2397

#ifdef PERL_POISON

2398

#define Safefree(d) \

2399

((d) ? (void)(safefree(MEM_LOG_FREE((Malloc_t)(d))), Poison(&(d), 1, Malloc_t)) : (void) 0)

2400

#else

2401

#define Safefree(d) safefree(MEM_LOG_FREE((Malloc_t)(d)))

2402

#endif

2403

2404

/* assert that a valid ptr has been supplied - use this instead of assert(ptr) *

2405

* as it handles cases like constant string arguments without throwing warnings *

2406

* the cast is required, as is the inequality check, to avoid warnings */

2407

#define perl_assert_ptr(p) assert( ((void*)(p)) != 0 )

2408

2409

2410

#define Move(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memmove((char*)(d),(const char*)(s), (n) * sizeof(t)))

2411

#define Copy(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memcpy((char*)(d),(const char*)(s), (n) * sizeof(t)))

2412

#define Zero(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), (void)memzero((char*)(d), (n) * sizeof(t)))

2413

2414

/* Like above, but returns a pointer to 'd' */

2415

#define MoveD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memmove((char*)(d),(const char*)(s), (n) * sizeof(t)))

2416

#define CopyD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memcpy((char*)(d),(const char*)(s), (n) * sizeof(t)))

2417

#define ZeroD(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), memzero((char*)(d), (n) * sizeof(t)))

2418

2419

#define PoisonWith(d,n,t,b) (MEM_WRAP_CHECK_(n,t) (void)memset((char*)(d), (U8)(b), (n) * sizeof(t)))

2420

#define PoisonNew(d,n,t) PoisonWith(d,n,t,0xAB)

2421

#define PoisonFree(d,n,t) PoisonWith(d,n,t,0xEF)

2422

#define Poison(d,n,t) PoisonFree(d,n,t)

2423

2424

#ifdef PERL_POISON

2425

# define PERL_POISON_EXPR(x) x

2426

#else

2427

# define PERL_POISON_EXPR(x)

2428

#endif

2429

2430

#define StructCopy(s,d,t) (*((t*)(d)) = *((t*)(s)))

2431

2432

/* C_ARRAY_LENGTH is the number of elements in the C array (so you

2433

* want your zero-based indices to be less than but not equal to).

2434

*

2435

* C_ARRAY_END is one past the last: half-open/half-closed range,

2436

* not last-inclusive range. */

2437

#define C_ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))

2438

#define C_ARRAY_END(a) ((a) + C_ARRAY_LENGTH(a))

#ifdef NEED_VA_COPY

# ifdef va_copy

# define Perl_va_copy(s, d) va_copy(d, s)

2443

# elif defined(__va_copy)

2444

# define Perl_va_copy(s, d) __va_copy(d, s)

2445

# else

2446

# define Perl_va_copy(s, d) Copy(s, d, 1, va_list)

# endif

#endif

/* convenience debug macros */

2451

#ifdef USE_ITHREADS

2452

#define pTHX_FORMAT "Perl interpreter: 0x%p"

2453

#define pTHX__FORMAT ", Perl interpreter: 0x%p"

2454

#define pTHX_VALUE_ (void *)my_perl,

2455

#define pTHX_VALUE (void *)my_perl

2456

#define pTHX__VALUE_ ,(void *)my_perl,

2457

#define pTHX__VALUE ,(void *)my_perl

#else

#define pTHX_FORMAT

#define pTHX__FORMAT

#define pTHX_VALUE_

#define pTHX_VALUE

#define pTHX__VALUE_

#define pTHX__VALUE

#endif /* USE_ITHREADS */

2466

2467

/* Perl_deprecate was not part of the public API, and did not have a deprecate()

2468

shortcut macro defined without -DPERL_CORE. Neither codesearch.google.com nor

2469

CPAN::Unpack show any users outside the core. */

2470

#ifdef PERL_CORE

2471

# define deprecate(s) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \

2472

"Use of " s " is deprecated")

2473

# define deprecate_disappears_in(when,message) \

2474

Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \

2475

message ", and will disappear in Perl " when)

2476

# define deprecate_fatal_in(when,message) \

2477

Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \

2478

message ". Its use will be fatal in Perl " when)

2479

#endif

2480

2481

/* Internal macros to deal with gids and uids */

2482

#ifdef PERL_CORE

2483

2484

# if Uid_t_size > IVSIZE

2485

# define sv_setuid(sv, uid) sv_setnv((sv), (NV)(uid))

2486

# define SvUID(sv) SvNV(sv)

2487

# elif Uid_t_sign <= 0

2488

# define sv_setuid(sv, uid) sv_setiv((sv), (IV)(uid))

2489

# define SvUID(sv) SvIV(sv)

2490

# else

2491

# define sv_setuid(sv, uid) sv_setuv((sv), (UV)(uid))

2492

# define SvUID(sv) SvUV(sv)

2493

# endif /* Uid_t_size */

2494

2495

# if Gid_t_size > IVSIZE

2496

# define sv_setgid(sv, gid) sv_setnv((sv), (NV)(gid))

2497

# define SvGID(sv) SvNV(sv)

2498

# elif Gid_t_sign <= 0

2499

# define sv_setgid(sv, gid) sv_setiv((sv), (IV)(gid))

2500

# define SvGID(sv) SvIV(sv)

2501

# else

2502

# define sv_setgid(sv, gid) sv_setuv((sv), (UV)(gid))

2503

# define SvGID(sv) SvUV(sv)

2504

# endif /* Gid_t_size */

#endif

#endif /* PERL_HANDY_H_ */

2509

2510

/*

2511

* ex: set ts=8 sts=4 sw=4 et:

2512

*/