This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
More preparation for 7.0 in versioning
[perl5.git] / handy.h
CommitLineData
a0d0e21e 1/* handy.h
a687059c 2 *
1129b882 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000,
da5d8dbb 4 * 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others
a687059c 5 *
6e21c824
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8d063cd8 8 *
8d063cd8
LW
9 */
10
4650c663
KW
11/* IMPORTANT NOTE: Everything whose name begins with an underscore is for
12 * internal core Perl use only. */
13
6a5bc5ac
KW
14#ifndef PERL_HANDY_H_ /* Guard against nested #inclusion */
15#define PERL_HANDY_H_
9d745869 16
24792b8d
NC
17#ifndef PERL_CORE
18# define Null(type) ((type)NULL)
954c1994
GS
19
20/*
ccfc67b7 21=head1 Handy Values
954c1994 22
78342678 23=for apidoc AmnU||Nullch
72d33970
FC
24Null character pointer. (No longer available when C<PERL_CORE> is
25defined.)
2307c6d0 26
78342678 27=for apidoc AmnU||Nullsv
72d33970 28Null SV pointer. (No longer available when C<PERL_CORE> is defined.)
954c1994
GS
29
30=cut
31*/
32
24792b8d
NC
33# define Nullch Null(char*)
34# define Nullfp Null(PerlIO*)
35# define Nullsv Null(SV*)
36#endif
8d063cd8 37
641d3f0b
PP
38#ifdef TRUE
39#undef TRUE
40#endif
41#ifdef FALSE
42#undef FALSE
43#endif
44#define TRUE (1)
45#define FALSE (0)
46
cf3f0ffb
DM
47/* The MUTABLE_*() macros cast pointers to the types shown, in such a way
48 * (compiler permitting) that casting away const-ness will give a warning;
49 * e.g.:
50 *
51 * const SV *sv = ...;
52 * AV *av1 = (AV*)sv; <== BAD: the const has been silently cast away
53 * AV *av2 = MUTABLE_AV(sv); <== GOOD: it may warn
54 */
55
b1bc3f34 56#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
6c2255e0 57# define MUTABLE_PTR(p) ({ void *p_ = (p); p_; })
b1bc3f34
NC
58#else
59# define MUTABLE_PTR(p) ((void *) (p))
60#endif
61
a062e10d 62#define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p))
ea726b52 63#define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p))
159b6efe 64#define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p))
dbebbdb4 65#define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p))
a45c7426 66#define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p))
b1bc3f34 67#define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p))
27d4fb96 68
f789f6a4 69#if defined(I_STDBOOL) && !defined(PERL_BOOL_AS_CHAR)
bd31be4b
NC
70# include <stdbool.h>
71# ifndef HAS_BOOL
72# define HAS_BOOL 1
73# endif
74#endif
75
8e84507e 76/* bool is built-in for g++-2.6.3 and later, which might be used
c1d22f6b
GS
77 for extensions. <_G_config.h> defines _G_HAVE_BOOL, but we can't
78 be sure _G_config.h will be included before this file. _G_config.h
8e84507e 79 also defines _G_HAVE_BOOL for both gcc and g++, but only g++
c1d22f6b
GS
80 actually has bool. Hence, _G_HAVE_BOOL is pretty useless for us.
81 g++ can be identified by __GNUG__.
82 Andy Dougherty February 2000
5d94fbed 83*/
3609ea0d 84#ifdef __GNUG__ /* GNU g++ has bool built-in */
f789f6a4 85# ifndef PERL_BOOL_AS_CHAR
5d94fbed 86# ifndef HAS_BOOL
c1d22f6b 87# define HAS_BOOL 1
5d94fbed 88# endif
f789f6a4 89# endif
5d94fbed
AD
90#endif
91
92#ifndef HAS_BOOL
f789f6a4
FC
93# ifdef bool
94# undef bool
95# endif
70d5cb32 96# define bool char
c1d22f6b 97# define HAS_BOOL 1
a687059c 98#endif
0d3e774c 99
25ba28ce
KW
100/*
101=for apidoc Am|bool|cBOOL|bool expr
102
103Cast-to-bool. A simple S<C<(bool) I<expr>>> cast may not do the right thing:
104if C<bool> is defined as C<char>, for example, then the cast from C<int> is
105implementation-defined.
106
107C<(bool)!!(cbool)> in a ternary triggers a bug in xlc on AIX
108
109=cut
110*/
18f5643b 111#define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0)
f2338a2e 112
46c6c7e2 113/* Try to figure out __func__ or __FUNCTION__ equivalent, if any.
e352bcff
JH
114 * XXX Should really be a Configure probe, with HAS__FUNCTION__
115 * and FUNCTION__ as results.
116 * XXX Similarly, a Configure probe for __FILE__ and __LINE__ is needed. */
46c6c7e2
JH
117#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__SUNPRO_C)) /* C99 or close enough. */
118# define FUNCTION__ __func__
7adf2470 119#elif (defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */
07798b17 120# define FUNCTION__ ""
46c6c7e2 121#else
07798b17 122# define FUNCTION__ __FUNCTION__ /* Common extension. */
46c6c7e2
JH
123#endif
124
27d4fb96
PP
125/* XXX A note on the perl source internal type system. The
126 original intent was that I32 be *exactly* 32 bits.
127
128 Currently, we only guarantee that I32 is *at least* 32 bits.
129 Specifically, if int is 64 bits, then so is I32. (This is the case
130 for the Cray.) This has the advantage of meshing nicely with
131 standard library calls (where we pass an I32 and the library is
132 expecting an int), but the disadvantage that an I32 is not 32 bits.
133 Andy Dougherty August 1996
24fef2a7 134
dc45a647
MB
135 There is no guarantee that there is *any* integral type with
136 exactly 32 bits. It is perfectly legal for a system to have
137 sizeof(short) == sizeof(int) == sizeof(long) == 8.
693762b4 138
dc45a647
MB
139 Similarly, there is no guarantee that I16 and U16 have exactly 16
140 bits.
693762b4 141
8e84507e
NIS
142 For dealing with issues that may arise from various 32/64-bit
143 systems, we will ask Configure to check out
8175356b 144
3609ea0d
JH
145 SHORTSIZE == sizeof(short)
146 INTSIZE == sizeof(int)
147 LONGSIZE == sizeof(long)
dc45a647 148 LONGLONGSIZE == sizeof(long long) (if HAS_LONG_LONG)
3609ea0d 149 PTRSIZE == sizeof(void *)
dc45a647
MB
150 DOUBLESIZE == sizeof(double)
151 LONG_DOUBLESIZE == sizeof(long double) (if HAS_LONG_DOUBLE).
8175356b 152
27d4fb96
PP
153*/
154
69512466
JH
155#ifdef I_INTTYPES /* e.g. Linux has int64_t without <inttypes.h> */
156# include <inttypes.h>
dd0eed91
JH
157# ifdef INT32_MIN_BROKEN
158# undef INT32_MIN
159# define INT32_MIN (-2147483647-1)
160# endif
161# ifdef INT64_MIN_BROKEN
162# undef INT64_MIN
163# define INT64_MIN (-9223372036854775807LL-1)
164# endif
69512466
JH
165#endif
166
8175356b
JH
167typedef I8TYPE I8;
168typedef U8TYPE U8;
169typedef I16TYPE I16;
170typedef U16TYPE U16;
171typedef I32TYPE I32;
172typedef U32TYPE U32;
16d89be8 173
74b807c7 174#ifdef QUADKIND
8175356b
JH
175typedef I64TYPE I64;
176typedef U64TYPE U64;
16d89be8 177#endif
8175356b 178
d8668976 179#if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX)
5ff3f7a4 180
5ff3f7a4
GS
181/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.
182 Please search CHAR_MAX in perl.h for further details. */
183#define U8_MAX UINT8_MAX
184#define U8_MIN UINT8_MIN
185
5ff3f7a4
GS
186#define I16_MAX INT16_MAX
187#define I16_MIN INT16_MIN
188#define U16_MAX UINT16_MAX
189#define U16_MIN UINT16_MIN
190
5ff3f7a4
GS
191#define I32_MAX INT32_MAX
192#define I32_MIN INT32_MIN
0e983133
GS
193#ifndef UINT32_MAX_BROKEN /* e.g. HP-UX with gcc messes this up */
194# define U32_MAX UINT32_MAX
195#else
196# define U32_MAX 4294967295U
197#endif
5ff3f7a4
GS
198#define U32_MIN UINT32_MIN
199
200#else
201
5c9fa16e
KA
202/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.
203 Please search CHAR_MAX in perl.h for further details. */
27d4fb96
PP
204#define U8_MAX PERL_UCHAR_MAX
205#define U8_MIN PERL_UCHAR_MIN
79072805 206
27d4fb96
PP
207#define I16_MAX PERL_SHORT_MAX
208#define I16_MIN PERL_SHORT_MIN
209#define U16_MAX PERL_USHORT_MAX
210#define U16_MIN PERL_USHORT_MIN
79072805 211
c4f23d77 212#if LONGSIZE > 4
27d4fb96
PP
213# define I32_MAX PERL_INT_MAX
214# define I32_MIN PERL_INT_MIN
215# define U32_MAX PERL_UINT_MAX
216# define U32_MIN PERL_UINT_MIN
79072805 217#else
27d4fb96
PP
218# define I32_MAX PERL_LONG_MAX
219# define I32_MIN PERL_LONG_MIN
220# define U32_MAX PERL_ULONG_MAX
221# define U32_MIN PERL_ULONG_MIN
79072805
LW
222#endif
223
5ff3f7a4
GS
224#endif
225
247cee9f
KW
226/* These C99 typedefs are useful sometimes for, say, loop variables whose
227 * maximum values are small, but for which speed trumps size. If we have a C99
228 * compiler, use that. Otherwise, a plain 'int' should be good enough.
229 *
230 * Restrict these to core for now until we are more certain this is a good
231 * idea. */
232#if defined(PERL_CORE) || defined(PERL_EXT)
233# ifdef I_STDINT
234 typedef int_fast8_t PERL_INT_FAST8_T;
235 typedef uint_fast8_t PERL_UINT_FAST8_T;
236 typedef int_fast16_t PERL_INT_FAST16_T;
237 typedef uint_fast16_t PERL_UINT_FAST16_T;
238# else
239 typedef int PERL_INT_FAST8_T;
240 typedef unsigned int PERL_UINT_FAST8_T;
241 typedef int PERL_INT_FAST16_T;
242 typedef unsigned int PERL_UINT_FAST16_T;
243# endif
244#endif
245
464decb6 246/* log(2) (i.e., log base 10 of 2) is pretty close to 0.30103, just in case
ab350cbd
KW
247 * anyone is grepping for it. So BIT_DIGITS gives the number of decimal digits
248 * required to represent any possible unsigned number containing N bits.
249 * TYPE_DIGITS gives the number of decimal digits required to represent any
250 * possible unsigned number of type T. */
464decb6 251#define BIT_DIGITS(N) (((N)*146)/485 + 1) /* log10(2) =~ 146/485 */
fc36a67e
PP
252#define TYPE_DIGITS(T) BIT_DIGITS(sizeof(T) * 8)
253#define TYPE_CHARS(T) (TYPE_DIGITS(T) + 2) /* sign, NUL */
254
88794300 255/* Unused by core; should be deprecated */
ff68c719 256#define Ctl(ch) ((ch) & 037)
8d063cd8 257
98fce2a4
KW
258#if defined(PERL_CORE) || defined(PERL_EXT)
259# ifndef MIN
260# define MIN(a,b) ((a) < (b) ? (a) : (b))
261# endif
262# ifndef MAX
263# define MAX(a,b) ((a) > (b) ? (a) : (b))
264# endif
265#endif
266
84ff4fa9
KW
267/* Returns a boolean as to whether the input unsigned number is a power of 2
268 * (2**0, 2**1, etc). In other words if it has just a single bit set.
269 * If not, subtracting 1 would leave the uppermost bit set, so the & would
270 * yield non-zero */
271#if defined(PERL_CORE) || defined(PERL_EXT)
011b1419 272# define isPOWER_OF_2(n) ((n) && ((n) & ((n)-1)) == 0)
84ff4fa9
KW
273#endif
274
d223e1ea 275/* Returns a mask with the lowest n bits set */
fae1e72b 276#define nBIT_MASK(n) ((UINTMAX_C(1) << (n)) - 1)
d223e1ea 277
1381ccb1
KW
278/* The largest unsigned number that will fit into n bits */
279#define nBIT_UMAX(n) nBIT_MASK(n)
280
8d9433eb
KW
281/*
282=for apidoc Am|void|__ASSERT_|bool expr
283
284This is a helper macro to avoid preprocessor issues, replaced by nothing
285unless under DEBUGGING, where it expands to an assert of its argument,
286followed by a comma (hence the comma operator). If we just used a straight
287assert(), we would get a comma with nothing before it when not DEBUGGING.
288
289=cut
290
291We also use empty definition under Coverity since the __ASSERT__
292checks often check for things that Really Cannot Happen, and Coverity
293detects that and gets all excited. */
3e94db23 294
e7ae132e
KW
295#if defined(DEBUGGING) && !defined(__COVERITY__) \
296 && ! defined(PERL_SMALL_MACRO_BUFFER)
0f092d08
KW
297# define __ASSERT_(statement) assert(statement),
298#else
299# define __ASSERT_(statement)
300#endif
301
3fe05580 302/*
a4ee4fb5 303=head1 SV Manipulation Functions
3fe05580 304
3bb9fd01 305=for apidoc Ama|SV*|newSVpvs|"literal string"
1568d13a 306Like C<newSVpvn>, but takes a literal string instead of a
30a15352 307string/length pair.
3fe05580 308
3bb9fd01 309=for apidoc Ama|SV*|newSVpvs_flags|"literal string"|U32 flags
1568d13a 310Like C<newSVpvn_flags>, but takes a literal string instead of
30a15352 311a string/length pair.
84bafc02 312
3bb9fd01 313=for apidoc Ama|SV*|newSVpvs_share|"literal string"
1568d13a 314Like C<newSVpvn_share>, but takes a literal string instead of
30a15352 315a string/length pair and omits the hash parameter.
3fe05580 316
3bb9fd01 317=for apidoc Am|void|sv_catpvs_flags|SV* sv|"literal string"|I32 flags
1568d13a 318Like C<sv_catpvn_flags>, but takes a literal string instead
30a15352 319of a string/length pair.
9dcc53ea 320
3bb9fd01 321=for apidoc Am|void|sv_catpvs_nomg|SV* sv|"literal string"
1568d13a 322Like C<sv_catpvn_nomg>, but takes a literal string instead of
0c395ea5 323a string/length pair.
9dcc53ea 324
3bb9fd01 325=for apidoc Am|void|sv_catpvs|SV* sv|"literal string"
1568d13a 326Like C<sv_catpvn>, but takes a literal string instead of a
0c395ea5 327string/length pair.
3fe05580 328
3bb9fd01 329=for apidoc Am|void|sv_catpvs_mg|SV* sv|"literal string"
1568d13a 330Like C<sv_catpvn_mg>, but takes a literal string instead of a
9dcc53ea
Z
331string/length pair.
332
3bb9fd01 333=for apidoc Am|void|sv_setpvs|SV* sv|"literal string"
1568d13a 334Like C<sv_setpvn>, but takes a literal string instead of a
0c395ea5 335string/length pair.
3fe05580 336
3bb9fd01 337=for apidoc Am|void|sv_setpvs_mg|SV* sv|"literal string"
1568d13a 338Like C<sv_setpvn_mg>, but takes a literal string instead of a
9dcc53ea
Z
339string/length pair.
340
3bb9fd01 341=for apidoc Am|SV *|sv_setref_pvs|SV *const rv|const char *const classname|"literal string"
1568d13a 342Like C<sv_setref_pvn>, but takes a literal string instead of
0c395ea5 343a string/length pair.
9dcc53ea 344
3fe05580
MHM
345=head1 Memory Management
346
3bb9fd01 347=for apidoc Ama|char*|savepvs|"literal string"
1568d13a 348Like C<savepvn>, but takes a literal string instead of a
30a15352 349string/length pair.
3fe05580 350
3bb9fd01 351=for apidoc Ama|char*|savesharedpvs|"literal string"
9dcc53ea
Z
352A version of C<savepvs()> which allocates the duplicate string in memory
353which is shared between threads.
354
3fe05580
MHM
355=head1 GV Functions
356
3bb9fd01 357=for apidoc Am|HV*|gv_stashpvs|"name"|I32 create
1568d13a 358Like C<gv_stashpvn>, but takes a literal string instead of a
0c395ea5 359string/length pair.
3fe05580
MHM
360
361=head1 Hash Manipulation Functions
362
3bb9fd01 363=for apidoc Am|SV**|hv_fetchs|HV* tb|"key"|I32 lval
1568d13a 364Like C<hv_fetch>, but takes a literal string instead of a
0c395ea5 365string/length pair.
3fe05580 366
3bb9fd01 367=for apidoc Am|SV**|hv_stores|HV* tb|"key"|SV* val
1568d13a 368Like C<hv_store>, but takes a literal string instead of a
0c395ea5 369string/length pair
3fe05580
MHM
370and omits the hash parameter.
371
510966aa
Z
372=head1 Lexer interface
373
3bb9fd01 374=for apidoc Amx|void|lex_stuff_pvs|"pv"|U32 flags
510966aa 375
1568d13a 376Like L</lex_stuff_pvn>, but takes a literal string instead of
0c395ea5 377a string/length pair.
510966aa 378
3fe05580
MHM
379=cut
380*/
381
a34e53fc
KW
382/*
383=head1 Handy Values
2efa8cc7 384
a34e53fc
KW
385=for apidoc Amu|pair|STR_WITH_LEN|"literal string"
386
387Returns two comma separated tokens of the input literal string, and its length.
388This is convenience macro which helps out in some API calls.
389Note that it can't be used as an argument to macros or functions that under
390some configurations might be macros, which means that it requires the full
391Perl_xxx(aTHX_ ...) form for any API calls where it's used.
392
393=cut
394*/
395
a34e53fc 396#define STR_WITH_LEN(s) ("" s ""), (sizeof(s)-1)
ba3a79e7
GA
397
398/* STR_WITH_LEN() shortcuts */
399#define newSVpvs(str) Perl_newSVpvn(aTHX_ STR_WITH_LEN(str))
84bafc02
NC
400#define newSVpvs_flags(str,flags) \
401 Perl_newSVpvn_flags(aTHX_ STR_WITH_LEN(str), flags)
ba3a79e7 402#define newSVpvs_share(str) Perl_newSVpvn_share(aTHX_ STR_WITH_LEN(str), 0)
9dcc53ea
Z
403#define sv_catpvs_flags(sv, str, flags) \
404 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), flags)
405#define sv_catpvs_nomg(sv, str) \
406 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), 0)
407#define sv_catpvs(sv, str) \
408 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC)
409#define sv_catpvs_mg(sv, str) \
410 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC|SV_SMAGIC)
3fe05580 411#define sv_setpvs(sv, str) Perl_sv_setpvn(aTHX_ sv, STR_WITH_LEN(str))
9dcc53ea
Z
412#define sv_setpvs_mg(sv, str) Perl_sv_setpvn_mg(aTHX_ sv, STR_WITH_LEN(str))
413#define sv_setref_pvs(rv, classname, str) \
414 Perl_sv_setref_pvn(aTHX_ rv, classname, STR_WITH_LEN(str))
ba3a79e7 415#define savepvs(str) Perl_savepvn(aTHX_ STR_WITH_LEN(str))
9dcc53ea
Z
416#define savesharedpvs(str) Perl_savesharedpvn(aTHX_ STR_WITH_LEN(str))
417#define gv_stashpvs(str, create) \
418 Perl_gv_stashpvn(aTHX_ STR_WITH_LEN(str), create)
419#define gv_fetchpvs(namebeg, add, sv_type) \
420 Perl_gv_fetchpvn_flags(aTHX_ STR_WITH_LEN(namebeg), add, sv_type)
421#define gv_fetchpvn(namebeg, len, add, sv_type) \
422 Perl_gv_fetchpvn_flags(aTHX_ namebeg, len, add, sv_type)
423#define sv_catxmlpvs(dsv, str, utf8) \
424 Perl_sv_catxmlpvn(aTHX_ dsv, STR_WITH_LEN(str), utf8)
4ac46235 425
ba3a79e7 426
510966aa
Z
427#define lex_stuff_pvs(pv,flags) Perl_lex_stuff_pvn(aTHX_ STR_WITH_LEN(pv), flags)
428
b96d8cd9
NC
429#define get_cvs(str, flags) \
430 Perl_get_cvn_flags(aTHX_ STR_WITH_LEN(str), (flags))
5c1737d1 431
9b6e9510 432/* internal helpers */
4a1bbd3d
KW
433/* Transitional */
434#ifndef PERL_MAJOR_VERSION
435# define PERL_MAJOR_VERSION PERL_REVISION
436#else
437# undef PERL_REVISION /* We don't want code to be using these */
438#endif
439#ifndef PERL_MINOR_VERSION
440# define PERL_MINOR_VERSION PERL_VERSION
441#else
442# undef PERL_VERSION
443#endif
444#ifndef PERL_MICRO_VERSION
445# define PERL_MICRO_VERSION PERL_SUBVERSION
446#else
447# undef PERL_SUBVERSION
448#endif
449
450#define PERL_JNP_TO_DECIMAL_(maJor,miNor,Patch) \
451 /* '10*' leaves room for things like alpha, beta, releases */ \
452 (10 * ((maJor) * 1000000) + ((miNor) * 1000) + (Patch))
9b6e9510 453#define PERL_DECIMAL_VERSION_ \
4a1bbd3d
KW
454 PERL_JNP_TO_DECIMAL_(PERL_MAJOR_VERSION, PERL_MINOR_VERSION, \
455 PERL_MICRO_VERSION)
9b6e9510
KW
456
457/*
4a1bbd3d 458=for apidoc AmR|bool|PERL_VERSION_EQ|const U8 major|const U8 minor|const U8 patch
9b6e9510 459
4a1bbd3d 460Returns whether or not the perl currently being compiled has the specified
9b6e9510
KW
461relationship to the perl given by the parameters. For example,
462
463 #if PERL_VERSION_GT(5,24,2)
464 code that will only be compiled on perls after v5.24.2
465 #else
466 fallback code
467 #endif
468
469Note that this is usable in making compile-time decisions
470
471The possible comparisons are C<PERL_VERSION_EQ>, C<PERL_VERSION_NE>,
472C<PERL_VERSION_GE>, C<PERL_VERSION_GT>, C<PERL_VERSION_LE>, and
473C<PERL_VERSION_LT>.
474
4a1bbd3d
KW
475You may use the special value '*' for the final number to mean ALL possible
476values for it. Thus,
477
478 #if PERL_VERSION_EQ(5,31,'*')
479
480means all perls in the 5.31 series. And
481
482 #if PERL_VERSION_NE(5,24,'*')
483
484means all perls EXCEPT 5.24 ones. And
485
486 #if PERL_VERSION_LE(5,9,'*')
487
488is effectively
489
490 #if PERL_VERSION_LT(5,10,0)
491
492This means you don't have to think so much when converting from the existing
493deprecated C<PERL_VERSION> to using this macro:
494
495 #if PERL_VERSION <= 9
496
497becomes
498
499 #if PERL_VERSION_LE(5,9,'*')
500
501=for apidoc AmRh|bool|PERL_VERSION_NE|const U8 major|const U8 minor|const U8 patch
502=for apidoc AmRh|bool|PERL_VERSION_GE|const U8 major|const U8 minor|const U8 patch
503=for apidoc AmRh|bool|PERL_VERSION_GT|const U8 major|const U8 minor|const U8 patch
504=for apidoc AmRh|bool|PERL_VERSION_LE|const U8 major|const U8 minor|const U8 patch
505=for apidoc AmRh|bool|PERL_VERSION_LT|const U8 major|const U8 minor|const U8 patch
9b6e9510
KW
506
507=cut
508*/
509
4a1bbd3d
KW
510/* N.B. These don't work if the patch version is 42 or 92, as those are what
511 * '*' is in ASCII and EBCDIC respectively */
512# define PERL_VERSION_EQ(j,n,p) \
513 (((p) == '*') \
514 ? ( (j) == PERL_MAJOR_VERSION \
515 && (n) == PERL_MINOR_VERSION) \
516 : (PERL_DECIMAL_VERSION_ == PERL_JNP_TO_DECIMAL_(j,n,p)))
517# define PERL_VERSION_NE(j,n,p) (! PERL_VERSION_EQ(j,n,p))
518
519# define PERL_VERSION_LT(j,n,p) /* < '*' effectively means < 0 */ \
520 (PERL_DECIMAL_VERSION_ < PERL_JNP_TO_DECIMAL_( (j), \
521 (n), \
522 (((p) == '*') ? 0 : p)))
523# define PERL_VERSION_GE(j,n,p) (! PERL_VERSION_LT(j,n,p))
524
525# define PERL_VERSION_LE(j,n,p) /* <= '*' effectively means < n+1 */ \
526 (PERL_DECIMAL_VERSION_ < PERL_JNP_TO_DECIMAL_( (j), \
527 (((p) == '*') ? ((n)+1) : (n)), \
528 (((p) == '*') ? 0 : p)))
529# define PERL_VERSION_GT(j,n,p) (! PERL_VERSION_LE(j,n,p))
9b6e9510 530
954c1994 531/*
ccfc67b7
JH
532=head1 Miscellaneous Functions
533
954c1994 534=for apidoc Am|bool|strNE|char* s1|char* s2
dc6b0978
KW
535Test two C<NUL>-terminated strings to see if they are different. Returns true
536or false.
954c1994
GS
537
538=for apidoc Am|bool|strEQ|char* s1|char* s2
dc6b0978
KW
539Test two C<NUL>-terminated strings to see if they are equal. Returns true or
540false.
954c1994
GS
541
542=for apidoc Am|bool|strLT|char* s1|char* s2
dc6b0978
KW
543Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than the
544second, C<s2>. Returns true or false.
954c1994
GS
545
546=for apidoc Am|bool|strLE|char* s1|char* s2
dc6b0978
KW
547Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than or
548equal to the second, C<s2>. Returns true or false.
954c1994
GS
549
550=for apidoc Am|bool|strGT|char* s1|char* s2
dc6b0978
KW
551Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than
552the second, C<s2>. Returns true or false.
954c1994
GS
553
554=for apidoc Am|bool|strGE|char* s1|char* s2
dc6b0978
KW
555Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than
556or equal to the second, C<s2>. Returns true or false.
954c1994
GS
557
558=for apidoc Am|bool|strnNE|char* s1|char* s2|STRLEN len
dc6b0978
KW
559Test two C<NUL>-terminated strings to see if they are different. The C<len>
560parameter indicates the number of bytes to compare. Returns true or false. (A
954c1994
GS
561wrapper for C<strncmp>).
562
563=for apidoc Am|bool|strnEQ|char* s1|char* s2|STRLEN len
dc6b0978
KW
564Test two C<NUL>-terminated strings to see if they are equal. The C<len>
565parameter indicates the number of bytes to compare. Returns true or false. (A
566wrapper for C<strncmp>).
954c1994 567
bd18bd40
KW
568=for apidoc Am|bool|memEQ|char* s1|char* s2|STRLEN len
569Test two buffers (which may contain embedded C<NUL> characters, to see if they
570are equal. The C<len> parameter indicates the number of bytes to compare.
571Returns zero if equal, or non-zero if non-equal.
572
3bb9fd01 573=for apidoc Am|bool|memEQs|char* s1|STRLEN l1|"s2"
2d8eeddb
KW
574Like L</memEQ>, but the second string is a literal enclosed in double quotes,
575C<l1> gives the number of bytes in C<s1>.
576Returns zero if equal, or non-zero if non-equal.
577
bd18bd40
KW
578=for apidoc Am|bool|memNE|char* s1|char* s2|STRLEN len
579Test two buffers (which may contain embedded C<NUL> characters, to see if they
580are not equal. The C<len> parameter indicates the number of bytes to compare.
581Returns zero if non-equal, or non-zero if equal.
582
3bb9fd01 583=for apidoc Am|bool|memNEs|char* s1|STRLEN l1|"s2"
2d8eeddb
KW
584Like L</memNE>, but the second string is a literal enclosed in double quotes,
585C<l1> gives the number of bytes in C<s1>.
586Returns zero if non-equal, or zero if non-equal.
587
4aada8b9
KW
588=for apidoc Am|bool|memCHRs|"list"|char c
589Returns the position of the first occurence of the byte C<c> in the literal
590string C<"list">, or NULL if C<c> doesn't appear in C<"list">. All bytes are
591treated as unsigned char. Thus this macro can be used to determine if C<c> is
592in a set of particular characters. Unlike L<strchr(3)>, it works even if C<c>
593is C<NUL> (and the set doesn't include C<NUL>).
594
954c1994 595=cut
fc169e00
KW
596
597New macros should use the following conventions for their names (which are
598based on the underlying C library functions):
599
600 (mem | str n? ) (EQ | NE | LT | GT | GE | (( BEGIN | END ) P? )) l? s?
601
602 Each has two main parameters, string-like operands that are compared
603 against each other, as specified by the macro name. Some macros may
604 additionally have one or potentially even two length parameters. If a length
605 parameter applies to both string parameters, it will be positioned third;
606 otherwise any length parameter immediately follows the string parameter it
607 applies to.
608
609 If the prefix to the name is 'str', the string parameter is a pointer to a C
610 language string. Such a string does not contain embedded NUL bytes; its
611 length may be unknown, but can be calculated by C<strlen()>, since it is
612 terminated by a NUL, which isn't included in its length.
613
a3815e44 614 The optional 'n' following 'str' means that there is a third parameter,
fc169e00
KW
615 giving the maximum number of bytes to look at in each string. Even if both
616 strings are longer than the length parameter, those extra bytes will be
617 unexamined.
618
619 The 's' suffix means that the 2nd byte string parameter is a literal C
620 double-quoted string. Its length will automatically be calculated by the
621 macro, so no length parameter will ever be needed for it.
622
623 If the prefix is 'mem', the string parameters don't have to be C strings;
624 they may contain embedded NUL bytes, do not necessarily have a terminating
625 NUL, and their lengths can be known only through other means, which in
626 practice are additional parameter(s) passed to the function. All 'mem'
627 functions have at least one length parameter. Barring any 'l' or 's' suffix,
628 there is a single length parameter, in position 3, which applies to both
629 string parameters. The 's' suffix means, as described above, that the 2nd
630 string is a literal double-quoted C string (hence its length is calculated by
631 the macro, and the length parameter to the function applies just to the first
632 string parameter, and hence is positioned just after it). An 'l' suffix
633 means that the 2nd string parameter has its own length parameter, and the
634 signature will look like memFOOl(s1, l1, s2, l2).
635
636 BEGIN (and END) are for testing if the 2nd string is an initial (or final)
637 substring of the 1st string. 'P' if present indicates that the substring
638 must be a "proper" one in tha mathematical sense that the first one must be
639 strictly larger than the 2nd.
640
954c1994
GS
641*/
642
62946e08 643
75400963
KW
644#define strNE(s1,s2) (strcmp(s1,s2) != 0)
645#define strEQ(s1,s2) (strcmp(s1,s2) == 0)
8d063cd8
LW
646#define strLT(s1,s2) (strcmp(s1,s2) < 0)
647#define strLE(s1,s2) (strcmp(s1,s2) <= 0)
648#define strGT(s1,s2) (strcmp(s1,s2) > 0)
649#define strGE(s1,s2) (strcmp(s1,s2) >= 0)
62946e08 650
75400963
KW
651#define strnNE(s1,s2,l) (strncmp(s1,s2,l) != 0)
652#define strnEQ(s1,s2,l) (strncmp(s1,s2,l) == 0)
378cc40b 653
9d3980bc
KW
654#define memEQ(s1,s2,l) (memcmp(((const void *) (s1)), ((const void *) (s2)), l) == 0)
655#define memNE(s1,s2,l) (! memEQ(s1,s2,l))
36477c24 656
085b7534 657/* memEQ and memNE where second comparand is a string constant */
568a785a 658#define memEQs(s1, l, s2) \
777fa2cb 659 (((sizeof(s2)-1) == (l)) && memEQ((s1), ("" s2 ""), (sizeof(s2)-1)))
5f50c6c9 660#define memNEs(s1, l, s2) (! memEQs(s1, l, s2))
568a785a 661
fdbb9a7c
KW
662/* Keep these private until we decide it was a good idea */
663#if defined(PERL_CORE) || defined(PERL_EXT) || defined(PERL_EXT_POSIX)
664
665#define strBEGINs(s1,s2) (strncmp(s1,"" s2 "", sizeof(s2)-1) == 0)
666
bdb7e3f0 667#define memBEGINs(s1, l, s2) \
30a6480c 668 ( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \
bdb7e3f0 669 && memEQ(s1, "" s2 "", sizeof(s2)-1))
de627158 670#define memBEGINPs(s1, l, s2) \
30a6480c 671 ( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) - 1 \
de627158 672 && memEQ(s1, "" s2 "", sizeof(s2)-1))
bdb7e3f0 673#define memENDs(s1, l, s2) \
30a6480c 674 ( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \
bdb7e3f0 675 && memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1))
b80f8424 676#define memENDPs(s1, l, s2) \
30a6480c 677 ( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) \
b80f8424 678 && memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1))
fdbb9a7c 679#endif /* End of making macros private */
bdb7e3f0 680
062b6850
KW
681#define memLT(s1,s2,l) (memcmp(s1,s2,l) < 0)
682#define memLE(s1,s2,l) (memcmp(s1,s2,l) <= 0)
683#define memGT(s1,s2,l) (memcmp(s1,s2,l) > 0)
684#define memGE(s1,s2,l) (memcmp(s1,s2,l) >= 0)
685
4aada8b9
KW
686#define memCHRs(s1,c) ((const char *) memchr("" s1 "" , c, sizeof(s1)-1))
687
bbce6d69
PP
688/*
689 * Character classes.
690 *
691 * Unfortunately, the introduction of locales means that we
692 * can't trust isupper(), etc. to tell the truth. And when
693 * it comes to /\w+/ with tainting enabled, we *must* be able
694 * to trust our character classes.
695 *
696 * Therefore, the default tests in the text of Perl will be
697 * independent of locale. Any code that wants to depend on
698 * the current locale will use the tests that begin with "lc".
699 */
700
2304df62
AD
701#ifdef HAS_SETLOCALE /* XXX Is there a better test for this? */
702# ifndef CTYPE256
703# define CTYPE256
704# endif
705#endif
706
954c1994 707/*
ccfc67b7 708
dcccc8ff 709=head1 Character classification
243effed
KW
710This section is about functions (really macros) that classify characters
711into types, such as punctuation versus alphabetic, etc. Most of these are
712analogous to regular expression character classes. (See
713L<perlrecharclass/POSIX Character Classes>.) There are several variants for
714each class. (Not all macros have all variants; each item below lists the
715ones valid for it.) None are affected by C<use bytes>, and only the ones
716with C<LC> in the name are affected by the current locale.
717
d713f9d9
KW
718The base function, e.g., C<isALPHA()>, takes any signed or unsigned value,
719treating it as a code point, and returns a boolean as to whether or not the
720character represented by it is (or on non-ASCII platforms, corresponds to) an
6aff1f14
KW
721ASCII character in the named class based on platform, Unicode, and Perl rules.
722If the input is a number that doesn't fit in an octet, FALSE is returned.
243effed 723
c98722a4 724Variant C<isI<FOO>_A> (e.g., C<isALPHA_A()>) is identical to the base function
550da823
KW
725with no suffix C<"_A">. This variant is used to emphasize by its name that
726only ASCII-range characters can return TRUE.
4b9734bf 727
d60679e1 728Variant C<isI<FOO>_L1> imposes the Latin-1 (or EBCDIC equivalent) character set
4b9734bf
KW
729onto the platform. That is, the code points that are ASCII are unaffected,
730since ASCII is a subset of Latin-1. But the non-ASCII code points are treated
731as if they are Latin-1 characters. For example, C<isWORDCHAR_L1()> will return
732true when called with the code point 0xDF, which is a word character in both
4650c663 733ASCII and EBCDIC (though it represents different characters in each).
d713f9d9
KW
734If the input is a number that doesn't fit in an octet, FALSE is returned.
735(Perl's documentation uses a colloquial definition of Latin-1, to include all
736code points below 256.)
243effed 737
d713f9d9
KW
738Variant C<isI<FOO>_uvchr> is exactly like the C<isI<FOO>_L1> variant, for
739inputs below 256, but if the code point is larger than 255, Unicode rules are
740used to determine if it is in the character class. For example,
d0da05db 741C<isWORDCHAR_uvchr(0x100)> returns TRUE, since 0x100 is LATIN CAPITAL LETTER A
6aff1f14 742WITH MACRON in Unicode, and is a word character.
243effed 743
059703b0
KW
744Variants C<isI<FOO>_utf8> and C<isI<FOO>_utf8_safe> are like C<isI<FOO>_uvchr>,
745but are used for UTF-8 encoded strings. The two forms are different names for
746the same thing. Each call to one of these classifies the first character of
747the string starting at C<p>. The second parameter, C<e>, points to anywhere in
748the string beyond the first character, up to one byte past the end of the
749entire string. Although both variants are identical, the suffix C<_safe> in
750one name emphasizes that it will not attempt to read beyond S<C<e - 1>>,
751provided that the constraint S<C<s E<lt> e>> is true (this is asserted for in
752C<-DDEBUGGING> builds). If the UTF-8 for the input character is malformed in
753some way, the program may croak, or the function may return FALSE, at the
754discretion of the implementation, and subject to change in future releases.
243effed 755
d713f9d9
KW
756Variant C<isI<FOO>_LC> is like the C<isI<FOO>_A> and C<isI<FOO>_L1> variants,
757but the result is based on the current locale, which is what C<LC> in the name
758stands for. If Perl can determine that the current locale is a UTF-8 locale,
759it uses the published Unicode rules; otherwise, it uses the C library function
760that gives the named classification. For example, C<isDIGIT_LC()> when not in
761a UTF-8 locale returns the result of calling C<isdigit()>. FALSE is always
1a83413c
KW
762returned if the input won't fit into an octet. On some platforms where the C
763library function is known to be defective, Perl changes its result to follow
764the POSIX standard's rules.
243effed 765
d713f9d9
KW
766Variant C<isI<FOO>_LC_uvchr> acts exactly like C<isI<FOO>_LC> for inputs less
767than 256, but for larger ones it returns the Unicode classification of the code
768point.
243effed 769
059703b0
KW
770Variants C<isI<FOO>_LC_utf8> and C<isI<FOO>_LC_utf8_safe> are like
771C<isI<FOO>_LC_uvchr>, but are used for UTF-8 encoded strings. The two forms
772are different names for the same thing. Each call to one of these classifies
773the first character of the string starting at C<p>. The second parameter,
774C<e>, points to anywhere in the string beyond the first character, up to one
775byte past the end of the entire string. Although both variants are identical,
776the suffix C<_safe> in one name emphasizes that it will not attempt to read
777beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this
778is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the input
779character is malformed in some way, the program may croak, or the function may
780return FALSE, at the discretion of the implementation, and subject to change in
781future releases.
ccfc67b7 782
d713f9d9
KW
783=for apidoc Am|bool|isALPHA|int ch
784Returns a boolean indicating whether the specified input is one of C<[A-Za-z]>,
785analogous to C<m/[[:alpha:]]/>.
dcccc8ff
KW
786See the L<top of this section|/Character classification> for an explanation of
787variants
059703b0
KW
788C<isALPHA_A>, C<isALPHA_L1>, C<isALPHA_uvchr>, C<isALPHA_utf8>,
789C<isALPHA_utf8_safe>, C<isALPHA_LC>, C<isALPHA_LC_uvchr>, C<isALPHA_LC_utf8>,
790and C<isALPHA_LC_utf8_safe>.
8a58bdcf 791
f16858ed
KW
792=cut
793
794Here and below, we add the protoypes of these macros for downstream programs
795that would be interested in them, such as Devel::PPPort
796
797=for apidoc Amh|bool|isALPHA_A|int ch
798=for apidoc Amh|bool|isALPHA_L1|int ch
799=for apidoc Amh|bool|isALPHA_uvchr|int ch
800=for apidoc Amh|bool|isALPHA_utf8_safe|U8 * s|U8 * end
d23c7e08 801=for apidoc Amh|bool|isALPHA_utf8|U8 * s|U8 * end
f16858ed
KW
802=for apidoc Amh|bool|isALPHA_LC|int ch
803=for apidoc Amh|bool|isALPHA_LC_uvchr|int ch
804=for apidoc Amh|bool|isALPHA_LC_utf8_safe|U8 * s| U8 *end
805
d713f9d9
KW
806=for apidoc Am|bool|isALPHANUMERIC|int ch
807Returns a boolean indicating whether the specified character is one of
808C<[A-Za-z0-9]>, analogous to C<m/[[:alnum:]]/>.
dcccc8ff
KW
809See the L<top of this section|/Character classification> for an explanation of
810variants
d0da05db 811C<isALPHANUMERIC_A>, C<isALPHANUMERIC_L1>, C<isALPHANUMERIC_uvchr>,
059703b0
KW
812C<isALPHANUMERIC_utf8>, C<isALPHANUMERIC_utf8_safe>, C<isALPHANUMERIC_LC>,
813C<isALPHANUMERIC_LC_uvchr>, C<isALPHANUMERIC_LC_utf8>, and
814C<isALPHANUMERIC_LC_utf8_safe>.
15861f94 815
255b632a
KW
816A (discouraged from use) synonym is C<isALNUMC> (where the C<C> suffix means
817this corresponds to the C language alphanumeric definition). Also
818there are the variants
819C<isALNUMC_A>, C<isALNUMC_L1>
820C<isALNUMC_LC>, and C<isALNUMC_LC_uvchr>.
821
f16858ed
KW
822=for apidoc Amh|bool|isALPHANUMERIC_A|int ch
823=for apidoc Amh|bool|isALPHANUMERIC_L1|int ch
824=for apidoc Amh|bool|isALPHANUMERIC_uvchr|int ch
825=for apidoc Amh|bool|isALPHANUMERIC_utf8_safe|U8 * s|U8 * end
d23c7e08 826=for apidoc Amh|bool|isALPHANUMERIC_utf8|U8 * s|U8 * end
f16858ed
KW
827=for apidoc Amh|bool|isALPHANUMERIC_LC|int ch
828=for apidoc Amh|bool|isALPHANUMERIC_LC_uvchr|int ch
829=for apidoc Amh|bool|isALPHANUMERIC_LC_utf8_safe|U8 * s| U8 *end
830=for apidoc Amh|bool|isALNUMC|int ch
831=for apidoc Amh|bool|isALNUMC_A|int ch
832=for apidoc Amh|bool|isALNUMC_L1|int ch
833=for apidoc Amh|bool|isALNUMC_LC|int ch
834=for apidoc Amh|bool|isALNUMC_LC_uvchr|int ch
835
d713f9d9 836=for apidoc Am|bool|isASCII|int ch
8a58bdcf 837Returns a boolean indicating whether the specified character is one of the 128
243effed 838characters in the ASCII character set, analogous to C<m/[[:ascii:]]/>.
e5ad6aba 839On non-ASCII platforms, it returns TRUE iff this
8a58bdcf
KW
840character corresponds to an ASCII character. Variants C<isASCII_A()> and
841C<isASCII_L1()> are identical to C<isASCII()>.
dcccc8ff
KW
842See the L<top of this section|/Character classification> for an explanation of
843variants
059703b0
KW
844C<isASCII_uvchr>, C<isASCII_utf8>, C<isASCII_utf8_safe>, C<isASCII_LC>,
845C<isASCII_LC_uvchr>, C<isASCII_LC_utf8>, and C<isASCII_LC_utf8_safe>.
846Note, however, that some platforms do not have the C library routine
847C<isascii()>. In these cases, the variants whose names contain C<LC> are the
848same as the corresponding ones without.
243effed 849
f16858ed
KW
850=for apidoc Amh|bool|isASCII_A|int ch
851=for apidoc Amh|bool|isASCII_L1|int ch
852=for apidoc Amh|bool|isASCII_uvchr|int ch
853=for apidoc Amh|bool|isASCII_utf8_safe|U8 * s|U8 * end
d23c7e08 854=for apidoc Amh|bool|isASCII_utf8|U8 * s|U8 * end
f16858ed
KW
855=for apidoc Amh|bool|isASCII_LC|int ch
856=for apidoc Amh|bool|isASCII_LC_uvchr|int ch
857=for apidoc Amh|bool|isASCII_LC_utf8_safe|U8 * s| U8 *end
858
d98532ea
KW
859Also note, that because all ASCII characters are UTF-8 invariant (meaning they
860have the exact same representation (always a single byte) whether encoded in
861UTF-8 or not), C<isASCII> will give the correct results when called with any
059703b0
KW
862byte in any string encoded or not in UTF-8. And similarly C<isASCII_utf8> and
863C<isASCII_utf8_safe> will work properly on any string encoded or not in UTF-8.
d98532ea 864
243effed
KW
865=for apidoc Am|bool|isBLANK|char ch
866Returns a boolean indicating whether the specified character is a
6aff1f14 867character considered to be a blank, analogous to C<m/[[:blank:]]/>.
dcccc8ff
KW
868See the L<top of this section|/Character classification> for an explanation of
869variants
059703b0
KW
870C<isBLANK_A>, C<isBLANK_L1>, C<isBLANK_uvchr>, C<isBLANK_utf8>,
871C<isBLANK_utf8_safe>, C<isBLANK_LC>, C<isBLANK_LC_uvchr>, C<isBLANK_LC_utf8>,
872and C<isBLANK_LC_utf8_safe>. Note,
da8c1a98
KW
873however, that some platforms do not have the C library routine
874C<isblank()>. In these cases, the variants whose names contain C<LC> are
875the same as the corresponding ones without.
243effed 876
f16858ed
KW
877=for apidoc Amh|bool|isBLANK_A|int ch
878=for apidoc Amh|bool|isBLANK_L1|int ch
879=for apidoc Amh|bool|isBLANK_uvchr|int ch
880=for apidoc Amh|bool|isBLANK_utf8_safe|U8 * s|U8 * end
d23c7e08 881=for apidoc Amh|bool|isBLANK_utf8|U8 * s|U8 * end
f16858ed
KW
882=for apidoc Amh|bool|isBLANK_LC|int ch
883=for apidoc Amh|bool|isBLANK_LC_uvchr|int ch
884=for apidoc Amh|bool|isBLANK_LC_utf8_safe|U8 * s| U8 *end
885
243effed
KW
886=for apidoc Am|bool|isCNTRL|char ch
887Returns a boolean indicating whether the specified character is a
6aff1f14 888control character, analogous to C<m/[[:cntrl:]]/>.
dcccc8ff
KW
889See the L<top of this section|/Character classification> for an explanation of
890variants
059703b0
KW
891C<isCNTRL_A>, C<isCNTRL_L1>, C<isCNTRL_uvchr>, C<isCNTRL_utf8>,
892C<isCNTRL_utf8_safe>, C<isCNTRL_LC>, C<isCNTRL_LC_uvchr>, C<isCNTRL_LC_utf8>
893and C<isCNTRL_LC_utf8_safe>. On EBCDIC
da8c1a98 894platforms, you almost always want to use the C<isCNTRL_L1> variant.
954c1994 895
f16858ed
KW
896=for apidoc Amh|bool|isCNTRL_A|int ch
897=for apidoc Amh|bool|isCNTRL_L1|int ch
898=for apidoc Amh|bool|isCNTRL_uvchr|int ch
899=for apidoc Amh|bool|isCNTRL_utf8_safe|U8 * s|U8 * end
d23c7e08 900=for apidoc Amh|bool|isCNTRL_utf8|U8 * s|U8 * end
f16858ed
KW
901=for apidoc Amh|bool|isCNTRL_LC|int ch
902=for apidoc Amh|bool|isCNTRL_LC_uvchr|int ch
903=for apidoc Amh|bool|isCNTRL_LC_utf8_safe|U8 * s| U8 *end
904
954c1994 905=for apidoc Am|bool|isDIGIT|char ch
2787a470 906Returns a boolean indicating whether the specified character is a
6aff1f14 907digit, analogous to C<m/[[:digit:]]/>.
8a58bdcf 908Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>.
dcccc8ff
KW
909See the L<top of this section|/Character classification> for an explanation of
910variants
059703b0
KW
911C<isDIGIT_uvchr>, C<isDIGIT_utf8>, C<isDIGIT_utf8_safe>, C<isDIGIT_LC>,
912C<isDIGIT_LC_uvchr>, C<isDIGIT_LC_utf8>, and C<isDIGIT_LC_utf8_safe>.
243effed 913
f16858ed
KW
914=for apidoc Amh|bool|isDIGIT_A|int ch
915=for apidoc Amh|bool|isDIGIT_L1|int ch
916=for apidoc Amh|bool|isDIGIT_uvchr|int ch
917=for apidoc Amh|bool|isDIGIT_utf8_safe|U8 * s|U8 * end
d23c7e08 918=for apidoc Amh|bool|isDIGIT_utf8|U8 * s|U8 * end
f16858ed
KW
919=for apidoc Amh|bool|isDIGIT_LC|int ch
920=for apidoc Amh|bool|isDIGIT_LC_uvchr|int ch
921=for apidoc Amh|bool|isDIGIT_LC_utf8_safe|U8 * s| U8 *end
922
243effed
KW
923=for apidoc Am|bool|isGRAPH|char ch
924Returns a boolean indicating whether the specified character is a
6aff1f14 925graphic character, analogous to C<m/[[:graph:]]/>.
dcccc8ff 926See the L<top of this section|/Character classification> for an explanation of
059703b0
KW
927variants C<isGRAPH_A>, C<isGRAPH_L1>, C<isGRAPH_uvchr>, C<isGRAPH_utf8>,
928C<isGRAPH_utf8_safe>, C<isGRAPH_LC>, C<isGRAPH_LC_uvchr>,
929C<isGRAPH_LC_utf8_safe>, and C<isGRAPH_LC_utf8_safe>.
954c1994 930
f16858ed
KW
931=for apidoc Amh|bool|isGRAPH_A|int ch
932=for apidoc Amh|bool|isGRAPH_L1|int ch
933=for apidoc Amh|bool|isGRAPH_uvchr|int ch
934=for apidoc Amh|bool|isGRAPH_utf8_safe|U8 * s|U8 * end
d23c7e08 935=for apidoc Amh|bool|isGRAPH_utf8|U8 * s|U8 * end
f16858ed
KW
936=for apidoc Amh|bool|isGRAPH_LC|int ch
937=for apidoc Amh|bool|isGRAPH_LC_uvchr|int ch
938=for apidoc Amh|bool|isGRAPH_LC_utf8_safe|U8 * s| U8 *end
939
0c82b6df 940=for apidoc Am|bool|isLOWER|char ch
2787a470 941Returns a boolean indicating whether the specified character is a
6aff1f14 942lowercase character, analogous to C<m/[[:lower:]]/>.
dcccc8ff
KW
943See the L<top of this section|/Character classification> for an explanation of
944variants
059703b0
KW
945C<isLOWER_A>, C<isLOWER_L1>, C<isLOWER_uvchr>, C<isLOWER_utf8>,
946C<isLOWER_utf8_safe>, C<isLOWER_LC>, C<isLOWER_LC_uvchr>, C<isLOWER_LC_utf8>,
947and C<isLOWER_LC_utf8_safe>.
0c82b6df 948
f16858ed
KW
949=for apidoc Amh|bool|isLOWER_A|int ch
950=for apidoc Amh|bool|isLOWER_L1|int ch
951=for apidoc Amh|bool|isLOWER_uvchr|int ch
952=for apidoc Amh|bool|isLOWER_utf8_safe|U8 * s|U8 * end
d23c7e08 953=for apidoc Amh|bool|isLOWER_utf8|U8 * s|U8 * end
f16858ed
KW
954=for apidoc Amh|bool|isLOWER_LC|int ch
955=for apidoc Amh|bool|isLOWER_LC_uvchr|int ch
956=for apidoc Amh|bool|isLOWER_LC_utf8_safe|U8 * s| U8 *end
957
c99e91e9 958=for apidoc Am|bool|isOCTAL|char ch
2787a470 959Returns a boolean indicating whether the specified character is an
6aff1f14 960octal digit, [0-7].
243effed
KW
961The only two variants are C<isOCTAL_A> and C<isOCTAL_L1>; each is identical to
962C<isOCTAL>.
963
f16858ed
KW
964=for apidoc Amh|bool|isOCTAL_A|int ch
965=for apidoc Amh|bool|isOCTAL_L1|int ch
966
243effed
KW
967=for apidoc Am|bool|isPUNCT|char ch
968Returns a boolean indicating whether the specified character is a
6aff1f14
KW
969punctuation character, analogous to C<m/[[:punct:]]/>.
970Note that the definition of what is punctuation isn't as
243effed
KW
971straightforward as one might desire. See L<perlrecharclass/POSIX Character
972Classes> for details.
dcccc8ff 973See the L<top of this section|/Character classification> for an explanation of
059703b0
KW
974variants C<isPUNCT_A>, C<isPUNCT_L1>, C<isPUNCT_uvchr>, C<isPUNCT_utf8>,
975C<isPUNCT_utf8_safe>, C<isPUNCT_LC>, C<isPUNCT_LC_uvchr>, C<isPUNCT_LC_utf8>,
976and C<isPUNCT_LC_utf8_safe>.
c99e91e9 977
f16858ed
KW
978=for apidoc Amh|bool|isPUNCT_A|int ch
979=for apidoc Amh|bool|isPUNCT_L1|int ch
980=for apidoc Amh|bool|isPUNCT_uvchr|int ch
981=for apidoc Amh|bool|isPUNCT_utf8_safe|U8 * s|U8 * end
d23c7e08 982=for apidoc Amh|bool|isPUNCT_utf8|U8 * s|U8 * end
f16858ed
KW
983=for apidoc Amh|bool|isPUNCT_LC|int ch
984=for apidoc Amh|bool|isPUNCT_LC_uvchr|int ch
985=for apidoc Amh|bool|isPUNCT_LC_utf8_safe|U8 * s| U8 *end
986
0c82b6df 987=for apidoc Am|bool|isSPACE|char ch
2787a470 988Returns a boolean indicating whether the specified character is a
6aff1f14 989whitespace character. This is analogous
398d098a 990to what C<m/\s/> matches in a regular expression. Starting in Perl 5.18
779cf272 991this also matches what C<m/[[:space:]]/> does. Prior to 5.18, only the
398d098a
KW
992locale forms of this macro (the ones with C<LC> in their names) matched
993precisely what C<m/[[:space:]]/> does. In those releases, the only difference,
994in the non-locale variants, was that C<isSPACE()> did not match a vertical tab.
995(See L</isPSXSPC> for a macro that matches a vertical tab in all releases.)
dcccc8ff
KW
996See the L<top of this section|/Character classification> for an explanation of
997variants
059703b0
KW
998C<isSPACE_A>, C<isSPACE_L1>, C<isSPACE_uvchr>, C<isSPACE_utf8>,
999C<isSPACE_utf8_safe>, C<isSPACE_LC>, C<isSPACE_LC_uvchr>, C<isSPACE_LC_utf8>,
1000and C<isSPACE_LC_utf8_safe>.
0c82b6df 1001
f16858ed
KW
1002=for apidoc Amh|bool|isSPACE_A|int ch
1003=for apidoc Amh|bool|isSPACE_L1|int ch
1004=for apidoc Amh|bool|isSPACE_uvchr|int ch
1005=for apidoc Amh|bool|isSPACE_utf8_safe|U8 * s|U8 * end
d23c7e08 1006=for apidoc Amh|bool|isSPACE_utf8|U8 * s|U8 * end
f16858ed
KW
1007=for apidoc Amh|bool|isSPACE_LC|int ch
1008=for apidoc Amh|bool|isSPACE_LC_uvchr|int ch
1009=for apidoc Amh|bool|isSPACE_LC_utf8_safe|U8 * s| U8 *end
1010
398d098a
KW
1011=for apidoc Am|bool|isPSXSPC|char ch
1012(short for Posix Space)
779cf272
KW
1013Starting in 5.18, this is identical in all its forms to the
1014corresponding C<isSPACE()> macros.
398d098a
KW
1015The locale forms of this macro are identical to their corresponding
1016C<isSPACE()> forms in all Perl releases. In releases prior to 5.18, the
1017non-locale forms differ from their C<isSPACE()> forms only in that the
1018C<isSPACE()> forms don't match a Vertical Tab, and the C<isPSXSPC()> forms do.
1019Otherwise they are identical. Thus this macro is analogous to what
1020C<m/[[:space:]]/> matches in a regular expression.
dcccc8ff 1021See the L<top of this section|/Character classification> for an explanation of
059703b0
KW
1022variants C<isPSXSPC_A>, C<isPSXSPC_L1>, C<isPSXSPC_uvchr>, C<isPSXSPC_utf8>,
1023C<isPSXSPC_utf8_safe>, C<isPSXSPC_LC>, C<isPSXSPC_LC_uvchr>,
1024C<isPSXSPC_LC_utf8>, and C<isPSXSPC_LC_utf8_safe>.
398d098a 1025
f16858ed
KW
1026=for apidoc Amh|bool|isPSXSPC_A|int ch
1027=for apidoc Amh|bool|isPSXSPC_L1|int ch
1028=for apidoc Amh|bool|isPSXSPC_uvchr|int ch
1029=for apidoc Amh|bool|isPSXSPC_utf8_safe|U8 * s|U8 * end
d23c7e08 1030=for apidoc Amh|bool|isPSXSPC_utf8|U8 * s|U8 * end
f16858ed
KW
1031=for apidoc Amh|bool|isPSXSPC_LC|int ch
1032=for apidoc Amh|bool|isPSXSPC_LC_uvchr|int ch
1033=for apidoc Amh|bool|isPSXSPC_LC_utf8_safe|U8 * s| U8 *end
1034
954c1994 1035=for apidoc Am|bool|isUPPER|char ch
2787a470 1036Returns a boolean indicating whether the specified character is an
6aff1f14 1037uppercase character, analogous to C<m/[[:upper:]]/>.
dcccc8ff 1038See the L<top of this section|/Character classification> for an explanation of
059703b0
KW
1039variants C<isUPPER_A>, C<isUPPER_L1>, C<isUPPER_uvchr>, C<isUPPER_utf8>,
1040C<isUPPER_utf8_safe>, C<isUPPER_LC>, C<isUPPER_LC_uvchr>, C<isUPPER_LC_utf8>,
1041and C<isUPPER_LC_utf8_safe>.
954c1994 1042
f16858ed
KW
1043=for apidoc Amh|bool|isUPPER_A|int ch
1044=for apidoc Amh|bool|isUPPER_L1|int ch
1045=for apidoc Amh|bool|isUPPER_uvchr|int ch
1046=for apidoc Amh|bool|isUPPER_utf8_safe|U8 * s|U8 * end
d23c7e08 1047=for apidoc Amh|bool|isUPPER_utf8|U8 * s|U8 * end
f16858ed
KW
1048=for apidoc Amh|bool|isUPPER_LC|int ch
1049=for apidoc Amh|bool|isUPPER_LC_uvchr|int ch
1050=for apidoc Amh|bool|isUPPER_LC_utf8_safe|U8 * s| U8 *end
1051
243effed 1052=for apidoc Am|bool|isPRINT|char ch
8eea39dd 1053Returns a boolean indicating whether the specified character is a
6aff1f14 1054printable character, analogous to C<m/[[:print:]]/>.
dcccc8ff
KW
1055See the L<top of this section|/Character classification> for an explanation of
1056variants
059703b0
KW
1057C<isPRINT_A>, C<isPRINT_L1>, C<isPRINT_uvchr>, C<isPRINT_utf8>,
1058C<isPRINT_utf8_safe>, C<isPRINT_LC>, C<isPRINT_LC_uvchr>, C<isPRINT_LC_utf8>,
1059and C<isPRINT_LC_utf8_safe>.
243effed 1060
f16858ed
KW
1061=for apidoc Amh|bool|isPRINT_A|int ch
1062=for apidoc Amh|bool|isPRINT_L1|int ch
1063=for apidoc Amh|bool|isPRINT_uvchr|int ch
1064=for apidoc Amh|bool|isPRINT_utf8_safe|U8 * s|U8 * end
d23c7e08 1065=for apidoc Amh|bool|isPRINT_utf8|U8 * s|U8 * end
f16858ed
KW
1066=for apidoc Amh|bool|isPRINT_LC|int ch
1067=for apidoc Amh|bool|isPRINT_LC_uvchr|int ch
1068=for apidoc Amh|bool|isPRINT_LC_utf8_safe|U8 * s| U8 *end
1069
243effed
KW
1070=for apidoc Am|bool|isWORDCHAR|char ch
1071Returns a boolean indicating whether the specified character is a character
1072that is a word character, analogous to what C<m/\w/> and C<m/[[:word:]]/> match
1073in a regular expression. A word character is an alphabetic character, a
1074decimal digit, a connecting punctuation character (such as an underscore), or
1075a "mark" character that attaches to one of those (like some sort of accent).
1076C<isALNUM()> is a synonym provided for backward compatibility, even though a
1077word character includes more than the standard C language meaning of
1078alphanumeric.
dcccc8ff 1079See the L<top of this section|/Character classification> for an explanation of
059703b0
KW
1080variants C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>,
1081C<isWORDCHAR_utf8>, and C<isWORDCHAR_utf8_safe>. C<isWORDCHAR_LC>,
1082C<isWORDCHAR_LC_uvchr>, C<isWORDCHAR_LC_utf8>, and C<isWORDCHAR_LC_utf8_safe>
1083are also as described there, but additionally include the platform's native
1084underscore.
8a58bdcf 1085
f16858ed
KW
1086=for apidoc Amh|bool|isWORDCHAR_A|int ch
1087=for apidoc Amh|bool|isWORDCHAR_L1|int ch
1088=for apidoc Amh|bool|isWORDCHAR_uvchr|int ch
1089=for apidoc Amh|bool|isWORDCHAR_utf8_safe|U8 * s|U8 * end
d23c7e08 1090=for apidoc Amh|bool|isWORDCHAR_utf8|U8 * s|U8 * end
f16858ed
KW
1091=for apidoc Amh|bool|isWORDCHAR_LC|int ch
1092=for apidoc Amh|bool|isWORDCHAR_LC_uvchr|int ch
1093=for apidoc Amh|bool|isWORDCHAR_LC_utf8_safe|U8 * s| U8 *end
1094=for apidoc Amh|bool|isALNUM|int ch
1095=for apidoc Amh|bool|isALNUM_A|int ch
1096=for apidoc Amh|bool|isALNUM_LC|int ch
1097=for apidoc Amh|bool|isALNUM_LC_uvchr|int ch
1098
8a58bdcf
KW
1099=for apidoc Am|bool|isXDIGIT|char ch
1100Returns a boolean indicating whether the specified character is a hexadecimal
243effed
KW
1101digit. In the ASCII range these are C<[0-9A-Fa-f]>. Variants C<isXDIGIT_A()>
1102and C<isXDIGIT_L1()> are identical to C<isXDIGIT()>.
dcccc8ff
KW
1103See the L<top of this section|/Character classification> for an explanation of
1104variants
059703b0
KW
1105C<isXDIGIT_uvchr>, C<isXDIGIT_utf8>, C<isXDIGIT_utf8_safe>, C<isXDIGIT_LC>,
1106C<isXDIGIT_LC_uvchr>, C<isXDIGIT_LC_utf8>, and C<isXDIGIT_LC_utf8_safe>.
243effed 1107
f16858ed
KW
1108=for apidoc Amh|bool|isXDIGIT_A|int ch
1109=for apidoc Amh|bool|isXDIGIT_L1|int ch
1110=for apidoc Amh|bool|isXDIGIT_uvchr|int ch
1111=for apidoc Amh|bool|isXDIGIT_utf8_safe|U8 * s|U8 * end
d23c7e08 1112=for apidoc Amh|bool|isXDIGIT_utf8|U8 * s|U8 * end
f16858ed
KW
1113=for apidoc Amh|bool|isXDIGIT_LC|int ch
1114=for apidoc Amh|bool|isXDIGIT_LC_uvchr|int ch
1115=for apidoc Amh|bool|isXDIGIT_LC_utf8_safe|U8 * s| U8 *end
1116
3c3ecf18
KW
1117=for apidoc Am|bool|isIDFIRST|char ch
1118Returns a boolean indicating whether the specified character can be the first
1119character of an identifier. This is very close to, but not quite the same as
1120the official Unicode property C<XID_Start>. The difference is that this
1121returns true only if the input character also matches L</isWORDCHAR>.
dcccc8ff
KW
1122See the L<top of this section|/Character classification> for an explanation of
1123variants
059703b0
KW
1124C<isIDFIRST_A>, C<isIDFIRST_L1>, C<isIDFIRST_uvchr>, C<isIDFIRST_utf8>,
1125C<isIDFIRST_utf8_safe>, C<isIDFIRST_LC>, C<isIDFIRST_LC_uvchr>,
1126C<isIDFIRST_LC_utf8>, and C<isIDFIRST_LC_utf8_safe>.
3c3ecf18 1127
f16858ed
KW
1128=for apidoc Amh|bool|isIDFIRST_A|int ch
1129=for apidoc Amh|bool|isIDFIRST_L1|int ch
1130=for apidoc Amh|bool|isIDFIRST_uvchr|int ch
1131=for apidoc Amh|bool|isIDFIRST_utf8_safe|U8 * s|U8 * end
d23c7e08 1132=for apidoc Amh|bool|isIDFIRST_utf8|U8 * s|U8 * end
f16858ed
KW
1133=for apidoc Amh|bool|isIDFIRST_LC|int ch
1134=for apidoc Amh|bool|isIDFIRST_LC_uvchr|int ch
1135=for apidoc Amh|bool|isIDFIRST_LC_utf8_safe|U8 * s| U8 *end
1136
3c3ecf18
KW
1137=for apidoc Am|bool|isIDCONT|char ch
1138Returns a boolean indicating whether the specified character can be the
1139second or succeeding character of an identifier. This is very close to, but
1140not quite the same as the official Unicode property C<XID_Continue>. The
1141difference is that this returns true only if the input character also matches
dcccc8ff 1142L</isWORDCHAR>. See the L<top of this section|/Character classification> for
059703b0
KW
1143an explanation of variants C<isIDCONT_A>, C<isIDCONT_L1>, C<isIDCONT_uvchr>,
1144C<isIDCONT_utf8>, C<isIDCONT_utf8_safe>, C<isIDCONT_LC>, C<isIDCONT_LC_uvchr>,
1145C<isIDCONT_LC_utf8>, and C<isIDCONT_LC_utf8_safe>.
3c3ecf18 1146
f16858ed
KW
1147=for apidoc Amh|bool|isIDCONT_A|int ch
1148=for apidoc Amh|bool|isIDCONT_L1|int ch
1149=for apidoc Amh|bool|isIDCONT_uvchr|int ch
1150=for apidoc Amh|bool|isIDCONT_utf8_safe|U8 * s|U8 * end
d23c7e08 1151=for apidoc Amh|bool|isIDCONT_utf8|U8 * s|U8 * end
f16858ed
KW
1152=for apidoc Amh|bool|isIDCONT_LC|int ch
1153=for apidoc Amh|bool|isIDCONT_LC_uvchr|int ch
1154=for apidoc Amh|bool|isIDCONT_LC_utf8_safe|U8 * s| U8 *end
1155
243effed 1156=head1 Miscellaneous Functions
8eea39dd 1157
95a59cab 1158=for apidoc Am|U8|READ_XDIGIT|char str*
243effed 1159Returns the value of an ASCII-range hex digit and advances the string pointer.
95a59cab
YO
1160Behaviour is only well defined when isXDIGIT(*str) is true.
1161
e7c1e6c1 1162=head1 Character case changing
21da7284
KW
1163Perl uses "full" Unicode case mappings. This means that converting a single
1164character to another case may result in a sequence of more than one character.
1165For example, the uppercase of C<E<223>> (LATIN SMALL LETTER SHARP S) is the two
1166character sequence C<SS>. This presents some complications The lowercase of
1167all characters in the range 0..255 is a single character, and thus
1168C<L</toLOWER_L1>> is furnished. But, C<toUPPER_L1> can't exist, as it couldn't
1169return a valid result for all legal inputs. Instead C<L</toUPPER_uvchr>> has
1170an API that does allow every possible legal result to be returned.) Likewise
1171no other function that is crippled by not being able to give the correct
1172results for the full range of possible inputs has been implemented here.
e7c1e6c1 1173
d713f9d9 1174=for apidoc Am|U8|toUPPER|int ch
1f607577
KW
1175Converts the specified character to uppercase. If the input is anything but an
1176ASCII lowercase character, that input character itself is returned. Variant
c753c8d3 1177C<toUPPER_A> is equivalent.
954c1994 1178
d0da05db
KW
1179=for apidoc Am|UV|toUPPER_uvchr|UV cp|U8* s|STRLEN* lenp
1180Converts the code point C<cp> to its uppercase version, and
1181stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code
1182point is interpreted as native if less than 256; otherwise as Unicode. Note
1f607577
KW
1183that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
1184bytes since the uppercase version may be longer than the original character.
1185
1186The first code point of the uppercased version is returned
21da7284
KW
1187(but note, as explained at L<the top of this section|/Character case
1188changing>, that there may be more.)
1f607577 1189
059703b0 1190=for apidoc Am|UV|toUPPER_utf8|U8* p|U8* e|U8* s|STRLEN* lenp
a239b1e2
KW
1191Converts the first UTF-8 encoded character in the sequence starting at C<p> and
1192extending no further than S<C<e - 1>> to its uppercase version, and
1f607577
KW
1193stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
1194that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
1195bytes since the uppercase version may be longer than the original character.
1196
1197The first code point of the uppercased version is returned
21da7284
KW
1198(but note, as explained at L<the top of this section|/Character case
1199changing>, that there may be more).
1f607577 1200
059703b0
KW
1201It will not attempt to read beyond S<C<e - 1>>, provided that the constraint
1202S<C<s E<lt> e>> is true (this is asserted for in C<-DDEBUGGING> builds). If
1203the UTF-8 for the input character is malformed in some way, the program may
1204croak, or the function may return the REPLACEMENT CHARACTER, at the discretion
1205of the implementation, and subject to change in future releases.
a239b1e2 1206
059703b0
KW
1207=for apidoc Am|UV|toUPPER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
1208Same as L</toUPPER_utf8>.
1f607577 1209
25200305
KW
1210=for apidoc Am|U8|toFOLD|U8 ch
1211Converts the specified character to foldcase. If the input is anything but an
1212ASCII uppercase character, that input character itself is returned. Variant
1213C<toFOLD_A> is equivalent. (There is no equivalent C<to_FOLD_L1> for the full
d0da05db 1214Latin1 range, as the full generality of L</toFOLD_uvchr> is needed there.)
25200305 1215
d0da05db
KW
1216=for apidoc Am|UV|toFOLD_uvchr|UV cp|U8* s|STRLEN* lenp
1217Converts the code point C<cp> to its foldcase version, and
1218stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code
1219point is interpreted as native if less than 256; otherwise as Unicode. Note
1f607577
KW
1220that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
1221bytes since the foldcase version may be longer than the original character.
1222
1223The first code point of the foldcased version is returned
21da7284
KW
1224(but note, as explained at L<the top of this section|/Character case
1225changing>, that there may be more).
1f607577 1226
059703b0 1227=for apidoc Am|UV|toFOLD_utf8|U8* p|U8* e|U8* s|STRLEN* lenp
a239b1e2
KW
1228Converts the first UTF-8 encoded character in the sequence starting at C<p> and
1229extending no further than S<C<e - 1>> to its foldcase version, and
1f607577
KW
1230stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
1231that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
1232bytes since the foldcase version may be longer than the original character.
1233
1234The first code point of the foldcased version is returned
21da7284
KW
1235(but note, as explained at L<the top of this section|/Character case
1236changing>, that there may be more).
1f607577 1237
059703b0 1238It will not attempt
a239b1e2
KW
1239to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is
1240true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the
1241input character is malformed in some way, the program may croak, or the
1242function may return the REPLACEMENT CHARACTER, at the discretion of the
1243implementation, and subject to change in future releases.
1244
059703b0
KW
1245=for apidoc Am|UV|toFOLD_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
1246Same as L</toFOLD_utf8>.
1f607577
KW
1247
1248=for apidoc Am|U8|toLOWER|U8 ch
1249Converts the specified character to lowercase. If the input is anything but an
1250ASCII uppercase character, that input character itself is returned. Variant
c753c8d3 1251C<toLOWER_A> is equivalent.
954c1994 1252
1f607577 1253=for apidoc Am|U8|toLOWER_L1|U8 ch
b7d90381
KW
1254Converts the specified Latin1 character to lowercase. The results are
1255undefined if the input doesn't fit in a byte.
1f607577
KW
1256
1257=for apidoc Am|U8|toLOWER_LC|U8 ch
1258Converts the specified character to lowercase using the current locale's rules,
1259if possible; otherwise returns the input character itself.
1260
d0da05db
KW
1261=for apidoc Am|UV|toLOWER_uvchr|UV cp|U8* s|STRLEN* lenp
1262Converts the code point C<cp> to its lowercase version, and
1263stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code
1264point is interpreted as native if less than 256; otherwise as Unicode. Note
1f607577
KW
1265that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
1266bytes since the lowercase version may be longer than the original character.
1267
1268The first code point of the lowercased version is returned
21da7284
KW
1269(but note, as explained at L<the top of this section|/Character case
1270changing>, that there may be more).
1f607577 1271
bd350c85 1272=for apidoc Am|UV|toLOWER_utf8|U8* p|U8* e|U8* s|STRLEN* lenp
a239b1e2
KW
1273Converts the first UTF-8 encoded character in the sequence starting at C<p> and
1274extending no further than S<C<e - 1>> to its lowercase version, and
1f607577
KW
1275stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
1276that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
1277bytes since the lowercase version may be longer than the original character.
1278
1279The first code point of the lowercased version is returned
21da7284
KW
1280(but note, as explained at L<the top of this section|/Character case
1281changing>, that there may be more).
059703b0
KW
1282It will not attempt to read beyond S<C<e - 1>>, provided that the constraint
1283S<C<s E<lt> e>> is true (this is asserted for in C<-DDEBUGGING> builds). If
1284the UTF-8 for the input character is malformed in some way, the program may
1285croak, or the function may return the REPLACEMENT CHARACTER, at the discretion
1286of the implementation, and subject to change in future releases.
1f607577 1287
059703b0
KW
1288=for apidoc Am|UV|toLOWER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
1289Same as L</toLOWER_utf8>.
1f607577 1290
25200305
KW
1291=for apidoc Am|U8|toTITLE|U8 ch
1292Converts the specified character to titlecase. If the input is anything but an
1293ASCII lowercase character, that input character itself is returned. Variant
b7d90381 1294C<toTITLE_A> is equivalent. (There is no C<toTITLE_L1> for the full Latin1
d0da05db 1295range, as the full generality of L</toTITLE_uvchr> is needed there. Titlecase is
b7d90381 1296not a concept used in locale handling, so there is no functionality for that.)
25200305 1297
d0da05db
KW
1298=for apidoc Am|UV|toTITLE_uvchr|UV cp|U8* s|STRLEN* lenp
1299Converts the code point C<cp> to its titlecase version, and
1300stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code
1301point is interpreted as native if less than 256; otherwise as Unicode. Note
1f607577
KW
1302that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
1303bytes since the titlecase version may be longer than the original character.
1304
1305The first code point of the titlecased version is returned
21da7284
KW
1306(but note, as explained at L<the top of this section|/Character case
1307changing>, that there may be more).
1f607577 1308
059703b0 1309=for apidoc Am|UV|toTITLE_utf8|U8* p|U8* e|U8* s|STRLEN* lenp
a239b1e2
KW
1310Converts the first UTF-8 encoded character in the sequence starting at C<p> and
1311extending no further than S<C<e - 1>> to its titlecase version, and
1f607577
KW
1312stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note
1313that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
1314bytes since the titlecase version may be longer than the original character.
1315
1316The first code point of the titlecased version is returned
21da7284
KW
1317(but note, as explained at L<the top of this section|/Character case
1318changing>, that there may be more).
1f607577 1319
059703b0 1320It will not attempt
a239b1e2
KW
1321to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is
1322true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the
1323input character is malformed in some way, the program may croak, or the
1324function may return the REPLACEMENT CHARACTER, at the discretion of the
1325implementation, and subject to change in future releases.
1326
059703b0
KW
1327=for apidoc Am|UV|toTITLE_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
1328Same as L</toTITLE_utf8>.
1f607577 1329
954c1994 1330=cut
353c9b6f 1331
d0da05db 1332XXX Still undocumented isVERTWS_uvchr and _utf8; it's unclear what their names
1e222e4f
KW
1333really should be. Also toUPPER_LC and toFOLD_LC, which are subject to change,
1334and aren't general purpose as they don't work on U+DF, and assert against that.
243effed 1335
8a58bdcf 1336Note that these macros are repeated in Devel::PPPort, so should also be
62fa66b6
KW
1337patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
1338
954c1994
GS
1339*/
1340
8f5283f4
KW
1341/*
1342 void below because that's the best fit, and works for Devel::PPPort
1343=for apidoc AmnU|void|WIDEST_UTYPE
1344
1345Yields the widest unsigned integer type on the platform, currently either
1346C<U32> or C<64>. This can be used in declarations such as
1347
1348 WIDEST_UTYPE my_uv;
1349
1350or casts
1351
1352 my_uv = (WIDEST_UTYPE) val;
1353
1354=cut
1355
1356*/
de9e2639
KW
1357#ifdef QUADKIND
1358# define WIDEST_UTYPE U64
7c062697
KW
1359#else
1360# define WIDEST_UTYPE U32
1361#endif
1362
3912bc88
KW
1363/* FITS_IN_8_BITS(c) returns true if c doesn't have a bit set other than in
1364 * the lower 8. It is designed to be hopefully bomb-proof, making sure that no
1365 * bits of information are lost even on a 64-bit machine, but to get the
1366 * compiler to optimize it out if possible. This is because Configure makes
1367 * sure that the machine has an 8-bit byte, so if c is stored in a byte, the
1368 * sizeof() guarantees that this evaluates to a constant true at compile time.
7e75d1a1
JH
1369 *
1370 * For Coverity, be always true, because otherwise Coverity thinks
1371 * it finds several expressions that are always true, independent
1372 * of operands. Well, they are, but that is kind of the point.
220c71bf 1373 */
7e75d1a1 1374#ifndef __COVERITY__
6c5b02ac
KW
1375 /* The '| 0' part ensures a compiler error if c is not integer (like e.g., a
1376 * pointer) */
1377#define FITS_IN_8_BITS(c) ( (sizeof(c) == 1) \
ace3ad0f 1378 || !(((WIDEST_UTYPE)((c) | 0)) & ~0xFF))
7e75d1a1
JH
1379#else
1380#define FITS_IN_8_BITS(c) (1)
1381#endif
cf301eb7 1382
45f4bb73 1383/* Returns true if l <= c <= (l + n), where 'l' and 'n' are non-negative
833b0f46 1384 * Written this way so that after optimization, only one conditional test is
76d3ad4c
KW
1385 * needed. (The NV casts stop any warnings about comparison always being true
1386 * if called with an unsigned. The cast preserves the sign, which is all we
1387 * care about.) */
1388#define withinCOUNT(c, l, n) (__ASSERT_((NV) (l) >= 0) \
1389 __ASSERT_((NV) (n) >= 0) \
1390 (((WIDEST_UTYPE) (((c)) - ((l) | 0))) <= (((WIDEST_UTYPE) ((n) | 0)))))
833b0f46 1391
94250c4f
KW
1392/* Returns true if c is in the range l..u, where 'l' is non-negative
1393 * Written this way so that after optimization, only one conditional test is
4758c20d 1394 * needed. */
1eaefa6e 1395#define inRANGE(c, l, u) (__ASSERT_((u) >= (l)) \
4758c20d 1396 ( (sizeof(c) == sizeof(U8)) ? withinCOUNT(((U8) (c)), (l), ((u) - (l))) \
4758c20d
KW
1397 : (sizeof(c) == sizeof(U32)) ? withinCOUNT(((U32) (c)), (l), ((u) - (l))) \
1398 : (__ASSERT_(sizeof(c) == sizeof(WIDEST_UTYPE)) \
45f4bb73 1399 withinCOUNT(((WIDEST_UTYPE) (c)), (l), ((u) - (l))))))
305fe86e 1400
41f43cc2 1401#ifdef EBCDIC
b6340bd0 1402# ifndef _ALL_SOURCE
0852beac
KW
1403 /* The native libc isascii() et.al. functions return the wrong results
1404 * on at least z/OS unless this is defined. */
b6340bd0
KW
1405# error _ALL_SOURCE should probably be defined
1406# endif
41f43cc2 1407#else
0852beac
KW
1408 /* There is a simple definition of ASCII for ASCII platforms. But the
1409 * EBCDIC one isn't so simple, so is defined using table look-up like the
9c903d59 1410 * other macros below.
3f3c579d
KW
1411 *
1412 * The cast here is used instead of '(c) >= 0', because some compilers emit
1413 * a warning that that test is always true when the parameter is an
1414 * unsigned type. khw supposes that it could be written as
1415 * && ((c) == '\0' || (c) > 0)
1416 * to avoid the message, but the cast will likely avoid extra branches even
1417 * with stupid compilers.
1418 *
1419 * The '| 0' part ensures a compiler error if c is not integer (like e.g.,
1420 * a pointer) */
9c903d59 1421# define isASCII(c) ((WIDEST_UTYPE)((c) | 0) < 128)
41f43cc2
KW
1422#endif
1423
38694112
KW
1424/* Take the eight possible bit patterns of the lower 3 bits and you get the
1425 * lower 3 bits of the 8 octal digits, in both ASCII and EBCDIC, so those bits
1426 * can be ignored. If the rest match '0', we have an octal */
1427#define isOCTAL_A(c) (((WIDEST_UTYPE)((c) | 0) & ~7) == '0')
c2da0b36 1428
9fb1bf9d 1429#ifdef H_PERL /* If have access to perl.h, lookup in its table */
f4cdb42c 1430
a500dc72
KW
1431/* Character class numbers. For internal core Perl use only. The ones less
1432 * than 32 are used in PL_charclass[] and the ones up through the one that
1433 * corresponds to <_HIGHEST_REGCOMP_DOT_H_SYNC> are used by regcomp.h and
1434 * related files. PL_charclass ones use names used in l1_char_class_tab.h but
1435 * their actual definitions are here. If that file has a name not used here,
1436 * it won't compile.
1709d539
KW
1437 *
1438 * The first group of these is ordered in what I (khw) estimate to be the
31c7f561 1439 * frequency of their use. This gives a slight edge to exiting a loop earlier
58a3ba2c
KW
1440 * (in reginclass() in regexec.c). Except \v should be last, as it isn't a
1441 * real Posix character class, and some (small) inefficiencies in regular
1442 * expression handling would be introduced by putting it in the middle of those
1443 * that are. Also, cntrl and ascii come after the others as it may be useful
1444 * to group these which have no members that match above Latin1, (or above
1445 * ASCII in the latter case) */
1446
1709d539
KW
1447# define _CC_WORDCHAR 0 /* \w and [:word:] */
1448# define _CC_DIGIT 1 /* \d and [:digit:] */
1449# define _CC_ALPHA 2 /* [:alpha:] */
1450# define _CC_LOWER 3 /* [:lower:] */
1451# define _CC_UPPER 4 /* [:upper:] */
1452# define _CC_PUNCT 5 /* [:punct:] */
1453# define _CC_PRINT 6 /* [:print:] */
15861f94 1454# define _CC_ALPHANUMERIC 7 /* [:alnum:] */
1709d539 1455# define _CC_GRAPH 8 /* [:graph:] */
359b005e 1456# define _CC_CASED 9 /* [:lower:] or [:upper:] under /i */
779cf272 1457# define _CC_SPACE 10 /* \s, [:space:] */
b0d691b2
KW
1458# define _CC_BLANK 11 /* [:blank:] */
1459# define _CC_XDIGIT 12 /* [:xdigit:] */
779cf272
KW
1460# define _CC_CNTRL 13 /* [:cntrl:] */
1461# define _CC_ASCII 14 /* [:ascii:] */
1462# define _CC_VERTSPACE 15 /* \v */
1709d539 1463
a0947d7b
KW
1464# define _HIGHEST_REGCOMP_DOT_H_SYNC _CC_VERTSPACE
1465
1709d539 1466/* The members of the third group below do not need to be coordinated with data
3ffc8c70 1467 * structures in regcomp.[ch] and regexec.c. */
779cf272
KW
1468# define _CC_IDFIRST 16
1469# define _CC_CHARNAME_CONT 17
1470# define _CC_NONLATIN1_FOLD 18
1471# define _CC_NONLATIN1_SIMPLE_FOLD 19
1472# define _CC_QUOTEMETA 20
1473# define _CC_NON_FINAL_FOLD 21
1474# define _CC_IS_IN_SOME_FOLD 22
2ae9030c
KW
1475# define _CC_BINDIGIT 23
1476# define _CC_OCTDIGIT 24
1477# define _CC_MNEMONIC_CNTRL 25
073c22b3
KW
1478
1479/* This next group is only used on EBCDIC platforms, so theoretically could be
1480 * shared with something entirely different that's only on ASCII platforms */
abb8abf6 1481# define _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE 31
0654f0ab 1482/* Unused: 26-30
f4cdb42c
KW
1483 * If more bits are needed, one could add a second word for non-64bit
1484 * QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd
37ede926
KW
1485 * word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it
1486 * is used only for optimization (as of this writing), and differs in the
1487 * Latin1 range from the ALPHA bit only in two relatively unimportant
a500dc72 1488 * characters: the masculine and feminine ordinal indicators, so removing it
073c22b3
KW
1489 * would just cause /i regexes which match them to run less efficiently.
1490 * Similarly the EBCDIC-only bits are used just for speed, and could be
1491 * replaced by other means */
96ac0975 1492
3a371f2f
KW
1493#if defined(PERL_CORE) || defined(PERL_EXT)
1494/* An enum version of the character class numbers, to help compilers
1495 * optimize */
1496typedef enum {
3a371f2f 1497 _CC_ENUM_ALPHA = _CC_ALPHA,
e8d596e0
KW
1498 _CC_ENUM_ALPHANUMERIC = _CC_ALPHANUMERIC,
1499 _CC_ENUM_ASCII = _CC_ASCII,
1500 _CC_ENUM_BLANK = _CC_BLANK,
b0d691b2 1501 _CC_ENUM_CASED = _CC_CASED,
e8d596e0 1502 _CC_ENUM_CNTRL = _CC_CNTRL,
3a371f2f
KW
1503 _CC_ENUM_DIGIT = _CC_DIGIT,
1504 _CC_ENUM_GRAPH = _CC_GRAPH,
1505 _CC_ENUM_LOWER = _CC_LOWER,
1506 _CC_ENUM_PRINT = _CC_PRINT,
1507 _CC_ENUM_PUNCT = _CC_PUNCT,
e8d596e0 1508 _CC_ENUM_SPACE = _CC_SPACE,
3a371f2f 1509 _CC_ENUM_UPPER = _CC_UPPER,
e8d596e0 1510 _CC_ENUM_VERTSPACE = _CC_VERTSPACE,
3a371f2f 1511 _CC_ENUM_WORDCHAR = _CC_WORDCHAR,
e8d596e0 1512 _CC_ENUM_XDIGIT = _CC_XDIGIT
3a371f2f
KW
1513} _char_class_number;
1514#endif
1515
86f72d56 1516#define POSIX_CC_COUNT (_HIGHEST_REGCOMP_DOT_H_SYNC + 1)
63c61c3f 1517
6635f04f 1518START_EXTERN_C
96ac0975
NC
1519# ifdef DOINIT
1520EXTCONST U32 PL_charclass[] = {
1521# include "l1_char_class_tab.h"
1522};
1523
1524# else /* ! DOINIT */
1525EXTCONST U32 PL_charclass[];
1526# endif
6635f04f 1527END_EXTERN_C
96ac0975 1528
265c1f46 1529 /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */
430b7c70 1530# define _CC_mask(classnum) (1U << (classnum))
4650c663
KW
1531
1532 /* For internal core Perl use only: the base macro for defining macros like
1533 * isALPHA */
ff7ecfc3 1534# define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \
f4cd282c 1535 && (PL_charclass[(U8) (c)] & _CC_mask(classnum)))
4eeeb416 1536
f4cdb42c
KW
1537 /* The mask for the _A versions of the macros; it just adds in the bit for
1538 * ASCII. */
1539# define _CC_mask_A(classnum) (_CC_mask(classnum) | _CC_mask(_CC_ASCII))
1540
4650c663
KW
1541 /* For internal core Perl use only: the base macro for defining macros like
1542 * isALPHA_A. The foo_A version makes sure that both the desired bit and
1543 * the ASCII bit are present */
b7d90381
KW
1544# define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \
1545 && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \
1546 == _CC_mask_A(classnum)))
f4cdb42c 1547
26c1d9d8
KW
1548/* On ASCII platforms certain classes form a single range. It's faster to
1549 * special case these. isDIGIT is a single range on all platforms */
b877c1ff
KW
1550# ifdef EBCDIC
1551# define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA)
1552# define isGRAPH_A(c) _generic_isCC_A(c, _CC_GRAPH)
1553# define isLOWER_A(c) _generic_isCC_A(c, _CC_LOWER)
1554# define isPRINT_A(c) _generic_isCC_A(c, _CC_PRINT)
1555# define isUPPER_A(c) _generic_isCC_A(c, _CC_UPPER)
1556# else
26c1d9d8 1557 /* By folding the upper and lowercase, we can use a single range */
b877c1ff 1558# define isALPHA_A(c) inRANGE((~('A' ^ 'a') & (c)), 'A', 'Z')
26c1d9d8 1559# define isGRAPH_A(c) inRANGE(c, ' ' + 1, 0x7e)
b877c1ff
KW
1560# define isLOWER_A(c) inRANGE(c, 'a', 'z')
1561# define isPRINT_A(c) inRANGE(c, ' ', 0x7e)
1562# define isUPPER_A(c) inRANGE(c, 'A', 'Z')
1563# endif
15861f94 1564# define isALPHANUMERIC_A(c) _generic_isCC_A(c, _CC_ALPHANUMERIC)
f4cdb42c
KW
1565# define isBLANK_A(c) _generic_isCC_A(c, _CC_BLANK)
1566# define isCNTRL_A(c) _generic_isCC_A(c, _CC_CNTRL)
b877c1ff 1567# define isDIGIT_A(c) inRANGE(c, '0', '9')
f4cdb42c
KW
1568# define isPUNCT_A(c) _generic_isCC_A(c, _CC_PUNCT)
1569# define isSPACE_A(c) _generic_isCC_A(c, _CC_SPACE)
f4cdb42c 1570# define isWORDCHAR_A(c) _generic_isCC_A(c, _CC_WORDCHAR)
b7d90381
KW
1571# define isXDIGIT_A(c) _generic_isCC(c, _CC_XDIGIT) /* No non-ASCII xdigits
1572 */
d95f8b6a 1573# define isIDFIRST_A(c) _generic_isCC_A(c, _CC_IDFIRST)
3ded5eb0
KW
1574# define isALPHA_L1(c) _generic_isCC(c, _CC_ALPHA)
1575# define isALPHANUMERIC_L1(c) _generic_isCC(c, _CC_ALPHANUMERIC)
1576# define isBLANK_L1(c) _generic_isCC(c, _CC_BLANK)
1577
1578 /* continuation character for legal NAME in \N{NAME} */
1579# define isCHARNAME_CONT(c) _generic_isCC(c, _CC_CHARNAME_CONT)
1580
1581# define isCNTRL_L1(c) _generic_isCC(c, _CC_CNTRL)
1582# define isGRAPH_L1(c) _generic_isCC(c, _CC_GRAPH)
1583# define isLOWER_L1(c) _generic_isCC(c, _CC_LOWER)
1584# define isPRINT_L1(c) _generic_isCC(c, _CC_PRINT)
b7d90381 1585# define isPSXSPC_L1(c) isSPACE_L1(c)
3ded5eb0
KW
1586# define isPUNCT_L1(c) _generic_isCC(c, _CC_PUNCT)
1587# define isSPACE_L1(c) _generic_isCC(c, _CC_SPACE)
1588# define isUPPER_L1(c) _generic_isCC(c, _CC_UPPER)
1589# define isWORDCHAR_L1(c) _generic_isCC(c, _CC_WORDCHAR)
1590# define isIDFIRST_L1(c) _generic_isCC(c, _CC_IDFIRST)
f4cdb42c 1591
0852beac
KW
1592# ifdef EBCDIC
1593# define isASCII(c) _generic_isCC(c, _CC_ASCII)
1594# endif
1595
f12c0118
KW
1596 /* Participates in a single-character fold with a character above 255 */
1597# define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_SIMPLE_FOLD)))
1598
1599 /* Like the above, but also can be part of a multi-char fold */
f4cd282c 1600# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
430b7c70 1601
4eeeb416 1602# define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)
26faadbd 1603# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
b7d90381 1604 _generic_isCC(c, _CC_NON_FINAL_FOLD)
37ede926 1605# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
b7d90381 1606 _generic_isCC(c, _CC_IS_IN_SOME_FOLD)
5e6ebb12
KW
1607
1608/* is c a control character for which we have a mnemonic? */
1609# if defined(PERL_CORE) || defined(PERL_EXT)
1610# define isMNEMONIC_CNTRL(c) _generic_isCC(c, _CC_MNEMONIC_CNTRL)
1611# endif
687c8d01 1612#else /* else we don't have perl.h H_PERL */
3ded5eb0
KW
1613
1614 /* If we don't have perl.h, we are compiling a utility program. Below we
1615 * hard-code various macro definitions that wouldn't otherwise be available
fc273927 1616 * to it. Most are coded based on first principles. These are written to
74665a89 1617 * avoid EBCDIC vs. ASCII #ifdef's as much as possible. */
182c4ace 1618# define isDIGIT_A(c) inRANGE(c, '0', '9')
0852beac 1619# define isBLANK_A(c) ((c) == ' ' || (c) == '\t')
74665a89
KW
1620# define isSPACE_A(c) (isBLANK_A(c) \
1621 || (c) == '\n' \
1622 || (c) == '\r' \
1623 || (c) == '\v' \
0852beac 1624 || (c) == '\f')
74665a89
KW
1625 /* On EBCDIC, there are gaps between 'i' and 'j'; 'r' and 's'. Same for
1626 * uppercase. The tests for those aren't necessary on ASCII, but hurt only
1627 * performance (if optimization isn't on), and allow the same code to be
1628 * used for both platform types */
182c4ace
KW
1629# define isLOWER_A(c) inRANGE((c), 'a', 'i') \
1630 || inRANGE((c), 'j', 'r') \
1631 || inRANGE((c), 's', 'z')
1632# define isUPPER_A(c) inRANGE((c), 'A', 'I') \
1633 || inRANGE((c), 'J', 'R') \
1634 || inRANGE((c), 'S', 'Z')
a4d7a999
KW
1635# define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c))
1636# define isALPHANUMERIC_A(c) (isALPHA_A(c) || isDIGIT_A(c))
3ded5eb0 1637# define isWORDCHAR_A(c) (isALPHANUMERIC_A(c) || (c) == '_')
0852beac 1638# define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_')
182c4ace
KW
1639# define isXDIGIT_A(c) ( isDIGIT_A(c) \
1640 || inRANGE((c), 'a', 'f') \
1641 || inRANGE((c), 'A', 'F')
74665a89
KW
1642# define isPUNCT_A(c) ((c) == '-' || (c) == '!' || (c) == '"' \
1643 || (c) == '#' || (c) == '$' || (c) == '%' \
1644 || (c) == '&' || (c) == '\'' || (c) == '(' \
1645 || (c) == ')' || (c) == '*' || (c) == '+' \
1646 || (c) == ',' || (c) == '.' || (c) == '/' \
1647 || (c) == ':' || (c) == ';' || (c) == '<' \
1648 || (c) == '=' || (c) == '>' || (c) == '?' \
1649 || (c) == '@' || (c) == '[' || (c) == '\\' \
1650 || (c) == ']' || (c) == '^' || (c) == '_' \
1651 || (c) == '`' || (c) == '{' || (c) == '|' \
1652 || (c) == '}' || (c) == '~')
1653# define isGRAPH_A(c) (isALPHANUMERIC_A(c) || isPUNCT_A(c))
1654# define isPRINT_A(c) (isGRAPH_A(c) || (c) == ' ')
3ded5eb0 1655
0852beac 1656# ifdef EBCDIC
74665a89
KW
1657 /* The below is accurate for the 3 EBCDIC code pages traditionally
1658 * supported by perl. The only difference between them in the controls
1659 * is the position of \n, and that is represented symbolically below */
1660# define isCNTRL_A(c) ((c) == '\0' || (c) == '\a' || (c) == '\b' \
1661 || (c) == '\f' || (c) == '\n' || (c) == '\r' \
1662 || (c) == '\t' || (c) == '\v' \
182c4ace 1663 || inRANGE((c), 1, 3) /* SOH, STX, ETX */ \
8ec0a736 1664 || (c) == 7F /* U+7F DEL */ \
182c4ace
KW
1665 || inRANGE((c), 0x0E, 0x13) /* SO SI DLE \
1666 DC[1-3] */ \
74665a89
KW
1667 || (c) == 0x18 /* U+18 CAN */ \
1668 || (c) == 0x19 /* U+19 EOM */ \
182c4ace 1669 || inRANGE((c), 0x1C, 0x1F) /* [FGRU]S */ \
74665a89
KW
1670 || (c) == 0x26 /* U+17 ETB */ \
1671 || (c) == 0x27 /* U+1B ESC */ \
1672 || (c) == 0x2D /* U+05 ENQ */ \
1673 || (c) == 0x2E /* U+06 ACK */ \
1674 || (c) == 0x32 /* U+16 SYN */ \
1675 || (c) == 0x37 /* U+04 EOT */ \
1676 || (c) == 0x3C /* U+14 DC4 */ \
1677 || (c) == 0x3D /* U+15 NAK */ \
1678 || (c) == 0x3F)/* U+1A SUB */
0852beac 1679# define isASCII(c) (isCNTRL_A(c) || isPRINT_A(c))
74665a89
KW
1680# else /* isASCII is already defined for ASCII platforms, so can use that to
1681 define isCNTRL */
1682# define isCNTRL_A(c) (isASCII(c) && ! isPRINT_A(c))
0852beac
KW
1683# endif
1684
3ffc8c70 1685 /* The _L1 macros may be unnecessary for the utilities; I (khw) added them
caa94d35
KW
1686 * during debugging, and it seems best to keep them. We may be called
1687 * without NATIVE_TO_LATIN1 being defined. On ASCII platforms, it doesn't
1688 * do anything anyway, so make it not a problem */
1689# if ! defined(EBCDIC) && ! defined(NATIVE_TO_LATIN1)
1690# define NATIVE_TO_LATIN1(ch) (ch)
1691# endif
3ded5eb0
KW
1692# define isALPHA_L1(c) (isUPPER_L1(c) || isLOWER_L1(c))
1693# define isALPHANUMERIC_L1(c) (isALPHA_L1(c) || isDIGIT_A(c))
1694# define isBLANK_L1(c) (isBLANK_A(c) \
1695 || (FITS_IN_8_BITS(c) \
1696 && NATIVE_TO_LATIN1((U8) c) == 0xA0))
1697# define isCNTRL_L1(c) (FITS_IN_8_BITS(c) && (! isPRINT_L1(c)))
1698# define isGRAPH_L1(c) (isPRINT_L1(c) && (! isBLANK_L1(c)))
1699# define isLOWER_L1(c) (isLOWER_A(c) \
1700 || (FITS_IN_8_BITS(c) \
ae683a5f 1701 && (( NATIVE_TO_LATIN1((U8) c) >= 0xDF \
3ded5eb0
KW
1702 && NATIVE_TO_LATIN1((U8) c) != 0xF7) \
1703 || NATIVE_TO_LATIN1((U8) c) == 0xAA \
1704 || NATIVE_TO_LATIN1((U8) c) == 0xBA \
1705 || NATIVE_TO_LATIN1((U8) c) == 0xB5)))
1706# define isPRINT_L1(c) (isPRINT_A(c) \
1707 || (FITS_IN_8_BITS(c) \
1708 && NATIVE_TO_LATIN1((U8) c) >= 0xA0))
3ded5eb0
KW
1709# define isPUNCT_L1(c) (isPUNCT_A(c) \
1710 || (FITS_IN_8_BITS(c) \
ae683a5f 1711 && ( NATIVE_TO_LATIN1((U8) c) == 0xA1 \
3ded5eb0
KW
1712 || NATIVE_TO_LATIN1((U8) c) == 0xA7 \
1713 || NATIVE_TO_LATIN1((U8) c) == 0xAB \
1714 || NATIVE_TO_LATIN1((U8) c) == 0xB6 \
1715 || NATIVE_TO_LATIN1((U8) c) == 0xB7 \
1716 || NATIVE_TO_LATIN1((U8) c) == 0xBB \
1717 || NATIVE_TO_LATIN1((U8) c) == 0xBF)))
1718# define isSPACE_L1(c) (isSPACE_A(c) \
1719 || (FITS_IN_8_BITS(c) \
ae683a5f 1720 && ( NATIVE_TO_LATIN1((U8) c) == 0x85 \
3ded5eb0
KW
1721 || NATIVE_TO_LATIN1((U8) c) == 0xA0)))
1722# define isUPPER_L1(c) (isUPPER_A(c) \
1723 || (FITS_IN_8_BITS(c) \
182c4ace
KW
1724 && ( IN_RANGE(NATIVE_TO_LATIN1((U8) c), \
1725 0xC0, 0xDE) \
3ded5eb0
KW
1726 && NATIVE_TO_LATIN1((U8) c) != 0xD7)))
1727# define isWORDCHAR_L1(c) (isIDFIRST_L1(c) || isDIGIT_A(c))
1728# define isIDFIRST_L1(c) (isALPHA_L1(c) || NATIVE_TO_LATIN1(c) == '_')
1729# define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) \
1730 || isBLANK_L1(c) \
1731 || (c) == '-' \
1732 || (c) == '(' \
1733 || (c) == ')')
1734 /* The following are not fully accurate in the above-ASCII range. I (khw)
1735 * don't think it's necessary to be so for the purposes where this gets
1736 * compiled */
1737# define _isQUOTEMETA(c) (FITS_IN_8_BITS(c) && ! isWORDCHAR_L1(c))
1738# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) isALPHA_L1(c)
1739
1740 /* And these aren't accurate at all. They are useful only for above
1741 * Latin1, which utilities and bootstrapping don't deal with */
1742# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 0
6838b41e 1743# define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0
3ded5eb0
KW
1744# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0
1745
1746 /* Many of the macros later in this file are defined in terms of these. By
1747 * implementing them with a function, which converts the class number into
1748 * a call to the desired macro, all of the later ones work. However, that
1749 * function won't be actually defined when building a utility program (no
1750 * perl.h), and so a compiler error will be generated if one is attempted
1751 * to be used. And the above-Latin1 code points require Unicode tables to
1752 * be present, something unlikely to be the case when bootstrapping */
1753# define _generic_isCC(c, classnum) \
1754 (FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), TRUE))
1755# define _generic_isCC_A(c, classnum) \
1756 (FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), FALSE))
687c8d01 1757#endif /* End of no perl.h H_PERL */
8a58bdcf 1758
e66b99e9
KW
1759#define isALPHANUMERIC(c) isALPHANUMERIC_A(c)
1760#define isALPHA(c) isALPHA_A(c)
0852beac
KW
1761#define isASCII_A(c) isASCII(c)
1762#define isASCII_L1(c) isASCII(c)
e66b99e9
KW
1763#define isBLANK(c) isBLANK_A(c)
1764#define isCNTRL(c) isCNTRL_A(c)
1765#define isDIGIT(c) isDIGIT_A(c)
1766#define isGRAPH(c) isGRAPH_A(c)
1767#define isIDFIRST(c) isIDFIRST_A(c)
1768#define isLOWER(c) isLOWER_A(c)
1769#define isPRINT(c) isPRINT_A(c)
779cf272 1770#define isPSXSPC_A(c) isSPACE_A(c)
e66b99e9 1771#define isPSXSPC(c) isPSXSPC_A(c)
779cf272 1772#define isPSXSPC_L1(c) isSPACE_L1(c)
e66b99e9
KW
1773#define isPUNCT(c) isPUNCT_A(c)
1774#define isSPACE(c) isSPACE_A(c)
1775#define isUPPER(c) isUPPER_A(c)
1776#define isWORDCHAR(c) isWORDCHAR_A(c)
1777#define isXDIGIT(c) isXDIGIT_A(c)
1778
1779/* ASCII casing. These could also be written as
1780 #define toLOWER(c) (isASCII(c) ? toLOWER_LATIN1(c) : (c))
1781 #define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c))
1782 which uses table lookup and mask instead of subtraction. (This would
c5e9991e
KW
1783 work because the _MOD does not apply in the ASCII range).
1784
1785 These actually are UTF-8 invariant casing, not just ASCII, as any non-ASCII
1786 UTF-8 invariants are neither upper nor lower. (Only on EBCDIC platforms are
1787 there non-ASCII invariants, and all of them are controls.) */
68067e4e
DM
1788#define toLOWER(c) (isUPPER(c) ? (U8)((c) + ('a' - 'A')) : (c))
1789#define toUPPER(c) (isLOWER(c) ? (U8)((c) - ('a' - 'A')) : (c))
bbce6d69 1790
25200305
KW
1791/* In the ASCII range, these are equivalent to what they're here defined to be.
1792 * But by creating these definitions, other code doesn't have to be aware of
c5e9991e
KW
1793 * this detail. Actually this works for all UTF-8 invariants, not just the
1794 * ASCII range. (EBCDIC platforms can have non-ASCII invariants.) */
25200305 1795#define toFOLD(c) toLOWER(c)
25200305
KW
1796#define toTITLE(c) toUPPER(c)
1797
c753c8d3
KW
1798#define toLOWER_A(c) toLOWER(c)
1799#define toUPPER_A(c) toUPPER(c)
25200305
KW
1800#define toFOLD_A(c) toFOLD(c)
1801#define toTITLE_A(c) toTITLE(c)
1a0901db 1802
4650c663 1803/* Use table lookup for speed; returns the input itself if is out-of-range */
b2bf251f 1804#define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \
8e7c6e7d 1805 ? (c) \
f4cd282c 1806 : PL_latin1_lc[ (U8) (c) ])
c753c8d3
KW
1807#define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */
1808
1a0901db 1809/* Modified uc. Is correct uc except for three non-ascii chars which are
4650c663
KW
1810 * all mapped to one of them, and these need special handling; returns the
1811 * input itself if is out-of-range */
b2bf251f 1812#define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \
8e7c6e7d 1813 ? (c) \
f4cd282c 1814 : PL_mod_latin1_uc[ (U8) (c) ])
31f05a37 1815#define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale
84061b6a 1816
beab9ebe
KW
1817/* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */
1818
1819/* For internal core Perl use only: the base macro for defining macros like
1820 * isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point
31f05a37
KW
1821 * (0-255) to check. In a UTF-8 locale, the result is the same as calling
1822 * isFOO_L1(); the 'utf8_locale_classnum' parameter is something like
1823 * _CC_UPPER, which gives the class number for doing this. For non-UTF-8
1824 * locales, the code to actually do the test this is passed in 'non_utf8'. If
1825 * 'c' is above 255, 0 is returned. For accessing the full range of possible
1826 * code points under locale rules, use the macros based on _generic_LC_uvchr
1827 * instead of this. */
beab9ebe
KW
1828#define _generic_LC_base(c, utf8_locale_classnum, non_utf8) \
1829 (! FITS_IN_8_BITS(c) \
1830 ? 0 \
31f05a37
KW
1831 : IN_UTF8_CTYPE_LOCALE \
1832 ? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum)) \
beab9ebe
KW
1833 : cBOOL(non_utf8))
1834
1835/* For internal core Perl use only: a helper macro for defining macros like
1836 * isALPHA_LC. 'c' is the code point (0-255) to check. The function name to
1837 * actually do this test is passed in 'non_utf8_func', which is called on 'c',
1838 * casting 'c' to the macro _LC_CAST, which should not be parenthesized. See
1839 * _generic_LC_base for more info */
1840#define _generic_LC(c, utf8_locale_classnum, non_utf8_func) \
1841 _generic_LC_base(c,utf8_locale_classnum, \
1842 non_utf8_func( (_LC_CAST) (c)))
1843
1844/* For internal core Perl use only: like _generic_LC, but also returns TRUE if
1845 * 'c' is the platform's native underscore character */
1846#define _generic_LC_underscore(c,utf8_locale_classnum,non_utf8_func) \
1847 _generic_LC_base(c, utf8_locale_classnum, \
1848 (non_utf8_func( (_LC_CAST) (c)) \
1849 || (char)(c) == '_'))
1850
1851/* These next three are also for internal core Perl use only: case-change
247985d4
KW
1852 * helper macros. The reason for using the PL_latin arrays is in case the
1853 * system function is defective; it ensures uniform results that conform to the
b257a28c 1854 * Unicod standard. It does not handle the anomalies in UTF-8 Turkic locales */
beab9ebe
KW
1855#define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \
1856 ? (c) \
31f05a37
KW
1857 : (IN_UTF8_CTYPE_LOCALE) \
1858 ? PL_latin1_lc[ (U8) (c) ] \
5a10328c 1859 : (cast)function((cast)(c)))
beab9ebe 1860
31f05a37
KW
1861/* Note that the result can be larger than a byte in a UTF-8 locale. It
1862 * returns a single value, so can't adequately return the upper case of LATIN
1863 * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two
1864 * values "SS"); instead it asserts against that under DEBUGGING, and
b257a28c
KW
1865 * otherwise returns its input. It does not handle the anomalies in UTF-8
1866 * Turkic locales. */
beab9ebe
KW
1867#define _generic_toUPPER_LC(c, function, cast) \
1868 (! FITS_IN_8_BITS(c) \
1869 ? (c) \
31f05a37 1870 : ((! IN_UTF8_CTYPE_LOCALE) \
b7d90381 1871 ? (cast)function((cast)(c)) \
31f05a37
KW
1872 : ((((U8)(c)) == MICRO_SIGN) \
1873 ? GREEK_CAPITAL_LETTER_MU \
1874 : ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \
1875 ? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \
1876 : ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \
1877 ? (__ASSERT_(0) (c)) \
1878 : PL_mod_latin1_uc[ (U8) (c) ])))))
1879
1880/* Note that the result can be larger than a byte in a UTF-8 locale. It
1881 * returns a single value, so can't adequately return the fold case of LATIN
1882 * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two
1883 * values "ss"); instead it asserts against that under DEBUGGING, and
b257a28c
KW
1884 * otherwise returns its input. It does not handle the anomalies in UTF-8
1885 * Turkic locales */
beab9ebe 1886#define _generic_toFOLD_LC(c, function, cast) \
4d7db1b9 1887 ((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \
31f05a37 1888 ? GREEK_SMALL_LETTER_MU \
4d7db1b9
KW
1889 : (__ASSERT_(! IN_UTF8_CTYPE_LOCALE \
1890 || (c) != LATIN_SMALL_LETTER_SHARP_S) \
1891 _generic_toLOWER_LC(c, function, cast)))
beab9ebe 1892
84061b6a 1893/* Use the libc versions for these if available. */
f05550c0 1894#if defined(HAS_ISASCII)
84061b6a
KW
1895# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii( (U8) (c)))
1896#else
1897# define isASCII_LC(c) isASCII(c)
1898#endif
1899
f05550c0 1900#if defined(HAS_ISBLANK)
84061b6a 1901# define isBLANK_LC(c) _generic_LC(c, _CC_BLANK, isblank)
95f34b6f 1902#else /* Unlike isASCII, varies if in a UTF-8 locale */
278b2b56 1903# define isBLANK_LC(c) ((IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c))
84061b6a
KW
1904#endif
1905
f05550c0 1906#define _LC_CAST U8
bbce6d69 1907
f05550c0 1908#ifdef WIN32
375f5f06
KW
1909 /* The Windows functions don't bother to follow the POSIX standard, which
1910 * for example says that something can't both be a printable and a control.
1911 * But Windows treats the \t control as a printable, and does such things
1912 * as making superscripts into both digits and punctuation. This tames
1913 * these flaws by assuming that the definitions of both controls and space
1914 * are correct, and then making sure that other definitions don't have
1915 * weirdnesses, by making sure that isalnum() isn't also ispunct(), etc.
1916 * Not all possible weirdnesses are checked for, just the ones that were
1917 * detected on actual Microsoft code pages */
1918
b7d90381
KW
1919# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl)
1920# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace)
1921
1922# define isALPHA_LC(c) (_generic_LC(c, _CC_ALPHA, isalpha) \
1923 && isALPHANUMERIC_LC(c))
1924# define isALPHANUMERIC_LC(c) (_generic_LC(c, _CC_ALPHANUMERIC, isalnum) && \
1925 ! isPUNCT_LC(c))
1926# define isDIGIT_LC(c) (_generic_LC(c, _CC_DIGIT, isdigit) && \
1927 isALPHANUMERIC_LC(c))
1928# define isGRAPH_LC(c) (_generic_LC(c, _CC_GRAPH, isgraph) && isPRINT_LC(c))
1929# define isIDFIRST_LC(c) (((c) == '_') \
1930 || (_generic_LC(c, _CC_IDFIRST, isalpha) && ! isPUNCT_LC(c)))
1931# define isLOWER_LC(c) (_generic_LC(c, _CC_LOWER, islower) && isALPHA_LC(c))
1932# define isPRINT_LC(c) (_generic_LC(c, _CC_PRINT, isprint) && ! isCNTRL_LC(c))
1933# define isPUNCT_LC(c) (_generic_LC(c, _CC_PUNCT, ispunct) && ! isCNTRL_LC(c))
1934# define isUPPER_LC(c) (_generic_LC(c, _CC_UPPER, isupper) && isALPHA_LC(c))
f05550c0 1935# define isWORDCHAR_LC(c) (((c) == '_') || isALPHANUMERIC_LC(c))
b7d90381
KW
1936# define isXDIGIT_LC(c) (_generic_LC(c, _CC_XDIGIT, isxdigit) \
1937 && isALPHANUMERIC_LC(c))
f05550c0
BF
1938
1939# define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8)
1940# define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8)
1941# define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8)
1942
1943#elif defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII))
4650c663 1944 /* For most other platforms */
beab9ebe 1945
f05550c0
BF
1946# define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, isalpha)
1947# define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, isalnum)
1948# define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl)
1949# define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, isdigit)
1950# define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, isgraph)
1951# define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, isalpha)
1952# define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, islower)
1953# define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, isprint)
1954# define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, ispunct)
1955# define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace)
1956# define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, isupper)
1957# define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, isalnum)
1958# define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, isxdigit)
1959
1960
1961# define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8)
1962# define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8)
1963# define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8)
1964
1965#else /* The final fallback position */
1966
b7d90381
KW
1967# define isALPHA_LC(c) (isascii(c) && isalpha(c))
1968# define isALPHANUMERIC_LC(c) (isascii(c) && isalnum(c))
1969# define isCNTRL_LC(c) (isascii(c) && iscntrl(c))
1970# define isDIGIT_LC(c) (isascii(c) && isdigit(c))
1971# define isGRAPH_LC(c) (isascii(c) && isgraph(c))
f05550c0 1972# define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_'))
b7d90381
KW
1973# define isLOWER_LC(c) (isascii(c) && islower(c))
1974# define isPRINT_LC(c) (isascii(c) && isprint(c))
1975# define isPUNCT_LC(c) (isascii(c) && ispunct(c))
1976# define isSPACE_LC(c) (isascii(c) && isspace(c))
1977# define isUPPER_LC(c) (isascii(c) && isupper(c))
f05550c0 1978# define isWORDCHAR_LC(c) (isascii(c) && (isalnum(c) || (c) == '_'))
b7d90381 1979# define isXDIGIT_LC(c) (isascii(c) && isxdigit(c))
f05550c0
BF
1980
1981# define toLOWER_LC(c) (isascii(c) ? tolower(c) : (c))
1982# define toUPPER_LC(c) (isascii(c) ? toupper(c) : (c))
1983# define toFOLD_LC(c) (isascii(c) ? tolower(c) : (c))
bbce6d69 1984
f05550c0 1985#endif
55204971 1986
eba68aa0
KW
1987#define isIDCONT(c) isWORDCHAR(c)
1988#define isIDCONT_A(c) isWORDCHAR_A(c)
1989#define isIDCONT_L1(c) isWORDCHAR_L1(c)
1990#define isIDCONT_LC(c) isWORDCHAR_LC(c)
13380643 1991#define isPSXSPC_LC(c) isSPACE_LC(c)
aaa51d5e 1992
4650c663 1993/* For internal core Perl use only: the base macros for defining macros like
d0da05db
KW
1994 * isALPHA_uvchr. 'c' is the code point to check. 'classnum' is the POSIX class
1995 * number defined earlier in this file. _generic_uvchr() is used for POSIX
4650c663
KW
1996 * classes where there is a macro or function 'above_latin1' that takes the
1997 * single argument 'c' and returns the desired value. These exist for those
2366ba44
KW
1998 * classes which have simple definitions, avoiding the overhead of an inversion
1999 * list binary search. _generic_invlist_uvchr() can be used
4650c663 2000 * for classes where that overhead is faster than a direct lookup.
d0da05db 2001 * _generic_uvchr() won't compile if 'c' isn't unsigned, as it won't match the
4650c663
KW
2002 * 'above_latin1' prototype. _generic_isCC() macro does bounds checking, so
2003 * have duplicate checks here, so could create versions of the macros that
2004 * don't, but experiments show that gcc optimizes them out anyway. */
66c17564
KW
2005
2006/* Note that all ignore 'use bytes' */
1e222e4f
KW
2007#define _generic_uvchr(classnum, above_latin1, c) ((c) < 256 \
2008 ? _generic_isCC(c, classnum) \
cd500f2f 2009 : above_latin1(c))
2366ba44 2010#define _generic_invlist_uvchr(classnum, c) ((c) < 256 \
1e222e4f 2011 ? _generic_isCC(c, classnum) \
922e8cb4 2012 : _is_uni_FOO(classnum, c))
2366ba44
KW
2013#define isALPHA_uvchr(c) _generic_invlist_uvchr(_CC_ALPHA, c)
2014#define isALPHANUMERIC_uvchr(c) _generic_invlist_uvchr(_CC_ALPHANUMERIC, c)
d0da05db
KW
2015#define isASCII_uvchr(c) isASCII(c)
2016#define isBLANK_uvchr(c) _generic_uvchr(_CC_BLANK, is_HORIZWS_cp_high, c)
2017#define isCNTRL_uvchr(c) isCNTRL_L1(c) /* All controls are in Latin1 */
2366ba44
KW
2018#define isDIGIT_uvchr(c) _generic_invlist_uvchr(_CC_DIGIT, c)
2019#define isGRAPH_uvchr(c) _generic_invlist_uvchr(_CC_GRAPH, c)
1e222e4f
KW
2020#define isIDCONT_uvchr(c) \
2021 _generic_uvchr(_CC_WORDCHAR, _is_uni_perl_idcont, c)
2022#define isIDFIRST_uvchr(c) \
2023 _generic_uvchr(_CC_IDFIRST, _is_uni_perl_idstart, c)
2366ba44
KW
2024#define isLOWER_uvchr(c) _generic_invlist_uvchr(_CC_LOWER, c)
2025#define isPRINT_uvchr(c) _generic_invlist_uvchr(_CC_PRINT, c)
d0da05db 2026
2366ba44 2027#define isPUNCT_uvchr(c) _generic_invlist_uvchr(_CC_PUNCT, c)
d0da05db
KW
2028#define isSPACE_uvchr(c) _generic_uvchr(_CC_SPACE, is_XPERLSPACE_cp_high, c)
2029#define isPSXSPC_uvchr(c) isSPACE_uvchr(c)
2030
2366ba44 2031#define isUPPER_uvchr(c) _generic_invlist_uvchr(_CC_UPPER, c)
d0da05db 2032#define isVERTWS_uvchr(c) _generic_uvchr(_CC_VERTSPACE, is_VERTWS_cp_high, c)
2366ba44 2033#define isWORDCHAR_uvchr(c) _generic_invlist_uvchr(_CC_WORDCHAR, c)
d0da05db
KW
2034#define isXDIGIT_uvchr(c) _generic_uvchr(_CC_XDIGIT, is_XDIGIT_cp_high, c)
2035
2036#define toFOLD_uvchr(c,s,l) to_uni_fold(c,s,l)
2037#define toLOWER_uvchr(c,s,l) to_uni_lower(c,s,l)
2038#define toTITLE_uvchr(c,s,l) to_uni_title(c,s,l)
2039#define toUPPER_uvchr(c,s,l) to_uni_upper(c,s,l)
2040
2041/* For backwards compatibility, even though '_uni' should mean official Unicode
2042 * code points, in Perl it means native for those below 256 */
2043#define isALPHA_uni(c) isALPHA_uvchr(c)
2044#define isALPHANUMERIC_uni(c) isALPHANUMERIC_uvchr(c)
2045#define isASCII_uni(c) isASCII_uvchr(c)
2046#define isBLANK_uni(c) isBLANK_uvchr(c)
2047#define isCNTRL_uni(c) isCNTRL_uvchr(c)
2048#define isDIGIT_uni(c) isDIGIT_uvchr(c)
2049#define isGRAPH_uni(c) isGRAPH_uvchr(c)
2050#define isIDCONT_uni(c) isIDCONT_uvchr(c)
2051#define isIDFIRST_uni(c) isIDFIRST_uvchr(c)
2052#define isLOWER_uni(c) isLOWER_uvchr(c)
2053#define isPRINT_uni(c) isPRINT_uvchr(c)
2054#define isPUNCT_uni(c) isPUNCT_uvchr(c)
2055#define isSPACE_uni(c) isSPACE_uvchr(c)
2056#define isPSXSPC_uni(c) isPSXSPC_uvchr(c)
2057#define isUPPER_uni(c) isUPPER_uvchr(c)
2058#define isVERTWS_uni(c) isVERTWS_uvchr(c)
2059#define isWORDCHAR_uni(c) isWORDCHAR_uvchr(c)
2060#define isXDIGIT_uni(c) isXDIGIT_uvchr(c)
2061#define toFOLD_uni(c,s,l) toFOLD_uvchr(c,s,l)
2062#define toLOWER_uni(c,s,l) toLOWER_uvchr(c,s,l)
2063#define toTITLE_uni(c,s,l) toTITLE_uvchr(c,s,l)
2064#define toUPPER_uni(c,s,l) toUPPER_uvchr(c,s,l)
a0ed51b3 2065
4650c663
KW
2066/* For internal core Perl use only: the base macros for defining macros like
2067 * isALPHA_LC_uvchr. These are like isALPHA_LC, but the input can be any code
d0da05db 2068 * point, not just 0-255. Like _generic_uvchr, there are two versions, one for
4650c663 2069 * simple class definitions; the other for more complex. These are like
d0da05db 2070 * _generic_uvchr, so see it for more info. */
cd500f2f
KW
2071#define _generic_LC_uvchr(latin1, above_latin1, c) \
2072 (c < 256 ? latin1(c) : above_latin1(c))
2366ba44 2073#define _generic_LC_invlist_uvchr(latin1, classnum, c) \
cd500f2f
KW
2074 (c < 256 ? latin1(c) : _is_uni_FOO(classnum, c))
2075
2366ba44
KW
2076#define isALPHA_LC_uvchr(c) _generic_LC_invlist_uvchr(isALPHA_LC, _CC_ALPHA, c)
2077#define isALPHANUMERIC_LC_uvchr(c) _generic_LC_invlist_uvchr(isALPHANUMERIC_LC, \
cd500f2f 2078 _CC_ALPHANUMERIC, c)
b7d90381
KW
2079#define isASCII_LC_uvchr(c) isASCII_LC(c)
2080#define isBLANK_LC_uvchr(c) _generic_LC_uvchr(isBLANK_LC, \
2081 is_HORIZWS_cp_high, c)
feeab5a9 2082#define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : 0)
2366ba44
KW
2083#define isDIGIT_LC_uvchr(c) _generic_LC_invlist_uvchr(isDIGIT_LC, _CC_DIGIT, c)
2084#define isGRAPH_LC_uvchr(c) _generic_LC_invlist_uvchr(isGRAPH_LC, _CC_GRAPH, c)
b7d90381 2085#define isIDCONT_LC_uvchr(c) _generic_LC_uvchr(isIDCONT_LC, \
eba68aa0 2086 _is_uni_perl_idcont, c)
b7d90381 2087#define isIDFIRST_LC_uvchr(c) _generic_LC_uvchr(isIDFIRST_LC, \
cd500f2f 2088 _is_uni_perl_idstart, c)
2366ba44
KW
2089#define isLOWER_LC_uvchr(c) _generic_LC_invlist_uvchr(isLOWER_LC, _CC_LOWER, c)
2090#define isPRINT_LC_uvchr(c) _generic_LC_invlist_uvchr(isPRINT_LC, _CC_PRINT, c)
b7d90381 2091#define isPSXSPC_LC_uvchr(c) isSPACE_LC_uvchr(c)
2366ba44 2092#define isPUNCT_LC_uvchr(c) _generic_LC_invlist_uvchr(isPUNCT_LC, _CC_PUNCT, c)
b7d90381 2093#define isSPACE_LC_uvchr(c) _generic_LC_uvchr(isSPACE_LC, \
509fb054 2094 is_XPERLSPACE_cp_high, c)
2366ba44
KW
2095#define isUPPER_LC_uvchr(c) _generic_LC_invlist_uvchr(isUPPER_LC, _CC_UPPER, c)
2096#define isWORDCHAR_LC_uvchr(c) _generic_LC_invlist_uvchr(isWORDCHAR_LC, \
cd500f2f 2097 _CC_WORDCHAR, c)
b7d90381
KW
2098#define isXDIGIT_LC_uvchr(c) _generic_LC_uvchr(isXDIGIT_LC, \
2099 is_XDIGIT_cp_high, c)
e712593e 2100
b7d90381 2101#define isBLANK_LC_uni(c) isBLANK_LC_uvchr(UNI_TO_NATIVE(c))
aaa51d5e 2102
da8c1a98
KW
2103/* The "_safe" macros make sure that we don't attempt to read beyond 'e', but
2104 * they don't otherwise go out of their way to look for malformed UTF-8. If
2105 * they can return accurate results without knowing if the input is otherwise
2106 * malformed, they do so. For example isASCII is accurate in spite of any
2107 * non-length malformations because it looks only at a single byte. Likewise
2108 * isDIGIT looks just at the first byte for code points 0-255, as all UTF-8
2109 * variant ones return FALSE. But, if the input has to be well-formed in order
2110 * for the results to be accurate, the macros will test and if malformed will
2111 * call a routine to die
2112 *
2113 * Except for toke.c, the macros do assume that e > p, asserting that on
2114 * DEBUGGING builds. Much code that calls these depends on this being true,
2115 * for other reasons. toke.c is treated specially as using the regular
2116 * assertion breaks it in many ways. All strings that these operate on there
2117 * are supposed to have an extra NUL character at the end, so that *e = \0. A
2118 * bunch of code in toke.c assumes that this is true, so the assertion allows
2119 * for that */
2120#ifdef PERL_IN_TOKE_C
2121# define _utf8_safe_assert(p,e) ((e) > (p) || ((e) == (p) && *(p) == '\0'))
2122#else
2123# define _utf8_safe_assert(p,e) ((e) > (p))
2124#endif
2125
2126#define _generic_utf8_safe(classnum, p, e, above_latin1) \
c81b3562
KW
2127 ((! _utf8_safe_assert(p, e)) \
2128 ? (_force_out_malformed_utf8_message((U8 *) (p), (U8 *) (e), 0, 1), 0)\
2129 : (UTF8_IS_INVARIANT(*(p))) \
da8c1a98
KW
2130 ? _generic_isCC(*(p), classnum) \
2131 : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
2132 ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
2133 ? _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \
2134 classnum) \
2135 : (_force_out_malformed_utf8_message( \
2136 (U8 *) (p), (U8 *) (e), 0, 1), 0)) \
2137 : above_latin1))
b7d90381
KW
2138/* Like the above, but calls 'above_latin1(p)' to get the utf8 value.
2139 * 'above_latin1' can be a macro */
da8c1a98
KW
2140#define _generic_func_utf8_safe(classnum, above_latin1, p, e) \
2141 _generic_utf8_safe(classnum, p, e, above_latin1(p, e))
2366ba44 2142#define _generic_non_invlist_utf8_safe(classnum, above_latin1, p, e) \
da8c1a98
KW
2143 _generic_utf8_safe(classnum, p, e, \
2144 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2145 ? (_force_out_malformed_utf8_message( \
2146 (U8 *) (p), (U8 *) (e), 0, 1), 0) \
2147 : above_latin1(p)))
2366ba44
KW
2148/* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
2149 * 'above_latin1' parameter */
2150#define _generic_invlist_utf8_safe(classnum, p, e) \
2151 _generic_utf8_safe(classnum, p, e, _is_utf8_FOO(classnum, p, e))
922e8cb4 2152
cc8ab7c0 2153/* Like the above, but should be used only when it is known that there are no
ff7ecfc3
KW
2154 * characters in the upper-Latin1 range (128-255 on ASCII platforms) which the
2155 * class is TRUE for. Hence it can skip the tests for this range.
2156 * 'above_latin1' should include its arguments */
da8c1a98
KW
2157#define _generic_utf8_safe_no_upper_latin1(classnum, p, e, above_latin1) \
2158 (__ASSERT_(_utf8_safe_assert(p, e)) \
2159 (UTF8_IS_INVARIANT(*(p))) \
2160 ? _generic_isCC(*(p), classnum) \
2161 : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
2162 ? 0 /* Note that doesn't check validity for latin1 */ \
2163 : above_latin1)
2164
84238efa 2165
059703b0
KW
2166#define isALPHA_utf8(p, e) isALPHA_utf8_safe(p, e)
2167#define isALPHANUMERIC_utf8(p, e) isALPHANUMERIC_utf8_safe(p, e)
2168#define isASCII_utf8(p, e) isASCII_utf8_safe(p, e)
2169#define isBLANK_utf8(p, e) isBLANK_utf8_safe(p, e)
2170#define isCNTRL_utf8(p, e) isCNTRL_utf8_safe(p, e)
2171#define isDIGIT_utf8(p, e) isDIGIT_utf8_safe(p, e)
2172#define isGRAPH_utf8(p, e) isGRAPH_utf8_safe(p, e)
2173#define isIDCONT_utf8(p, e) isIDCONT_utf8_safe(p, e)
2174#define isIDFIRST_utf8(p, e) isIDFIRST_utf8_safe(p, e)
2175#define isLOWER_utf8(p, e) isLOWER_utf8_safe(p, e)
2176#define isPRINT_utf8(p, e) isPRINT_utf8_safe(p, e)
2177#define isPSXSPC_utf8(p, e) isPSXSPC_utf8_safe(p, e)
2178#define isPUNCT_utf8(p, e) isPUNCT_utf8_safe(p, e)
2179#define isSPACE_utf8(p, e) isSPACE_utf8_safe(p, e)
2180#define isUPPER_utf8(p, e) isUPPER_utf8_safe(p, e)
2181#define isVERTWS_utf8(p, e) isVERTWS_utf8_safe(p, e)
2182#define isWORDCHAR_utf8(p, e) isWORDCHAR_utf8_safe(p, e)
2183#define isXDIGIT_utf8(p, e) isXDIGIT_utf8_safe(p, e)
e8fa43e2 2184
2366ba44 2185#define isALPHA_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_ALPHA, p, e)
da8c1a98 2186#define isALPHANUMERIC_utf8_safe(p, e) \
2366ba44 2187 _generic_invlist_utf8_safe(_CC_ALPHANUMERIC, p, e)
da8c1a98
KW
2188#define isASCII_utf8_safe(p, e) \
2189 /* Because ASCII is invariant under utf8, the non-utf8 macro \
2190 * works */ \
2191 (__ASSERT_(_utf8_safe_assert(p, e)) isASCII(*(p)))
2192#define isBLANK_utf8_safe(p, e) \
2366ba44 2193 _generic_non_invlist_utf8_safe(_CC_BLANK, is_HORIZWS_high, p, e)
da8c1a98 2194
e8fa43e2
KW
2195#ifdef EBCDIC
2196 /* Because all controls are UTF-8 invariants in EBCDIC, we can use this
2197 * more efficient macro instead of the more general one */
da8c1a98 2198# define isCNTRL_utf8_safe(p, e) \
56d02b8c 2199 (__ASSERT_(_utf8_safe_assert(p, e)) isCNTRL_L1(*(p)))
e8fa43e2 2200#else
da8c1a98 2201# define isCNTRL_utf8_safe(p, e) _generic_utf8_safe(_CC_CNTRL, p, e, 0)
e8fa43e2
KW
2202#endif
2203
da8c1a98
KW
2204#define isDIGIT_utf8_safe(p, e) \
2205 _generic_utf8_safe_no_upper_latin1(_CC_DIGIT, p, e, \
2366ba44
KW
2206 _is_utf8_FOO(_CC_DIGIT, p, e))
2207#define isGRAPH_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_GRAPH, p, e)
da8c1a98 2208#define isIDCONT_utf8_safe(p, e) _generic_func_utf8_safe(_CC_WORDCHAR, \
dd1a3ba7 2209 _is_utf8_perl_idcont, p, e)
e5dcd934 2210
c11ff943
KW
2211/* To prevent S_scan_word in toke.c from hanging, we have to make sure that
2212 * IDFIRST is an alnum. See
8034715d 2213 * https://github.com/Perl/perl5/issues/10275 for more detail than you
f91dcd13
KW
2214 * ever wanted to know about. (In the ASCII range, there isn't a difference.)
2215 * This used to be not the XID version, but we decided to go with the more
2216 * modern Unicode definition */
da8c1a98
KW
2217#define isIDFIRST_utf8_safe(p, e) \
2218 _generic_func_utf8_safe(_CC_IDFIRST, \
dd1a3ba7 2219 _is_utf8_perl_idstart, (U8 *) (p), (U8 *) (e))
da8c1a98 2220
2366ba44
KW
2221#define isLOWER_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_LOWER, p, e)
2222#define isPRINT_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_PRINT, p, e)
da8c1a98 2223#define isPSXSPC_utf8_safe(p, e) isSPACE_utf8_safe(p, e)
2366ba44 2224#define isPUNCT_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_PUNCT, p, e)
da8c1a98 2225#define isSPACE_utf8_safe(p, e) \
2366ba44
KW
2226 _generic_non_invlist_utf8_safe(_CC_SPACE, is_XPERLSPACE_high, p, e)
2227#define isUPPER_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_UPPER, p, e)
da8c1a98 2228#define isVERTWS_utf8_safe(p, e) \
2366ba44 2229 _generic_non_invlist_utf8_safe(_CC_VERTSPACE, is_VERTWS_high, p, e)
da8c1a98 2230#define isWORDCHAR_utf8_safe(p, e) \
2366ba44 2231 _generic_invlist_utf8_safe(_CC_WORDCHAR, p, e)
da8c1a98
KW
2232#define isXDIGIT_utf8_safe(p, e) \
2233 _generic_utf8_safe_no_upper_latin1(_CC_XDIGIT, p, e, \
2234 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2235 ? (_force_out_malformed_utf8_message( \
2236 (U8 *) (p), (U8 *) (e), 0, 1), 0) \
2237 : is_XDIGIT_high(p)))
a0ed51b3 2238
059703b0
KW
2239#define toFOLD_utf8(p,e,s,l) toFOLD_utf8_safe(p,e,s,l)
2240#define toLOWER_utf8(p,e,s,l) toLOWER_utf8_safe(p,e,s,l)
2241#define toTITLE_utf8(p,e,s,l) toTITLE_utf8_safe(p,e,s,l)
2242#define toUPPER_utf8(p,e,s,l) toUPPER_utf8_safe(p,e,s,l)
2e8adce6 2243
567b353c 2244/* For internal core use only, subject to change */
059703b0
KW
2245#define _toFOLD_utf8_flags(p,e,s,l,f) _to_utf8_fold_flags (p,e,s,l,f)
2246#define _toLOWER_utf8_flags(p,e,s,l,f) _to_utf8_lower_flags(p,e,s,l,f)
2247#define _toTITLE_utf8_flags(p,e,s,l,f) _to_utf8_title_flags(p,e,s,l,f)
2248#define _toUPPER_utf8_flags(p,e,s,l,f) _to_utf8_upper_flags(p,e,s,l,f)
a1a5ec35
KW
2249
2250#define toFOLD_utf8_safe(p,e,s,l) _toFOLD_utf8_flags(p,e,s,l, FOLD_FLAGS_FULL)
2251#define toLOWER_utf8_safe(p,e,s,l) _toLOWER_utf8_flags(p,e,s,l, 0)
2252#define toTITLE_utf8_safe(p,e,s,l) _toTITLE_utf8_flags(p,e,s,l, 0)
2253#define toUPPER_utf8_safe(p,e,s,l) _toUPPER_utf8_flags(p,e,s,l, 0)
567b353c 2254
059703b0
KW
2255#define isALPHA_LC_utf8(p, e) isALPHA_LC_utf8_safe(p, e)
2256#define isALPHANUMERIC_LC_utf8(p, e) isALPHANUMERIC_LC_utf8_safe(p, e)
2257#define isASCII_LC_utf8(p, e) isASCII_LC_utf8_safe(p, e)
2258#define isBLANK_LC_utf8(p, e) isBLANK_LC_utf8_safe(p, e)
2259#define isCNTRL_LC_utf8(p, e) isCNTRL_LC_utf8_safe(p, e)
2260#define isDIGIT_LC_utf8(p, e) isDIGIT_LC_utf8_safe(p, e)
2261#define isGRAPH_LC_utf8(p, e) isGRAPH_LC_utf8_safe(p, e)
2262#define isIDCONT_LC_utf8(p, e) isIDCONT_LC_utf8_safe(p, e)
2263#define isIDFIRST_LC_utf8(p, e) isIDFIRST_LC_utf8_safe(p, e)
2264#define isLOWER_LC_utf8(p, e) isLOWER_LC_utf8_safe(p, e)
2265#define isPRINT_LC_utf8(p, e) isPRINT_LC_utf8_safe(p, e)
2266#define isPSXSPC_LC_utf8(p, e) isPSXSPC_LC_utf8_safe(p, e)
2267#define isPUNCT_LC_utf8(p, e) isPUNCT_LC_utf8_safe(p, e)
2268#define isSPACE_LC_utf8(p, e) isSPACE_LC_utf8_safe(p, e)
2269#define isUPPER_LC_utf8(p, e) isUPPER_LC_utf8_safe(p, e)
2270#define isWORDCHAR_LC_utf8(p, e) isWORDCHAR_LC_utf8_safe(p, e)
2271#define isXDIGIT_LC_utf8(p, e) isXDIGIT_LC_utf8_safe(p, e)
34aeb2e9 2272
da8c1a98
KW
2273/* For internal core Perl use only: the base macros for defining macros like
2274 * isALPHA_LC_utf8_safe. These are like _generic_utf8, but if the first code
2275 * point in 'p' is within the 0-255 range, it uses locale rules from the
2276 * passed-in 'macro' parameter */
2277#define _generic_LC_utf8_safe(macro, p, e, above_latin1) \
2278 (__ASSERT_(_utf8_safe_assert(p, e)) \
2279 (UTF8_IS_INVARIANT(*(p))) \
2280 ? macro(*(p)) \
2281 : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
2282 ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
2283 ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \
2284 : (_force_out_malformed_utf8_message( \
2285 (U8 *) (p), (U8 *) (e), 0, 1), 0)) \
2286 : above_latin1))
2287
2366ba44
KW
2288#define _generic_LC_invlist_utf8_safe(macro, classnum, p, e) \
2289 _generic_LC_utf8_safe(macro, p, e, \
2290 _is_utf8_FOO(classnum, p, e))
da8c1a98
KW
2291
2292#define _generic_LC_func_utf8_safe(macro, above_latin1, p, e) \
2293 _generic_LC_utf8_safe(macro, p, e, above_latin1(p, e))
2294
2366ba44 2295#define _generic_LC_non_invlist_utf8_safe(classnum, above_latin1, p, e) \
da8c1a98
KW
2296 _generic_LC_utf8_safe(classnum, p, e, \
2297 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2298 ? (_force_out_malformed_utf8_message( \
2299 (U8 *) (p), (U8 *) (e), 0, 1), 0) \
2300 : above_latin1(p)))
2301
2302#define isALPHANUMERIC_LC_utf8_safe(p, e) \
2366ba44 2303 _generic_LC_invlist_utf8_safe(isALPHANUMERIC_LC, \
da8c1a98
KW
2304 _CC_ALPHANUMERIC, p, e)
2305#define isALPHA_LC_utf8_safe(p, e) \
2366ba44 2306 _generic_LC_invlist_utf8_safe(isALPHA_LC, _CC_ALPHA, p, e)
da8c1a98
KW
2307#define isASCII_LC_utf8_safe(p, e) \
2308 (__ASSERT_(_utf8_safe_assert(p, e)) isASCII_LC(*(p)))
2309#define isBLANK_LC_utf8_safe(p, e) \
2366ba44 2310 _generic_LC_non_invlist_utf8_safe(isBLANK_LC, is_HORIZWS_high, p, e)
da8c1a98
KW
2311#define isCNTRL_LC_utf8_safe(p, e) \
2312 _generic_LC_utf8_safe(isCNTRL_LC, p, e, 0)
2313#define isDIGIT_LC_utf8_safe(p, e) \
2366ba44 2314 _generic_LC_invlist_utf8_safe(isDIGIT_LC, _CC_DIGIT, p, e)
da8c1a98 2315#define isGRAPH_LC_utf8_safe(p, e) \
2366ba44 2316 _generic_LC_invlist_utf8_safe(isGRAPH_LC, _CC_GRAPH, p, e)
da8c1a98
KW
2317#define isIDCONT_LC_utf8_safe(p, e) \
2318 _generic_LC_func_utf8_safe(isIDCONT_LC, \
dd1a3ba7 2319 _is_utf8_perl_idcont, p, e)
da8c1a98
KW
2320#define isIDFIRST_LC_utf8_safe(p, e) \
2321 _generic_LC_func_utf8_safe(isIDFIRST_LC, \
dd1a3ba7 2322 _is_utf8_perl_idstart, p, e)
da8c1a98 2323#define isLOWER_LC_utf8_safe(p, e) \
2366ba44 2324 _generic_LC_invlist_utf8_safe(isLOWER_LC, _CC_LOWER, p, e)
da8c1a98 2325#define isPRINT_LC_utf8_safe(p, e) \
2366ba44 2326 _generic_LC_invlist_utf8_safe(isPRINT_LC, _CC_PRINT, p, e)
da8c1a98
KW
2327#define isPSXSPC_LC_utf8_safe(p, e) isSPACE_LC_utf8_safe(p, e)
2328#define isPUNCT_LC_utf8_safe(p, e) \
2366ba44 2329 _generic_LC_invlist_utf8_safe(isPUNCT_LC, _CC_PUNCT, p, e)
da8c1a98 2330#define isSPACE_LC_utf8_safe(p, e) \
2366ba44 2331 _generic_LC_non_invlist_utf8_safe(isSPACE_LC, is_XPERLSPACE_high, p, e)
da8c1a98 2332#define isUPPER_LC_utf8_safe(p, e) \
2366ba44 2333 _generic_LC_invlist_utf8_safe(isUPPER_LC, _CC_UPPER, p, e)
da8c1a98 2334#define isWORDCHAR_LC_utf8_safe(p, e) \
2366ba44 2335 _generic_LC_invlist_utf8_safe(isWORDCHAR_LC, _CC_WORDCHAR, p, e)
da8c1a98 2336#define isXDIGIT_LC_utf8_safe(p, e) \
2366ba44 2337 _generic_LC_non_invlist_utf8_safe(isXDIGIT_LC, is_XDIGIT_high, p, e)
aaa51d5e 2338
fbc19f27
KW
2339/* Macros for backwards compatibility and for completeness when the ASCII and
2340 * Latin1 values are identical */
b7d90381
KW
2341#define isALPHAU(c) isALPHA_L1(c)
2342#define isDIGIT_L1(c) isDIGIT_A(c)
2343#define isOCTAL(c) isOCTAL_A(c)
2344#define isOCTAL_L1(c) isOCTAL_A(c)
2345#define isXDIGIT_L1(c) isXDIGIT_A(c)
2346#define isALNUM(c) isWORDCHAR(c)
a377c856 2347#define isALNUM_A(c) isALNUM(c)
b7d90381
KW
2348#define isALNUMU(c) isWORDCHAR_L1(c)
2349#define isALNUM_LC(c) isWORDCHAR_LC(c)
2350#define isALNUM_uni(c) isWORDCHAR_uni(c)
2e28f0b9 2351#define isALNUM_LC_uvchr(c) isWORDCHAR_LC_uvchr(c)
059703b0 2352#define isALNUM_utf8(p,e) isWORDCHAR_utf8(p,e)
4c1d9526 2353#define isALNUM_utf8_safe(p,e) isWORDCHAR_utf8_safe(p,e)
059703b0 2354#define isALNUM_LC_utf8(p,e)isWORDCHAR_LC_utf8(p,e)
4c1d9526 2355#define isALNUM_LC_utf8_safe(p,e)isWORDCHAR_LC_utf8_safe(p,e)
b7d90381
KW
2356#define isALNUMC_A(c) isALPHANUMERIC_A(c) /* Mnemonic: "C's alnum" */
2357#define isALNUMC_L1(c) isALPHANUMERIC_L1(c)
2358#define isALNUMC(c) isALPHANUMERIC(c)
2359#define isALNUMC_LC(c) isALPHANUMERIC_LC(c)
2360#define isALNUMC_uni(c) isALPHANUMERIC_uni(c)
15861f94 2361#define isALNUMC_LC_uvchr(c) isALPHANUMERIC_LC_uvchr(c)
059703b0 2362#define isALNUMC_utf8(p,e) isALPHANUMERIC_utf8(p,e)
4c1d9526
KW
2363#define isALNUMC_utf8_safe(p,e) isALPHANUMERIC_utf8_safe(p,e)
2364#define isALNUMC_LC_utf8_safe(p,e) isALPHANUMERIC_LC_utf8_safe(p,e)
fbc19f27 2365
2bd1cbf6
KW
2366/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,
2367 * except that they don't necessarily mean the same characters, e.g. CTRL-D is
2368 * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC.
2369 * '?' is special-cased on EBCDIC to APC, which is the control there that is
2370 * the outlier from the block that contains the other controls, just like
2371 * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0
2372 * block. If it weren't special cased, it would yield a non-control.
88794300
KW
2373 * The conversion works both ways, so toCTRL('D') is 4, and toCTRL(4) is D,
2374 * etc. */
2bd1cbf6 2375#ifndef EBCDIC
75763b3a 2376# define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) toUPPER(((U8)(c))) ^ 64)
2bd1cbf6 2377#else
75763b3a
KW
2378# define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
2379 ((isPRINT_A(c)) \
2380 ? (UNLIKELY((c) == '?') \
2381 ? QUESTION_MARK_CTRL \
2382 : (NATIVE_TO_LATIN1(toUPPER((U8) (c))) ^ 64)) \
2383 : (UNLIKELY((c) == QUESTION_MARK_CTRL) \
2384 ? '?' \
2385 : (LATIN1_TO_NATIVE(((U8) (c)) ^ 64)))))
2bd1cbf6 2386#endif
bbce6d69 2387
dea28490
JJ
2388/* Line numbers are unsigned, 32 bits. */
2389typedef U32 line_t;
e5dcd934 2390#define NOLINE ((line_t) 4294967295UL) /* = FFFFFFFF */
378cc40b 2391
91152fc1
DG
2392/* Helpful alias for version prescan */
2393#define is_LAX_VERSION(a,b) \
2394 (a != Perl_prescan_version(aTHX_ a, FALSE, b, NULL, NULL, NULL, NULL))
2395
2396#define is_STRICT_VERSION(a,b) \
2397 (a != Perl_prescan_version(aTHX_ a, TRUE, b, NULL, NULL, NULL, NULL))
2398
2399#define BADVERSION(a,b,c) \
2400 if (b) { \
2401 *b = c; \
2402 } \
2403 return a;
8c52afec 2404
1ce77b7d
KW
2405/* Converts a character KNOWN to represent a hexadecimal digit (0-9, A-F, or
2406 * a-f) to its numeric value without using any branches. The input is
2407 * validated only by an assert() in DEBUGGING builds.
2408 *
2409 * It works by right shifting and isolating the bit that is 0 for the digits,
2410 * and 1 for at least the alphas A-F, a-f. The bit is shifted to the ones
2411 * position, and then to the eights position. Both are added together to form
2412 * 0 if the input is '0'-'9' and to form 9 if alpha. This is added to the
2413 * final four bits of the input to form the correct value. */
2414#define XDIGIT_VALUE(c) (__ASSERT_(isXDIGIT(c)) \
2415 ((NATIVE_TO_LATIN1(c) >> 6) & 1) /* 1 if alpha; 0 if not */ \
2416 + ((NATIVE_TO_LATIN1(c) >> 3) & 8) /* 8 if alpha; 0 if not */ \
2417 + ((c) & 0xF)) /* 0-9 if input valid hex digit */
2418
2419/* The argument is a string pointer, which is advanced. */
2420#define READ_XDIGIT(s) ((s)++, XDIGIT_VALUE(*((s) - 1)))
95a59cab 2421
cb27eebd
KW
2422/* Converts a character known to represent an octal digit (0-7) to its numeric
2423 * value. The input is validated only by an assert() in DEBUGGING builds. In
2424 * both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */
2425#define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c)))
2426
305b8651
KW
2427/* Efficiently returns a boolean as to if two native characters are equivalent
2428 * case-insenstively. At least one of the characters must be one of [A-Za-z];
2429 * the ALPHA in the name is to remind you of that. This is asserted() in
2430 * DEBUGGING builds. Because [A-Za-z] are invariant under UTF-8, this macro
2431 * works (on valid input) for both non- and UTF-8-encoded bytes.
2432 *
2433 * When one of the inputs is a compile-time constant and gets folded by the
2434 * compiler, this reduces to an AND and a TEST. On both EBCDIC and ASCII
2435 * machines, 'A' and 'a' differ by a single bit; the same with the upper and
2436 * lower case of all other ASCII-range alphabetics. On ASCII platforms, they
96ca48da
KW
2437 * are 32 apart; on EBCDIC, they are 64. At compile time, this uses an
2438 * exclusive 'or' to find that bit and then inverts it to form a mask, with
2439 * just a single 0, in the bit position where the upper- and lowercase differ.
2440 * */
305b8651
KW
2441#define isALPHA_FOLD_EQ(c1, c2) \
2442 (__ASSERT_(isALPHA_A(c1) || isALPHA_A(c2)) \
2443 ((c1) & ~('A' ^ 'a')) == ((c2) & ~('A' ^ 'a')))
2444#define isALPHA_FOLD_NE(c1, c2) (! isALPHA_FOLD_EQ((c1), (c2)))
2445
8e84507e 2446/*
ccfc67b7
JH
2447=head1 Memory Management
2448
a02a5408 2449=for apidoc Am|void|Newx|void* ptr|int nitems|type
954c1994
GS
2450The XSUB-writer's interface to the C C<malloc> function.
2451
596f7718 2452Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2453
c5008215
JC
2454In 5.9.3, Newx() and friends replace the older New() API, and drops
2455the first parameter, I<x>, a debug aid which allowed callers to identify
37b8b4c9 2456themselves. This aid has been superseded by a new build option,
d10b4965 2457PERL_MEM_LOG (see L<perlhacktips/PERL_MEM_LOG>). The older API is still
c5008215
JC
2458there for use in XS modules supporting older perls.
2459
a02a5408 2460=for apidoc Am|void|Newxc|void* ptr|int nitems|type|cast
954c1994 2461The XSUB-writer's interface to the C C<malloc> function, with
fbe13c60 2462cast. See also C<L</Newx>>.
954c1994 2463
596f7718 2464Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2465
a02a5408 2466=for apidoc Am|void|Newxz|void* ptr|int nitems|type
954c1994 2467The XSUB-writer's interface to the C C<malloc> function. The allocated
fbe13c60 2468memory is zeroed with C<memzero>. See also C<L</Newx>>.
a02a5408 2469
596f7718 2470Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2471
954c1994
GS
2472=for apidoc Am|void|Renew|void* ptr|int nitems|type
2473The XSUB-writer's interface to the C C<realloc> function.
2474
596f7718 2475Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2476
954c1994
GS
2477=for apidoc Am|void|Renewc|void* ptr|int nitems|type|cast
2478The XSUB-writer's interface to the C C<realloc> function, with
2479cast.
2480
596f7718 2481Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2482
49b8b560 2483=for apidoc Am|void|Safefree|void* ptr
954c1994
GS
2484The XSUB-writer's interface to the C C<free> function.
2485
596f7718 2486This should B<ONLY> be used on memory obtained using L</"Newx"> and friends.
0d7b2759 2487
954c1994
GS
2488=for apidoc Am|void|Move|void* src|void* dest|int nitems|type
2489The XSUB-writer's interface to the C C<memmove> function. The C<src> is the
926bb54c 2490source, C<dest> is the destination, C<nitems> is the number of items, and
fbe13c60 2491C<type> is the type. Can do overlapping moves. See also C<L</Copy>>.
954c1994 2492
e90e2364 2493=for apidoc Am|void *|MoveD|void* src|void* dest|int nitems|type
796b6530 2494Like C<Move> but returns C<dest>. Useful
72d33970 2495for encouraging compilers to tail-call
e90e2364
NC
2496optimise.
2497
954c1994
GS
2498=for apidoc Am|void|Copy|void* src|void* dest|int nitems|type
2499The XSUB-writer's interface to the C C<memcpy> function. The C<src> is the
926bb54c 2500source, C<dest> is the destination, C<nitems> is the number of items, and
fbe13c60 2501C<type> is the type. May fail on overlapping copies. See also C<L</Move>>.
954c1994 2502
e90e2364
NC
2503=for apidoc Am|void *|CopyD|void* src|void* dest|int nitems|type
2504
796b6530 2505Like C<Copy> but returns C<dest>. Useful
72d33970 2506for encouraging compilers to tail-call
e90e2364
NC
2507optimise.
2508
954c1994
GS
2509=for apidoc Am|void|Zero|void* dest|int nitems|type
2510
2511The XSUB-writer's interface to the C C<memzero> function. The C<dest> is the
2512destination, C<nitems> is the number of items, and C<type> is the type.
2513
e90e2364
NC
2514=for apidoc Am|void *|ZeroD|void* dest|int nitems|type
2515
72d33970
FC
2516Like C<Zero> but returns dest. Useful
2517for encouraging compilers to tail-call
e90e2364
NC
2518optimise.
2519
da5d8dbb 2520=for apidoc Am|void|StructCopy|type *src|type *dest|type
4375e838 2521This is an architecture-independent macro to copy one structure to another.
954c1994 2522
7e337ee0
JH
2523=for apidoc Am|void|PoisonWith|void* dest|int nitems|type|U8 byte
2524
2525Fill up memory with a byte pattern (a byte repeated over and over
2526again) that hopefully catches attempts to access uninitialized memory.
2527
2528=for apidoc Am|void|PoisonNew|void* dest|int nitems|type
2529
2530PoisonWith(0xAB) for catching access to allocated but uninitialized memory.
2531
1c12ffb4 2532=for apidoc Am|void|PoisonFree|void* dest|int nitems|type
7e337ee0
JH
2533
2534PoisonWith(0xEF) for catching access to freed memory.
2535
9965345d
JH
2536=for apidoc Am|void|Poison|void* dest|int nitems|type
2537
7e337ee0 2538PoisonWith(0xEF) for catching access to freed memory.
9965345d
JH
2539
2540=cut */
954c1994 2541
561b68a9
SH
2542/* Maintained for backwards-compatibility only. Use newSV() instead. */
2543#ifndef PERL_CORE
ff06c60c 2544#define NEWSV(x,len) newSV(len)
561b68a9 2545#endif
ff06c60c 2546
b7112dce 2547#define MEM_SIZE_MAX ((MEM_SIZE)-1)
19a94d75 2548
a500027b 2549#define _PERL_STRLEN_ROUNDUP_UNCHECKED(n) (((n) - 1 + PERL_STRLEN_ROUNDUP_QUANTUM) & ~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM - 1))
e6bdf523 2550
27d5b266 2551#ifdef PERL_MALLOC_WRAP
e6bdf523
DM
2552
2553/* This expression will be constant-folded at compile time. It checks
2554 * whether or not the type of the count n is so small (e.g. U8 or U16, or
2555 * U32 on 64-bit systems) that there's no way a wrap-around could occur.
2556 * As well as avoiding the need for a run-time check in some cases, it's
2557 * designed to avoid compiler warnings like:
2558 * comparison is always false due to limited range of data type
73e8ff00
DM
2559 * It's mathematically equivalent to
2560 * max(n) * sizeof(t) > MEM_SIZE_MAX
e6bdf523
DM
2561 */
2562
2563# define _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) \
445198b9
LM
2564 ( sizeof(MEM_SIZE) < sizeof(n) \
2565 || sizeof(t) > ((MEM_SIZE)1 << 8*(sizeof(MEM_SIZE) - sizeof(n))))
e6bdf523 2566
88f9f128 2567/* This is written in a slightly odd way to avoid various spurious
d98e5cde
DM
2568 * compiler warnings. We *want* to write the expression as
2569 * _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) && (n > C)
2570 * (for some compile-time constant C), but even when the LHS
2571 * constant-folds to false at compile-time, g++ insists on emitting
2572 * warnings about the RHS (e.g. "comparison is always false"), so instead
2573 * we write it as
e6bdf523 2574 *
d98e5cde 2575 * (cond ? n : X) > C
88f9f128 2576 *
d98e5cde
DM
2577 * where X is a constant with X > C always false. Choosing a value for X
2578 * is tricky. If 0, some compilers will complain about 0 > C always being
2579 * false; if 1, Coverity complains when n happens to be the constant value
2580 * '1', that cond ? 1 : 1 has the same value on both branches; so use C
2581 * for X and hope that nothing else whines.
e6bdf523
DM
2582 */
2583
2584# define _MEM_WRAP_WILL_WRAP(n,t) \
88f9f128
DM
2585 ((_MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) ? (MEM_SIZE)(n) : \
2586 MEM_SIZE_MAX/sizeof(t)) > MEM_SIZE_MAX/sizeof(t))
e6bdf523
DM
2587
2588# define MEM_WRAP_CHECK(n,t) \
2589 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \
2590 && (croak_memory_wrap(),0))
2591
2592# define MEM_WRAP_CHECK_1(n,t,a) \
2593 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \
2594 && (Perl_croak_nocontext("%s",(a)),0))
2595
814eedc8
DD
2596/* "a" arg must be a string literal */
2597# define MEM_WRAP_CHECK_s(n,t,a) \
2598 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \
2599 && (Perl_croak_nocontext("" a ""),0))
2600
8b44ba4c 2601#define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t),
27d5b266 2602
a500027b 2603#define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0) : 0), _PERL_STRLEN_ROUNDUP_UNCHECKED(n))
27d5b266
JH
2604#else
2605
410319be
NC
2606#define MEM_WRAP_CHECK(n,t)
2607#define MEM_WRAP_CHECK_1(n,t,a)
814eedc8 2608#define MEM_WRAP_CHECK_s(n,t,a)
8b44ba4c
NC
2609#define MEM_WRAP_CHECK_(n,t)
2610
a500027b 2611#define PERL_STRLEN_ROUNDUP(n) _PERL_STRLEN_ROUNDUP_UNCHECKED(n)
27d5b266 2612
1936d2a7 2613#endif
8b44ba4c 2614
fe4f188c 2615#ifdef PERL_MEM_LOG
46c6c7e2 2616/*
9f653bb5 2617 * If PERL_MEM_LOG is defined, all Newx()s, Renew()s, and Safefree()s
46c6c7e2
JH
2618 * go through functions, which are handy for debugging breakpoints, but
2619 * which more importantly get the immediate calling environment (file and
e352bcff
JH
2620 * line number, and C function name if available) passed in. This info can
2621 * then be used for logging the calls, for which one gets a sample
73d1d973 2622 * implementation unless -DPERL_MEM_LOG_NOIMPL is also defined.
3609ea0d 2623 *
46c6c7e2 2624 * Known problems:
94e892a6 2625 * - not all memory allocs get logged, only those
46c6c7e2 2626 * that go through Newx() and derivatives (while all
94e892a6 2627 * Safefrees do get logged)
46c6c7e2
JH
2628 * - __FILE__ and __LINE__ do not work everywhere
2629 * - __func__ or __FUNCTION__ even less so
2630 * - I think more goes on after the perlio frees but
2631 * the thing is that STDERR gets closed (as do all
2632 * the file descriptors)
2633 * - no deeper calling stack than the caller of the Newx()
2634 * or the kind, but do I look like a C reflection/introspection
2635 * utility to you?
2636 * - the function prototypes for the logging functions
2637 * probably should maybe be somewhere else than handy.h
2638 * - one could consider inlining (macrofying) the logging
2639 * for speed, but I am too lazy
2640 * - one could imagine recording the allocations in a hash,
2641 * (keyed by the allocation address?), and maintain that
2642 * through reallocs and frees, but how to do that without
2643 * any News() happening...?
73d1d973 2644 * - lots of -Ddefines to get useful/controllable output
b953482e 2645 * - lots of ENV reads
46c6c7e2
JH
2646 */
2647
0b0ab801 2648# ifdef PERL_CORE
73d1d973 2649# ifndef PERL_MEM_LOG_NOIMPL
0b0ab801
MHM
2650enum mem_log_type {
2651 MLT_ALLOC,
2652 MLT_REALLOC,
d7a2c63c
MHM
2653 MLT_FREE,
2654 MLT_NEW_SV,
2655 MLT_DEL_SV
0b0ab801
MHM
2656};
2657# endif
12754f92 2658# if defined(PERL_IN_SV_C) /* those are only used in sv.c */
d7a2c63c
MHM
2659void Perl_mem_log_new_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname);
2660void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname);
12754f92 2661# endif
0b0ab801
MHM
2662# endif
2663
fe4f188c
JH
2664#endif
2665
2666#ifdef PERL_MEM_LOG
d1401ee9
MHM
2667#define MEM_LOG_ALLOC(n,t,a) Perl_mem_log_alloc(n,sizeof(t),STRINGIFY(t),a,__FILE__,__LINE__,FUNCTION__)
2668#define MEM_LOG_REALLOC(n,t,v,a) Perl_mem_log_realloc(n,sizeof(t),STRINGIFY(t),v,a,__FILE__,__LINE__,FUNCTION__)
46c6c7e2 2669#define MEM_LOG_FREE(a) Perl_mem_log_free(a,__FILE__,__LINE__,FUNCTION__)
fe4f188c
JH
2670#endif
2671
2672#ifndef MEM_LOG_ALLOC
2673#define MEM_LOG_ALLOC(n,t,a) (a)
2674#endif
2675#ifndef MEM_LOG_REALLOC
2676#define MEM_LOG_REALLOC(n,t,v,a) (a)
2677#endif
2678#ifndef MEM_LOG_FREE
2679#define MEM_LOG_FREE(a) (a)
2680#endif
2681
d1401ee9
MHM
2682#define Newx(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t))))))
2683#define Newxc(v,n,t,c) (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t))))))
2684#define Newxz(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safecalloc((n),sizeof(t)))))
a6f6820f
NC
2685
2686#ifndef PERL_CORE
a02a5408
JC
2687/* pre 5.9.x compatibility */
2688#define New(x,v,n,t) Newx(v,n,t)
2689#define Newc(x,v,n,t,c) Newxc(v,n,t,c)
4541904d 2690#define Newz(x,v,n,t) Newxz(v,n,t)
a6f6820f 2691#endif
a02a5408 2692
ff68c719 2693#define Renew(v,n,t) \
d1401ee9 2694 (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t))))))
ff68c719 2695#define Renewc(v,n,t,c) \
d1401ee9 2696 (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t))))))
94010e71
NC
2697
2698#ifdef PERL_POISON
2699#define Safefree(d) \
06c0cc96 2700 ((d) ? (void)(safefree(MEM_LOG_FREE((Malloc_t)(d))), Poison(&(d), 1, Malloc_t)) : (void) 0)
94010e71 2701#else
fe4f188c 2702#define Safefree(d) safefree(MEM_LOG_FREE((Malloc_t)(d)))
94010e71 2703#endif
55497cff 2704
dbb57106
YO
2705/* assert that a valid ptr has been supplied - use this instead of assert(ptr) *
2706 * as it handles cases like constant string arguments without throwing warnings *
2707 * the cast is required, as is the inequality check, to avoid warnings */
45908e4d 2708#define perl_assert_ptr(p) assert( ((void*)(p)) != 0 )
55497cff 2709
45908e4d
YO
2710
2711#define Move(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memmove((char*)(d),(const char*)(s), (n) * sizeof(t)))
2712#define Copy(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memcpy((char*)(d),(const char*)(s), (n) * sizeof(t)))
2713#define Zero(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), (void)memzero((char*)(d), (n) * sizeof(t)))
2714
bdd1531d 2715/* Like above, but returns a pointer to 'd' */
45908e4d
YO
2716#define MoveD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memmove((char*)(d),(const char*)(s), (n) * sizeof(t)))
2717#define CopyD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memcpy((char*)(d),(const char*)(s), (n) * sizeof(t)))
45908e4d 2718#define ZeroD(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), memzero((char*)(d), (n) * sizeof(t)))
e90e2364 2719
7e337ee0
JH
2720#define PoisonWith(d,n,t,b) (MEM_WRAP_CHECK_(n,t) (void)memset((char*)(d), (U8)(b), (n) * sizeof(t)))
2721#define PoisonNew(d,n,t) PoisonWith(d,n,t,0xAB)
2722#define PoisonFree(d,n,t) PoisonWith(d,n,t,0xEF)
2723#define Poison(d,n,t) PoisonFree(d,n,t)
27d5b266 2724
caa674f3
DD
2725#ifdef PERL_POISON
2726# define PERL_POISON_EXPR(x) x
2727#else
2728# define PERL_POISON_EXPR(x)
2729#endif
2730
ff68c719 2731#define StructCopy(s,d,t) (*((t*)(d)) = *((t*)(s)))
2cc61e15 2732
1b7e2294
KW
2733/*
2734=head1 Handy Values
2735
2736=for apidoc Am|STRLEN|C_ARRAY_LENGTH|void *a
2737
2738Returns the number of elements in the input C array (so you want your
2739zero-based indices to be less than but not equal to).
2740
2741=for apidoc Am|void *|C_ARRAY_END|void *a
2742
2743Returns a pointer to one element past the final element of the input C array.
2744
2745=cut
2746
2747C_ARRAY_END is one past the last: half-open/half-closed range, not
2748last-inclusive range.
2749*/
622913ab 2750#define C_ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
c3caa5c3 2751#define C_ARRAY_END(a) ((a) + C_ARRAY_LENGTH(a))
622913ab 2752
2cc61e15
DD
2753#ifdef NEED_VA_COPY
2754# ifdef va_copy
2755# define Perl_va_copy(s, d) va_copy(d, s)
07798b17
AC
2756# elif defined(__va_copy)
2757# define Perl_va_copy(s, d) __va_copy(d, s)
2cc61e15 2758# else
07798b17 2759# define Perl_va_copy(s, d) Copy(s, d, 1, va_list)
2cc61e15
DD
2760# endif
2761#endif
2762
472d47bc
SB
2763/* convenience debug macros */
2764#ifdef USE_ITHREADS
2765#define pTHX_FORMAT "Perl interpreter: 0x%p"
2766#define pTHX__FORMAT ", Perl interpreter: 0x%p"
f54cb97a
AL
2767#define pTHX_VALUE_ (void *)my_perl,
2768#define pTHX_VALUE (void *)my_perl
2769#define pTHX__VALUE_ ,(void *)my_perl,
2770#define pTHX__VALUE ,(void *)my_perl
472d47bc 2771#else
3609ea0d 2772#define pTHX_FORMAT
472d47bc 2773#define pTHX__FORMAT
3609ea0d 2774#define pTHX_VALUE_
472d47bc 2775#define pTHX_VALUE
3609ea0d 2776#define pTHX__VALUE_
472d47bc
SB
2777#define pTHX__VALUE
2778#endif /* USE_ITHREADS */
3609ea0d 2779
2acdbac1
NC
2780/* Perl_deprecate was not part of the public API, and did not have a deprecate()
2781 shortcut macro defined without -DPERL_CORE. Neither codesearch.google.com nor
2782 CPAN::Unpack show any users outside the core. */
2783#ifdef PERL_CORE
dc6e8de0
A
2784# define deprecate(s) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
2785 "Use of " s " is deprecated")
c9680906
A
2786# define deprecate_disappears_in(when,message) \
2787 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
2788 message ", and will disappear in Perl " when)
ac641426
A
2789# define deprecate_fatal_in(when,message) \
2790 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
2791 message ". Its use will be fatal in Perl " when)
2acdbac1
NC
2792#endif
2793
dfff4baf
BF
2794/* Internal macros to deal with gids and uids */
2795#ifdef PERL_CORE
2796
2797# if Uid_t_size > IVSIZE
2798# define sv_setuid(sv, uid) sv_setnv((sv), (NV)(uid))
2799# define SvUID(sv) SvNV(sv)
07798b17
AC
2800# elif Uid_t_sign <= 0
2801# define sv_setuid(sv, uid) sv_setiv((sv), (IV)(uid))
2802# define SvUID(sv) SvIV(sv)
dfff4baf 2803# else
07798b17
AC
2804# define sv_setuid(sv, uid) sv_setuv((sv), (UV)(uid))
2805# define SvUID(sv) SvUV(sv)
dfff4baf
BF
2806# endif /* Uid_t_size */
2807
2808# if Gid_t_size > IVSIZE
2809# define sv_setgid(sv, gid) sv_setnv((sv), (NV)(gid))
2810# define SvGID(sv) SvNV(sv)
07798b17
AC
2811# elif Gid_t_sign <= 0
2812# define sv_setgid(sv, gid) sv_setiv((sv), (IV)(gid))
2813# define SvGID(sv) SvIV(sv)
dfff4baf 2814# else
07798b17
AC
2815# define sv_setgid(sv, gid) sv_setuv((sv), (UV)(gid))
2816# define SvGID(sv) SvUV(sv)
dfff4baf
BF
2817# endif /* Gid_t_size */
2818
2819#endif
2820
6a5bc5ac 2821#endif /* PERL_HANDY_H_ */
9d745869 2822
e9a8c099 2823/*
14d04a33 2824 * ex: set ts=8 sts=4 sw=4 et:
e9a8c099 2825 */