This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add tests for value magic
[perl5.git] / handy.h
CommitLineData
a0d0e21e 1/* handy.h
a687059c 2 *
1129b882 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000,
da5d8dbb 4 * 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others
a687059c 5 *
6e21c824
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8d063cd8 8 *
8d063cd8
LW
9 */
10
4650c663
KW
11/* IMPORTANT NOTE: Everything whose name begins with an underscore is for
12 * internal core Perl use only. */
13
6a5bc5ac
KW
14#ifndef PERL_HANDY_H_ /* Guard against nested #inclusion */
15#define PERL_HANDY_H_
9d745869 16
24792b8d
NC
17#ifndef PERL_CORE
18# define Null(type) ((type)NULL)
954c1994
GS
19
20/*
3f620621 21=for apidoc_section $string
78342678 22=for apidoc AmnU||Nullch
72d33970
FC
23Null character pointer. (No longer available when C<PERL_CORE> is
24defined.)
2307c6d0 25
3f620621 26=for apidoc_section $SV
78342678 27=for apidoc AmnU||Nullsv
72d33970 28Null SV pointer. (No longer available when C<PERL_CORE> is defined.)
954c1994
GS
29
30=cut
51b56f5c
KW
31
32Below are signatures of functions from config.h which can't easily be gleaned
33from it, and are very unlikely to change
34
3f620621 35=for apidoc_section $signals
51b56f5c
KW
36=for apidoc Am|int|Sigsetjmp|jmp_buf env|int savesigs
37=for apidoc Am|void|Siglongjmp|jmp_buf env|int val
38
3f620621 39=for apidoc_section $filesystem
51b56f5c
KW
40=for apidoc Am|void *|FILE_ptr|FILE * f
41=for apidoc Am|Size_t|FILE_cnt|FILE * f
42=for apidoc Am|void *|FILE_base|FILE * f
b290de04 43=for apidoc Am|Size_t|FILE_bufsiz|FILE *f
51b56f5c 44
3f620621 45=for apidoc_section $string
b290de04
KW
46=for apidoc Amu|token|CAT2|token x|token y
47=for apidoc Amu|string|STRINGIFY|token x
51b56f5c 48
3f620621 49=for apidoc_section $numeric
b290de04 50=for apidoc Am|double|Drand01
51b56f5c
KW
51=for apidoc Am|void|seedDrand01|Rand_seed_t x
52=for apidoc Am|char *|Gconvert|double x|Size_t n|bool t|char * b
53
54=cut
954c1994
GS
55*/
56
24792b8d
NC
57# define Nullch Null(char*)
58# define Nullfp Null(PerlIO*)
59# define Nullsv Null(SV*)
60#endif
8d063cd8 61
641d3f0b 62#ifdef TRUE
63#undef TRUE
64#endif
65#ifdef FALSE
66#undef FALSE
67#endif
68#define TRUE (1)
69#define FALSE (0)
70
df87895c 71/*
3f620621 72=for apidoc_section $SV
1607e393
KW
73=for apidoc Am |AV * |MUTABLE_AV |AV * p
74=for apidoc_item |CV * |MUTABLE_CV |CV * p
75=for apidoc_item |GV * |MUTABLE_GV |GV * p
76=for apidoc_item |HV * |MUTABLE_HV |HV * p
77=for apidoc_item |IO * |MUTABLE_IO |IO * p
78=for apidoc_item |void *|MUTABLE_PTR|void * p
79=for apidoc_item |SV * |MUTABLE_SV |SV * p
df87895c
KW
80
81The C<MUTABLE_I<*>>() macros cast pointers to the types shown, in such a way
82(compiler permitting) that casting away const-ness will give a warning;
83e.g.:
84
85 const SV *sv = ...;
86 AV *av1 = (AV*)sv; <== BAD: the const has been silently
87 cast away
88 AV *av2 = MUTABLE_AV(sv); <== GOOD: it may warn
89
54ba56f3
KW
90C<MUTABLE_PTR> is the base macro used to derive new casts. The other
91already-built-in ones return pointers to what their names indicate.
df87895c
KW
92
93=cut
cf3f0ffb 94
ba04fd90
KW
95The brace group version will raise a diagnostic if 'p' is const; the other
96blindly casts away const.
97 */
041c1a23 98#if defined(PERL_USE_GCC_BRACE_GROUPS)
6c2255e0 99# define MUTABLE_PTR(p) ({ void *p_ = (p); p_; })
b1bc3f34
NC
100#else
101# define MUTABLE_PTR(p) ((void *) (p))
102#endif
103
a062e10d 104#define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p))
ea726b52 105#define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p))
159b6efe 106#define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p))
dbebbdb4 107#define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p))
a45c7426 108#define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p))
b1bc3f34 109#define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p))
27d4fb96 110
888c3878
PE
111/*
112=for apidoc_section $SV
113=for apidoc Am |AV *|AV_FROM_REF|SV * ref
114=for apidoc_item |CV *|CV_FROM_REF|SV * ref
115=for apidoc_item |HV *|HV_FROM_REF|SV * ref
116
117The C<I<*>V_FROM_REF> macros extract the C<SvRV()> from a given reference SV
118and return a suitably-cast to pointer to the referenced SV. When running
119under C<-DDEBUGGING>, assertions are also applied that check that I<ref> is
120definitely a reference SV that refers to an SV of the right type.
121
122=cut
123*/
124
125#if defined(DEBUGGING) && defined(PERL_USE_GCC_BRACE_GROUPS)
126# define xV_FROM_REF(XV, ref) \
127 ({ SV *_ref = ref; \
128 assert(SvROK(_ref)); \
129 assert(SvTYPE(SvRV(_ref)) == SVt_PV ## XV); \
130 (XV *)(SvRV(_ref)); })
131#else
132# define xV_FROM_REF(XV, ref) ((XV *)(SvRV(ref)))
133#endif
134
135#define AV_FROM_REF(ref) xV_FROM_REF(AV, ref)
136#define CV_FROM_REF(ref) xV_FROM_REF(CV, ref)
137#define HV_FROM_REF(ref) xV_FROM_REF(HV, ref)
138
b1c011dc 139#ifndef __cplusplus
bd31be4b 140# include <stdbool.h>
bd31be4b
NC
141#endif
142
25ba28ce 143/*
3f620621 144=for apidoc_section $casting
25ba28ce
KW
145=for apidoc Am|bool|cBOOL|bool expr
146
9fa5fd96
KW
147Cast-to-bool. When Perl was able to be compiled on pre-C99 compilers, a
148C<(bool)> cast didn't necessarily do the right thing, so this macro was
149created (and made somewhat complicated to work around bugs in old
150compilers). Now, many years later, and C99 is used, this is no longer
151required, but is kept for backwards compatibility.
25ba28ce
KW
152
153=cut
154*/
9fa5fd96 155#define cBOOL(cbool) ((bool) (cbool))
f2338a2e 156
46c6c7e2 157/* Try to figure out __func__ or __FUNCTION__ equivalent, if any.
e352bcff
JH
158 * XXX Should really be a Configure probe, with HAS__FUNCTION__
159 * and FUNCTION__ as results.
160 * XXX Similarly, a Configure probe for __FILE__ and __LINE__ is needed. */
46c6c7e2
JH
161#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__SUNPRO_C)) /* C99 or close enough. */
162# define FUNCTION__ __func__
f31da5f4 163# define SAFE_FUNCTION__ __func__
7adf2470 164#elif (defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */
f31da5f4
YO
165# define FUNCTION__ ("")
166# define SAFE_FUNCTION__ ("UNKNOWN")
46c6c7e2 167#else
07798b17 168# define FUNCTION__ __FUNCTION__ /* Common extension. */
f31da5f4 169# define SAFE_FUNCTION__ __FUNCTION__ /* Common extension. */
46c6c7e2
JH
170#endif
171
27d4fb96 172/* XXX A note on the perl source internal type system. The
173 original intent was that I32 be *exactly* 32 bits.
174
175 Currently, we only guarantee that I32 is *at least* 32 bits.
176 Specifically, if int is 64 bits, then so is I32. (This is the case
177 for the Cray.) This has the advantage of meshing nicely with
178 standard library calls (where we pass an I32 and the library is
179 expecting an int), but the disadvantage that an I32 is not 32 bits.
180 Andy Dougherty August 1996
24fef2a7 181
dc45a647
MB
182 There is no guarantee that there is *any* integral type with
183 exactly 32 bits. It is perfectly legal for a system to have
184 sizeof(short) == sizeof(int) == sizeof(long) == 8.
693762b4 185
dc45a647
MB
186 Similarly, there is no guarantee that I16 and U16 have exactly 16
187 bits.
693762b4 188
8e84507e
NIS
189 For dealing with issues that may arise from various 32/64-bit
190 systems, we will ask Configure to check out
8175356b 191
1604cfb0
MS
192 SHORTSIZE == sizeof(short)
193 INTSIZE == sizeof(int)
194 LONGSIZE == sizeof(long)
195 LONGLONGSIZE == sizeof(long long) (if HAS_LONG_LONG)
196 PTRSIZE == sizeof(void *)
197 DOUBLESIZE == sizeof(double)
198 LONG_DOUBLESIZE == sizeof(long double) (if HAS_LONG_DOUBLE).
8175356b 199
27d4fb96 200*/
201
69512466
JH
202#ifdef I_INTTYPES /* e.g. Linux has int64_t without <inttypes.h> */
203# include <inttypes.h>
dd0eed91
JH
204# ifdef INT32_MIN_BROKEN
205# undef INT32_MIN
206# define INT32_MIN (-2147483647-1)
207# endif
208# ifdef INT64_MIN_BROKEN
209# undef INT64_MIN
210# define INT64_MIN (-9223372036854775807LL-1)
211# endif
69512466
JH
212#endif
213
8175356b
JH
214typedef I8TYPE I8;
215typedef U8TYPE U8;
216typedef I16TYPE I16;
217typedef U16TYPE U16;
218typedef I32TYPE I32;
219typedef U32TYPE U32;
16d89be8 220
74b807c7 221#ifdef QUADKIND
8175356b
JH
222typedef I64TYPE I64;
223typedef U64TYPE U64;
16d89be8 224#endif
8175356b 225
5ff3f7a4
GS
226/* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type.
227 Please search CHAR_MAX in perl.h for further details. */
26e4a9a9
KW
228#ifdef UINT8_MAX
229# define U8_MAX UINT8_MAX
0e983133 230#else
26e4a9a9 231# define U8_MAX PERL_UCHAR_MAX
0e983133 232#endif
26e4a9a9
KW
233#ifdef UINT8_MIN
234# define U8_MIN UINT8_MIN
5ff3f7a4 235#else
26e4a9a9
KW
236# define U8_MIN PERL_UCHAR_MIN
237#endif
5ff3f7a4 238
26e4a9a9
KW
239#ifdef INT16_MAX
240# define I16_MAX INT16_MAX
79072805 241#else
26e4a9a9
KW
242# define I16_MAX PERL_SHORT_MAX
243#endif
244#ifdef INT16_MIN
245# define I16_MIN INT16_MIN
246#else
247# define I16_MIN PERL_SHORT_MIN
248#endif
249#ifdef UINT16_MAX
250# define U16_MAX UINT16_MAX
251#else
252# define U16_MAX PERL_USHORT_MAX
253#endif
254#ifdef UINT16_MIN
255# define U16_MIN UINT16_MIN
256#else
257# define U16_MIN PERL_USHORT_MIN
79072805
LW
258#endif
259
26e4a9a9
KW
260#ifdef INT32_MAX
261# define I32_MAX INT32_MAX
262#elif LONGSIZE > 4
263# define I32_MAX PERL_INT_MAX
264#else
265# define I32_MAX PERL_LONG_MAX
266#endif
267#ifdef INT32_MIN
268# define I32_MIN INT32_MIN
269#elif LONGSIZE > 4
270# define I32_MIN PERL_INT_MIN
271#else
272# define I32_MIN PERL_LONG_MIN
273#endif
274#ifdef UINT32_MAX
275# ifndef UINT32_MAX_BROKEN /* e.g. HP-UX with gcc messes this up */
276# define U32_MAX UINT_MAX
277# else
278# define U32_MAX 4294967295U
279# endif
280#elif LONGSIZE > 4
281# define U32_MAX PERL_UINT_MAX
282#else
283# define U32_MAX PERL_ULONG_MAX
284#endif
285#ifdef UINT32_MIN
286# define U32_MIN UINT32_MIN
287#elif LONGSIZE > 4
288# define U32_MIN PERL_UINT_MIN
289#else
290# define U32_MIN PERL_ULONG_MIN
5ff3f7a4
GS
291#endif
292
23332c7d
KW
293/*
294=for apidoc_section $integer
295=for apidoc Ay|| PERL_INT_FAST8_T
296=for apidoc_item PERL_INT_FAST16_T
297=for apidoc_item PERL_UINT_FAST8_T
298=for apidoc_item PERL_UINT_FAST16_T
299
300These are equivalent to the correspondingly-named C99 typedefs on platforms
301that have those; they evaluate to C<int> and C<unsigned int> on platforms that
302don't, so that you can portably take advantage of this C99 feature.
303
304=cut
305*/
934902b8 306#ifdef I_STDINT
247cee9f
KW
307 typedef int_fast8_t PERL_INT_FAST8_T;
308 typedef uint_fast8_t PERL_UINT_FAST8_T;
309 typedef int_fast16_t PERL_INT_FAST16_T;
310 typedef uint_fast16_t PERL_UINT_FAST16_T;
934902b8 311#else
247cee9f
KW
312 typedef int PERL_INT_FAST8_T;
313 typedef unsigned int PERL_UINT_FAST8_T;
314 typedef int PERL_INT_FAST16_T;
315 typedef unsigned int PERL_UINT_FAST16_T;
934902b8 316#endif
247cee9f 317
464decb6 318/* log(2) (i.e., log base 10 of 2) is pretty close to 0.30103, just in case
ab350cbd
KW
319 * anyone is grepping for it. So BIT_DIGITS gives the number of decimal digits
320 * required to represent any possible unsigned number containing N bits.
321 * TYPE_DIGITS gives the number of decimal digits required to represent any
322 * possible unsigned number of type T. */
464decb6 323#define BIT_DIGITS(N) (((N)*146)/485 + 1) /* log10(2) =~ 146/485 */
fc36a67e 324#define TYPE_DIGITS(T) BIT_DIGITS(sizeof(T) * 8)
325#define TYPE_CHARS(T) (TYPE_DIGITS(T) + 2) /* sign, NUL */
326
88794300 327/* Unused by core; should be deprecated */
ff68c719 328#define Ctl(ch) ((ch) & 037)
8d063cd8 329
98fce2a4
KW
330#if defined(PERL_CORE) || defined(PERL_EXT)
331# ifndef MIN
332# define MIN(a,b) ((a) < (b) ? (a) : (b))
333# endif
334# ifndef MAX
335# define MAX(a,b) ((a) > (b) ? (a) : (b))
336# endif
337#endif
338
84ff4fa9
KW
339/* Returns a boolean as to whether the input unsigned number is a power of 2
340 * (2**0, 2**1, etc). In other words if it has just a single bit set.
341 * If not, subtracting 1 would leave the uppermost bit set, so the & would
342 * yield non-zero */
343#if defined(PERL_CORE) || defined(PERL_EXT)
011b1419 344# define isPOWER_OF_2(n) ((n) && ((n) & ((n)-1)) == 0)
84ff4fa9
KW
345#endif
346
d223e1ea 347/* Returns a mask with the lowest n bits set */
fae1e72b 348#define nBIT_MASK(n) ((UINTMAX_C(1) << (n)) - 1)
d223e1ea 349
1381ccb1
KW
350/* The largest unsigned number that will fit into n bits */
351#define nBIT_UMAX(n) nBIT_MASK(n)
352
8d9433eb 353/*
3f620621 354=for apidoc_section $directives
d23778e6 355=for apidoc Am||__ASSERT_|bool expr
8d9433eb
KW
356
357This is a helper macro to avoid preprocessor issues, replaced by nothing
358unless under DEBUGGING, where it expands to an assert of its argument,
359followed by a comma (hence the comma operator). If we just used a straight
360assert(), we would get a comma with nothing before it when not DEBUGGING.
361
362=cut
363
803e4935 364We also use empty definition under Coverity since the __ASSERT_
8d9433eb
KW
365checks often check for things that Really Cannot Happen, and Coverity
366detects that and gets all excited. */
3e94db23 367
e7ae132e
KW
368#if defined(DEBUGGING) && !defined(__COVERITY__) \
369 && ! defined(PERL_SMALL_MACRO_BUFFER)
0f092d08
KW
370# define __ASSERT_(statement) assert(statement),
371#else
372# define __ASSERT_(statement)
373#endif
374
3fe05580 375/*
3f620621 376=for apidoc_section $SV
3fe05580 377
3bb9fd01 378=for apidoc Ama|SV*|newSVpvs|"literal string"
1568d13a 379Like C<newSVpvn>, but takes a literal string instead of a
30a15352 380string/length pair.
3fe05580 381
3bb9fd01 382=for apidoc Ama|SV*|newSVpvs_flags|"literal string"|U32 flags
1568d13a 383Like C<newSVpvn_flags>, but takes a literal string instead of
30a15352 384a string/length pair.
84bafc02 385
3bb9fd01 386=for apidoc Ama|SV*|newSVpvs_share|"literal string"
1568d13a 387Like C<newSVpvn_share>, but takes a literal string instead of
30a15352 388a string/length pair and omits the hash parameter.
3fe05580 389
3bb9fd01 390=for apidoc Am|void|sv_catpvs_flags|SV* sv|"literal string"|I32 flags
1568d13a 391Like C<sv_catpvn_flags>, but takes a literal string instead
30a15352 392of a string/length pair.
9dcc53ea 393
3bb9fd01 394=for apidoc Am|void|sv_catpvs_nomg|SV* sv|"literal string"
1568d13a 395Like C<sv_catpvn_nomg>, but takes a literal string instead of
0c395ea5 396a string/length pair.
9dcc53ea 397
3bb9fd01 398=for apidoc Am|void|sv_catpvs|SV* sv|"literal string"
1568d13a 399Like C<sv_catpvn>, but takes a literal string instead of a
0c395ea5 400string/length pair.
3fe05580 401
3bb9fd01 402=for apidoc Am|void|sv_catpvs_mg|SV* sv|"literal string"
1568d13a 403Like C<sv_catpvn_mg>, but takes a literal string instead of a
9dcc53ea
Z
404string/length pair.
405
3bb9fd01 406=for apidoc Am|SV *|sv_setref_pvs|SV *const rv|const char *const classname|"literal string"
1568d13a 407Like C<sv_setref_pvn>, but takes a literal string instead of
0c395ea5 408a string/length pair.
9dcc53ea 409
3f620621 410=for apidoc_section $string
3fe05580 411
3bb9fd01 412=for apidoc Ama|char*|savepvs|"literal string"
1568d13a 413Like C<savepvn>, but takes a literal string instead of a
30a15352 414string/length pair.
3fe05580 415
3bb9fd01 416=for apidoc Ama|char*|savesharedpvs|"literal string"
9dcc53ea
Z
417A version of C<savepvs()> which allocates the duplicate string in memory
418which is shared between threads.
419
3f620621 420=for apidoc_section $GV
3fe05580 421
3bb9fd01 422=for apidoc Am|HV*|gv_stashpvs|"name"|I32 create
1568d13a 423Like C<gv_stashpvn>, but takes a literal string instead of a
0c395ea5 424string/length pair.
3fe05580 425
3f620621 426=for apidoc_section $HV
3fe05580 427
3bb9fd01 428=for apidoc Am|SV**|hv_fetchs|HV* tb|"key"|I32 lval
1568d13a 429Like C<hv_fetch>, but takes a literal string instead of a
0c395ea5 430string/length pair.
3f620621 431=for apidoc_section $lexer
510966aa 432
3bb9fd01 433=for apidoc Amx|void|lex_stuff_pvs|"pv"|U32 flags
510966aa 434
1568d13a 435Like L</lex_stuff_pvn>, but takes a literal string instead of
0c395ea5 436a string/length pair.
510966aa 437
3fe05580
MHM
438=cut
439*/
440
ca0572d7
KW
441#define ASSERT_IS_LITERAL(s) ("" s "")
442
a34e53fc 443/*
3f620621 444=for apidoc_section $string
2efa8cc7 445
a34e53fc
KW
446=for apidoc Amu|pair|STR_WITH_LEN|"literal string"
447
448Returns two comma separated tokens of the input literal string, and its length.
449This is convenience macro which helps out in some API calls.
450Note that it can't be used as an argument to macros or functions that under
451some configurations might be macros, which means that it requires the full
452Perl_xxx(aTHX_ ...) form for any API calls where it's used.
453
454=cut
455*/
456
ca0572d7 457#define STR_WITH_LEN(s) ASSERT_IS_LITERAL(s), (sizeof(s)-1)
ba3a79e7
GA
458
459/* STR_WITH_LEN() shortcuts */
460#define newSVpvs(str) Perl_newSVpvn(aTHX_ STR_WITH_LEN(str))
84bafc02
NC
461#define newSVpvs_flags(str,flags) \
462 Perl_newSVpvn_flags(aTHX_ STR_WITH_LEN(str), flags)
ba3a79e7 463#define newSVpvs_share(str) Perl_newSVpvn_share(aTHX_ STR_WITH_LEN(str), 0)
9dcc53ea
Z
464#define sv_catpvs_flags(sv, str, flags) \
465 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), flags)
466#define sv_catpvs_nomg(sv, str) \
467 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), 0)
468#define sv_catpvs(sv, str) \
469 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC)
470#define sv_catpvs_mg(sv, str) \
471 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC|SV_SMAGIC)
3fe05580 472#define sv_setpvs(sv, str) Perl_sv_setpvn(aTHX_ sv, STR_WITH_LEN(str))
9dcc53ea
Z
473#define sv_setpvs_mg(sv, str) Perl_sv_setpvn_mg(aTHX_ sv, STR_WITH_LEN(str))
474#define sv_setref_pvs(rv, classname, str) \
475 Perl_sv_setref_pvn(aTHX_ rv, classname, STR_WITH_LEN(str))
ba3a79e7 476#define savepvs(str) Perl_savepvn(aTHX_ STR_WITH_LEN(str))
9dcc53ea
Z
477#define savesharedpvs(str) Perl_savesharedpvn(aTHX_ STR_WITH_LEN(str))
478#define gv_stashpvs(str, create) \
479 Perl_gv_stashpvn(aTHX_ STR_WITH_LEN(str), create)
3752a9fe
KW
480
481#define gv_fetchpvs(namebeg, flags, sv_type) \
482 Perl_gv_fetchpvn_flags(aTHX_ STR_WITH_LEN(namebeg), flags, sv_type)
72659597 483#define gv_fetchpvn gv_fetchpvn_flags
9dcc53ea
Z
484#define sv_catxmlpvs(dsv, str, utf8) \
485 Perl_sv_catxmlpvn(aTHX_ dsv, STR_WITH_LEN(str), utf8)
4ac46235 486
ba3a79e7 487
510966aa
Z
488#define lex_stuff_pvs(pv,flags) Perl_lex_stuff_pvn(aTHX_ STR_WITH_LEN(pv), flags)
489
b96d8cd9 490#define get_cvs(str, flags) \
1604cfb0 491 Perl_get_cvn_flags(aTHX_ STR_WITH_LEN(str), (flags))
5c1737d1 492
9b6e9510 493/* internal helpers */
4a1bbd3d 494/* Transitional */
d67f622b
N
495#ifndef PERL_VERSION_MAJOR
496# define PERL_VERSION_MAJOR PERL_REVISION
4a1bbd3d
KW
497#else
498# undef PERL_REVISION /* We don't want code to be using these */
499#endif
d67f622b
N
500#ifndef PERL_VERSION_MINOR
501# define PERL_VERSION_MINOR PERL_VERSION
4a1bbd3d
KW
502#else
503# undef PERL_VERSION
504#endif
d67f622b
N
505#ifndef PERL_VERSION_PATCH
506# define PERL_VERSION_PATCH PERL_SUBVERSION
4a1bbd3d
KW
507#else
508# undef PERL_SUBVERSION
509#endif
510
511#define PERL_JNP_TO_DECIMAL_(maJor,miNor,Patch) \
512 /* '10*' leaves room for things like alpha, beta, releases */ \
513 (10 * ((maJor) * 1000000) + ((miNor) * 1000) + (Patch))
9b6e9510 514#define PERL_DECIMAL_VERSION_ \
d67f622b
N
515 PERL_JNP_TO_DECIMAL_(PERL_VERSION_MAJOR, PERL_VERSION_MINOR, \
516 PERL_VERSION_PATCH)
9b6e9510
KW
517
518/*
3f620621 519=for apidoc_section $versioning
4a1bbd3d 520=for apidoc AmR|bool|PERL_VERSION_EQ|const U8 major|const U8 minor|const U8 patch
9de44d19 521=for apidoc_item PERL_VERSION_GE
1607e393
KW
522=for apidoc_item PERL_VERSION_GT
523=for apidoc_item PERL_VERSION_LE
524=for apidoc_item PERL_VERSION_LT
525=for apidoc_item PERL_VERSION_NE
9b6e9510 526
4a1bbd3d 527Returns whether or not the perl currently being compiled has the specified
9b6e9510
KW
528relationship to the perl given by the parameters. For example,
529
530 #if PERL_VERSION_GT(5,24,2)
531 code that will only be compiled on perls after v5.24.2
532 #else
533 fallback code
534 #endif
535
536Note that this is usable in making compile-time decisions
537
4a1bbd3d
KW
538You may use the special value '*' for the final number to mean ALL possible
539values for it. Thus,
540
541 #if PERL_VERSION_EQ(5,31,'*')
542
543means all perls in the 5.31 series. And
544
545 #if PERL_VERSION_NE(5,24,'*')
546
547means all perls EXCEPT 5.24 ones. And
548
549 #if PERL_VERSION_LE(5,9,'*')
550
551is effectively
552
553 #if PERL_VERSION_LT(5,10,0)
554
555This means you don't have to think so much when converting from the existing
556deprecated C<PERL_VERSION> to using this macro:
557
558 #if PERL_VERSION <= 9
559
560becomes
561
562 #if PERL_VERSION_LE(5,9,'*')
563
9b6e9510
KW
564=cut
565*/
566
4a1bbd3d
KW
567/* N.B. These don't work if the patch version is 42 or 92, as those are what
568 * '*' is in ASCII and EBCDIC respectively */
569# define PERL_VERSION_EQ(j,n,p) \
570 (((p) == '*') \
d67f622b
N
571 ? ( (j) == PERL_VERSION_MAJOR \
572 && (n) == PERL_VERSION_MINOR) \
4a1bbd3d
KW
573 : (PERL_DECIMAL_VERSION_ == PERL_JNP_TO_DECIMAL_(j,n,p)))
574# define PERL_VERSION_NE(j,n,p) (! PERL_VERSION_EQ(j,n,p))
575
576# define PERL_VERSION_LT(j,n,p) /* < '*' effectively means < 0 */ \
577 (PERL_DECIMAL_VERSION_ < PERL_JNP_TO_DECIMAL_( (j), \
578 (n), \
579 (((p) == '*') ? 0 : p)))
580# define PERL_VERSION_GE(j,n,p) (! PERL_VERSION_LT(j,n,p))
581
582# define PERL_VERSION_LE(j,n,p) /* <= '*' effectively means < n+1 */ \
583 (PERL_DECIMAL_VERSION_ < PERL_JNP_TO_DECIMAL_( (j), \
584 (((p) == '*') ? ((n)+1) : (n)), \
585 (((p) == '*') ? 0 : p)))
586# define PERL_VERSION_GT(j,n,p) (! PERL_VERSION_LE(j,n,p))
9b6e9510 587
954c1994 588/*
3f620621 589=for apidoc_section $string
ccfc67b7 590
954c1994 591=for apidoc Am|bool|strNE|char* s1|char* s2
dc6b0978
KW
592Test two C<NUL>-terminated strings to see if they are different. Returns true
593or false.
954c1994
GS
594
595=for apidoc Am|bool|strEQ|char* s1|char* s2
dc6b0978
KW
596Test two C<NUL>-terminated strings to see if they are equal. Returns true or
597false.
954c1994
GS
598
599=for apidoc Am|bool|strLT|char* s1|char* s2
dc6b0978
KW
600Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than the
601second, C<s2>. Returns true or false.
954c1994
GS
602
603=for apidoc Am|bool|strLE|char* s1|char* s2
dc6b0978
KW
604Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than or
605equal to the second, C<s2>. Returns true or false.
954c1994
GS
606
607=for apidoc Am|bool|strGT|char* s1|char* s2
dc6b0978
KW
608Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than
609the second, C<s2>. Returns true or false.
954c1994
GS
610
611=for apidoc Am|bool|strGE|char* s1|char* s2
dc6b0978
KW
612Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than
613or equal to the second, C<s2>. Returns true or false.
954c1994
GS
614
615=for apidoc Am|bool|strnNE|char* s1|char* s2|STRLEN len
dc6b0978
KW
616Test two C<NUL>-terminated strings to see if they are different. The C<len>
617parameter indicates the number of bytes to compare. Returns true or false. (A
954c1994
GS
618wrapper for C<strncmp>).
619
620=for apidoc Am|bool|strnEQ|char* s1|char* s2|STRLEN len
dc6b0978
KW
621Test two C<NUL>-terminated strings to see if they are equal. The C<len>
622parameter indicates the number of bytes to compare. Returns true or false. (A
623wrapper for C<strncmp>).
954c1994 624
bd18bd40
KW
625=for apidoc Am|bool|memEQ|char* s1|char* s2|STRLEN len
626Test two buffers (which may contain embedded C<NUL> characters, to see if they
627are equal. The C<len> parameter indicates the number of bytes to compare.
b96bd7bf
KW
628Returns true or false. It is undefined behavior if either of the buffers
629doesn't contain at least C<len> bytes.
bd18bd40 630
3bb9fd01 631=for apidoc Am|bool|memEQs|char* s1|STRLEN l1|"s2"
2d8eeddb
KW
632Like L</memEQ>, but the second string is a literal enclosed in double quotes,
633C<l1> gives the number of bytes in C<s1>.
b96bd7bf 634Returns true or false.
2d8eeddb 635
bd18bd40
KW
636=for apidoc Am|bool|memNE|char* s1|char* s2|STRLEN len
637Test two buffers (which may contain embedded C<NUL> characters, to see if they
638are not equal. The C<len> parameter indicates the number of bytes to compare.
b96bd7bf
KW
639Returns true or false. It is undefined behavior if either of the buffers
640doesn't contain at least C<len> bytes.
bd18bd40 641
3bb9fd01 642=for apidoc Am|bool|memNEs|char* s1|STRLEN l1|"s2"
2d8eeddb
KW
643Like L</memNE>, but the second string is a literal enclosed in double quotes,
644C<l1> gives the number of bytes in C<s1>.
b96bd7bf 645Returns true or false.
2d8eeddb 646
4aada8b9
KW
647=for apidoc Am|bool|memCHRs|"list"|char c
648Returns the position of the first occurence of the byte C<c> in the literal
649string C<"list">, or NULL if C<c> doesn't appear in C<"list">. All bytes are
650treated as unsigned char. Thus this macro can be used to determine if C<c> is
651in a set of particular characters. Unlike L<strchr(3)>, it works even if C<c>
652is C<NUL> (and the set doesn't include C<NUL>).
653
954c1994 654=cut
fc169e00
KW
655
656New macros should use the following conventions for their names (which are
657based on the underlying C library functions):
658
659 (mem | str n? ) (EQ | NE | LT | GT | GE | (( BEGIN | END ) P? )) l? s?
660
661 Each has two main parameters, string-like operands that are compared
662 against each other, as specified by the macro name. Some macros may
663 additionally have one or potentially even two length parameters. If a length
664 parameter applies to both string parameters, it will be positioned third;
665 otherwise any length parameter immediately follows the string parameter it
666 applies to.
667
668 If the prefix to the name is 'str', the string parameter is a pointer to a C
669 language string. Such a string does not contain embedded NUL bytes; its
670 length may be unknown, but can be calculated by C<strlen()>, since it is
671 terminated by a NUL, which isn't included in its length.
672
a3815e44 673 The optional 'n' following 'str' means that there is a third parameter,
fc169e00
KW
674 giving the maximum number of bytes to look at in each string. Even if both
675 strings are longer than the length parameter, those extra bytes will be
676 unexamined.
677
678 The 's' suffix means that the 2nd byte string parameter is a literal C
679 double-quoted string. Its length will automatically be calculated by the
680 macro, so no length parameter will ever be needed for it.
681
682 If the prefix is 'mem', the string parameters don't have to be C strings;
683 they may contain embedded NUL bytes, do not necessarily have a terminating
684 NUL, and their lengths can be known only through other means, which in
685 practice are additional parameter(s) passed to the function. All 'mem'
686 functions have at least one length parameter. Barring any 'l' or 's' suffix,
687 there is a single length parameter, in position 3, which applies to both
688 string parameters. The 's' suffix means, as described above, that the 2nd
689 string is a literal double-quoted C string (hence its length is calculated by
690 the macro, and the length parameter to the function applies just to the first
691 string parameter, and hence is positioned just after it). An 'l' suffix
692 means that the 2nd string parameter has its own length parameter, and the
693 signature will look like memFOOl(s1, l1, s2, l2).
694
695 BEGIN (and END) are for testing if the 2nd string is an initial (or final)
696 substring of the 1st string. 'P' if present indicates that the substring
697 must be a "proper" one in tha mathematical sense that the first one must be
698 strictly larger than the 2nd.
699
954c1994
GS
700*/
701
62946e08 702
75400963
KW
703#define strNE(s1,s2) (strcmp(s1,s2) != 0)
704#define strEQ(s1,s2) (strcmp(s1,s2) == 0)
8d063cd8
LW
705#define strLT(s1,s2) (strcmp(s1,s2) < 0)
706#define strLE(s1,s2) (strcmp(s1,s2) <= 0)
707#define strGT(s1,s2) (strcmp(s1,s2) > 0)
708#define strGE(s1,s2) (strcmp(s1,s2) >= 0)
62946e08 709
75400963
KW
710#define strnNE(s1,s2,l) (strncmp(s1,s2,l) != 0)
711#define strnEQ(s1,s2,l) (strncmp(s1,s2,l) == 0)
378cc40b 712
9d3980bc
KW
713#define memEQ(s1,s2,l) (memcmp(((const void *) (s1)), ((const void *) (s2)), l) == 0)
714#define memNE(s1,s2,l) (! memEQ(s1,s2,l))
36477c24 715
085b7534 716/* memEQ and memNE where second comparand is a string constant */
568a785a 717#define memEQs(s1, l, s2) \
ca0572d7 718 (((sizeof(s2)-1) == (l)) && memEQ((s1), ASSERT_IS_LITERAL(s2), (sizeof(s2)-1)))
5f50c6c9 719#define memNEs(s1, l, s2) (! memEQs(s1, l, s2))
568a785a 720
fdbb9a7c
KW
721/* Keep these private until we decide it was a good idea */
722#if defined(PERL_CORE) || defined(PERL_EXT) || defined(PERL_EXT_POSIX)
723
ca0572d7 724#define strBEGINs(s1,s2) (strncmp(s1,ASSERT_IS_LITERAL(s2), sizeof(s2)-1) == 0)
fdbb9a7c 725
bdb7e3f0 726#define memBEGINs(s1, l, s2) \
30a6480c 727 ( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \
ca0572d7 728 && memEQ(s1, ASSERT_IS_LITERAL(s2), sizeof(s2)-1))
de627158 729#define memBEGINPs(s1, l, s2) \
30a6480c 730 ( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) - 1 \
ca0572d7 731 && memEQ(s1, ASSERT_IS_LITERAL(s2), sizeof(s2)-1))
bdb7e3f0 732#define memENDs(s1, l, s2) \
30a6480c 733 ( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \
ca0572d7 734 && memEQ(s1 + (l) - (sizeof(s2) - 1), ASSERT_IS_LITERAL(s2), sizeof(s2)-1))
b80f8424 735#define memENDPs(s1, l, s2) \
30a6480c 736 ( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) \
ca0572d7 737 && memEQ(s1 + (l) - (sizeof(s2) - 1), ASSERT_IS_LITERAL(s2), sizeof(s2)-1))
fdbb9a7c 738#endif /* End of making macros private */
bdb7e3f0 739
062b6850
KW
740#define memLT(s1,s2,l) (memcmp(s1,s2,l) < 0)
741#define memLE(s1,s2,l) (memcmp(s1,s2,l) <= 0)
742#define memGT(s1,s2,l) (memcmp(s1,s2,l) > 0)
743#define memGE(s1,s2,l) (memcmp(s1,s2,l) >= 0)
744
ca0572d7 745#define memCHRs(s1,c) ((const char *) memchr(ASSERT_IS_LITERAL(s1) , c, sizeof(s1)-1))
4aada8b9 746
bbce6d69 747/*
748 * Character classes.
749 *
750 * Unfortunately, the introduction of locales means that we
751 * can't trust isupper(), etc. to tell the truth. And when
752 * it comes to /\w+/ with tainting enabled, we *must* be able
753 * to trust our character classes.
754 *
81d43abf
KW
755 * Therefore, the default tests in the text of Perl will be independent of
756 * locale. Any code that wants to depend on the current locale will use the
757 * macros that contain _LC in their names
bbce6d69 758 */
759
5bf5e40b 760#ifdef USE_LOCALE_CTYPE
2304df62
AD
761# ifndef CTYPE256
762# define CTYPE256
763# endif
764#endif
765
954c1994 766/*
ccfc67b7 767
dcccc8ff 768=head1 Character classification
243effed
KW
769This section is about functions (really macros) that classify characters
770into types, such as punctuation versus alphabetic, etc. Most of these are
771analogous to regular expression character classes. (See
772L<perlrecharclass/POSIX Character Classes>.) There are several variants for
773each class. (Not all macros have all variants; each item below lists the
774ones valid for it.) None are affected by C<use bytes>, and only the ones
775with C<LC> in the name are affected by the current locale.
776
d713f9d9
KW
777The base function, e.g., C<isALPHA()>, takes any signed or unsigned value,
778treating it as a code point, and returns a boolean as to whether or not the
779character represented by it is (or on non-ASCII platforms, corresponds to) an
6aff1f14
KW
780ASCII character in the named class based on platform, Unicode, and Perl rules.
781If the input is a number that doesn't fit in an octet, FALSE is returned.
243effed 782
c98722a4 783Variant C<isI<FOO>_A> (e.g., C<isALPHA_A()>) is identical to the base function
550da823
KW
784with no suffix C<"_A">. This variant is used to emphasize by its name that
785only ASCII-range characters can return TRUE.
4b9734bf 786
d60679e1 787Variant C<isI<FOO>_L1> imposes the Latin-1 (or EBCDIC equivalent) character set
4b9734bf
KW
788onto the platform. That is, the code points that are ASCII are unaffected,
789since ASCII is a subset of Latin-1. But the non-ASCII code points are treated
790as if they are Latin-1 characters. For example, C<isWORDCHAR_L1()> will return
791true when called with the code point 0xDF, which is a word character in both
4650c663 792ASCII and EBCDIC (though it represents different characters in each).
d713f9d9
KW
793If the input is a number that doesn't fit in an octet, FALSE is returned.
794(Perl's documentation uses a colloquial definition of Latin-1, to include all
795code points below 256.)
243effed 796
d713f9d9
KW
797Variant C<isI<FOO>_uvchr> is exactly like the C<isI<FOO>_L1> variant, for
798inputs below 256, but if the code point is larger than 255, Unicode rules are
799used to determine if it is in the character class. For example,
d0da05db 800C<isWORDCHAR_uvchr(0x100)> returns TRUE, since 0x100 is LATIN CAPITAL LETTER A
6aff1f14 801WITH MACRON in Unicode, and is a word character.
243effed 802
059703b0
KW
803Variants C<isI<FOO>_utf8> and C<isI<FOO>_utf8_safe> are like C<isI<FOO>_uvchr>,
804but are used for UTF-8 encoded strings. The two forms are different names for
805the same thing. Each call to one of these classifies the first character of
806the string starting at C<p>. The second parameter, C<e>, points to anywhere in
807the string beyond the first character, up to one byte past the end of the
808entire string. Although both variants are identical, the suffix C<_safe> in
809one name emphasizes that it will not attempt to read beyond S<C<e - 1>>,
810provided that the constraint S<C<s E<lt> e>> is true (this is asserted for in
811C<-DDEBUGGING> builds). If the UTF-8 for the input character is malformed in
812some way, the program may croak, or the function may return FALSE, at the
813discretion of the implementation, and subject to change in future releases.
243effed 814
d713f9d9
KW
815Variant C<isI<FOO>_LC> is like the C<isI<FOO>_A> and C<isI<FOO>_L1> variants,
816but the result is based on the current locale, which is what C<LC> in the name
817stands for. If Perl can determine that the current locale is a UTF-8 locale,
818it uses the published Unicode rules; otherwise, it uses the C library function
819that gives the named classification. For example, C<isDIGIT_LC()> when not in
820a UTF-8 locale returns the result of calling C<isdigit()>. FALSE is always
1a83413c
KW
821returned if the input won't fit into an octet. On some platforms where the C
822library function is known to be defective, Perl changes its result to follow
823the POSIX standard's rules.
243effed 824
d713f9d9
KW
825Variant C<isI<FOO>_LC_uvchr> acts exactly like C<isI<FOO>_LC> for inputs less
826than 256, but for larger ones it returns the Unicode classification of the code
827point.
243effed 828
059703b0
KW
829Variants C<isI<FOO>_LC_utf8> and C<isI<FOO>_LC_utf8_safe> are like
830C<isI<FOO>_LC_uvchr>, but are used for UTF-8 encoded strings. The two forms
831are different names for the same thing. Each call to one of these classifies
832the first character of the string starting at C<p>. The second parameter,
833C<e>, points to anywhere in the string beyond the first character, up to one
834byte past the end of the entire string. Although both variants are identical,
835the suffix C<_safe> in one name emphasizes that it will not attempt to read
836beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this
837is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the input
838character is malformed in some way, the program may croak, or the function may
839return FALSE, at the discretion of the implementation, and subject to change in
840future releases.
ccfc67b7 841
6fdd32c3
KW
842=for apidoc Am|bool|isALPHA|UV ch
843=for apidoc_item ||isALPHA_A|UV ch
6fdd32c3 844=for apidoc_item ||isALPHA_LC|UV ch
fa470d81 845=for apidoc_item ||isALPHA_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
846=for apidoc_item ||isALPHA_LC_uvchr|UV ch
847=for apidoc_item ||isALPHA_L1|UV ch
848=for apidoc_item ||isALPHA_utf8|U8 * s|U8 * end
849=for apidoc_item ||isALPHA_utf8_safe|U8 * s|U8 * end
850=for apidoc_item ||isALPHA_uvchr|UV ch
d713f9d9
KW
851Returns a boolean indicating whether the specified input is one of C<[A-Za-z]>,
852analogous to C<m/[[:alpha:]]/>.
dcccc8ff 853See the L<top of this section|/Character classification> for an explanation of
fa470d81 854the variants.
8a58bdcf 855
f16858ed
KW
856=cut
857
f1460a66 858Here and below, we add the prototypes of these macros for downstream programs
f16858ed
KW
859that would be interested in them, such as Devel::PPPort
860
6fdd32c3
KW
861=for apidoc Am|bool|isALPHANUMERIC|UV ch
862=for apidoc_item ||isALPHANUMERIC_A|UV ch
6fdd32c3 863=for apidoc_item ||isALPHANUMERIC_LC|UV ch
fa470d81 864=for apidoc_item ||isALPHANUMERIC_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
865=for apidoc_item ||isALPHANUMERIC_LC_uvchr|UV ch
866=for apidoc_item ||isALPHANUMERIC_L1|UV ch
867=for apidoc_item ||isALPHANUMERIC_utf8|U8 * s|U8 * end
868=for apidoc_item ||isALPHANUMERIC_utf8_safe|U8 * s|U8 * end
869=for apidoc_item ||isALPHANUMERIC_uvchr|UV ch
d713f9d9
KW
870Returns a boolean indicating whether the specified character is one of
871C<[A-Za-z0-9]>, analogous to C<m/[[:alnum:]]/>.
dcccc8ff 872See the L<top of this section|/Character classification> for an explanation of
fa470d81 873the variants.
15861f94 874
c1ef4981
KW
875=for apidoc Am|bool|isALNUMC|UV ch
876=for apidoc_item ||isALNUMC_A|UV ch
877=for apidoc_item ||isALNUMC_LC|UV ch
878=for apidoc_item ||isALNUMC_LC_uvchr|UV ch
879=for apidoc_item ||isALNUMC_L1|UV ch
880These are discouraged, backward compatibility macros for L</C<isALPHANUMERIC>>.
881That is, each returns a boolean indicating whether the specified character is
882one of C<[A-Za-z0-9]>, analogous to C<m/[[:alnum:]]/>.
883
884The C<C> suffix in the names was meant to indicate that they correspond to the
885C language L<C<isalnum(3)>>.
255b632a 886
6fdd32c3
KW
887=for apidoc Am|bool|isASCII|UV ch
888=for apidoc_item ||isASCII_A|UV ch
6fdd32c3 889=for apidoc_item ||isASCII_LC|UV ch
fa470d81 890=for apidoc_item ||isASCII_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
891=for apidoc_item ||isASCII_LC_uvchr|UV ch
892=for apidoc_item ||isASCII_L1|UV ch
893=for apidoc_item ||isASCII_utf8|U8 * s|U8 * end
894=for apidoc_item ||isASCII_utf8_safe|U8 * s|U8 * end
895=for apidoc_item ||isASCII_uvchr|UV ch
8a58bdcf 896Returns a boolean indicating whether the specified character is one of the 128
243effed 897characters in the ASCII character set, analogous to C<m/[[:ascii:]]/>.
e5ad6aba 898On non-ASCII platforms, it returns TRUE iff this
8a58bdcf
KW
899character corresponds to an ASCII character. Variants C<isASCII_A()> and
900C<isASCII_L1()> are identical to C<isASCII()>.
dcccc8ff 901See the L<top of this section|/Character classification> for an explanation of
fa470d81 902the variants.
059703b0
KW
903Note, however, that some platforms do not have the C library routine
904C<isascii()>. In these cases, the variants whose names contain C<LC> are the
905same as the corresponding ones without.
243effed 906
d98532ea
KW
907Also note, that because all ASCII characters are UTF-8 invariant (meaning they
908have the exact same representation (always a single byte) whether encoded in
909UTF-8 or not), C<isASCII> will give the correct results when called with any
059703b0
KW
910byte in any string encoded or not in UTF-8. And similarly C<isASCII_utf8> and
911C<isASCII_utf8_safe> will work properly on any string encoded or not in UTF-8.
d98532ea 912
6fdd32c3
KW
913=for apidoc Am|bool|isBLANK|UV ch
914=for apidoc_item ||isBLANK_A|UV ch
6fdd32c3 915=for apidoc_item ||isBLANK_LC|UV ch
fa470d81 916=for apidoc_item ||isBLANK_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
917=for apidoc_item ||isBLANK_LC_uvchr|UV ch
918=for apidoc_item ||isBLANK_L1|UV ch
919=for apidoc_item ||isBLANK_utf8|U8 * s|U8 * end
920=for apidoc_item ||isBLANK_utf8_safe|U8 * s|U8 * end
921=for apidoc_item ||isBLANK_uvchr|UV ch
243effed 922Returns a boolean indicating whether the specified character is a
6aff1f14 923character considered to be a blank, analogous to C<m/[[:blank:]]/>.
dcccc8ff 924See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
925the variants.
926Note,
da8c1a98
KW
927however, that some platforms do not have the C library routine
928C<isblank()>. In these cases, the variants whose names contain C<LC> are
929the same as the corresponding ones without.
243effed 930
6fdd32c3
KW
931=for apidoc Am|bool|isCNTRL|UV ch
932=for apidoc_item ||isCNTRL_A|UV ch
6fdd32c3 933=for apidoc_item ||isCNTRL_LC|UV ch
fa470d81 934=for apidoc_item ||isCNTRL_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
935=for apidoc_item ||isCNTRL_LC_uvchr|UV ch
936=for apidoc_item ||isCNTRL_L1|UV ch
937=for apidoc_item ||isCNTRL_utf8|U8 * s|U8 * end
938=for apidoc_item ||isCNTRL_utf8_safe|U8 * s|U8 * end
939=for apidoc_item ||isCNTRL_uvchr|UV ch
f16858ed 940
243effed 941Returns a boolean indicating whether the specified character is a
6aff1f14 942control character, analogous to C<m/[[:cntrl:]]/>.
dcccc8ff 943See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
944the variants.
945On EBCDIC platforms, you almost always want to use the C<isCNTRL_L1> variant.
946
6fdd32c3
KW
947=for apidoc Am|bool|isDIGIT|UV ch
948=for apidoc_item ||isDIGIT_A|UV ch
6fdd32c3 949=for apidoc_item ||isDIGIT_LC|UV ch
fa470d81 950=for apidoc_item ||isDIGIT_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
951=for apidoc_item ||isDIGIT_LC_uvchr|UV ch
952=for apidoc_item ||isDIGIT_L1|UV ch
953=for apidoc_item ||isDIGIT_utf8|U8 * s|U8 * end
954=for apidoc_item ||isDIGIT_utf8_safe|U8 * s|U8 * end
955=for apidoc_item ||isDIGIT_uvchr|UV ch
fa470d81 956
2787a470 957Returns a boolean indicating whether the specified character is a
6aff1f14 958digit, analogous to C<m/[[:digit:]]/>.
8a58bdcf 959Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>.
dcccc8ff 960See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
961the variants.
962
6fdd32c3
KW
963=for apidoc Am|bool|isGRAPH|UV ch
964=for apidoc_item ||isGRAPH_A|UV ch
6fdd32c3 965=for apidoc_item ||isGRAPH_LC|UV ch
fa470d81 966=for apidoc_item ||isGRAPH_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
967=for apidoc_item ||isGRAPH_LC_uvchr|UV ch
968=for apidoc_item ||isGRAPH_L1|UV ch
969=for apidoc_item ||isGRAPH_utf8|U8 * s|U8 * end
970=for apidoc_item ||isGRAPH_utf8_safe|U8 * s|U8 * end
971=for apidoc_item ||isGRAPH_uvchr|UV ch
243effed 972Returns a boolean indicating whether the specified character is a
6aff1f14 973graphic character, analogous to C<m/[[:graph:]]/>.
dcccc8ff 974See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
975the variants.
976
6fdd32c3
KW
977=for apidoc Am|bool|isLOWER|UV ch
978=for apidoc_item ||isLOWER_A|UV ch
6fdd32c3 979=for apidoc_item ||isLOWER_LC|UV ch
fa470d81 980=for apidoc_item ||isLOWER_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
981=for apidoc_item ||isLOWER_LC_uvchr|UV ch
982=for apidoc_item ||isLOWER_L1|UV ch
983=for apidoc_item ||isLOWER_utf8|U8 * s|U8 * end
984=for apidoc_item ||isLOWER_utf8_safe|U8 * s|U8 * end
985=for apidoc_item ||isLOWER_uvchr|UV ch
2787a470 986Returns a boolean indicating whether the specified character is a
6aff1f14 987lowercase character, analogous to C<m/[[:lower:]]/>.
dcccc8ff 988See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
989the variants
990
6fdd32c3
KW
991=for apidoc Am|bool|isOCTAL|UV ch
992=for apidoc_item ||isOCTAL_A|UV ch
993=for apidoc_item ||isOCTAL_L1|UV ch
2787a470 994Returns a boolean indicating whether the specified character is an
6aff1f14 995octal digit, [0-7].
243effed
KW
996The only two variants are C<isOCTAL_A> and C<isOCTAL_L1>; each is identical to
997C<isOCTAL>.
998
6fdd32c3
KW
999=for apidoc Am|bool|isPUNCT|UV ch
1000=for apidoc_item ||isPUNCT_A|UV ch
6fdd32c3 1001=for apidoc_item ||isPUNCT_LC|UV ch
fa470d81 1002=for apidoc_item ||isPUNCT_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1003=for apidoc_item ||isPUNCT_LC_uvchr|UV ch
1004=for apidoc_item ||isPUNCT_L1|UV ch
1005=for apidoc_item ||isPUNCT_utf8|U8 * s|U8 * end
1006=for apidoc_item ||isPUNCT_utf8_safe|U8 * s|U8 * end
1007=for apidoc_item ||isPUNCT_uvchr|UV ch
243effed 1008Returns a boolean indicating whether the specified character is a
6aff1f14
KW
1009punctuation character, analogous to C<m/[[:punct:]]/>.
1010Note that the definition of what is punctuation isn't as
243effed
KW
1011straightforward as one might desire. See L<perlrecharclass/POSIX Character
1012Classes> for details.
dcccc8ff 1013See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
1014the variants.
1015
6fdd32c3
KW
1016=for apidoc Am|bool|isSPACE|UV ch
1017=for apidoc_item ||isSPACE_A|UV ch
6fdd32c3 1018=for apidoc_item ||isSPACE_LC|UV ch
fa470d81 1019=for apidoc_item ||isSPACE_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1020=for apidoc_item ||isSPACE_LC_uvchr|UV ch
1021=for apidoc_item ||isSPACE_L1|UV ch
1022=for apidoc_item ||isSPACE_utf8|U8 * s|U8 * end
1023=for apidoc_item ||isSPACE_utf8_safe|U8 * s|U8 * end
1024=for apidoc_item ||isSPACE_uvchr|UV ch
2787a470 1025Returns a boolean indicating whether the specified character is a
6aff1f14 1026whitespace character. This is analogous
398d098a 1027to what C<m/\s/> matches in a regular expression. Starting in Perl 5.18
779cf272 1028this also matches what C<m/[[:space:]]/> does. Prior to 5.18, only the
398d098a
KW
1029locale forms of this macro (the ones with C<LC> in their names) matched
1030precisely what C<m/[[:space:]]/> does. In those releases, the only difference,
1031in the non-locale variants, was that C<isSPACE()> did not match a vertical tab.
1032(See L</isPSXSPC> for a macro that matches a vertical tab in all releases.)
dcccc8ff 1033See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
1034the variants.
1035
6fdd32c3
KW
1036=for apidoc Am|bool|isPSXSPC|UV ch
1037=for apidoc_item ||isPSXSPC_A|UV ch
6fdd32c3 1038=for apidoc_item ||isPSXSPC_LC|UV ch
fa470d81 1039=for apidoc_item ||isPSXSPC_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1040=for apidoc_item ||isPSXSPC_LC_uvchr|UV ch
1041=for apidoc_item ||isPSXSPC_L1|UV ch
1042=for apidoc_item ||isPSXSPC_utf8|U8 * s|U8 * end
1043=for apidoc_item ||isPSXSPC_utf8_safe|U8 * s|U8 * end
1044=for apidoc_item ||isPSXSPC_uvchr|UV ch
398d098a 1045(short for Posix Space)
779cf272
KW
1046Starting in 5.18, this is identical in all its forms to the
1047corresponding C<isSPACE()> macros.
398d098a
KW
1048The locale forms of this macro are identical to their corresponding
1049C<isSPACE()> forms in all Perl releases. In releases prior to 5.18, the
1050non-locale forms differ from their C<isSPACE()> forms only in that the
1051C<isSPACE()> forms don't match a Vertical Tab, and the C<isPSXSPC()> forms do.
1052Otherwise they are identical. Thus this macro is analogous to what
1053C<m/[[:space:]]/> matches in a regular expression.
dcccc8ff 1054See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
1055the variants.
1056
6fdd32c3
KW
1057=for apidoc Am|bool|isUPPER|UV ch
1058=for apidoc_item ||isUPPER_A|UV ch
6fdd32c3 1059=for apidoc_item ||isUPPER_LC|UV ch
fa470d81 1060=for apidoc_item ||isUPPER_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1061=for apidoc_item ||isUPPER_LC_uvchr|UV ch
1062=for apidoc_item ||isUPPER_L1|UV ch
1063=for apidoc_item ||isUPPER_utf8|U8 * s|U8 * end
1064=for apidoc_item ||isUPPER_utf8_safe|U8 * s|U8 * end
1065=for apidoc_item ||isUPPER_uvchr|UV ch
2787a470 1066Returns a boolean indicating whether the specified character is an
6aff1f14 1067uppercase character, analogous to C<m/[[:upper:]]/>.
dcccc8ff 1068See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
1069the variants.
1070
6fdd32c3
KW
1071=for apidoc Am|bool|isPRINT|UV ch
1072=for apidoc_item ||isPRINT_A|UV ch
6fdd32c3 1073=for apidoc_item ||isPRINT_LC|UV ch
fa470d81 1074=for apidoc_item ||isPRINT_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1075=for apidoc_item ||isPRINT_LC_uvchr|UV ch
1076=for apidoc_item ||isPRINT_L1|UV ch
1077=for apidoc_item ||isPRINT_utf8|U8 * s|U8 * end
1078=for apidoc_item ||isPRINT_utf8_safe|U8 * s|U8 * end
1079=for apidoc_item ||isPRINT_uvchr|UV ch
8eea39dd 1080Returns a boolean indicating whether the specified character is a
6aff1f14 1081printable character, analogous to C<m/[[:print:]]/>.
dcccc8ff 1082See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
1083the variants.
1084
6fdd32c3
KW
1085=for apidoc Am|bool|isWORDCHAR|UV ch
1086=for apidoc_item ||isWORDCHAR_A|UV ch
6fdd32c3 1087=for apidoc_item ||isWORDCHAR_LC|UV ch
fa470d81 1088=for apidoc_item ||isWORDCHAR_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1089=for apidoc_item ||isWORDCHAR_LC_uvchr|UV ch
1090=for apidoc_item ||isWORDCHAR_L1|UV ch
1091=for apidoc_item ||isWORDCHAR_utf8|U8 * s|U8 * end
1092=for apidoc_item ||isWORDCHAR_utf8_safe|U8 * s|U8 * end
1093=for apidoc_item ||isWORDCHAR_uvchr|UV ch
243effed
KW
1094Returns a boolean indicating whether the specified character is a character
1095that is a word character, analogous to what C<m/\w/> and C<m/[[:word:]]/> match
1096in a regular expression. A word character is an alphabetic character, a
1097decimal digit, a connecting punctuation character (such as an underscore), or
1098a "mark" character that attaches to one of those (like some sort of accent).
c1ef4981 1099
dcccc8ff 1100See the L<top of this section|/Character classification> for an explanation of
fa470d81 1101the variants.
1607e393 1102
fa470d81
KW
1103C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>,
1104C<isWORDCHAR_LC>, C<isWORDCHAR_LC_uvchr>, C<isWORDCHAR_LC_utf8>, and
1105C<isWORDCHAR_LC_utf8_safe> are also as described there, but additionally
1106include the platform's native underscore.
1107
c1ef4981
KW
1108=for apidoc Am|bool|isALNUM |UV ch
1109=for apidoc_item ||isALNUM_A |UV ch
1110=for apidoc_item ||isALNUM_LC |UV ch
1111=for apidoc_item ||isALNUM_LC_uvchr|UV ch
1112These are each a synonym for their respectively named L</C<isWORDCHAR>>
1113variant.
1114
1115They are provided for backward compatibility, even though a word character
1116includes more than the standard C language meaning of alphanumeric.
1117To get the C language definition, use the corresponding L</C<isALPHANUMERIC>>
1118variant.
1119
6fdd32c3
KW
1120=for apidoc Am|bool|isXDIGIT|UV ch
1121=for apidoc_item ||isXDIGIT_A|UV ch
6fdd32c3 1122=for apidoc_item ||isXDIGIT_LC|UV ch
fa470d81 1123=for apidoc_item ||isXDIGIT_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1124=for apidoc_item ||isXDIGIT_LC_uvchr|UV ch
1125=for apidoc_item ||isXDIGIT_L1|UV ch
1126=for apidoc_item ||isXDIGIT_utf8|U8 * s|U8 * end
1127=for apidoc_item ||isXDIGIT_utf8_safe|U8 * s|U8 * end
1128=for apidoc_item ||isXDIGIT_uvchr|UV ch
8a58bdcf 1129Returns a boolean indicating whether the specified character is a hexadecimal
243effed
KW
1130digit. In the ASCII range these are C<[0-9A-Fa-f]>. Variants C<isXDIGIT_A()>
1131and C<isXDIGIT_L1()> are identical to C<isXDIGIT()>.
dcccc8ff 1132See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
1133the variants.
1134
6fdd32c3
KW
1135=for apidoc Am|bool|isIDFIRST|UV ch
1136=for apidoc_item ||isIDFIRST_A|UV ch
6fdd32c3 1137=for apidoc_item ||isIDFIRST_LC|UV ch
fa470d81 1138=for apidoc_item ||isIDFIRST_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1139=for apidoc_item ||isIDFIRST_LC_uvchr|UV ch
1140=for apidoc_item ||isIDFIRST_L1|UV ch
1141=for apidoc_item ||isIDFIRST_utf8|U8 * s|U8 * end
1142=for apidoc_item ||isIDFIRST_utf8_safe|U8 * s|U8 * end
1143=for apidoc_item ||isIDFIRST_uvchr|UV ch
3c3ecf18
KW
1144Returns a boolean indicating whether the specified character can be the first
1145character of an identifier. This is very close to, but not quite the same as
1146the official Unicode property C<XID_Start>. The difference is that this
1147returns true only if the input character also matches L</isWORDCHAR>.
dcccc8ff 1148See the L<top of this section|/Character classification> for an explanation of
fa470d81
KW
1149the variants.
1150
6fdd32c3
KW
1151=for apidoc Am|bool|isIDCONT|UV ch
1152=for apidoc_item ||isIDCONT_A|UV ch
6fdd32c3 1153=for apidoc_item ||isIDCONT_LC|UV ch
fa470d81 1154=for apidoc_item ||isIDCONT_LC_utf8_safe|U8 * s| U8 *end
1607e393
KW
1155=for apidoc_item ||isIDCONT_LC_uvchr|UV ch
1156=for apidoc_item ||isIDCONT_L1|UV ch
1157=for apidoc_item ||isIDCONT_utf8|U8 * s|U8 * end
1158=for apidoc_item ||isIDCONT_utf8_safe|U8 * s|U8 * end
1159=for apidoc_item ||isIDCONT_uvchr|UV ch
3c3ecf18
KW
1160Returns a boolean indicating whether the specified character can be the
1161second or succeeding character of an identifier. This is very close to, but
1162not quite the same as the official Unicode property C<XID_Continue>. The
1163difference is that this returns true only if the input character also matches
dcccc8ff 1164L</isWORDCHAR>. See the L<top of this section|/Character classification> for
fa470d81 1165an explanation of the variants.
f16858ed 1166
3f620621 1167=for apidoc_section $numeric
8eea39dd 1168
95a59cab 1169=for apidoc Am|U8|READ_XDIGIT|char str*
243effed 1170Returns the value of an ASCII-range hex digit and advances the string pointer.
95a59cab
YO
1171Behaviour is only well defined when isXDIGIT(*str) is true.
1172
e7c1e6c1 1173=head1 Character case changing
21da7284
KW
1174Perl uses "full" Unicode case mappings. This means that converting a single
1175character to another case may result in a sequence of more than one character.
1176For example, the uppercase of C<E<223>> (LATIN SMALL LETTER SHARP S) is the two
1177character sequence C<SS>. This presents some complications The lowercase of
1178all characters in the range 0..255 is a single character, and thus
1179C<L</toLOWER_L1>> is furnished. But, C<toUPPER_L1> can't exist, as it couldn't
1180return a valid result for all legal inputs. Instead C<L</toUPPER_uvchr>> has
1181an API that does allow every possible legal result to be returned.) Likewise
1182no other function that is crippled by not being able to give the correct
1183results for the full range of possible inputs has been implemented here.
e7c1e6c1 1184
45d6cb5e
KW
1185=for apidoc Am|UV|toUPPER|UV cp
1186=for apidoc_item |UV|toUPPER_A|UV cp
45d6cb5e
KW
1187=for apidoc_item |UV|toUPPER_utf8|U8* p|U8* e|U8* s|STRLEN* lenp
1188=for apidoc_item |UV|toUPPER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
1607e393 1189=for apidoc_item |UV|toUPPER_uvchr|UV cp|U8* s|STRLEN* lenp
45d6cb5e
KW
1190
1191These all return the uppercase of a character. The differences are what domain
1192they operate on, and whether the input is specified as a code point (those
1193forms with a C<cp> parameter) or as a UTF-8 string (the others). In the latter
1194case, the code point to use is the first one in the buffer of UTF-8 encoded
1195code points, delineated by the arguments S<C<p .. e - 1>>.
1196
1197C<toUPPER> and C<toUPPER_A> are synonyms of each other. They return the
1198uppercase of any lowercase ASCII-range code point. All other inputs are
1199returned unchanged. Since these are macros, the input type may be any integral
1200one, and the output will occupy the same number of bits as the input.
1201
1202There is no C<toUPPER_L1> nor C<toUPPER_LATIN1> as the uppercase of some code
1203points in the 0..255 range is above that range or consists of multiple
1204characters. Instead use C<toUPPER_uvchr>.
1205
1206C<toUPPER_uvchr> returns the uppercase of any Unicode code point. The return
1207value is identical to that of C<toUPPER_A> for input code points in the ASCII
1208range. The uppercase of the vast majority of Unicode code points is the same
1209as the code point itself. For these, and for code points above the legal
1210Unicode maximum, this returns the input code point unchanged. It additionally
1211stores the UTF-8 of the result into the buffer beginning at C<s>, and its
1212length in bytes into C<*lenp>. The caller must have made C<s> large enough to
1213contain at least C<UTF8_MAXBYTES_CASE+1> bytes to avoid possible overflow.
1214
1215NOTE: the uppercase of a code point may be more than one code point. The
1216return value of this function is only the first of these. The entire uppercase
1217is returned in C<s>. To determine if the result is more than a single code
1218point, you can do something like this:
1219
1220 uc = toUPPER_uvchr(cp, s, &len);
1221 if (len > UTF8SKIP(s)) { is multiple code points }
1222 else { is a single code point }
1223
1224C<toUPPER_utf8> and C<toUPPER_utf8_safe> are synonyms of each other. The only
1225difference between these and C<toUPPER_uvchr> is that the source for these is
1226encoded in UTF-8, instead of being a code point. It is passed as a buffer
1227starting at C<p>, with C<e> pointing to one byte beyond its end. The C<p>
1228buffer may certainly contain more than one code point; but only the first one
1229(up through S<C<e - 1>>) is examined. If the UTF-8 for the input character is
1230malformed in some way, the program may croak, or the function may return the
1231REPLACEMENT CHARACTER, at the discretion of the implementation, and subject to
1232change in future releases.
1233
1234=for apidoc Am|UV|toFOLD|UV cp
1235=for apidoc_item |UV|toFOLD_A|UV cp
45d6cb5e
KW
1236=for apidoc_item |UV|toFOLD_utf8|U8* p|U8* e|U8* s|STRLEN* lenp
1237=for apidoc_item |UV|toFOLD_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
1607e393 1238=for apidoc_item |UV|toFOLD_uvchr|UV cp|U8* s|STRLEN* lenp
45d6cb5e
KW
1239
1240These all return the foldcase of a character. "foldcase" is an internal case
1241for C</i> pattern matching. If the foldcase of character A and the foldcase of
1242character B are the same, they match caselessly; otherwise they don't.
1243
1244The differences in the forms are what domain they operate on, and whether the
1245input is specified as a code point (those forms with a C<cp> parameter) or as a
1246UTF-8 string (the others). In the latter case, the code point to use is the
1247first one in the buffer of UTF-8 encoded code points, delineated by the
1248arguments S<C<p .. e - 1>>.
1249
1250C<toFOLD> and C<toFOLD_A> are synonyms of each other. They return the
1251foldcase of any ASCII-range code point. In this range, the foldcase is
1252identical to the lowercase. All other inputs are returned unchanged. Since
1253these are macros, the input type may be any integral one, and the output will
1254occupy the same number of bits as the input.
1255
1256There is no C<toFOLD_L1> nor C<toFOLD_LATIN1> as the foldcase of some code
1257points in the 0..255 range is above that range or consists of multiple
1258characters. Instead use C<toFOLD_uvchr>.
1259
1260C<toFOLD_uvchr> returns the foldcase of any Unicode code point. The return
1261value is identical to that of C<toFOLD_A> for input code points in the ASCII
1262range. The foldcase of the vast majority of Unicode code points is the same
1263as the code point itself. For these, and for code points above the legal
1264Unicode maximum, this returns the input code point unchanged. It additionally
1265stores the UTF-8 of the result into the buffer beginning at C<s>, and its
1266length in bytes into C<*lenp>. The caller must have made C<s> large enough to
1267contain at least C<UTF8_MAXBYTES_CASE+1> bytes to avoid possible overflow.
1268
1269NOTE: the foldcase of a code point may be more than one code point. The
1270return value of this function is only the first of these. The entire foldcase
1271is returned in C<s>. To determine if the result is more than a single code
1272point, you can do something like this:
1273
1274 uc = toFOLD_uvchr(cp, s, &len);
1275 if (len > UTF8SKIP(s)) { is multiple code points }
1276 else { is a single code point }
1277
1278C<toFOLD_utf8> and C<toFOLD_utf8_safe> are synonyms of each other. The only
1279difference between these and C<toFOLD_uvchr> is that the source for these is
1280encoded in UTF-8, instead of being a code point. It is passed as a buffer
1281starting at C<p>, with C<e> pointing to one byte beyond its end. The C<p>
1282buffer may certainly contain more than one code point; but only the first one
1283(up through S<C<e - 1>>) is examined. If the UTF-8 for the input character is
1284malformed in some way, the program may croak, or the function may return the
1285REPLACEMENT CHARACTER, at the discretion of the implementation, and subject to
1286change in future releases.
1f607577 1287
3cb048e5
KW
1288=for apidoc Am|UV|toLOWER|UV cp
1289=for apidoc_item |UV|toLOWER_A|UV cp
3cb048e5
KW
1290=for apidoc_item |UV|toLOWER_LATIN1|UV cp
1291=for apidoc_item |UV|toLOWER_LC|UV cp
1607e393 1292=for apidoc_item |UV|toLOWER_L1|UV cp
3cb048e5
KW
1293=for apidoc_item |UV|toLOWER_utf8|U8* p|U8* e|U8* s|STRLEN* lenp
1294=for apidoc_item |UV|toLOWER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
1607e393 1295=for apidoc_item |UV|toLOWER_uvchr|UV cp|U8* s|STRLEN* lenp
3cb048e5
KW
1296
1297These all return the lowercase of a character. The differences are what domain
1298they operate on, and whether the input is specified as a code point (those
1299forms with a C<cp> parameter) or as a UTF-8 string (the others). In the latter
1300case, the code point to use is the first one in the buffer of UTF-8 encoded
1301code points, delineated by the arguments S<C<p .. e - 1>>.
1302
1303C<toLOWER> and C<toLOWER_A> are synonyms of each other. They return the
1304lowercase of any uppercase ASCII-range code point. All other inputs are
1305returned unchanged. Since these are macros, the input type may be any integral
1306one, and the output will occupy the same number of bits as the input.
1307
1308C<toLOWER_L1> and C<toLOWER_LATIN1> are synonyms of each other. They behave
1309identically as C<toLOWER> for ASCII-range input. But additionally will return
1310the lowercase of any uppercase code point in the entire 0..255 range, assuming
1311a Latin-1 encoding (or the EBCDIC equivalent on such platforms).
1312
1313C<toLOWER_LC> returns the lowercase of the input code point according to the
1314rules of the current POSIX locale. Input code points outside the range 0..255
1315are returned unchanged.
1316
1317C<toLOWER_uvchr> returns the lowercase of any Unicode code point. The return
1318value is identical to that of C<toLOWER_L1> for input code points in the 0..255
1319range. The lowercase of the vast majority of Unicode code points is the same
1320as the code point itself. For these, and for code points above the legal
1321Unicode maximum, this returns the input code point unchanged. It additionally
1322stores the UTF-8 of the result into the buffer beginning at C<s>, and its
1323length in bytes into C<*lenp>. The caller must have made C<s> large enough to
1324contain at least C<UTF8_MAXBYTES_CASE+1> bytes to avoid possible overflow.
1325
1326NOTE: the lowercase of a code point may be more than one code point. The
1327return value of this function is only the first of these. The entire lowercase
1328is returned in C<s>. To determine if the result is more than a single code
1329point, you can do something like this:
1330
1331 uc = toLOWER_uvchr(cp, s, &len);
1332 if (len > UTF8SKIP(s)) { is multiple code points }
1333 else { is a single code point }
1334
1335C<toLOWER_utf8> and C<toLOWER_utf8_safe> are synonyms of each other. The only
1336difference between these and C<toLOWER_uvchr> is that the source for these is
1337encoded in UTF-8, instead of being a code point. It is passed as a buffer
1338starting at C<p>, with C<e> pointing to one byte beyond its end. The C<p>
1339buffer may certainly contain more than one code point; but only the first one
1340(up through S<C<e - 1>>) is examined. If the UTF-8 for the input character is
1341malformed in some way, the program may croak, or the function may return the
1342REPLACEMENT CHARACTER, at the discretion of the implementation, and subject to
1343change in future releases.
1f607577 1344
45d6cb5e
KW
1345=for apidoc Am|UV|toTITLE|UV cp
1346=for apidoc_item |UV|toTITLE_A|UV cp
45d6cb5e
KW
1347=for apidoc_item |UV|toTITLE_utf8|U8* p|U8* e|U8* s|STRLEN* lenp
1348=for apidoc_item |UV|toTITLE_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
1607e393 1349=for apidoc_item |UV|toTITLE_uvchr|UV cp|U8* s|STRLEN* lenp
45d6cb5e
KW
1350
1351These all return the titlecase of a character. The differences are what domain
1352they operate on, and whether the input is specified as a code point (those
1353forms with a C<cp> parameter) or as a UTF-8 string (the others). In the latter
1354case, the code point to use is the first one in the buffer of UTF-8 encoded
1355code points, delineated by the arguments S<C<p .. e - 1>>.
1356
1357C<toTITLE> and C<toTITLE_A> are synonyms of each other. They return the
1358titlecase of any lowercase ASCII-range code point. In this range, the
1359titlecase is identical to the uppercase. All other inputs are returned
1360unchanged. Since these are macros, the input type may be any integral one, and
1361the output will occupy the same number of bits as the input.
1362
1363There is no C<toTITLE_L1> nor C<toTITLE_LATIN1> as the titlecase of some code
1364points in the 0..255 range is above that range or consists of multiple
1365characters. Instead use C<toTITLE_uvchr>.
1366
1367C<toTITLE_uvchr> returns the titlecase of any Unicode code point. The return
1368value is identical to that of C<toTITLE_A> for input code points in the ASCII
1369range. The titlecase of the vast majority of Unicode code points is the same
1370as the code point itself. For these, and for code points above the legal
1371Unicode maximum, this returns the input code point unchanged. It additionally
1372stores the UTF-8 of the result into the buffer beginning at C<s>, and its
1373length in bytes into C<*lenp>. The caller must have made C<s> large enough to
1374contain at least C<UTF8_MAXBYTES_CASE+1> bytes to avoid possible overflow.
1375
1376NOTE: the titlecase of a code point may be more than one code point. The
1377return value of this function is only the first of these. The entire titlecase
1378is returned in C<s>. To determine if the result is more than a single code
1379point, you can do something like this:
1380
1381 uc = toTITLE_uvchr(cp, s, &len);
1382 if (len > UTF8SKIP(s)) { is multiple code points }
1383 else { is a single code point }
1384
1385C<toTITLE_utf8> and C<toTITLE_utf8_safe> are synonyms of each other. The only
1386difference between these and C<toTITLE_uvchr> is that the source for these is
1387encoded in UTF-8, instead of being a code point. It is passed as a buffer
1388starting at C<p>, with C<e> pointing to one byte beyond its end. The C<p>
1389buffer may certainly contain more than one code point; but only the first one
1390(up through S<C<e - 1>>) is examined. If the UTF-8 for the input character is
1391malformed in some way, the program may croak, or the function may return the
1392REPLACEMENT CHARACTER, at the discretion of the implementation, and subject to
1393change in future releases.
1f607577 1394
954c1994 1395=cut
353c9b6f 1396
d0da05db 1397XXX Still undocumented isVERTWS_uvchr and _utf8; it's unclear what their names
1e222e4f
KW
1398really should be. Also toUPPER_LC and toFOLD_LC, which are subject to change,
1399and aren't general purpose as they don't work on U+DF, and assert against that.
8fd8ea43 1400and isCASED_LC, as it really is more of an internal thing.
243effed 1401
8a58bdcf 1402Note that these macros are repeated in Devel::PPPort, so should also be
62fa66b6
KW
1403patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
1404
954c1994
GS
1405*/
1406
8f5283f4
KW
1407/*
1408 void below because that's the best fit, and works for Devel::PPPort
3f620621 1409=for apidoc_section $integer
545bca17 1410=for apidoc AyT||WIDEST_UTYPE
8f5283f4
KW
1411
1412Yields the widest unsigned integer type on the platform, currently either
326c768d 1413C<U32> or C<U64>. This can be used in declarations such as
8f5283f4
KW
1414
1415 WIDEST_UTYPE my_uv;
1416
1417or casts
1418
1419 my_uv = (WIDEST_UTYPE) val;
1420
1421=cut
1422
1423*/
bbe73871 1424#define WIDEST_UTYPE PERL_UINTMAX_T
7c062697 1425
de40ad3f
KW
1426/* Where there could be some confusion, use this as a static assert in macros
1427 * to make sure that a parameter isn't a pointer. But some compilers can't
1428 * handle this. The only one known so far that doesn't is gcc 3.3.6; the check
1429 * below isn't thorough for such an old compiler, so may have to be revised if
1430 * experience so dictates. */
1431#if ! PERL_IS_GCC || PERL_GCC_VERSION_GT(3,3,6)
1432# define ASSERT_NOT_PTR(x) ((x) | 0)
1433#else
1434# define ASSERT_NOT_PTR(x) (x)
1435#endif
296969d3 1436
08e4e7bf 1437/* Likewise, this is effectively a static assert to be used to guarantee the
43131e10
KW
1438 * parameter is a pointer
1439 *
1440 * NOT suitable for void*
1441 */
08e4e7bf
KW
1442#define ASSERT_IS_PTR(x) (__ASSERT_(sizeof(*(x))) (x))
1443
3912bc88
KW
1444/* FITS_IN_8_BITS(c) returns true if c doesn't have a bit set other than in
1445 * the lower 8. It is designed to be hopefully bomb-proof, making sure that no
1446 * bits of information are lost even on a 64-bit machine, but to get the
1447 * compiler to optimize it out if possible. This is because Configure makes
1448 * sure that the machine has an 8-bit byte, so if c is stored in a byte, the
1449 * sizeof() guarantees that this evaluates to a constant true at compile time.
7e75d1a1
JH
1450 *
1451 * For Coverity, be always true, because otherwise Coverity thinks
1452 * it finds several expressions that are always true, independent
1453 * of operands. Well, they are, but that is kind of the point.
220c71bf 1454 */
7e75d1a1 1455#ifndef __COVERITY__
231a6d16
KW
1456 /* The '| 0' part in ASSERT_NOT_PTR ensures a compiler error if c is not
1457 * integer (like e.g., a pointer) */
1458# define FITS_IN_8_BITS(c) ( (sizeof(c) == 1) \
1459 || (((WIDEST_UTYPE) ASSERT_NOT_PTR(c)) >> 8) == 0)
7e75d1a1 1460#else
9555181b 1461# define FITS_IN_8_BITS(c) (1)
7e75d1a1 1462#endif
cf301eb7 1463
45f4bb73 1464/* Returns true if l <= c <= (l + n), where 'l' and 'n' are non-negative
833b0f46 1465 * Written this way so that after optimization, only one conditional test is
76d3ad4c
KW
1466 * needed. (The NV casts stop any warnings about comparison always being true
1467 * if called with an unsigned. The cast preserves the sign, which is all we
1468 * care about.) */
92a0bb24
KW
1469#define withinCOUNT(c, l, n) (__ASSERT_((NV) (l) >= 0) \
1470 __ASSERT_((NV) (n) >= 0) \
1471 withinCOUNT_KNOWN_VALID_((c), (l), (n)))
1472
1473/* For internal use only, this can be used in places where it is known that the
1474 * parameters to withinCOUNT() are valid, to avoid the asserts. For example,
1475 * inRANGE() below, calls this several times, but does all the necessary
1476 * asserts itself, once. The reason that this is necessary is that the
1477 * duplicate asserts were exceeding the internal limits of some compilers */
1478#define withinCOUNT_KNOWN_VALID_(c, l, n) \
296969d3
KW
1479 ((((WIDEST_UTYPE) (c)) - ASSERT_NOT_PTR(l)) \
1480 <= ((WIDEST_UTYPE) ASSERT_NOT_PTR(n)))
833b0f46 1481
94250c4f
KW
1482/* Returns true if c is in the range l..u, where 'l' is non-negative
1483 * Written this way so that after optimization, only one conditional test is
4758c20d 1484 * needed. */
92a0bb24
KW
1485#define inRANGE(c, l, u) (__ASSERT_((NV) (l) >= 0) __ASSERT_((u) >= (l)) \
1486 ( (sizeof(c) == sizeof(U8)) ? inRANGE_helper_(U8, (c), (l), ((u))) \
1442da54 1487 : (sizeof(c) == sizeof(U16)) ? inRANGE_helper_(U16,(c), (l), ((u))) \
92a0bb24
KW
1488 : (sizeof(c) == sizeof(U32)) ? inRANGE_helper_(U32,(c), (l), ((u))) \
1489 : (__ASSERT_(sizeof(c) == sizeof(WIDEST_UTYPE)) \
1490 inRANGE_helper_(WIDEST_UTYPE,(c), (l), ((u))))))
1491
1492/* For internal use, this is used by machine-generated code which generates
1493 * known valid calls, with a known sizeof(). This avoids the extra code and
1494 * asserts that were exceeding internal limits of some compilers. */
1495#define inRANGE_helper_(cast, c, l, u) \
1496 withinCOUNT_KNOWN_VALID_(((cast) (c)), (l), ((u) - (l)))
305fe86e 1497
41f43cc2 1498#ifdef EBCDIC
b6340bd0 1499# ifndef _ALL_SOURCE
0852beac
KW
1500 /* The native libc isascii() et.al. functions return the wrong results
1501 * on at least z/OS unless this is defined. */
b6340bd0
KW
1502# error _ALL_SOURCE should probably be defined
1503# endif
41f43cc2 1504#else
0852beac
KW
1505 /* There is a simple definition of ASCII for ASCII platforms. But the
1506 * EBCDIC one isn't so simple, so is defined using table look-up like the
9c903d59 1507 * other macros below.
3f3c579d
KW
1508 *
1509 * The cast here is used instead of '(c) >= 0', because some compilers emit
1510 * a warning that that test is always true when the parameter is an
1511 * unsigned type. khw supposes that it could be written as
1512 * && ((c) == '\0' || (c) > 0)
1513 * to avoid the message, but the cast will likely avoid extra branches even
296969d3
KW
1514 * with stupid compilers. */
1515# define isASCII(c) (((WIDEST_UTYPE) ASSERT_NOT_PTR(c)) < 128)
41f43cc2
KW
1516#endif
1517
38694112
KW
1518/* Take the eight possible bit patterns of the lower 3 bits and you get the
1519 * lower 3 bits of the 8 octal digits, in both ASCII and EBCDIC, so those bits
1520 * can be ignored. If the rest match '0', we have an octal */
296969d3 1521#define isOCTAL_A(c) ((((WIDEST_UTYPE) ASSERT_NOT_PTR(c)) & ~7) == '0')
c2da0b36 1522
9fb1bf9d 1523#ifdef H_PERL /* If have access to perl.h, lookup in its table */
f4cdb42c 1524
a500dc72
KW
1525/* Character class numbers. For internal core Perl use only. The ones less
1526 * than 32 are used in PL_charclass[] and the ones up through the one that
91456fff 1527 * corresponds to <HIGHEST_REGCOMP_DOT_H_SYNC_> are used by regcomp.h and
a500dc72
KW
1528 * related files. PL_charclass ones use names used in l1_char_class_tab.h but
1529 * their actual definitions are here. If that file has a name not used here,
1530 * it won't compile.
1709d539
KW
1531 *
1532 * The first group of these is ordered in what I (khw) estimate to be the
31c7f561 1533 * frequency of their use. This gives a slight edge to exiting a loop earlier
58a3ba2c
KW
1534 * (in reginclass() in regexec.c). Except \v should be last, as it isn't a
1535 * real Posix character class, and some (small) inefficiencies in regular
1536 * expression handling would be introduced by putting it in the middle of those
1537 * that are. Also, cntrl and ascii come after the others as it may be useful
1538 * to group these which have no members that match above Latin1, (or above
1539 * ASCII in the latter case) */
1540
91456fff
KW
1541# define CC_WORDCHAR_ 0 /* \w and [:word:] */
1542# define CC_DIGIT_ 1 /* \d and [:digit:] */
1543# define CC_ALPHA_ 2 /* [:alpha:] */
1544# define CC_LOWER_ 3 /* [:lower:] */
1545# define CC_UPPER_ 4 /* [:upper:] */
1546# define CC_PUNCT_ 5 /* [:punct:] */
1547# define CC_PRINT_ 6 /* [:print:] */
1548# define CC_ALPHANUMERIC_ 7 /* [:alnum:] */
1549# define CC_GRAPH_ 8 /* [:graph:] */
1550# define CC_CASED_ 9 /* [:lower:] or [:upper:] under /i */
1551# define CC_SPACE_ 10 /* \s, [:space:] */
1552# define CC_BLANK_ 11 /* [:blank:] */
1553# define CC_XDIGIT_ 12 /* [:xdigit:] */
1554# define CC_CNTRL_ 13 /* [:cntrl:] */
1555# define CC_ASCII_ 14 /* [:ascii:] */
1556# define CC_VERTSPACE_ 15 /* \v */
1557
1558# define HIGHEST_REGCOMP_DOT_H_SYNC_ CC_VERTSPACE_
a0947d7b 1559
1709d539 1560/* The members of the third group below do not need to be coordinated with data
3ffc8c70 1561 * structures in regcomp.[ch] and regexec.c. */
91456fff
KW
1562# define CC_IDFIRST_ 16
1563# define CC_CHARNAME_CONT_ 17
1564# define CC_NONLATIN1_FOLD_ 18
1565# define CC_NONLATIN1_SIMPLE_FOLD_ 19
1566# define CC_QUOTEMETA_ 20
1567# define CC_NON_FINAL_FOLD_ 21
1568# define CC_IS_IN_SOME_FOLD_ 22
1569# define CC_BINDIGIT_ 23
1570# define CC_OCTDIGIT_ 24
1571# define CC_MNEMONIC_CNTRL_ 25
073c22b3 1572
51b58dba 1573/* Unused: 26-31
f4cdb42c
KW
1574 * If more bits are needed, one could add a second word for non-64bit
1575 * QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd
37ede926
KW
1576 * word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it
1577 * is used only for optimization (as of this writing), and differs in the
1578 * Latin1 range from the ALPHA bit only in two relatively unimportant
a500dc72 1579 * characters: the masculine and feminine ordinal indicators, so removing it
073c22b3
KW
1580 * would just cause /i regexes which match them to run less efficiently.
1581 * Similarly the EBCDIC-only bits are used just for speed, and could be
1582 * replaced by other means */
96ac0975 1583
3a371f2f
KW
1584#if defined(PERL_CORE) || defined(PERL_EXT)
1585/* An enum version of the character class numbers, to help compilers
1586 * optimize */
1587typedef enum {
91456fff
KW
1588 CC_ENUM_ALPHA_ = CC_ALPHA_,
1589 CC_ENUM_ALPHANUMERIC_ = CC_ALPHANUMERIC_,
1590 CC_ENUM_ASCII_ = CC_ASCII_,
1591 CC_ENUM_BLANK_ = CC_BLANK_,
1592 CC_ENUM_CASED_ = CC_CASED_,
1593 CC_ENUM_CNTRL_ = CC_CNTRL_,
1594 CC_ENUM_DIGIT_ = CC_DIGIT_,
1595 CC_ENUM_GRAPH_ = CC_GRAPH_,
1596 CC_ENUM_LOWER_ = CC_LOWER_,
1597 CC_ENUM_PRINT_ = CC_PRINT_,
1598 CC_ENUM_PUNCT_ = CC_PUNCT_,
1599 CC_ENUM_SPACE_ = CC_SPACE_,
1600 CC_ENUM_UPPER_ = CC_UPPER_,
1601 CC_ENUM_VERTSPACE_ = CC_VERTSPACE_,
1602 CC_ENUM_WORDCHAR_ = CC_WORDCHAR_,
1603 CC_ENUM_XDIGIT_ = CC_XDIGIT_
1604} char_class_number_;
3a371f2f
KW
1605#endif
1606
91456fff 1607#define POSIX_CC_COUNT (HIGHEST_REGCOMP_DOT_H_SYNC_ + 1)
63c61c3f 1608
6635f04f 1609START_EXTERN_C
96ac0975
NC
1610# ifdef DOINIT
1611EXTCONST U32 PL_charclass[] = {
1612# include "l1_char_class_tab.h"
1613};
1614
1615# else /* ! DOINIT */
1616EXTCONST U32 PL_charclass[];
1617# endif
6635f04f 1618END_EXTERN_C
96ac0975 1619
265c1f46 1620 /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */
91456fff 1621# define CC_mask_(classnum) (1U << (classnum))
4650c663
KW
1622
1623 /* For internal core Perl use only: the base macro for defining macros like
1624 * isALPHA */
6eb62d23 1625# define generic_isCC_(c, classnum) cBOOL(FITS_IN_8_BITS(c) \
91456fff 1626 && (PL_charclass[(U8) (c)] & CC_mask_(classnum)))
4eeeb416 1627
f4cdb42c
KW
1628 /* The mask for the _A versions of the macros; it just adds in the bit for
1629 * ASCII. */
91456fff 1630# define CC_mask_A_(classnum) (CC_mask_(classnum) | CC_mask_(CC_ASCII_))
f4cdb42c 1631
4650c663
KW
1632 /* For internal core Perl use only: the base macro for defining macros like
1633 * isALPHA_A. The foo_A version makes sure that both the desired bit and
1634 * the ASCII bit are present */
6eb62d23 1635# define generic_isCC_A_(c, classnum) (FITS_IN_8_BITS(c) \
91456fff
KW
1636 && ((PL_charclass[(U8) (c)] & CC_mask_A_(classnum)) \
1637 == CC_mask_A_(classnum)))
f4cdb42c 1638
26c1d9d8
KW
1639/* On ASCII platforms certain classes form a single range. It's faster to
1640 * special case these. isDIGIT is a single range on all platforms */
b877c1ff 1641# ifdef EBCDIC
91456fff
KW
1642# define isALPHA_A(c) generic_isCC_A_(c, CC_ALPHA_)
1643# define isGRAPH_A(c) generic_isCC_A_(c, CC_GRAPH_)
1644# define isLOWER_A(c) generic_isCC_A_(c, CC_LOWER_)
1645# define isPRINT_A(c) generic_isCC_A_(c, CC_PRINT_)
1646# define isUPPER_A(c) generic_isCC_A_(c, CC_UPPER_)
b877c1ff 1647# else
26c1d9d8 1648 /* By folding the upper and lowercase, we can use a single range */
b877c1ff 1649# define isALPHA_A(c) inRANGE((~('A' ^ 'a') & (c)), 'A', 'Z')
26c1d9d8 1650# define isGRAPH_A(c) inRANGE(c, ' ' + 1, 0x7e)
b877c1ff
KW
1651# define isLOWER_A(c) inRANGE(c, 'a', 'z')
1652# define isPRINT_A(c) inRANGE(c, ' ', 0x7e)
1653# define isUPPER_A(c) inRANGE(c, 'A', 'Z')
1654# endif
91456fff
KW
1655# define isALPHANUMERIC_A(c) generic_isCC_A_(c, CC_ALPHANUMERIC_)
1656# define isBLANK_A(c) generic_isCC_A_(c, CC_BLANK_)
1657# define isCNTRL_A(c) generic_isCC_A_(c, CC_CNTRL_)
b877c1ff 1658# define isDIGIT_A(c) inRANGE(c, '0', '9')
91456fff
KW
1659# define isPUNCT_A(c) generic_isCC_A_(c, CC_PUNCT_)
1660# define isSPACE_A(c) generic_isCC_A_(c, CC_SPACE_)
1661# define isWORDCHAR_A(c) generic_isCC_A_(c, CC_WORDCHAR_)
1662# define isXDIGIT_A(c) generic_isCC_(c, CC_XDIGIT_) /* No non-ASCII xdigits
b7d90381 1663 */
91456fff
KW
1664# define isIDFIRST_A(c) generic_isCC_A_(c, CC_IDFIRST_)
1665# define isALPHA_L1(c) generic_isCC_(c, CC_ALPHA_)
1666# define isALPHANUMERIC_L1(c) generic_isCC_(c, CC_ALPHANUMERIC_)
1667# define isBLANK_L1(c) generic_isCC_(c, CC_BLANK_)
3ded5eb0
KW
1668
1669 /* continuation character for legal NAME in \N{NAME} */
91456fff 1670# define isCHARNAME_CONT(c) generic_isCC_(c, CC_CHARNAME_CONT_)
3ded5eb0 1671
91456fff
KW
1672# define isCNTRL_L1(c) generic_isCC_(c, CC_CNTRL_)
1673# define isGRAPH_L1(c) generic_isCC_(c, CC_GRAPH_)
1674# define isLOWER_L1(c) generic_isCC_(c, CC_LOWER_)
1675# define isPRINT_L1(c) generic_isCC_(c, CC_PRINT_)
b7d90381 1676# define isPSXSPC_L1(c) isSPACE_L1(c)
91456fff
KW
1677# define isPUNCT_L1(c) generic_isCC_(c, CC_PUNCT_)
1678# define isSPACE_L1(c) generic_isCC_(c, CC_SPACE_)
1679# define isUPPER_L1(c) generic_isCC_(c, CC_UPPER_)
1680# define isWORDCHAR_L1(c) generic_isCC_(c, CC_WORDCHAR_)
1681# define isIDFIRST_L1(c) generic_isCC_(c, CC_IDFIRST_)
f4cdb42c 1682
0852beac 1683# ifdef EBCDIC
91456fff 1684# define isASCII(c) generic_isCC_(c, CC_ASCII_)
0852beac
KW
1685# endif
1686
f12c0118 1687 /* Participates in a single-character fold with a character above 255 */
c62fdeb7 1688# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
81d43abf 1689# define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(c) \
c62fdeb7 1690 (( ! cBOOL(FITS_IN_8_BITS(c))) \
91456fff 1691 || (PL_charclass[(U8) (c)] & CC_mask_(CC_NONLATIN1_SIMPLE_FOLD_)))
c62fdeb7 1692
91456fff
KW
1693# define IS_NON_FINAL_FOLD(c) generic_isCC_(c, CC_NON_FINAL_FOLD_)
1694# define IS_IN_SOME_FOLD_L1(c) generic_isCC_(c, CC_IS_IN_SOME_FOLD_)
c62fdeb7 1695# endif
f12c0118
KW
1696
1697 /* Like the above, but also can be part of a multi-char fold */
c62fdeb7
KW
1698# define HAS_NONLATIN1_FOLD_CLOSURE(c) \
1699 ( (! cBOOL(FITS_IN_8_BITS(c))) \
91456fff 1700 || (PL_charclass[(U8) (c)] & CC_mask_(CC_NONLATIN1_FOLD_)))
430b7c70 1701
91456fff 1702# define _isQUOTEMETA(c) generic_isCC_(c, CC_QUOTEMETA_)
5e6ebb12
KW
1703
1704/* is c a control character for which we have a mnemonic? */
1705# if defined(PERL_CORE) || defined(PERL_EXT)
91456fff 1706# define isMNEMONIC_CNTRL(c) generic_isCC_(c, CC_MNEMONIC_CNTRL_)
5e6ebb12 1707# endif
687c8d01 1708#else /* else we don't have perl.h H_PERL */
3ded5eb0
KW
1709
1710 /* If we don't have perl.h, we are compiling a utility program. Below we
1711 * hard-code various macro definitions that wouldn't otherwise be available
fc273927 1712 * to it. Most are coded based on first principles. These are written to
74665a89 1713 * avoid EBCDIC vs. ASCII #ifdef's as much as possible. */
182c4ace 1714# define isDIGIT_A(c) inRANGE(c, '0', '9')
0852beac 1715# define isBLANK_A(c) ((c) == ' ' || (c) == '\t')
74665a89
KW
1716# define isSPACE_A(c) (isBLANK_A(c) \
1717 || (c) == '\n' \
1718 || (c) == '\r' \
1719 || (c) == '\v' \
0852beac 1720 || (c) == '\f')
74665a89
KW
1721 /* On EBCDIC, there are gaps between 'i' and 'j'; 'r' and 's'. Same for
1722 * uppercase. The tests for those aren't necessary on ASCII, but hurt only
1723 * performance (if optimization isn't on), and allow the same code to be
1724 * used for both platform types */
182c4ace
KW
1725# define isLOWER_A(c) inRANGE((c), 'a', 'i') \
1726 || inRANGE((c), 'j', 'r') \
1727 || inRANGE((c), 's', 'z')
1728# define isUPPER_A(c) inRANGE((c), 'A', 'I') \
1729 || inRANGE((c), 'J', 'R') \
1730 || inRANGE((c), 'S', 'Z')
a4d7a999
KW
1731# define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c))
1732# define isALPHANUMERIC_A(c) (isALPHA_A(c) || isDIGIT_A(c))
3ded5eb0 1733# define isWORDCHAR_A(c) (isALPHANUMERIC_A(c) || (c) == '_')
0852beac 1734# define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_')
182c4ace
KW
1735# define isXDIGIT_A(c) ( isDIGIT_A(c) \
1736 || inRANGE((c), 'a', 'f') \
1737 || inRANGE((c), 'A', 'F')
74665a89
KW
1738# define isPUNCT_A(c) ((c) == '-' || (c) == '!' || (c) == '"' \
1739 || (c) == '#' || (c) == '$' || (c) == '%' \
1740 || (c) == '&' || (c) == '\'' || (c) == '(' \
1741 || (c) == ')' || (c) == '*' || (c) == '+' \
1742 || (c) == ',' || (c) == '.' || (c) == '/' \
1743 || (c) == ':' || (c) == ';' || (c) == '<' \
1744 || (c) == '=' || (c) == '>' || (c) == '?' \
1745 || (c) == '@' || (c) == '[' || (c) == '\\' \
1746 || (c) == ']' || (c) == '^' || (c) == '_' \
1747 || (c) == '`' || (c) == '{' || (c) == '|' \
1748 || (c) == '}' || (c) == '~')
1749# define isGRAPH_A(c) (isALPHANUMERIC_A(c) || isPUNCT_A(c))
1750# define isPRINT_A(c) (isGRAPH_A(c) || (c) == ' ')
3ded5eb0 1751
0852beac 1752# ifdef EBCDIC
74665a89
KW
1753 /* The below is accurate for the 3 EBCDIC code pages traditionally
1754 * supported by perl. The only difference between them in the controls
1755 * is the position of \n, and that is represented symbolically below */
1756# define isCNTRL_A(c) ((c) == '\0' || (c) == '\a' || (c) == '\b' \
1757 || (c) == '\f' || (c) == '\n' || (c) == '\r' \
1758 || (c) == '\t' || (c) == '\v' \
182c4ace 1759 || inRANGE((c), 1, 3) /* SOH, STX, ETX */ \
8ec0a736 1760 || (c) == 7F /* U+7F DEL */ \
182c4ace
KW
1761 || inRANGE((c), 0x0E, 0x13) /* SO SI DLE \
1762 DC[1-3] */ \
74665a89
KW
1763 || (c) == 0x18 /* U+18 CAN */ \
1764 || (c) == 0x19 /* U+19 EOM */ \
182c4ace 1765 || inRANGE((c), 0x1C, 0x1F) /* [FGRU]S */ \
74665a89
KW
1766 || (c) == 0x26 /* U+17 ETB */ \
1767 || (c) == 0x27 /* U+1B ESC */ \
1768 || (c) == 0x2D /* U+05 ENQ */ \
1769 || (c) == 0x2E /* U+06 ACK */ \
1770 || (c) == 0x32 /* U+16 SYN */ \
1771 || (c) == 0x37 /* U+04 EOT */ \
1772 || (c) == 0x3C /* U+14 DC4 */ \
1773 || (c) == 0x3D /* U+15 NAK */ \
1774 || (c) == 0x3F)/* U+1A SUB */
0852beac 1775# define isASCII(c) (isCNTRL_A(c) || isPRINT_A(c))
74665a89
KW
1776# else /* isASCII is already defined for ASCII platforms, so can use that to
1777 define isCNTRL */
1778# define isCNTRL_A(c) (isASCII(c) && ! isPRINT_A(c))
0852beac
KW
1779# endif
1780
3ffc8c70 1781 /* The _L1 macros may be unnecessary for the utilities; I (khw) added them
caa94d35
KW
1782 * during debugging, and it seems best to keep them. We may be called
1783 * without NATIVE_TO_LATIN1 being defined. On ASCII platforms, it doesn't
1784 * do anything anyway, so make it not a problem */
1785# if ! defined(EBCDIC) && ! defined(NATIVE_TO_LATIN1)
1786# define NATIVE_TO_LATIN1(ch) (ch)
1787# endif
3ded5eb0
KW
1788# define isALPHA_L1(c) (isUPPER_L1(c) || isLOWER_L1(c))
1789# define isALPHANUMERIC_L1(c) (isALPHA_L1(c) || isDIGIT_A(c))
1790# define isBLANK_L1(c) (isBLANK_A(c) \
1791 || (FITS_IN_8_BITS(c) \
1792 && NATIVE_TO_LATIN1((U8) c) == 0xA0))
1793# define isCNTRL_L1(c) (FITS_IN_8_BITS(c) && (! isPRINT_L1(c)))
1794# define isGRAPH_L1(c) (isPRINT_L1(c) && (! isBLANK_L1(c)))
1795# define isLOWER_L1(c) (isLOWER_A(c) \
1796 || (FITS_IN_8_BITS(c) \
ae683a5f 1797 && (( NATIVE_TO_LATIN1((U8) c) >= 0xDF \
3ded5eb0
KW
1798 && NATIVE_TO_LATIN1((U8) c) != 0xF7) \
1799 || NATIVE_TO_LATIN1((U8) c) == 0xAA \
1800 || NATIVE_TO_LATIN1((U8) c) == 0xBA \
1801 || NATIVE_TO_LATIN1((U8) c) == 0xB5)))
1802# define isPRINT_L1(c) (isPRINT_A(c) \
1803 || (FITS_IN_8_BITS(c) \
1804 && NATIVE_TO_LATIN1((U8) c) >= 0xA0))
3ded5eb0
KW
1805# define isPUNCT_L1(c) (isPUNCT_A(c) \
1806 || (FITS_IN_8_BITS(c) \
ae683a5f 1807 && ( NATIVE_TO_LATIN1((U8) c) == 0xA1 \
3ded5eb0
KW
1808 || NATIVE_TO_LATIN1((U8) c) == 0xA7 \
1809 || NATIVE_TO_LATIN1((U8) c) == 0xAB \
1810 || NATIVE_TO_LATIN1((U8) c) == 0xB6 \
1811 || NATIVE_TO_LATIN1((U8) c) == 0xB7 \
1812 || NATIVE_TO_LATIN1((U8) c) == 0xBB \
1813 || NATIVE_TO_LATIN1((U8) c) == 0xBF)))
1814# define isSPACE_L1(c) (isSPACE_A(c) \
1815 || (FITS_IN_8_BITS(c) \
ae683a5f 1816 && ( NATIVE_TO_LATIN1((U8) c) == 0x85 \
3ded5eb0
KW
1817 || NATIVE_TO_LATIN1((U8) c) == 0xA0)))
1818# define isUPPER_L1(c) (isUPPER_A(c) \
1819 || (FITS_IN_8_BITS(c) \
182c4ace
KW
1820 && ( IN_RANGE(NATIVE_TO_LATIN1((U8) c), \
1821 0xC0, 0xDE) \
3ded5eb0
KW
1822 && NATIVE_TO_LATIN1((U8) c) != 0xD7)))
1823# define isWORDCHAR_L1(c) (isIDFIRST_L1(c) || isDIGIT_A(c))
1824# define isIDFIRST_L1(c) (isALPHA_L1(c) || NATIVE_TO_LATIN1(c) == '_')
1825# define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) \
1826 || isBLANK_L1(c) \
1827 || (c) == '-' \
1828 || (c) == '(' \
1829 || (c) == ')')
1830 /* The following are not fully accurate in the above-ASCII range. I (khw)
1831 * don't think it's necessary to be so for the purposes where this gets
1832 * compiled */
6eb62d23 1833# define isQUOTEMETA_(c) (FITS_IN_8_BITS(c) && ! isWORDCHAR_L1(c))
3ded5eb0
KW
1834# define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) isALPHA_L1(c)
1835
1836 /* And these aren't accurate at all. They are useful only for above
1837 * Latin1, which utilities and bootstrapping don't deal with */
1838# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 0
6838b41e 1839# define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0
3ded5eb0
KW
1840# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0
1841
1842 /* Many of the macros later in this file are defined in terms of these. By
1843 * implementing them with a function, which converts the class number into
1844 * a call to the desired macro, all of the later ones work. However, that
1845 * function won't be actually defined when building a utility program (no
1846 * perl.h), and so a compiler error will be generated if one is attempted
1847 * to be used. And the above-Latin1 code points require Unicode tables to
1848 * be present, something unlikely to be the case when bootstrapping */
6eb62d23 1849# define generic_isCC_(c, classnum) \
3ded5eb0 1850 (FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), TRUE))
6eb62d23 1851# define generic_isCC_A_(c, classnum) \
3ded5eb0 1852 (FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), FALSE))
687c8d01 1853#endif /* End of no perl.h H_PERL */
8a58bdcf 1854
e66b99e9
KW
1855#define isALPHANUMERIC(c) isALPHANUMERIC_A(c)
1856#define isALPHA(c) isALPHA_A(c)
0852beac
KW
1857#define isASCII_A(c) isASCII(c)
1858#define isASCII_L1(c) isASCII(c)
e66b99e9
KW
1859#define isBLANK(c) isBLANK_A(c)
1860#define isCNTRL(c) isCNTRL_A(c)
1861#define isDIGIT(c) isDIGIT_A(c)
1862#define isGRAPH(c) isGRAPH_A(c)
1863#define isIDFIRST(c) isIDFIRST_A(c)
1864#define isLOWER(c) isLOWER_A(c)
1865#define isPRINT(c) isPRINT_A(c)
779cf272 1866#define isPSXSPC_A(c) isSPACE_A(c)
e66b99e9 1867#define isPSXSPC(c) isPSXSPC_A(c)
779cf272 1868#define isPSXSPC_L1(c) isSPACE_L1(c)
e66b99e9
KW
1869#define isPUNCT(c) isPUNCT_A(c)
1870#define isSPACE(c) isSPACE_A(c)
1871#define isUPPER(c) isUPPER_A(c)
1872#define isWORDCHAR(c) isWORDCHAR_A(c)
1873#define isXDIGIT(c) isXDIGIT_A(c)
1874
1875/* ASCII casing. These could also be written as
1876 #define toLOWER(c) (isASCII(c) ? toLOWER_LATIN1(c) : (c))
1877 #define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c))
1878 which uses table lookup and mask instead of subtraction. (This would
c5e9991e
KW
1879 work because the _MOD does not apply in the ASCII range).
1880
1881 These actually are UTF-8 invariant casing, not just ASCII, as any non-ASCII
1882 UTF-8 invariants are neither upper nor lower. (Only on EBCDIC platforms are
1883 there non-ASCII invariants, and all of them are controls.) */
68067e4e
DM
1884#define toLOWER(c) (isUPPER(c) ? (U8)((c) + ('a' - 'A')) : (c))
1885#define toUPPER(c) (isLOWER(c) ? (U8)((c) - ('a' - 'A')) : (c))
bbce6d69 1886
25200305
KW
1887/* In the ASCII range, these are equivalent to what they're here defined to be.
1888 * But by creating these definitions, other code doesn't have to be aware of
c5e9991e
KW
1889 * this detail. Actually this works for all UTF-8 invariants, not just the
1890 * ASCII range. (EBCDIC platforms can have non-ASCII invariants.) */
25200305 1891#define toFOLD(c) toLOWER(c)
25200305
KW
1892#define toTITLE(c) toUPPER(c)
1893
c753c8d3
KW
1894#define toLOWER_A(c) toLOWER(c)
1895#define toUPPER_A(c) toUPPER(c)
25200305
KW
1896#define toFOLD_A(c) toFOLD(c)
1897#define toTITLE_A(c) toTITLE(c)
1a0901db 1898
4650c663 1899/* Use table lookup for speed; returns the input itself if is out-of-range */
b2bf251f 1900#define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \
8e7c6e7d 1901 ? (c) \
f4cd282c 1902 : PL_latin1_lc[ (U8) (c) ])
c753c8d3
KW
1903#define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */
1904
1a0901db 1905/* Modified uc. Is correct uc except for three non-ascii chars which are
4650c663
KW
1906 * all mapped to one of them, and these need special handling; returns the
1907 * input itself if is out-of-range */
b2bf251f 1908#define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \
8e7c6e7d 1909 ? (c) \
f4cd282c 1910 : PL_mod_latin1_uc[ (U8) (c) ])
f41910bf
KW
1911#ifdef USE_LOCALE_CTYPE
1912# define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale
1913# define IN_UTF8_TURKIC_LOCALE PL_in_utf8_turkic_locale
1914#else
1915# define IN_UTF8_CTYPE_LOCALE false
1916# define IN_UTF8_TURKIC_LOCALE false
1917#endif
84061b6a 1918
beab9ebe
KW
1919/* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */
1920
1921/* For internal core Perl use only: the base macro for defining macros like
1922 * isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point
31f05a37 1923 * (0-255) to check. In a UTF-8 locale, the result is the same as calling
53049083
KW
1924 * isFOO_L1(); 'classnum' is something like CC_UPPER_, which gives the class
1925 * number for doing this. For non-UTF-8 locales, the code to actually do the
1926 * test this is passed in 'non_utf8'. If 'c' is above 255, 0 is returned. For
1927 * accessing the full range of possible code points under locale rules, use the
1928 * macros based on generic_LC_uvchr_ instead of this. */
1929#define generic_LC_base_(c, classnum, non_utf8_func) \
81d43abf
KW
1930 (! FITS_IN_8_BITS(c) \
1931 ? 0 \
1932 : IN_UTF8_CTYPE_LOCALE \
53049083
KW
1933 ? cBOOL(PL_charclass[(U8) (c)] & CC_mask_(classnum)) \
1934 : cBOOL(non_utf8_func(c)))
beab9ebe 1935
ef620431
KW
1936/* A helper macro for defining macros like isALPHA_LC. On systems without
1937 * proper locales, these reduce to, e.g., isALPHA_A */
1938#ifdef CTYPE256
1939# define generic_LC_(c, classnum, non_utf8_func) \
1940 generic_LC_base_(c, classnum, non_utf8_func)
1941#else
1942# define generic_LC_(c, classnum, non_utf8_func) \
1943 generic_isCC_A_(c, classnum)
1944#endif
beab9ebe 1945
33bdb9d3
KW
1946/* Below are the definitions for the locale-sensitive character classification
1947 * macros whose input domain is a byte, and the locale isn't UTF-8. These are
1948 * as close as possible to the bare versions on the platform and still yield
1949 * POSIX Standard-compliant results.
1950 *
1951 * There is currently only one place these definitions should be used, in
1952 * certain function calls like Perl_iswordchar_() in inline.h.
1953 *
1954 * Most likely you want to use the macros a ways below with names like
1955 * isALPHA_LC(). Rarely, you may want isU8_ALPHA_LC(), somewhat below.
1956 *
1957 * The first two aren't in C89, so the fallback is to use the non-locale
1958 * sensitive versions; these are the same for all platforms */
f05550c0 1959#if defined(HAS_ISASCII)
cbc5b6f1 1960# define is_base_ASCII(c) isascii((U8) (c))
84061b6a 1961#else
cbc5b6f1 1962# define is_base_ASCII(c) isASCII(c)
84061b6a
KW
1963#endif
1964
f05550c0 1965#if defined(HAS_ISBLANK)
cbc5b6f1 1966# define is_base_BLANK(c) isblank((U8) (c))
6d432bcf 1967#else
cbc5b6f1 1968# define is_base_BLANK(c) isBLANK(c)
84061b6a
KW
1969#endif
1970
6d432bcf 1971/* The next few are the same in all platforms. */
cbc5b6f1
KW
1972#define is_base_CNTRL(c) iscntrl((U8) (c))
1973#define is_base_IDFIRST(c) (UNLIKELY((c) == '_') || is_base_ALPHA(c))
1974#define is_base_SPACE(c) isspace((U8) (c))
1975#define is_base_WORDCHAR(c) (UNLIKELY((c) == '_') || is_base_ALPHANUMERIC(c))
3f0486a3 1976
33bdb9d3 1977/* The base-level case changing macros are also the same in all platforms */
cbc5b6f1
KW
1978#define to_base_LOWER(c) tolower((U8) (c))
1979#define to_base_UPPER(c) toupper((U8) (c))
1980#define to_base_FOLD(c) to_base_LOWER(c)
33bdb9d3
KW
1981
1982#ifdef WIN32
81d43abf
KW
1983
1984/* The Windows functions don't bother to follow the POSIX standard, which for
1985 * example says that something can't both be a printable and a control. But
6d432bcf
KW
1986 * Windows treats \t as both a control and a printable, and does such things as
1987 * making superscripts into both digits and punctuation. These #defines tame
1988 * these flaws by assuming that the definitions of controls (and the other few
1989 * ones defined above) are correct, and then making sure that other definitions
1990 * don't have weirdnesses, by adding a check that \w and its subsets aren't
1991 * ispunct(), and things that are \W, like ispunct(), arent't controls. Not
1992 * all possible weirdnesses are checked for, just ones that were detected on
1993 * actual Microsoft code pages */
cbc5b6f1
KW
1994# define is_base_ALPHA(c) \
1995 (isalpha((U8) (c)) && ! is_base_PUNCT(c))
1996# define is_base_ALPHANUMERIC(c) \
1997 (isalnum((U8) (c)) && ! is_base_PUNCT(c))
1998# define is_base_CASED(c) \
1999 ((isupper((U8) (c)) || islower((U8) (c))) && ! is_base_PUNCT(c))
2000# define is_base_DIGIT(c) \
2001 (isdigit((U8) (c)) && ! is_base_PUNCT(c))
2002# define is_base_GRAPH(c) \
2003 (isgraph((U8) (c)) && ! is_base_CNTRL(c))
2004# define is_base_LOWER(c) \
2005 (islower((U8) (c)) && ! is_base_PUNCT(c))
2006# define is_base_PRINT(c) \
2007 (isprint((U8) (c)) && ! is_base_CNTRL(c))
2008# define is_base_PUNCT(c) \
2009 (ispunct((U8) (c)) && ! is_base_CNTRL(c))
2010# define is_base_UPPER(c) \
2011 (isupper((U8) (c)) && ! is_base_PUNCT(c))
2012# define is_base_XDIGIT(c) \
2013 (isxdigit((U8) (c)) && ! is_base_PUNCT(c))
6d432bcf
KW
2014#else
2015
33bdb9d3
KW
2016/* For all other platforms, as far as we know, isdigit(), etc. work sanely
2017 * enough */
cbc5b6f1
KW
2018# define is_base_ALPHA(c) isalpha((U8) (c))
2019# define is_base_ALPHANUMERIC(c) isalnum((U8) (c))
2020# define is_base_CASED(c) (islower((U8) (c)) || isupper((U8) (c)))
2021# define is_base_DIGIT(c) isdigit((U8) (c))
6d432bcf
KW
2022
2023 /* ... But it seems that IBM products treat NBSP as both a space and a
2024 * graphic; these are the two platforms that we have active test beds for.
2025 */
2026# if defined(OS390) || defined(_AIX)
cbc5b6f1 2027# define is_base_GRAPH(c) (isgraph((U8) (c)) && ! isspace((U8) (c)))
6d432bcf 2028# else
cbc5b6f1 2029# define is_base_GRAPH(c) isgraph((U8) (c))
3f0486a3 2030# endif
cbc5b6f1
KW
2031# define is_base_LOWER(c) islower((U8) (c))
2032# define is_base_PRINT(c) isprint((U8) (c))
2033# define is_base_PUNCT(c) ispunct((U8) (c))
2034# define is_base_UPPER(c) isupper((U8) (c))
2035# define is_base_XDIGIT(c) isxdigit((U8) (c))
6d432bcf
KW
2036#endif
2037
33bdb9d3
KW
2038/* Below is the next level up, which currently expands to nothing more
2039 * than the previous layer. These are the macros to use if you really need
2040 * something whose input domain is a byte, and the locale isn't UTF-8; that is,
2041 * where you normally would have to use things like bare isalnum().
2042 *
2043 * But most likely you should instead use the layer defined further below which
2044 * has names like isALPHA_LC. They deal with larger-than-byte inputs, and
2045 * UTF-8 locales.
2046 *
2047 * (Note, proper general operation of the bare libc functons requires you to
2048 * cast to U8. These do that for you automatically.) */
2049
cbc5b6f1 2050# define WRAP_U8_LC_(c, classnum, base) base(c)
33bdb9d3
KW
2051
2052#define isU8_ALPHANUMERIC_LC(c) \
cbc5b6f1
KW
2053 WRAP_U8_LC_((c), CC_ALPHANUMERIC_, is_base_ALPHANUMERIC)
2054#define isU8_ALPHA_LC(c) WRAP_U8_LC_((c), CC_ALPHA_, is_base_ALPHA)
2055#define isU8_ASCII_LC(c) WRAP_U8_LC_((c), CC_ASCII_, is_base_ASCII)
2056#define isU8_BLANK_LC(c) WRAP_U8_LC_((c), CC_BLANK_, is_base_BLANK)
2057#define isU8_CASED_LC(c) WRAP_U8_LC_((c), CC_CASED_, is_base_CASED)
2058#define isU8_CNTRL_LC(c) WRAP_U8_LC_((c), CC_CNTRL_, is_base_CNTRL)
2059#define isU8_DIGIT_LC(c) WRAP_U8_LC_((c), CC_DIGIT_, is_base_DIGIT)
2060#define isU8_GRAPH_LC(c) WRAP_U8_LC_((c), CC_GRAPH_, is_base_GRAPH)
2061#define isU8_IDFIRST_LC(c) WRAP_U8_LC_((c), CC_IDFIRST_, is_base_IDFIRST)
2062#define isU8_LOWER_LC(c) WRAP_U8_LC_((c), CC_LOWER_, is_base_LOWER)
2063#define isU8_PRINT_LC(c) WRAP_U8_LC_((c), CC_PRINT_, is_base_PRINT)
2064#define isU8_PUNCT_LC(c) WRAP_U8_LC_((c), CC_PUNCT_, is_base_PUNCT)
2065#define isU8_SPACE_LC(c) WRAP_U8_LC_((c), CC_SPACE_, is_base_SPACE)
2066#define isU8_UPPER_LC(c) WRAP_U8_LC_((c), CC_UPPER_, is_base_UPPER)
2067#define isU8_WORDCHAR_LC(c) WRAP_U8_LC_((c), CC_WORDCHAR_, is_base_WORDCHAR)
2068#define isU8_XDIGIT_LC(c) WRAP_U8_LC_((c), CC_XDIGIT_, is_base_XDIGIT)
2069
2070#define toU8_LOWER_LC(c) WRAP_U8_LC_((c), CC_TOLOWER_, to_base_LOWER)
2071#define toU8_UPPER_LC(c) WRAP_U8_LC_((c), CC_TOUPPER_, to_base_UPPER)
33bdb9d3
KW
2072#define toU8_FOLD_LC(c) toU8_LOWER_LC(c)
2073
6d432bcf
KW
2074/* The definitions below use the ones above to create versions in which the
2075 * input domain isn't restricted to bytes (though always returning false if the
2076 * input doesn't fit in a byte), and to behave properly should the locale be
33bdb9d3
KW
2077 * UTF-8. These are the documented ones, suitable for general use (though
2078 * toUPPER_LC and toFOLD_LC aren't documented because they need special
2079 * handling to deal with SHARP S expanding to two characters). */
2080
6d432bcf
KW
2081#define isASCII_LC(c) (FITS_IN_8_BITS(c) && isU8_ASCII_LC(c))
2082#define isALPHA_LC(c) generic_LC_(c, CC_ALPHA_, isU8_ALPHA_LC)
2083#define isALPHANUMERIC_LC(c) \
2084 generic_LC_(c, CC_ALPHANUMERIC_, isU8_ALPHANUMERIC_LC)
2085#define isBLANK_LC(c) generic_LC_(c, CC_BLANK_, isU8_BLANK_LC)
8fd8ea43 2086#define isCASED_LC(c) generic_LC_(c, CC_CASED_, isU8_CASED_LC)
6d432bcf
KW
2087#define isCNTRL_LC(c) generic_LC_(c, CC_CNTRL_, isU8_CNTRL_LC)
2088#define isDIGIT_LC(c) generic_LC_(c, CC_DIGIT_, isU8_DIGIT_LC)
2089#define isGRAPH_LC(c) generic_LC_(c, CC_GRAPH_, isU8_GRAPH_LC)
2090#define isIDFIRST_LC(c) generic_LC_(c, CC_IDFIRST_, isU8_IDFIRST_LC)
2091#define isLOWER_LC(c) generic_LC_(c, CC_LOWER_, isU8_LOWER_LC)
2092#define isPRINT_LC(c) generic_LC_(c, CC_PRINT_, isU8_PRINT_LC)
2093#define isPUNCT_LC(c) generic_LC_(c, CC_PUNCT_, isU8_PUNCT_LC)
2094#define isSPACE_LC(c) generic_LC_(c, CC_SPACE_, isU8_SPACE_LC)
2095#define isUPPER_LC(c) generic_LC_(c, CC_UPPER_, isU8_UPPER_LC)
2096#define isWORDCHAR_LC(c) generic_LC_(c, CC_WORDCHAR_, isU8_WORDCHAR_LC)
2097#define isXDIGIT_LC(c) generic_LC_(c, CC_XDIGIT_, isU8_XDIGIT_LC)
4a283f4f 2098
ef620431 2099#ifndef CTYPE256
d277535a
KW
2100# define toLOWER_LC(c) toLOWER_A(c)
2101# define toUPPER_LC(c) toUPPER_A(c)
2102# define toFOLD_LC(c) toFOLD_A(c)
ef620431
KW
2103#else
2104
4a283f4f
KW
2105/* In the next three macros, the reason for using the PL_latin arrays is in
2106 * case the system function is defective; it ensures uniform results that
2107 * conform to the Unicode standard. */
2108
2109/* This does not handle the anomalies in UTF-8 Turkic locales. */
31e89ad7 2110# define toLOWER_LC(c) ((! FITS_IN_8_BITS(c)) \
4a283f4f
KW
2111 ? (c) \
2112 : ((IN_UTF8_CTYPE_LOCALE) \
2113 ? PL_latin1_lc[ (U8) (c) ] \
6d432bcf 2114 : ((U8) toU8_LOWER_LC(c))))
4a283f4f
KW
2115
2116/* In this macro, note that the result can be larger than a byte in a UTF-8
2117 * locale. It returns a single value, so can't adequately return the upper
2118 * case of LATIN SMALL LETTER SHARP S in a UTF-8 locale (which should be a
2119 * string of two values "SS"); instead it asserts against that under
2120 * DEBUGGING, and otherwise returns its input. It does not handle the
2121 * anomalies in UTF-8 Turkic locales. */
31e89ad7 2122# define toUPPER_LC(c) \
4a283f4f
KW
2123 ((! FITS_IN_8_BITS(c)) \
2124 ? (c) \
2125 : ((! IN_UTF8_CTYPE_LOCALE) \
6d432bcf 2126 ? ((U8) toU8_UPPER_LC(c)) \
4a283f4f
KW
2127 : (UNLIKELY(((U8)(c)) == MICRO_SIGN) \
2128 ? GREEK_CAPITAL_LETTER_MU \
2129 : ((UNLIKELY(((U8) (c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \
2130 ? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \
2131 : (UNLIKELY(((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \
2132 ? (__ASSERT_(0) (c)) /* Fail on Sharp S in DEBUGGING */ \
2133 : PL_mod_latin1_uc[ (U8) (c) ]))))))
2134
2135/* In this macro, note that the result can be larger than a byte in a UTF-8
2136 * locale. It returns a single value, so can't adequately return the fold case
2137 * of LATIN SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of
2138 * two values "ss"); instead it asserts against that under DEBUGGING, and
2139 * otherwise returns its input. It does not handle the anomalies in UTF-8
2140 * Turkic locales */
31e89ad7 2141# define toFOLD_LC(c) \
4a283f4f
KW
2142 ((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \
2143 ? GREEK_SMALL_LETTER_MU \
2144 : (__ASSERT_( ! IN_UTF8_CTYPE_LOCALE \
2145 || LIKELY((c) != LATIN_SMALL_LETTER_SHARP_S)) \
31e89ad7 2146 toLOWER_LC(c)))
f05550c0 2147#endif
55204971 2148
eba68aa0
KW
2149#define isIDCONT(c) isWORDCHAR(c)
2150#define isIDCONT_A(c) isWORDCHAR_A(c)
2151#define isIDCONT_L1(c) isWORDCHAR_L1(c)
2152#define isIDCONT_LC(c) isWORDCHAR_LC(c)
13380643 2153#define isPSXSPC_LC(c) isSPACE_LC(c)
aaa51d5e 2154
4650c663 2155/* For internal core Perl use only: the base macros for defining macros like
d0da05db 2156 * isALPHA_uvchr. 'c' is the code point to check. 'classnum' is the POSIX class
6eb62d23 2157 * number defined earlier in this file. generic_uvchr_() is used for POSIX
4650c663
KW
2158 * classes where there is a macro or function 'above_latin1' that takes the
2159 * single argument 'c' and returns the desired value. These exist for those
2366ba44 2160 * classes which have simple definitions, avoiding the overhead of an inversion
6eb62d23 2161 * list binary search. generic_invlist_uvchr_() can be used
4650c663 2162 * for classes where that overhead is faster than a direct lookup.
6eb62d23
KW
2163 * generic_uvchr_() won't compile if 'c' isn't unsigned, as it won't match the
2164 * 'above_latin1' prototype. generic_isCC_() macro does bounds checking, so
4650c663
KW
2165 * have duplicate checks here, so could create versions of the macros that
2166 * don't, but experiments show that gcc optimizes them out anyway. */
66c17564
KW
2167
2168/* Note that all ignore 'use bytes' */
6eb62d23
KW
2169#define generic_uvchr_(classnum, above_latin1, c) ((c) < 256 \
2170 ? generic_isCC_(c, classnum) \
cd500f2f 2171 : above_latin1(c))
81d43abf 2172#define generic_invlist_uvchr_(classnum, c) ((c) < 256 \
6eb62d23 2173 ? generic_isCC_(c, classnum) \
922e8cb4 2174 : _is_uni_FOO(classnum, c))
91456fff
KW
2175#define isALPHA_uvchr(c) generic_invlist_uvchr_(CC_ALPHA_, c)
2176#define isALPHANUMERIC_uvchr(c) generic_invlist_uvchr_(CC_ALPHANUMERIC_, c)
d0da05db 2177#define isASCII_uvchr(c) isASCII(c)
91456fff 2178#define isBLANK_uvchr(c) generic_uvchr_(CC_BLANK_, is_HORIZWS_cp_high, c)
d0da05db 2179#define isCNTRL_uvchr(c) isCNTRL_L1(c) /* All controls are in Latin1 */
91456fff
KW
2180#define isDIGIT_uvchr(c) generic_invlist_uvchr_(CC_DIGIT_, c)
2181#define isGRAPH_uvchr(c) generic_invlist_uvchr_(CC_GRAPH_, c)
1e222e4f 2182#define isIDCONT_uvchr(c) \
91456fff 2183 generic_uvchr_(CC_WORDCHAR_, _is_uni_perl_idcont, c)
1e222e4f 2184#define isIDFIRST_uvchr(c) \
91456fff
KW
2185 generic_uvchr_(CC_IDFIRST_, _is_uni_perl_idstart, c)
2186#define isLOWER_uvchr(c) generic_invlist_uvchr_(CC_LOWER_, c)
2187#define isPRINT_uvchr(c) generic_invlist_uvchr_(CC_PRINT_, c)
d0da05db 2188
91456fff
KW
2189#define isPUNCT_uvchr(c) generic_invlist_uvchr_(CC_PUNCT_, c)
2190#define isSPACE_uvchr(c) generic_uvchr_(CC_SPACE_, is_XPERLSPACE_cp_high, c)
d0da05db
KW
2191#define isPSXSPC_uvchr(c) isSPACE_uvchr(c)
2192
91456fff
KW
2193#define isUPPER_uvchr(c) generic_invlist_uvchr_(CC_UPPER_, c)
2194#define isVERTWS_uvchr(c) generic_uvchr_(CC_VERTSPACE_, is_VERTWS_cp_high, c)
2195#define isWORDCHAR_uvchr(c) generic_invlist_uvchr_(CC_WORDCHAR_, c)
2196#define isXDIGIT_uvchr(c) generic_uvchr_(CC_XDIGIT_, is_XDIGIT_cp_high, c)
d0da05db
KW
2197
2198#define toFOLD_uvchr(c,s,l) to_uni_fold(c,s,l)
2199#define toLOWER_uvchr(c,s,l) to_uni_lower(c,s,l)
2200#define toTITLE_uvchr(c,s,l) to_uni_title(c,s,l)
2201#define toUPPER_uvchr(c,s,l) to_uni_upper(c,s,l)
2202
2203/* For backwards compatibility, even though '_uni' should mean official Unicode
2204 * code points, in Perl it means native for those below 256 */
2205#define isALPHA_uni(c) isALPHA_uvchr(c)
2206#define isALPHANUMERIC_uni(c) isALPHANUMERIC_uvchr(c)
2207#define isASCII_uni(c) isASCII_uvchr(c)
2208#define isBLANK_uni(c) isBLANK_uvchr(c)
2209#define isCNTRL_uni(c) isCNTRL_uvchr(c)
2210#define isDIGIT_uni(c) isDIGIT_uvchr(c)
2211#define isGRAPH_uni(c) isGRAPH_uvchr(c)
2212#define isIDCONT_uni(c) isIDCONT_uvchr(c)
2213#define isIDFIRST_uni(c) isIDFIRST_uvchr(c)
2214#define isLOWER_uni(c) isLOWER_uvchr(c)
2215#define isPRINT_uni(c) isPRINT_uvchr(c)
2216#define isPUNCT_uni(c) isPUNCT_uvchr(c)
2217#define isSPACE_uni(c) isSPACE_uvchr(c)
2218#define isPSXSPC_uni(c) isPSXSPC_uvchr(c)
2219#define isUPPER_uni(c) isUPPER_uvchr(c)
2220#define isVERTWS_uni(c) isVERTWS_uvchr(c)
2221#define isWORDCHAR_uni(c) isWORDCHAR_uvchr(c)
2222#define isXDIGIT_uni(c) isXDIGIT_uvchr(c)
2223#define toFOLD_uni(c,s,l) toFOLD_uvchr(c,s,l)
2224#define toLOWER_uni(c,s,l) toLOWER_uvchr(c,s,l)
2225#define toTITLE_uni(c,s,l) toTITLE_uvchr(c,s,l)
2226#define toUPPER_uni(c,s,l) toUPPER_uvchr(c,s,l)
a0ed51b3 2227
4650c663
KW
2228/* For internal core Perl use only: the base macros for defining macros like
2229 * isALPHA_LC_uvchr. These are like isALPHA_LC, but the input can be any code
6eb62d23 2230 * point, not just 0-255. Like generic_uvchr_, there are two versions, one for
4650c663 2231 * simple class definitions; the other for more complex. These are like
6eb62d23
KW
2232 * generic_uvchr_, so see it for more info. */
2233#define generic_LC_uvchr_(latin1, above_latin1, c) \
cd500f2f 2234 (c < 256 ? latin1(c) : above_latin1(c))
81d43abf 2235#define generic_LC_invlist_uvchr_(latin1, classnum, c) \
cd500f2f
KW
2236 (c < 256 ? latin1(c) : _is_uni_FOO(classnum, c))
2237
91456fff 2238#define isALPHA_LC_uvchr(c) generic_LC_invlist_uvchr_(isALPHA_LC, CC_ALPHA_, c)
6eb62d23 2239#define isALPHANUMERIC_LC_uvchr(c) generic_LC_invlist_uvchr_(isALPHANUMERIC_LC, \
91456fff 2240 CC_ALPHANUMERIC_, c)
b7d90381 2241#define isASCII_LC_uvchr(c) isASCII_LC(c)
6eb62d23 2242#define isBLANK_LC_uvchr(c) generic_LC_uvchr_(isBLANK_LC, \
b7d90381 2243 is_HORIZWS_cp_high, c)
feeab5a9 2244#define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : 0)
91456fff
KW
2245#define isDIGIT_LC_uvchr(c) generic_LC_invlist_uvchr_(isDIGIT_LC, CC_DIGIT_, c)
2246#define isGRAPH_LC_uvchr(c) generic_LC_invlist_uvchr_(isGRAPH_LC, CC_GRAPH_, c)
6eb62d23 2247#define isIDCONT_LC_uvchr(c) generic_LC_uvchr_(isIDCONT_LC, \
eba68aa0 2248 _is_uni_perl_idcont, c)
6eb62d23 2249#define isIDFIRST_LC_uvchr(c) generic_LC_uvchr_(isIDFIRST_LC, \
cd500f2f 2250 _is_uni_perl_idstart, c)
91456fff
KW
2251#define isLOWER_LC_uvchr(c) generic_LC_invlist_uvchr_(isLOWER_LC, CC_LOWER_, c)
2252#define isPRINT_LC_uvchr(c) generic_LC_invlist_uvchr_(isPRINT_LC, CC_PRINT_, c)
b7d90381 2253#define isPSXSPC_LC_uvchr(c) isSPACE_LC_uvchr(c)
91456fff 2254#define isPUNCT_LC_uvchr(c) generic_LC_invlist_uvchr_(isPUNCT_LC, CC_PUNCT_, c)
6eb62d23 2255#define isSPACE_LC_uvchr(c) generic_LC_uvchr_(isSPACE_LC, \
509fb054 2256 is_XPERLSPACE_cp_high, c)
91456fff 2257#define isUPPER_LC_uvchr(c) generic_LC_invlist_uvchr_(isUPPER_LC, CC_UPPER_, c)
81d43abf 2258#define isWORDCHAR_LC_uvchr(c) generic_LC_invlist_uvchr_(isWORDCHAR_LC, \
91456fff 2259 CC_WORDCHAR_, c)
81d43abf 2260#define isXDIGIT_LC_uvchr(c) generic_LC_uvchr_(isXDIGIT_LC, \
b7d90381 2261 is_XDIGIT_cp_high, c)
e712593e 2262
b7d90381 2263#define isBLANK_LC_uni(c) isBLANK_LC_uvchr(UNI_TO_NATIVE(c))
aaa51d5e 2264
da8c1a98
KW
2265/* The "_safe" macros make sure that we don't attempt to read beyond 'e', but
2266 * they don't otherwise go out of their way to look for malformed UTF-8. If
2267 * they can return accurate results without knowing if the input is otherwise
2268 * malformed, they do so. For example isASCII is accurate in spite of any
2269 * non-length malformations because it looks only at a single byte. Likewise
2270 * isDIGIT looks just at the first byte for code points 0-255, as all UTF-8
2271 * variant ones return FALSE. But, if the input has to be well-formed in order
2272 * for the results to be accurate, the macros will test and if malformed will
2273 * call a routine to die
2274 *
2275 * Except for toke.c, the macros do assume that e > p, asserting that on
2276 * DEBUGGING builds. Much code that calls these depends on this being true,
2277 * for other reasons. toke.c is treated specially as using the regular
2278 * assertion breaks it in many ways. All strings that these operate on there
2279 * are supposed to have an extra NUL character at the end, so that *e = \0. A
2280 * bunch of code in toke.c assumes that this is true, so the assertion allows
2281 * for that */
2282#ifdef PERL_IN_TOKE_C
2283# define _utf8_safe_assert(p,e) ((e) > (p) || ((e) == (p) && *(p) == '\0'))
2284#else
2285# define _utf8_safe_assert(p,e) ((e) > (p))
2286#endif
2287
6eb62d23 2288#define generic_utf8_safe_(classnum, p, e, above_latin1) \
c81b3562
KW
2289 ((! _utf8_safe_assert(p, e)) \
2290 ? (_force_out_malformed_utf8_message((U8 *) (p), (U8 *) (e), 0, 1), 0)\
2291 : (UTF8_IS_INVARIANT(*(p))) \
6eb62d23 2292 ? generic_isCC_(*(p), classnum) \
da8c1a98
KW
2293 : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
2294 ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
6eb62d23 2295 ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \
da8c1a98
KW
2296 classnum) \
2297 : (_force_out_malformed_utf8_message( \
2298 (U8 *) (p), (U8 *) (e), 0, 1), 0)) \
2299 : above_latin1))
b7d90381
KW
2300/* Like the above, but calls 'above_latin1(p)' to get the utf8 value.
2301 * 'above_latin1' can be a macro */
6eb62d23
KW
2302#define generic_func_utf8_safe_(classnum, above_latin1, p, e) \
2303 generic_utf8_safe_(classnum, p, e, above_latin1(p, e))
81d43abf 2304#define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \
6eb62d23 2305 generic_utf8_safe_(classnum, p, e, \
da8c1a98
KW
2306 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2307 ? (_force_out_malformed_utf8_message( \
2308 (U8 *) (p), (U8 *) (e), 0, 1), 0) \
2309 : above_latin1(p)))
2366ba44
KW
2310/* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
2311 * 'above_latin1' parameter */
81d43abf 2312#define generic_invlist_utf8_safe_(classnum, p, e) \
6eb62d23 2313 generic_utf8_safe_(classnum, p, e, _is_utf8_FOO(classnum, p, e))
922e8cb4 2314
cc8ab7c0 2315/* Like the above, but should be used only when it is known that there are no
ff7ecfc3
KW
2316 * characters in the upper-Latin1 range (128-255 on ASCII platforms) which the
2317 * class is TRUE for. Hence it can skip the tests for this range.
2318 * 'above_latin1' should include its arguments */
6eb62d23 2319#define generic_utf8_safe_no_upper_latin1_(classnum, p, e, above_latin1) \
da8c1a98 2320 (__ASSERT_(_utf8_safe_assert(p, e)) \
2d8dd9eb 2321 (isASCII(*(p))) \
6eb62d23 2322 ? generic_isCC_(*(p), classnum) \
da8c1a98
KW
2323 : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
2324 ? 0 /* Note that doesn't check validity for latin1 */ \
2325 : above_latin1)
2326
84238efa 2327
059703b0
KW
2328#define isALPHA_utf8(p, e) isALPHA_utf8_safe(p, e)
2329#define isALPHANUMERIC_utf8(p, e) isALPHANUMERIC_utf8_safe(p, e)
2330#define isASCII_utf8(p, e) isASCII_utf8_safe(p, e)
2331#define isBLANK_utf8(p, e) isBLANK_utf8_safe(p, e)
2332#define isCNTRL_utf8(p, e) isCNTRL_utf8_safe(p, e)
2333#define isDIGIT_utf8(p, e) isDIGIT_utf8_safe(p, e)
2334#define isGRAPH_utf8(p, e) isGRAPH_utf8_safe(p, e)
2335#define isIDCONT_utf8(p, e) isIDCONT_utf8_safe(p, e)
2336#define isIDFIRST_utf8(p, e) isIDFIRST_utf8_safe(p, e)
2337#define isLOWER_utf8(p, e) isLOWER_utf8_safe(p, e)
2338#define isPRINT_utf8(p, e) isPRINT_utf8_safe(p, e)
2339#define isPSXSPC_utf8(p, e) isPSXSPC_utf8_safe(p, e)
2340#define isPUNCT_utf8(p, e) isPUNCT_utf8_safe(p, e)
2341#define isSPACE_utf8(p, e) isSPACE_utf8_safe(p, e)
2342#define isUPPER_utf8(p, e) isUPPER_utf8_safe(p, e)
2343#define isVERTWS_utf8(p, e) isVERTWS_utf8_safe(p, e)
2344#define isWORDCHAR_utf8(p, e) isWORDCHAR_utf8_safe(p, e)
2345#define isXDIGIT_utf8(p, e) isXDIGIT_utf8_safe(p, e)
e8fa43e2 2346
91456fff 2347#define isALPHA_utf8_safe(p, e) generic_invlist_utf8_safe_(CC_ALPHA_, p, e)
da8c1a98 2348#define isALPHANUMERIC_utf8_safe(p, e) \
91456fff 2349 generic_invlist_utf8_safe_(CC_ALPHANUMERIC_, p, e)
da8c1a98
KW
2350#define isASCII_utf8_safe(p, e) \
2351 /* Because ASCII is invariant under utf8, the non-utf8 macro \
2352 * works */ \
2353 (__ASSERT_(_utf8_safe_assert(p, e)) isASCII(*(p)))
2354#define isBLANK_utf8_safe(p, e) \
91456fff 2355 generic_non_invlist_utf8_safe_(CC_BLANK_, is_HORIZWS_high, p, e)
da8c1a98 2356
e8fa43e2
KW
2357#ifdef EBCDIC
2358 /* Because all controls are UTF-8 invariants in EBCDIC, we can use this
2359 * more efficient macro instead of the more general one */
da8c1a98 2360# define isCNTRL_utf8_safe(p, e) \
56d02b8c 2361 (__ASSERT_(_utf8_safe_assert(p, e)) isCNTRL_L1(*(p)))
e8fa43e2 2362#else
91456fff 2363# define isCNTRL_utf8_safe(p, e) generic_utf8_safe_(CC_CNTRL_, p, e, 0)
e8fa43e2
KW
2364#endif
2365
da8c1a98 2366#define isDIGIT_utf8_safe(p, e) \
91456fff
KW
2367 generic_utf8_safe_no_upper_latin1_(CC_DIGIT_, p, e, \
2368 _is_utf8_FOO(CC_DIGIT_, p, e))
2369#define isGRAPH_utf8_safe(p, e) generic_invlist_utf8_safe_(CC_GRAPH_, p, e)
2370#define isIDCONT_utf8_safe(p, e) generic_func_utf8_safe_(CC_WORDCHAR_, \
dd1a3ba7 2371 _is_utf8_perl_idcont, p, e)
e5dcd934 2372
c11ff943
KW
2373/* To prevent S_scan_word in toke.c from hanging, we have to make sure that
2374 * IDFIRST is an alnum. See
8034715d 2375 * https://github.com/Perl/perl5/issues/10275 for more detail than you
f91dcd13
KW
2376 * ever wanted to know about. (In the ASCII range, there isn't a difference.)
2377 * This used to be not the XID version, but we decided to go with the more
2378 * modern Unicode definition */
da8c1a98 2379#define isIDFIRST_utf8_safe(p, e) \
91456fff 2380 generic_func_utf8_safe_(CC_IDFIRST_, \
dd1a3ba7 2381 _is_utf8_perl_idstart, (U8 *) (p), (U8 *) (e))
da8c1a98 2382
91456fff
KW
2383#define isLOWER_utf8_safe(p, e) generic_invlist_utf8_safe_(CC_LOWER_, p, e)
2384#define isPRINT_utf8_safe(p, e) generic_invlist_utf8_safe_(CC_PRINT_, p, e)
da8c1a98 2385#define isPSXSPC_utf8_safe(p, e) isSPACE_utf8_safe(p, e)
91456fff 2386#define isPUNCT_utf8_safe(p, e) generic_invlist_utf8_safe_(CC_PUNCT_, p, e)
da8c1a98 2387#define isSPACE_utf8_safe(p, e) \
91456fff
KW
2388 generic_non_invlist_utf8_safe_(CC_SPACE_, is_XPERLSPACE_high, p, e)
2389#define isUPPER_utf8_safe(p, e) generic_invlist_utf8_safe_(CC_UPPER_, p, e)
da8c1a98 2390#define isVERTWS_utf8_safe(p, e) \
91456fff 2391 generic_non_invlist_utf8_safe_(CC_VERTSPACE_, is_VERTWS_high, p, e)
da8c1a98 2392#define isWORDCHAR_utf8_safe(p, e) \
91456fff 2393 generic_invlist_utf8_safe_(CC_WORDCHAR_, p, e)
da8c1a98 2394#define isXDIGIT_utf8_safe(p, e) \
91456fff 2395 generic_utf8_safe_no_upper_latin1_(CC_XDIGIT_, p, e, \
da8c1a98
KW
2396 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2397 ? (_force_out_malformed_utf8_message( \
2398 (U8 *) (p), (U8 *) (e), 0, 1), 0) \
2399 : is_XDIGIT_high(p)))
a0ed51b3 2400
059703b0
KW
2401#define toFOLD_utf8(p,e,s,l) toFOLD_utf8_safe(p,e,s,l)
2402#define toLOWER_utf8(p,e,s,l) toLOWER_utf8_safe(p,e,s,l)
2403#define toTITLE_utf8(p,e,s,l) toTITLE_utf8_safe(p,e,s,l)
2404#define toUPPER_utf8(p,e,s,l) toUPPER_utf8_safe(p,e,s,l)
2e8adce6 2405
567b353c 2406/* For internal core use only, subject to change */
059703b0
KW
2407#define _toFOLD_utf8_flags(p,e,s,l,f) _to_utf8_fold_flags (p,e,s,l,f)
2408#define _toLOWER_utf8_flags(p,e,s,l,f) _to_utf8_lower_flags(p,e,s,l,f)
2409#define _toTITLE_utf8_flags(p,e,s,l,f) _to_utf8_title_flags(p,e,s,l,f)
2410#define _toUPPER_utf8_flags(p,e,s,l,f) _to_utf8_upper_flags(p,e,s,l,f)
a1a5ec35
KW
2411
2412#define toFOLD_utf8_safe(p,e,s,l) _toFOLD_utf8_flags(p,e,s,l, FOLD_FLAGS_FULL)
2413#define toLOWER_utf8_safe(p,e,s,l) _toLOWER_utf8_flags(p,e,s,l, 0)
2414#define toTITLE_utf8_safe(p,e,s,l) _toTITLE_utf8_flags(p,e,s,l, 0)
2415#define toUPPER_utf8_safe(p,e,s,l) _toUPPER_utf8_flags(p,e,s,l, 0)
567b353c 2416
059703b0
KW
2417#define isALPHA_LC_utf8(p, e) isALPHA_LC_utf8_safe(p, e)
2418#define isALPHANUMERIC_LC_utf8(p, e) isALPHANUMERIC_LC_utf8_safe(p, e)
2419#define isASCII_LC_utf8(p, e) isASCII_LC_utf8_safe(p, e)
2420#define isBLANK_LC_utf8(p, e) isBLANK_LC_utf8_safe(p, e)
2421#define isCNTRL_LC_utf8(p, e) isCNTRL_LC_utf8_safe(p, e)
2422#define isDIGIT_LC_utf8(p, e) isDIGIT_LC_utf8_safe(p, e)
2423#define isGRAPH_LC_utf8(p, e) isGRAPH_LC_utf8_safe(p, e)
2424#define isIDCONT_LC_utf8(p, e) isIDCONT_LC_utf8_safe(p, e)
2425#define isIDFIRST_LC_utf8(p, e) isIDFIRST_LC_utf8_safe(p, e)
2426#define isLOWER_LC_utf8(p, e) isLOWER_LC_utf8_safe(p, e)
2427#define isPRINT_LC_utf8(p, e) isPRINT_LC_utf8_safe(p, e)
2428#define isPSXSPC_LC_utf8(p, e) isPSXSPC_LC_utf8_safe(p, e)
2429#define isPUNCT_LC_utf8(p, e) isPUNCT_LC_utf8_safe(p, e)
2430#define isSPACE_LC_utf8(p, e) isSPACE_LC_utf8_safe(p, e)
2431#define isUPPER_LC_utf8(p, e) isUPPER_LC_utf8_safe(p, e)
2432#define isWORDCHAR_LC_utf8(p, e) isWORDCHAR_LC_utf8_safe(p, e)
2433#define isXDIGIT_LC_utf8(p, e) isXDIGIT_LC_utf8_safe(p, e)
34aeb2e9 2434
da8c1a98 2435/* For internal core Perl use only: the base macros for defining macros like
6eb62d23 2436 * isALPHA_LC_utf8_safe. These are like generic_utf8_, but if the first code
da8c1a98
KW
2437 * point in 'p' is within the 0-255 range, it uses locale rules from the
2438 * passed-in 'macro' parameter */
6eb62d23 2439#define generic_LC_utf8_safe_(macro, p, e, above_latin1) \
da8c1a98
KW
2440 (__ASSERT_(_utf8_safe_assert(p, e)) \
2441 (UTF8_IS_INVARIANT(*(p))) \
2442 ? macro(*(p)) \
2443 : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
2444 ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
2445 ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \
2446 : (_force_out_malformed_utf8_message( \
2447 (U8 *) (p), (U8 *) (e), 0, 1), 0)) \
2448 : above_latin1))
2449
6eb62d23
KW
2450#define generic_LC_invlist_utf8_safe_(macro, classnum, p, e) \
2451 generic_LC_utf8_safe_(macro, p, e, \
2366ba44 2452 _is_utf8_FOO(classnum, p, e))
da8c1a98 2453
6eb62d23
KW
2454#define generic_LC_func_utf8_safe_(macro, above_latin1, p, e) \
2455 generic_LC_utf8_safe_(macro, p, e, above_latin1(p, e))
da8c1a98 2456
6eb62d23
KW
2457#define generic_LC_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \
2458 generic_LC_utf8_safe_(classnum, p, e, \
da8c1a98
KW
2459 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2460 ? (_force_out_malformed_utf8_message( \
2461 (U8 *) (p), (U8 *) (e), 0, 1), 0) \
2462 : above_latin1(p)))
2463
2464#define isALPHANUMERIC_LC_utf8_safe(p, e) \
81d43abf 2465 generic_LC_invlist_utf8_safe_(isALPHANUMERIC_LC, \
91456fff 2466 CC_ALPHANUMERIC_, p, e)
da8c1a98 2467#define isALPHA_LC_utf8_safe(p, e) \
91456fff 2468 generic_LC_invlist_utf8_safe_(isALPHA_LC, CC_ALPHA_, p, e)
da8c1a98
KW
2469#define isASCII_LC_utf8_safe(p, e) \
2470 (__ASSERT_(_utf8_safe_assert(p, e)) isASCII_LC(*(p)))
2471#define isBLANK_LC_utf8_safe(p, e) \
6eb62d23 2472 generic_LC_non_invlist_utf8_safe_(isBLANK_LC, is_HORIZWS_high, p, e)
da8c1a98 2473#define isCNTRL_LC_utf8_safe(p, e) \
6eb62d23 2474 generic_LC_utf8_safe_(isCNTRL_LC, p, e, 0)
da8c1a98 2475#define isDIGIT_LC_utf8_safe(p, e) \
91456fff 2476 generic_LC_invlist_utf8_safe_(isDIGIT_LC, CC_DIGIT_, p, e)
da8c1a98 2477#define isGRAPH_LC_utf8_safe(p, e) \
91456fff 2478 generic_LC_invlist_utf8_safe_(isGRAPH_LC, CC_GRAPH_, p, e)
da8c1a98 2479#define isIDCONT_LC_utf8_safe(p, e) \
6eb62d23 2480 generic_LC_func_utf8_safe_(isIDCONT_LC, \
dd1a3ba7 2481 _is_utf8_perl_idcont, p, e)
da8c1a98 2482#define isIDFIRST_LC_utf8_safe(p, e) \
6eb62d23 2483 generic_LC_func_utf8_safe_(isIDFIRST_LC, \
dd1a3ba7 2484 _is_utf8_perl_idstart, p, e)
da8c1a98 2485#define isLOWER_LC_utf8_safe(p, e) \
91456fff 2486 generic_LC_invlist_utf8_safe_(isLOWER_LC, CC_LOWER_, p, e)
da8c1a98 2487#define isPRINT_LC_utf8_safe(p, e) \
91456fff 2488 generic_LC_invlist_utf8_safe_(isPRINT_LC, CC_PRINT_, p, e)
da8c1a98
KW
2489#define isPSXSPC_LC_utf8_safe(p, e) isSPACE_LC_utf8_safe(p, e)
2490#define isPUNCT_LC_utf8_safe(p, e) \
91456fff 2491 generic_LC_invlist_utf8_safe_(isPUNCT_LC, CC_PUNCT_, p, e)
da8c1a98 2492#define isSPACE_LC_utf8_safe(p, e) \
6eb62d23 2493 generic_LC_non_invlist_utf8_safe_(isSPACE_LC, is_XPERLSPACE_high, p, e)
da8c1a98 2494#define isUPPER_LC_utf8_safe(p, e) \
91456fff 2495 generic_LC_invlist_utf8_safe_(isUPPER_LC, CC_UPPER_, p, e)
da8c1a98 2496#define isWORDCHAR_LC_utf8_safe(p, e) \
91456fff 2497 generic_LC_invlist_utf8_safe_(isWORDCHAR_LC, CC_WORDCHAR_, p, e)
da8c1a98 2498#define isXDIGIT_LC_utf8_safe(p, e) \
6eb62d23 2499 generic_LC_non_invlist_utf8_safe_(isXDIGIT_LC, is_XDIGIT_high, p, e)
aaa51d5e 2500
fbc19f27
KW
2501/* Macros for backwards compatibility and for completeness when the ASCII and
2502 * Latin1 values are identical */
b7d90381
KW
2503#define isALPHAU(c) isALPHA_L1(c)
2504#define isDIGIT_L1(c) isDIGIT_A(c)
2505#define isOCTAL(c) isOCTAL_A(c)
2506#define isOCTAL_L1(c) isOCTAL_A(c)
2507#define isXDIGIT_L1(c) isXDIGIT_A(c)
2508#define isALNUM(c) isWORDCHAR(c)
a377c856 2509#define isALNUM_A(c) isALNUM(c)
b7d90381
KW
2510#define isALNUMU(c) isWORDCHAR_L1(c)
2511#define isALNUM_LC(c) isWORDCHAR_LC(c)
2512#define isALNUM_uni(c) isWORDCHAR_uni(c)
2e28f0b9 2513#define isALNUM_LC_uvchr(c) isWORDCHAR_LC_uvchr(c)
059703b0 2514#define isALNUM_utf8(p,e) isWORDCHAR_utf8(p,e)
4c1d9526 2515#define isALNUM_utf8_safe(p,e) isWORDCHAR_utf8_safe(p,e)
059703b0 2516#define isALNUM_LC_utf8(p,e)isWORDCHAR_LC_utf8(p,e)
4c1d9526 2517#define isALNUM_LC_utf8_safe(p,e)isWORDCHAR_LC_utf8_safe(p,e)
b7d90381
KW
2518#define isALNUMC_A(c) isALPHANUMERIC_A(c) /* Mnemonic: "C's alnum" */
2519#define isALNUMC_L1(c) isALPHANUMERIC_L1(c)
2520#define isALNUMC(c) isALPHANUMERIC(c)
2521#define isALNUMC_LC(c) isALPHANUMERIC_LC(c)
2522#define isALNUMC_uni(c) isALPHANUMERIC_uni(c)
15861f94 2523#define isALNUMC_LC_uvchr(c) isALPHANUMERIC_LC_uvchr(c)
059703b0 2524#define isALNUMC_utf8(p,e) isALPHANUMERIC_utf8(p,e)
4c1d9526
KW
2525#define isALNUMC_utf8_safe(p,e) isALPHANUMERIC_utf8_safe(p,e)
2526#define isALNUMC_LC_utf8_safe(p,e) isALPHANUMERIC_LC_utf8_safe(p,e)
fbc19f27 2527
2bd1cbf6
KW
2528/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,
2529 * except that they don't necessarily mean the same characters, e.g. CTRL-D is
2530 * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC.
2531 * '?' is special-cased on EBCDIC to APC, which is the control there that is
2532 * the outlier from the block that contains the other controls, just like
2533 * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0
2534 * block. If it weren't special cased, it would yield a non-control.
88794300
KW
2535 * The conversion works both ways, so toCTRL('D') is 4, and toCTRL(4) is D,
2536 * etc. */
2bd1cbf6 2537#ifndef EBCDIC
75763b3a 2538# define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) toUPPER(((U8)(c))) ^ 64)
2bd1cbf6 2539#else
75763b3a
KW
2540# define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
2541 ((isPRINT_A(c)) \
2542 ? (UNLIKELY((c) == '?') \
2543 ? QUESTION_MARK_CTRL \
2544 : (NATIVE_TO_LATIN1(toUPPER((U8) (c))) ^ 64)) \
2545 : (UNLIKELY((c) == QUESTION_MARK_CTRL) \
2546 ? '?' \
2547 : (LATIN1_TO_NATIVE(((U8) (c)) ^ 64)))))
2bd1cbf6 2548#endif
bbce6d69 2549
837781cc
KW
2550/*
2551=for apidoc Ay||line_t
2552The typedef to use to declare variables that are to hold line numbers.
2553
2554=cut
2555
2556 Line numbers are unsigned, 32 bits.
2557*/
dea28490 2558typedef U32 line_t;
ee178617 2559#define LINE_Tf U32uf
e5dcd934 2560#define NOLINE ((line_t) 4294967295UL) /* = FFFFFFFF */
378cc40b 2561
91152fc1
DG
2562/* Helpful alias for version prescan */
2563#define is_LAX_VERSION(a,b) \
1604cfb0 2564 (a != Perl_prescan_version(aTHX_ a, FALSE, b, NULL, NULL, NULL, NULL))
91152fc1
DG
2565
2566#define is_STRICT_VERSION(a,b) \
1604cfb0 2567 (a != Perl_prescan_version(aTHX_ a, TRUE, b, NULL, NULL, NULL, NULL))
91152fc1
DG
2568
2569#define BADVERSION(a,b,c) \
1604cfb0
MS
2570 if (b) { \
2571 *b = c; \
2572 } \
2573 return a;
8c52afec 2574
1ce77b7d
KW
2575/* Converts a character KNOWN to represent a hexadecimal digit (0-9, A-F, or
2576 * a-f) to its numeric value without using any branches. The input is
2577 * validated only by an assert() in DEBUGGING builds.
2578 *
2579 * It works by right shifting and isolating the bit that is 0 for the digits,
2580 * and 1 for at least the alphas A-F, a-f. The bit is shifted to the ones
2581 * position, and then to the eights position. Both are added together to form
2582 * 0 if the input is '0'-'9' and to form 9 if alpha. This is added to the
2583 * final four bits of the input to form the correct value. */
2584#define XDIGIT_VALUE(c) (__ASSERT_(isXDIGIT(c)) \
2585 ((NATIVE_TO_LATIN1(c) >> 6) & 1) /* 1 if alpha; 0 if not */ \
2586 + ((NATIVE_TO_LATIN1(c) >> 3) & 8) /* 8 if alpha; 0 if not */ \
2587 + ((c) & 0xF)) /* 0-9 if input valid hex digit */
2588
2589/* The argument is a string pointer, which is advanced. */
2590#define READ_XDIGIT(s) ((s)++, XDIGIT_VALUE(*((s) - 1)))
95a59cab 2591
cb27eebd
KW
2592/* Converts a character known to represent an octal digit (0-7) to its numeric
2593 * value. The input is validated only by an assert() in DEBUGGING builds. In
2594 * both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */
2595#define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c)))
2596
305b8651 2597/* Efficiently returns a boolean as to if two native characters are equivalent
f1460a66 2598 * case-insensitively. At least one of the characters must be one of [A-Za-z];
305b8651
KW
2599 * the ALPHA in the name is to remind you of that. This is asserted() in
2600 * DEBUGGING builds. Because [A-Za-z] are invariant under UTF-8, this macro
2601 * works (on valid input) for both non- and UTF-8-encoded bytes.
2602 *
2603 * When one of the inputs is a compile-time constant and gets folded by the
2604 * compiler, this reduces to an AND and a TEST. On both EBCDIC and ASCII
2605 * machines, 'A' and 'a' differ by a single bit; the same with the upper and
2606 * lower case of all other ASCII-range alphabetics. On ASCII platforms, they
96ca48da
KW
2607 * are 32 apart; on EBCDIC, they are 64. At compile time, this uses an
2608 * exclusive 'or' to find that bit and then inverts it to form a mask, with
2609 * just a single 0, in the bit position where the upper- and lowercase differ.
2610 * */
305b8651
KW
2611#define isALPHA_FOLD_EQ(c1, c2) \
2612 (__ASSERT_(isALPHA_A(c1) || isALPHA_A(c2)) \
2613 ((c1) & ~('A' ^ 'a')) == ((c2) & ~('A' ^ 'a')))
2614#define isALPHA_FOLD_NE(c1, c2) (! isALPHA_FOLD_EQ((c1), (c2)))
2615
8e84507e 2616/*
3f620621 2617=for apidoc_section $memory
ccfc67b7 2618
a02a5408 2619=for apidoc Am|void|Newx|void* ptr|int nitems|type
25a3e84c 2620=for apidoc_item |void*|safemalloc|size_t size
c372fad0 2621
954c1994
GS
2622The XSUB-writer's interface to the C C<malloc> function.
2623
596f7718 2624Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2625
c5008215
JC
2626In 5.9.3, Newx() and friends replace the older New() API, and drops
2627the first parameter, I<x>, a debug aid which allowed callers to identify
37b8b4c9 2628themselves. This aid has been superseded by a new build option,
d10b4965 2629PERL_MEM_LOG (see L<perlhacktips/PERL_MEM_LOG>). The older API is still
c5008215
JC
2630there for use in XS modules supporting older perls.
2631
a02a5408 2632=for apidoc Am|void|Newxc|void* ptr|int nitems|type|cast
954c1994 2633The XSUB-writer's interface to the C C<malloc> function, with
fbe13c60 2634cast. See also C<L</Newx>>.
954c1994 2635
596f7718 2636Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2637
a02a5408 2638=for apidoc Am|void|Newxz|void* ptr|int nitems|type
c372fad0
TK
2639=for apidoc_item |void*|safecalloc|size_t nitems|size_t item_size
2640
954c1994 2641The XSUB-writer's interface to the C C<malloc> function. The allocated
fbe13c60 2642memory is zeroed with C<memzero>. See also C<L</Newx>>.
a02a5408 2643
596f7718 2644Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2645
954c1994 2646=for apidoc Am|void|Renew|void* ptr|int nitems|type
c372fad0
TK
2647=for apidoc_item |void*|saferealloc|void *ptr|size_t size
2648
954c1994
GS
2649The XSUB-writer's interface to the C C<realloc> function.
2650
596f7718 2651Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2652
954c1994
GS
2653=for apidoc Am|void|Renewc|void* ptr|int nitems|type|cast
2654The XSUB-writer's interface to the C C<realloc> function, with
2655cast.
2656
596f7718 2657Memory obtained by this should B<ONLY> be freed with L</"Safefree">.
0d7b2759 2658
49b8b560 2659=for apidoc Am|void|Safefree|void* ptr
954c1994
GS
2660The XSUB-writer's interface to the C C<free> function.
2661
596f7718 2662This should B<ONLY> be used on memory obtained using L</"Newx"> and friends.
0d7b2759 2663
3f620621 2664=for apidoc_section $string
75b94e77
KW
2665=for apidoc Am|void |Move |void* src|void* dest|int nitems|type
2666=for apidoc_item |void *|MoveD|void* src|void* dest|int nitems|type
954c1994 2667The XSUB-writer's interface to the C C<memmove> function. The C<src> is the
926bb54c 2668source, C<dest> is the destination, C<nitems> is the number of items, and
fbe13c60 2669C<type> is the type. Can do overlapping moves. See also C<L</Copy>>.
954c1994 2670
75b94e77 2671C<MoveD> is like C<Move> but returns C<dest>. Useful
72d33970 2672for encouraging compilers to tail-call
e90e2364
NC
2673optimise.
2674
75b94e77
KW
2675=for apidoc Am|void |Copy |void* src|void* dest|int nitems|type
2676=for apidoc_item |void *|CopyD|void* src|void* dest|int nitems|type
954c1994 2677The XSUB-writer's interface to the C C<memcpy> function. The C<src> is the
926bb54c 2678source, C<dest> is the destination, C<nitems> is the number of items, and
fbe13c60 2679C<type> is the type. May fail on overlapping copies. See also C<L</Move>>.
954c1994 2680
75b94e77 2681C<CopyD> is like C<Copy> but returns C<dest>. Useful
72d33970 2682for encouraging compilers to tail-call
e90e2364
NC
2683optimise.
2684
75b94e77
KW
2685=for apidoc Am|void |Zero |void* dest|int nitems|type
2686=for apidoc_item |void *|ZeroD|void* dest|int nitems|type
954c1994
GS
2687
2688The XSUB-writer's interface to the C C<memzero> function. The C<dest> is the
2689destination, C<nitems> is the number of items, and C<type> is the type.
2690
75b94e77 2691C<ZeroD> is like C<Zero> but returns C<dest>. Useful
72d33970 2692for encouraging compilers to tail-call
e90e2364
NC
2693optimise.
2694
3f620621 2695=for apidoc_section $utility
e538104b 2696=for apidoc Amu|void|StructCopy|type *src|type *dest|type
4375e838 2697This is an architecture-independent macro to copy one structure to another.
954c1994 2698
7e337ee0
JH
2699=for apidoc Am|void|PoisonWith|void* dest|int nitems|type|U8 byte
2700
2701Fill up memory with a byte pattern (a byte repeated over and over
2702again) that hopefully catches attempts to access uninitialized memory.
2703
2704=for apidoc Am|void|PoisonNew|void* dest|int nitems|type
2705
2706PoisonWith(0xAB) for catching access to allocated but uninitialized memory.
2707
1c12ffb4 2708=for apidoc Am|void|PoisonFree|void* dest|int nitems|type
7e337ee0
JH
2709
2710PoisonWith(0xEF) for catching access to freed memory.
2711
9965345d
JH
2712=for apidoc Am|void|Poison|void* dest|int nitems|type
2713
7e337ee0 2714PoisonWith(0xEF) for catching access to freed memory.
9965345d
JH
2715
2716=cut */
954c1994 2717
561b68a9
SH
2718/* Maintained for backwards-compatibility only. Use newSV() instead. */
2719#ifndef PERL_CORE
ff06c60c 2720#define NEWSV(x,len) newSV(len)
561b68a9 2721#endif
ff06c60c 2722
b7112dce 2723#define MEM_SIZE_MAX ((MEM_SIZE)-1)
19a94d75 2724
a500027b 2725#define _PERL_STRLEN_ROUNDUP_UNCHECKED(n) (((n) - 1 + PERL_STRLEN_ROUNDUP_QUANTUM) & ~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM - 1))
e6bdf523 2726
27d5b266 2727#ifdef PERL_MALLOC_WRAP
e6bdf523
DM
2728
2729/* This expression will be constant-folded at compile time. It checks
2730 * whether or not the type of the count n is so small (e.g. U8 or U16, or
2731 * U32 on 64-bit systems) that there's no way a wrap-around could occur.
2732 * As well as avoiding the need for a run-time check in some cases, it's
2733 * designed to avoid compiler warnings like:
2734 * comparison is always false due to limited range of data type
73e8ff00
DM
2735 * It's mathematically equivalent to
2736 * max(n) * sizeof(t) > MEM_SIZE_MAX
e6bdf523
DM
2737 */
2738
2739# define _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) \
445198b9
LM
2740 ( sizeof(MEM_SIZE) < sizeof(n) \
2741 || sizeof(t) > ((MEM_SIZE)1 << 8*(sizeof(MEM_SIZE) - sizeof(n))))
e6bdf523 2742
88f9f128 2743/* This is written in a slightly odd way to avoid various spurious
d98e5cde
DM
2744 * compiler warnings. We *want* to write the expression as
2745 * _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) && (n > C)
2746 * (for some compile-time constant C), but even when the LHS
2747 * constant-folds to false at compile-time, g++ insists on emitting
2748 * warnings about the RHS (e.g. "comparison is always false"), so instead
2749 * we write it as
e6bdf523 2750 *
d98e5cde 2751 * (cond ? n : X) > C
88f9f128 2752 *
d98e5cde
DM
2753 * where X is a constant with X > C always false. Choosing a value for X
2754 * is tricky. If 0, some compilers will complain about 0 > C always being
2755 * false; if 1, Coverity complains when n happens to be the constant value
2756 * '1', that cond ? 1 : 1 has the same value on both branches; so use C
2757 * for X and hope that nothing else whines.
e6bdf523
DM
2758 */
2759
2760# define _MEM_WRAP_WILL_WRAP(n,t) \
88f9f128
DM
2761 ((_MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) ? (MEM_SIZE)(n) : \
2762 MEM_SIZE_MAX/sizeof(t)) > MEM_SIZE_MAX/sizeof(t))
e6bdf523
DM
2763
2764# define MEM_WRAP_CHECK(n,t) \
1604cfb0 2765 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \
e6bdf523
DM
2766 && (croak_memory_wrap(),0))
2767
2768# define MEM_WRAP_CHECK_1(n,t,a) \
1604cfb0
MS
2769 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \
2770 && (Perl_croak_nocontext("%s",(a)),0))
e6bdf523 2771
814eedc8
DD
2772/* "a" arg must be a string literal */
2773# define MEM_WRAP_CHECK_s(n,t,a) \
ca0572d7
KW
2774 ( (void) (UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \
2775 && (Perl_croak_nocontext(ASSERT_IS_LITERAL(a)), 0)))
814eedc8 2776
0622ec7a 2777# define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t),
27d5b266 2778
0622ec7a 2779# define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0) : 0), _PERL_STRLEN_ROUNDUP_UNCHECKED(n))
27d5b266
JH
2780#else
2781
0622ec7a
KW
2782# define MEM_WRAP_CHECK(n,t)
2783# define MEM_WRAP_CHECK_1(n,t,a)
2784# define MEM_WRAP_CHECK_s(n,t,a)
2785# define MEM_WRAP_CHECK_(n,t)
8b44ba4c 2786
0622ec7a 2787# define PERL_STRLEN_ROUNDUP(n) _PERL_STRLEN_ROUNDUP_UNCHECKED(n)
27d5b266 2788
1936d2a7 2789#endif
8b44ba4c 2790
fe4f188c 2791#ifdef PERL_MEM_LOG
46c6c7e2 2792/*
9f653bb5 2793 * If PERL_MEM_LOG is defined, all Newx()s, Renew()s, and Safefree()s
46c6c7e2
JH
2794 * go through functions, which are handy for debugging breakpoints, but
2795 * which more importantly get the immediate calling environment (file and
e352bcff
JH
2796 * line number, and C function name if available) passed in. This info can
2797 * then be used for logging the calls, for which one gets a sample
73d1d973 2798 * implementation unless -DPERL_MEM_LOG_NOIMPL is also defined.
3609ea0d 2799 *
46c6c7e2 2800 * Known problems:
94e892a6 2801 * - not all memory allocs get logged, only those
46c6c7e2 2802 * that go through Newx() and derivatives (while all
94e892a6 2803 * Safefrees do get logged)
46c6c7e2
JH
2804 * - __FILE__ and __LINE__ do not work everywhere
2805 * - __func__ or __FUNCTION__ even less so
2806 * - I think more goes on after the perlio frees but
2807 * the thing is that STDERR gets closed (as do all
2808 * the file descriptors)
2809 * - no deeper calling stack than the caller of the Newx()
2810 * or the kind, but do I look like a C reflection/introspection
2811 * utility to you?
2812 * - the function prototypes for the logging functions
2813 * probably should maybe be somewhere else than handy.h
2814 * - one could consider inlining (macrofying) the logging
2815 * for speed, but I am too lazy
2816 * - one could imagine recording the allocations in a hash,
2817 * (keyed by the allocation address?), and maintain that
2818 * through reallocs and frees, but how to do that without
2819 * any News() happening...?
73d1d973 2820 * - lots of -Ddefines to get useful/controllable output
b953482e 2821 * - lots of ENV reads
46c6c7e2
JH
2822 */
2823
0b0ab801 2824# ifdef PERL_CORE
73d1d973 2825# ifndef PERL_MEM_LOG_NOIMPL
0b0ab801
MHM
2826enum mem_log_type {
2827 MLT_ALLOC,
2828 MLT_REALLOC,
d7a2c63c
MHM
2829 MLT_FREE,
2830 MLT_NEW_SV,
2831 MLT_DEL_SV
0b0ab801
MHM
2832};
2833# endif
2834# endif
2835
fe4f188c
JH
2836#endif
2837
2838#ifdef PERL_MEM_LOG
d1401ee9
MHM
2839#define MEM_LOG_ALLOC(n,t,a) Perl_mem_log_alloc(n,sizeof(t),STRINGIFY(t),a,__FILE__,__LINE__,FUNCTION__)
2840#define MEM_LOG_REALLOC(n,t,v,a) Perl_mem_log_realloc(n,sizeof(t),STRINGIFY(t),v,a,__FILE__,__LINE__,FUNCTION__)
46c6c7e2 2841#define MEM_LOG_FREE(a) Perl_mem_log_free(a,__FILE__,__LINE__,FUNCTION__)
fe4f188c
JH
2842#endif
2843
2844#ifndef MEM_LOG_ALLOC
2845#define MEM_LOG_ALLOC(n,t,a) (a)
2846#endif
2847#ifndef MEM_LOG_REALLOC
2848#define MEM_LOG_REALLOC(n,t,v,a) (a)
2849#endif
2850#ifndef MEM_LOG_FREE
2851#define MEM_LOG_FREE(a) (a)
2852#endif
2853
d1401ee9
MHM
2854#define Newx(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t))))))
2855#define Newxc(v,n,t,c) (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t))))))
2856#define Newxz(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safecalloc((n),sizeof(t)))))
a6f6820f
NC
2857
2858#ifndef PERL_CORE
a02a5408
JC
2859/* pre 5.9.x compatibility */
2860#define New(x,v,n,t) Newx(v,n,t)
2861#define Newc(x,v,n,t,c) Newxc(v,n,t,c)
4541904d 2862#define Newz(x,v,n,t) Newxz(v,n,t)
a6f6820f 2863#endif
a02a5408 2864
ff68c719 2865#define Renew(v,n,t) \
1604cfb0 2866 (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t))))))
ff68c719 2867#define Renewc(v,n,t,c) \
1604cfb0 2868 (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t))))))
94010e71
NC
2869
2870#ifdef PERL_POISON
2871#define Safefree(d) \
06c0cc96 2872 ((d) ? (void)(safefree(MEM_LOG_FREE((Malloc_t)(d))), Poison(&(d), 1, Malloc_t)) : (void) 0)
94010e71 2873#else
fe4f188c 2874#define Safefree(d) safefree(MEM_LOG_FREE((Malloc_t)(d)))
94010e71 2875#endif
55497cff 2876
dbb57106
YO
2877/* assert that a valid ptr has been supplied - use this instead of assert(ptr) *
2878 * as it handles cases like constant string arguments without throwing warnings *
2879 * the cast is required, as is the inequality check, to avoid warnings */
45908e4d 2880#define perl_assert_ptr(p) assert( ((void*)(p)) != 0 )
55497cff 2881
45908e4d
YO
2882
2883#define Move(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memmove((char*)(d),(const char*)(s), (n) * sizeof(t)))
2884#define Copy(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memcpy((char*)(d),(const char*)(s), (n) * sizeof(t)))
2885#define Zero(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), (void)memzero((char*)(d), (n) * sizeof(t)))
2886
bdd1531d 2887/* Like above, but returns a pointer to 'd' */
45908e4d
YO
2888#define MoveD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memmove((char*)(d),(const char*)(s), (n) * sizeof(t)))
2889#define CopyD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memcpy((char*)(d),(const char*)(s), (n) * sizeof(t)))
45908e4d 2890#define ZeroD(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), memzero((char*)(d), (n) * sizeof(t)))
e90e2364 2891
7e337ee0
JH
2892#define PoisonWith(d,n,t,b) (MEM_WRAP_CHECK_(n,t) (void)memset((char*)(d), (U8)(b), (n) * sizeof(t)))
2893#define PoisonNew(d,n,t) PoisonWith(d,n,t,0xAB)
2894#define PoisonFree(d,n,t) PoisonWith(d,n,t,0xEF)
2895#define Poison(d,n,t) PoisonFree(d,n,t)
27d5b266 2896
caa674f3
DD
2897#ifdef PERL_POISON
2898# define PERL_POISON_EXPR(x) x
2899#else
2900# define PERL_POISON_EXPR(x)
2901#endif
2902
be319be3 2903/* Shallow copy */
ff68c719 2904#define StructCopy(s,d,t) (*((t*)(d)) = *((t*)(s)))
2cc61e15 2905
1b7e2294 2906/*
3f620621 2907=for apidoc_section $utility
1b7e2294
KW
2908
2909=for apidoc Am|STRLEN|C_ARRAY_LENGTH|void *a
2910
2911Returns the number of elements in the input C array (so you want your
2912zero-based indices to be less than but not equal to).
2913
2914=for apidoc Am|void *|C_ARRAY_END|void *a
2915
2916Returns a pointer to one element past the final element of the input C array.
2917
2918=cut
2919
2920C_ARRAY_END is one past the last: half-open/half-closed range, not
2921last-inclusive range.
2922*/
622913ab 2923#define C_ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
c3caa5c3 2924#define C_ARRAY_END(a) ((a) + C_ARRAY_LENGTH(a))
622913ab 2925
ff00d73b
KW
2926#if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)
2927/* strlen() of a literal string constant. Restricting this to core, in part
2928 * because it can generate compiler warnings about comparing unlike signs */
2929# define STRLENs(s) (sizeof("" s "") - 1)
2930#endif
2931
2cc61e15
DD
2932#ifdef NEED_VA_COPY
2933# ifdef va_copy
2934# define Perl_va_copy(s, d) va_copy(d, s)
07798b17
AC
2935# elif defined(__va_copy)
2936# define Perl_va_copy(s, d) __va_copy(d, s)
2cc61e15 2937# else
07798b17 2938# define Perl_va_copy(s, d) Copy(s, d, 1, va_list)
2cc61e15
DD
2939# endif
2940#endif
2941
472d47bc
SB
2942/* convenience debug macros */
2943#ifdef USE_ITHREADS
2944#define pTHX_FORMAT "Perl interpreter: 0x%p"
2945#define pTHX__FORMAT ", Perl interpreter: 0x%p"
f54cb97a
AL
2946#define pTHX_VALUE_ (void *)my_perl,
2947#define pTHX_VALUE (void *)my_perl
2948#define pTHX__VALUE_ ,(void *)my_perl,
2949#define pTHX__VALUE ,(void *)my_perl
472d47bc 2950#else
3609ea0d 2951#define pTHX_FORMAT
472d47bc 2952#define pTHX__FORMAT
3609ea0d 2953#define pTHX_VALUE_
472d47bc 2954#define pTHX_VALUE
3609ea0d 2955#define pTHX__VALUE_
472d47bc
SB
2956#define pTHX__VALUE
2957#endif /* USE_ITHREADS */
3609ea0d 2958
2acdbac1
NC
2959/* Perl_deprecate was not part of the public API, and did not have a deprecate()
2960 shortcut macro defined without -DPERL_CORE. Neither codesearch.google.com nor
2961 CPAN::Unpack show any users outside the core. */
2962#ifdef PERL_CORE
dc6e8de0
A
2963# define deprecate(s) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
2964 "Use of " s " is deprecated")
c9680906
A
2965# define deprecate_disappears_in(when,message) \
2966 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
81a93f7e 2967 message " is deprecated, and will disappear in Perl " when)
ac641426
A
2968# define deprecate_fatal_in(when,message) \
2969 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
81a93f7e 2970 message " is deprecated, and will become fatal in Perl " when)
2acdbac1
NC
2971#endif
2972
dfff4baf
BF
2973/* Internal macros to deal with gids and uids */
2974#ifdef PERL_CORE
2975
2976# if Uid_t_size > IVSIZE
2977# define sv_setuid(sv, uid) sv_setnv((sv), (NV)(uid))
2978# define SvUID(sv) SvNV(sv)
07798b17
AC
2979# elif Uid_t_sign <= 0
2980# define sv_setuid(sv, uid) sv_setiv((sv), (IV)(uid))
2981# define SvUID(sv) SvIV(sv)
dfff4baf 2982# else
07798b17
AC
2983# define sv_setuid(sv, uid) sv_setuv((sv), (UV)(uid))
2984# define SvUID(sv) SvUV(sv)
dfff4baf
BF
2985# endif /* Uid_t_size */
2986
2987# if Gid_t_size > IVSIZE
2988# define sv_setgid(sv, gid) sv_setnv((sv), (NV)(gid))
2989# define SvGID(sv) SvNV(sv)
07798b17
AC
2990# elif Gid_t_sign <= 0
2991# define sv_setgid(sv, gid) sv_setiv((sv), (IV)(gid))
2992# define SvGID(sv) SvIV(sv)
dfff4baf 2993# else
07798b17
AC
2994# define sv_setgid(sv, gid) sv_setuv((sv), (UV)(gid))
2995# define SvGID(sv) SvUV(sv)
dfff4baf
BF
2996# endif /* Gid_t_size */
2997
2998#endif
2999
08da5deb
YO
3000/* These are simple Marsaglia XOR-SHIFT RNG's for 64 and 32 bits. These
3001 * RNG's are of reasonable quality, very fast, and have the interesting
3002 * property that provided 'x' is non-zero they create a cycle of 2^32-1
3003 * or 2^64-1 "random" like numbers, with the exception of 0. Thus they
3004 * are very useful when you want an integer to "dance" in a random way,
3005 * but you also never want it to become 0 and thus false.
3006 *
bf2a3dae
YO
3007 * Obviously they leave x unchanged if it starts out as 0.
3008 *
3009 * We have two variants just because that can be helpful in certain
3010 * places. There is no advantage to either, they are equally bad as each
3011 * other as far RNG's go. Sufficiently random for many purposes, but
3012 * insufficiently random for serious use as they fail important tests in
3013 * the Test01 BigCrush RNG test suite by L’Ecuyer and Simard. (Note
3014 * that Drand48 also fails BigCrush). The main point is they produce
3015 * different sequences and in places where we want some randomlike
3016 * behavior they are cheap and easy.
3017 *
3018 * Marsaglia was one of the early researchers into RNG testing and wrote
3019 * the Diehard RNG test suite, which after his death become the
3020 * Dieharder RNG suite, and was generally supplanted by the Test01 suite
3021 * by L'Ecruyer and associates.
3022 *
3023 * There are dozens of shift parameters that create a pseudo random ring
3024 * of integers 1..2^N-1, if you need a different sequence just read the
3025 * paper and select a set of parameters. In fact, simply reversing the
3026 * shift order from L/R/L to R/L/R should result in another valid
3027 * example, but read the paper before you do that.
3028 *
3029 * PDF of the original paper:
3030 * https://www.jstatsoft.org/article/download/v008i14/916
3031 * Wikipedia:
3032 * https://en.wikipedia.org/wiki/Xorshift
3033 * Criticism:
3034 * https://www.iro.umontreal.ca/~lecuyer/myftp/papers/xorshift.pdf
3035 * Test01:
3036 * http://simul.iro.umontreal.ca/testu01/tu01.html
3037 * Diehard:
3038 * https://en.wikipedia.org/wiki/Diehard_tests
3039 * Dieharder:
3040 * https://webhome.phy.duke.edu/~rgb/General/rand_rate/rand_rate.abs
3041 *
3042 */
08da5deb 3043
bf2a3dae
YO
3044/* 32 bit version */
3045#define PERL_XORSHIFT32_A(x) \
08da5deb 3046STMT_START { \
bf2a3dae
YO
3047 (x) ^= ((x) << 13); \
3048 (x) ^= ((x) >> 17); \
3049 (x) ^= ((x) << 5); \
3050} STMT_END
3051
3052/* 64 bit version */
3053#define PERL_XORSHIFT64_A(x) \
3054STMT_START { \
3055 (x) ^= ((x) << 13); \
3056 (x) ^= ((x) >> 7); \
3057 (x) ^= ((x) << 17); \
08da5deb
YO
3058} STMT_END
3059
3060/* 32 bit version */
bf2a3dae
YO
3061#define PERL_XORSHIFT32_B(x) \
3062STMT_START { \
3063 (x) ^= ((x) << 5); \
3064 (x) ^= ((x) >> 27); \
3065 (x) ^= ((x) << 8); \
3066} STMT_END
3067
3068/* 64 bit version - currently this is unused,
3069 * it is provided here to complement the 32 bit _B
3070 * variant which IS used. */
3071#define PERL_XORSHIFT64_B(x) \
08da5deb 3072STMT_START { \
bf2a3dae
YO
3073 (x) ^= ((x) << 15); \
3074 (x) ^= ((x) >> 49); \
3075 (x) ^= ((x) << 26); \
08da5deb
YO
3076} STMT_END
3077
3078
6a5bc5ac 3079#endif /* PERL_HANDY_H_ */
9d745869 3080
e9a8c099 3081/*
14d04a33 3082 * ex: set ts=8 sts=4 sw=4 et:
e9a8c099 3083 */