perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	/*
	35	* pregcomp and pregexec -- regsub and regerror are not used in perl
	36	*
	37	* Copyright (c) 1986 by University of Toronto.
	38	* Written by Henry Spencer. Not derived from licensed software.
	39	*
	40	* Permission is granted to anyone to use this software for any
	41	* purpose on any computer system, and to redistribute it freely,
	42	* subject to the following restrictions:
	43	*
	44	* 1. The author is not responsible for the consequences of use of
	45	* this software, no matter how awful, even if they arise
	46	* from defects in it.
	47	*
	48	* 2. The origin of this software must not be misrepresented, either
	49	* by explicit claim or by omission.
	50	*
	51	* 3. Altered versions must be plainly marked as such, and must not
	52	* be misrepresented as being the original software.
	53	*
	54	*
	55	**** Alterations to Henry's code are...
	56	****
	57	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	58	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	59	**** by Larry Wall and others
	60	****
	61	**** You may distribute under the terms of either the GNU General Public
	62	**** License or the Artistic License, as specified in the README file.
	63
	64	*
	65	* Beware that some of this code is subtly aware of the way operator
	66	* precedence is structured in regular expressions. Serious changes in
	67	* regular-expression syntax might require a total rethink.
	68	*/
	69
	70	/* Note on debug output:
	71	*
	72	* This is set up so that -Dr turns on debugging like all other flags that are
	73	* enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
	74	* all regular expressions encountered in a program, and gives a huge amount of
	75	* output for all but the shortest programs.
	76	*
	77	* The ability to output pattern debugging information lexically, and with much
	78	* finer grained control was added, with 'use re qw(Debug ....);' available even
	79	* in non-DEBUGGING builds. This is accomplished by copying the contents of
	80	* regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
	81	* Those files are compiled and linked into the perl executable, and they are
	82	* compiled essentially as if DEBUGGING were enabled, and controlled by calls
	83	* to re.pm.
	84	*
	85	* That would normally mean linking errors when two functions of the same name
	86	* are attempted to be placed into the same executable. That is solved in one
	87	* of four ways:
	88	* 1) Static functions aren't known outside the file they are in, so for the
	89	* many functions of that type in this file, it just isn't a problem.
	90	* 2) Most externally known functions are enclosed in
	91	* #ifndef PERL_IN_XSUB_RE
	92	* ...
	93	* #endif
	94	* blocks, so there is only one definition for them in the whole
	95	* executable, the one in regcomp.c (or regexec.c). The implication of
	96	* that is any debugging info that comes from them is controlled only by
	97	* -Dr. Further, any static function they call will also be the version
	98	* in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
	99	* 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
	100	* have different names, so that what gets loaded in the executable is
	101	* 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
	102	* from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
	103	* in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
	104	* versions and their callees are under control of re.pm. The catch is
	105	* that references to all these go through the regexp_engine structure,
	106	* which is initialized in regcomp.h to the Perl_foo versions, and
	107	* substituted out in lexical scopes where 'use re' is in effect to the
	108	* 'my_foo' ones. That structure is public API, so it would be a hard
	109	* sell to add any additional members.
	110	* 4) For functions in regcomp.c and re_comp.c that are called only from,
	111	* respectively, regexec.c and re_exec.c, they can have two different
	112	* names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
	113	* embed.fnc.
	114	*
	115	* The bottom line is that if you add code to one of the public functions
	116	* listed in ext/re/re_top.h, debugging automagically works. But if you write
	117	* a new function that needs to do debugging or there is a chain of calls from
	118	* it that need to do debugging, all functions in the chain should use options
	119	* 2) or 4) above.
	120	*
	121	* A function may have to be split so that debugging stuff is static, but it
	122	* calls out to some other function that only gets compiled in regcomp.c to
	123	* access data that we don't want to duplicate.
	124	*/
	125
	126	#ifdef PERL_EXT_RE_BUILD
	127	#include "re_top.h"
	128	#endif
	129
	130	#include "EXTERN.h"
	131	#define PERL_IN_REGEX_ENGINE
	132	#define PERL_IN_REGCOMP_ANY
	133	#define PERL_IN_REGCOMP_C
	134	#include "perl.h"
	135
	136	#ifdef PERL_IN_XSUB_RE
	137	# include "re_comp.h"
	138	EXTERN_C const struct regexp_engine my_reg_engine;
	139	EXTERN_C const struct regexp_engine wild_reg_engine;
	140	#else
	141	# include "regcomp.h"
	142	#endif
	143
	144	#include "invlist_inline.h"
	145	#include "unicode_constants.h"
	146	#include "regcomp_internal.h"
	147
	148	/* =========================================================
	149	* BEGIN edit_distance stuff.
	150	*
	151	* This calculates how many single character changes of any type are needed to
	152	* transform a string into another one. It is taken from version 3.1 of
	153	*
	154	* https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
	155	*/
	156
	157	/* Our unsorted dictionary linked list. */
	158	/* Note we use UVs, not chars. */
	159
	160	struct dictionary{
	161	UV key;
	162	UV value;
	163	struct dictionary* next;
	164	};
	165	typedef struct dictionary item;
	166
	167
	168	PERL_STATIC_INLINE item*
	169	push(UV key, item* curr)
	170	{
	171	item* head;
	172	Newx(head, 1, item);
	173	head->key = key;
	174	head->value = 0;
	175	head->next = curr;
	176	return head;
	177	}
	178
	179
	180	PERL_STATIC_INLINE item*
	181	find(item* head, UV key)
	182	{
	183	item* iterator = head;
	184	while (iterator){
	185	if (iterator->key == key){
	186	return iterator;
	187	}
	188	iterator = iterator->next;
	189	}
	190
	191	return NULL;
	192	}
	193
	194	PERL_STATIC_INLINE item*
	195	uniquePush(item* head, UV key)
	196	{
	197	item* iterator = head;
	198
	199	while (iterator){
	200	if (iterator->key == key) {
	201	return head;
	202	}
	203	iterator = iterator->next;
	204	}
	205
	206	return push(key, head);
	207	}
	208
	209	PERL_STATIC_INLINE void
	210	dict_free(item* head)
	211	{
	212	item* iterator = head;
	213
	214	while (iterator) {
	215	item* temp = iterator;
	216	iterator = iterator->next;
	217	Safefree(temp);
	218	}
	219
	220	head = NULL;
	221	}
	222
	223	/* End of Dictionary Stuff */
	224
	225	/* All calculations/work are done here */
	226	STATIC int
	227	S_edit_distance(const UV* src,
	228	const UV* tgt,
	229	const STRLEN x, /* length of src[] */
	230	const STRLEN y, /* length of tgt[] */
	231	const SSize_t maxDistance
	232	)
	233	{
	234	item *head = NULL;
	235	UV swapCount, swapScore, targetCharCount, i, j;
	236	UV *scores;
	237	UV score_ceil = x + y;
	238
	239	PERL_ARGS_ASSERT_EDIT_DISTANCE;
	240
	241	/* initialize matrix start values */
	242	Newx(scores, ( (x + 2) * (y + 2)), UV);
	243	scores[0] = score_ceil;
	244	scores[1 * (y + 2) + 0] = score_ceil;
	245	scores[0 * (y + 2) + 1] = score_ceil;
	246	scores[1 * (y + 2) + 1] = 0;
	247	head = uniquePush(uniquePush(head, src[0]), tgt[0]);
	248
	249	/* work loops */
	250	/* i = src index */
	251	/* j = tgt index */
	252	for (i=1;i<=x;i++) {
	253	if (i < x)
	254	head = uniquePush(head, src[i]);
	255	scores[(i+1) * (y + 2) + 1] = i;
	256	scores[(i+1) * (y + 2) + 0] = score_ceil;
	257	swapCount = 0;
	258
	259	for (j=1;j<=y;j++) {
	260	if (i == 1) {
	261	if(j < y)
	262	head = uniquePush(head, tgt[j]);
	263	scores[1 * (y + 2) + (j + 1)] = j;
	264	scores[0 * (y + 2) + (j + 1)] = score_ceil;
	265	}
	266
	267	targetCharCount = find(head, tgt[j-1])->value;
	268	swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
	269
	270	if (src[i-1] != tgt[j-1]){
	271	scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
	272	}
	273	else {
	274	swapCount = j;
	275	scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
	276	}
	277	}
	278
	279	find(head, src[i-1])->value = i;
	280	}
	281
	282	{
	283	IV score = scores[(x+1) * (y + 2) + (y + 1)];
	284	dict_free(head);
	285	Safefree(scores);
	286	return (maxDistance != 0 && maxDistance < score)?(-1):score;
	287	}
	288	}
	289
	290	/* END of edit_distance() stuff
	291	* ========================================================= */
	292
	293	#ifdef PERL_RE_BUILD_AUX
	294	/* add a data member to the struct reg_data attached to this regex, it should
	295	* always return a non-zero return. the 's' argument is the type of the items
	296	* being added and the n is the number of items. The length of 's' should match
	297	* the number of items. */
	298	U32
	299	Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
	300	{
	301	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
	302
	303	PERL_ARGS_ASSERT_REG_ADD_DATA;
	304
	305	/* in the below expression we have (count + n - 1), the minus one is there
	306	* because the struct that we allocate already contains a slot for 1 data
	307	* item, so we do not need to allocate it the first time. IOW, the
	308	* sizeof(*RExC_rxi->data) already accounts for one of the elements we need
	309	* to allocate. See struct reg_data in regcomp.h
	310	*/
	311	Renewc(RExC_rxi->data,
	312	sizeof(RExC_rxi->data) + (sizeof(void) * (count + n - 1)),
	313	char, struct reg_data);
	314	/* however in the data->what expression we use (count + n) and do not
	315	* subtract one from the result because the data structure contains a
	316	* pointer to an array, and does not allocate the first element as part of
	317	* the data struct. */
	318	if (count > 1)
	319	Renew(RExC_rxi->data->what, (count + n), U8);
	320	else {
	321	/* when count == 1 it means we have not initialized anything.
	322	* we always fill the 0 slot of the data array with a '%' entry, which
	323	* means "zero" (all the other types are letters) which exists purely
	324	* so the return from reg_add_data is ALWAYS true, so we can tell it apart
	325	* from a "no value" idx=0 in places where we would return an index
	326	* into reg_add_data. This is particularly important with the new "single
	327	* pass, usually, but not always" strategy that we use, where the code
	328	* will use a 0 to represent "not able to compute this yet".
	329	*/
	330	Newx(RExC_rxi->data->what, n+1, U8);
	331	/* fill in the placeholder slot of 0 with a what of '%', we use
	332	* this because it sorta looks like a zero (0/0) and it is not a letter
	333	* like any of the other "whats", this type should never be created
	334	* any other way but here. '%' happens to also not appear in this
	335	* file for any other reason (at the time of writing this comment)*/
	336	RExC_rxi->data->what[0]= '%';
	337	RExC_rxi->data->data[0]= NULL;
	338	}
	339	RExC_rxi->data->count = count + n;
	340	Copy(s, RExC_rxi->data->what + count, n, U8);
	341	assert(count>0);
	342	return count;
	343	}
	344	#endif /* PERL_RE_BUILD_AUX */
	345
	346	/*XXX: todo make this not included in a non debugging perl, but appears to be
	347	* used anyway there, in 'use re' */
	348	#ifndef PERL_IN_XSUB_RE
	349	void
	350	Perl_reginitcolors(pTHX)
	351	{
	352	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	353	if (s) {
	354	char *t = savepv(s);
	355	int i = 0;
	356	PL_colors[0] = t;
	357	while (++i < 6) {
	358	t = strchr(t, '\t');
	359	if (t) {
	360	*t = '\0';
	361	PL_colors[i] = ++t;
	362	}
	363	else
	364	PL_colors[i] = t = (char *)"";
	365	}
	366	} else {
	367	int i = 0;
	368	while (i < 6)
	369	PL_colors[i++] = (char *)"";
	370	}
	371	PL_colorset = 1;
	372	}
	373	#endif
	374
	375
	376	#ifdef TRIE_STUDY_OPT
	377	/* search for "restudy" in this file for a detailed explanation */
	378	#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
	379	STMT_START { \
	380	if ( \
	381	(data.flags & SCF_TRIE_RESTUDY) \
	382	&& ! restudied++ \
	383	) { \
	384	dOsomething; \
	385	goto reStudy; \
	386	} \
	387	} STMT_END
	388	#else
	389	#define CHECK_RESTUDY_GOTO_butfirst
	390	#endif
	391
	392	/*
	393	* pregcomp - compile a regular expression into internal code
	394	*
	395	* Decides which engine's compiler to call based on the hint currently in
	396	* scope
	397	*/
	398
	399	#ifndef PERL_IN_XSUB_RE
	400
	401	/* return the currently in-scope regex engine (or the default if none) */
	402
	403	regexp_engine const *
	404	Perl_current_re_engine(pTHX)
	405	{
	406	if (IN_PERL_COMPILETIME) {
	407	HV * const table = GvHV(PL_hintgv);
	408	SV **ptr;
	409
	410	if (!table \|\| !(PL_hints & HINT_LOCALIZE_HH))
	411	return &PL_core_reg_engine;
	412	ptr = hv_fetchs(table, "regcomp", FALSE);
	413	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	414	return &PL_core_reg_engine;
	415	return INT2PTR(regexp_engine, SvIV(ptr));
	416	}
	417	else {
	418	SV *ptr;
	419	if (!PL_curcop->cop_hints_hash)
	420	return &PL_core_reg_engine;
	421	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
	422	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	423	return &PL_core_reg_engine;
	424	return INT2PTR(regexp_engine*, SvIV(ptr));
	425	}
	426	}
	427
	428
	429	REGEXP *
	430	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	431	{
	432	regexp_engine const *eng = current_re_engine();
	433	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	434
	435	PERL_ARGS_ASSERT_PREGCOMP;
	436
	437	/* Dispatch a request to compile a regexp to correct regexp engine. */
	438	DEBUG_COMPILE_r({
	439	Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
	440	PTR2UV(eng));
	441	});
	442	return CALLREGCOMP_ENG(eng, pattern, flags);
	443	}
	444	#endif
	445
	446	/*
	447	=for apidoc re_compile
	448
	449	Compile the regular expression pattern C<pattern>, returning a pointer to the
	450	compiled object for later matching with the internal regex engine.
	451
	452	This function is typically used by a custom regexp engine C<.comp()> function
	453	to hand off to the core regexp engine those patterns it doesn't want to handle
	454	itself (typically passing through the same flags it was called with). In
	455	almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
	456	to compile using the currently active regexp engine.
	457
	458	If C<pattern> is already a C<REGEXP>, this function does nothing but return a
	459	pointer to the input. Otherwise the PV is extracted and treated like a string
	460	representing a pattern. See L<perlre>.
	461
	462	The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names
	463	all begin with C<RXf_>.
	464
	465	=cut
	466
	467	* public entry point for the perl core's own regex compiling code.
	468	* It's actually a wrapper for Perl_re_op_compile that only takes an SV
	469	* pattern rather than a list of OPs, and uses the internal engine rather
	470	* than the current one */
	471
	472	REGEXP *
	473	Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
	474	{
	475	SV pat = pattern; / defeat constness! */
	476
	477	PERL_ARGS_ASSERT_RE_COMPILE;
	478
	479	return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
	480	#ifdef PERL_IN_XSUB_RE
	481	&my_reg_engine,
	482	#else
	483	&PL_core_reg_engine,
	484	#endif
	485	NULL, NULL, rx_flags, 0);
	486	}
	487
	488	static void
	489	S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
	490	{
	491	int n;
	492
	493	if (--cbs->refcnt > 0)
	494	return;
	495	for (n = 0; n < cbs->count; n++) {
	496	REGEXP *rx = cbs->cb[n].src_regex;
	497	if (rx) {
	498	cbs->cb[n].src_regex = NULL;
	499	SvREFCNT_dec_NN(rx);
	500	}
	501	}
	502	Safefree(cbs->cb);
	503	Safefree(cbs);
	504	}
	505
	506
	507	static struct reg_code_blocks *
	508	S_alloc_code_blocks(pTHX_ int ncode)
	509	{
	510	struct reg_code_blocks *cbs;
	511	Newx(cbs, 1, struct reg_code_blocks);
	512	cbs->count = ncode;
	513	cbs->refcnt = 1;
	514	SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
	515	if (ncode)
	516	Newx(cbs->cb, ncode, struct reg_code_block);
	517	else
	518	cbs->cb = NULL;
	519	return cbs;
	520	}
	521
	522
	523	/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
	524	* blocks, recalculate the indices. Update pat_p and plen_p in-place to
	525	* point to the realloced string and length.
	526	*
	527	* This is essentially a copy of Perl_bytes_to_utf8() with the code index
	528	* stuff added */
	529
	530	static void
	531	S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
	532	char *pat_p, STRLEN plen_p, int num_code_blocks)
	533	{
	534	U8 const src = (U8)*pat_p;
	535	U8 dst, d;
	536	int n=0;
	537	STRLEN s = 0;
	538	bool do_end = 0;
	539	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	540
	541	DEBUG_PARSE_r(Perl_re_printf( aTHX_
	542	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	543
	544	/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
	545	Newx(dst, plen_p + variant_under_utf8_count(src, src + plen_p) + 1, U8);
	546	d = dst;
	547
	548	while (s < *plen_p) {
	549	append_utf8_from_native_byte(src[s], &d);
	550
	551	if (n < num_code_blocks) {
	552	assert(pRExC_state->code_blocks);
	553	if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
	554	pRExC_state->code_blocks->cb[n].start = d - dst - 1;
	555	assert(*(d - 1) == '(');
	556	do_end = 1;
	557	}
	558	else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
	559	pRExC_state->code_blocks->cb[n].end = d - dst - 1;
	560	assert(*(d - 1) == ')');
	561	do_end = 0;
	562	n++;
	563	}
	564	}
	565	s++;
	566	}
	567	*d = '\0';
	568	*plen_p = d - dst;
	569	pat_p = (char) dst;
	570	SAVEFREEPV(*pat_p);
	571	RExC_orig_utf8 = RExC_utf8 = 1;
	572	}
	573
	574
	575
	576	/* S_concat_pat(): concatenate a list of args to the pattern string pat,
	577	* while recording any code block indices, and handling overloading,
	578	* nested qr// objects etc. If pat is null, it will allocate a new
	579	* string, or just return the first arg, if there's only one.
	580	*
	581	* Returns the malloced/updated pat.
	582	* patternp and pat_count is the array of SVs to be concatted;
	583	* oplist is the optional list of ops that generated the SVs;
	584	* recompile_p is a pointer to a boolean that will be set if
	585	* the regex will need to be recompiled.
	586	* delim, if non-null is an SV that will be inserted between each element
	587	*/
	588
	589	static SV*
	590	S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
	591	SV pat, SV * const patternp, int pat_count,
	592	OP oplist, bool recompile_p, SV *delim)
	593	{
	594	SV **svp;
	595	int n = 0;
	596	bool use_delim = FALSE;
	597	bool alloced = FALSE;
	598
	599	/* if we know we have at least two args, create an empty string,
	600	* then concatenate args to that. For no args, return an empty string */
	601	if (!pat && pat_count != 1) {
	602	pat = newSVpvs("");
	603	SAVEFREESV(pat);
	604	alloced = TRUE;
	605	}
	606
	607	for (svp = patternp; svp < patternp + pat_count; svp++) {
	608	SV *sv;
	609	SV *rx = NULL;
	610	STRLEN orig_patlen = 0;
	611	bool code = 0;
	612	SV msv = use_delim ? delim : svp;
	613	if (!msv) msv = &PL_sv_undef;
	614
	615	/* if we've got a delimiter, we go round the loop twice for each
	616	* svp slot (except the last), using the delimiter the second
	617	* time round */
	618	if (use_delim) {
	619	svp--;
	620	use_delim = FALSE;
	621	}
	622	else if (delim)
	623	use_delim = TRUE;
	624
	625	if (SvTYPE(msv) == SVt_PVAV) {
	626	/* we've encountered an interpolated array within
	627	* the pattern, e.g. /...@a..../. Expand the list of elements,
	628	* then recursively append elements.
	629	* The code in this block is based on S_pushav() */
	630
	631	AV const av = (AV)msv;
	632	const SSize_t maxarg = AvFILL(av) + 1;
	633	SV **array;
	634
	635	if (oplist) {
	636	assert(oplist->op_type == OP_PADAV
	637	\|\| oplist->op_type == OP_RV2AV);
	638	oplist = OpSIBLING(oplist);
	639	}
	640
	641	if (SvRMAGICAL(av)) {
	642	SSize_t i;
	643
	644	Newx(array, maxarg, SV*);
	645	SAVEFREEPV(array);
	646	for (i=0; i < maxarg; i++) {
	647	SV ** const svp = av_fetch(av, i, FALSE);
	648	array[i] = svp ? *svp : &PL_sv_undef;
	649	}
	650	}
	651	else
	652	array = AvARRAY(av);
	653
	654	if (maxarg > 0) {
	655	pat = S_concat_pat(aTHX_ pRExC_state, pat,
	656	array, maxarg, NULL, recompile_p,
	657	/* $" */
	658	GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
	659	}
	660	else if (!pat) {
	661	pat = newSVpvs_flags("", SVs_TEMP);
	662	}
	663
	664	continue;
	665	}
	666
	667
	668	/* we make the assumption here that each op in the list of
	669	* op_siblings maps to one SV pushed onto the stack,
	670	* except for code blocks, with have both an OP_NULL and
	671	* an OP_CONST.
	672	* This allows us to match up the list of SVs against the
	673	* list of OPs to find the next code block.
	674	*
	675	* Note that PUSHMARK PADSV PADSV ..
	676	* is optimised to
	677	* PADRANGE PADSV PADSV ..
	678	* so the alignment still works. */
	679
	680	if (oplist) {
	681	if (oplist->op_type == OP_NULL
	682	&& (oplist->op_flags & OPf_SPECIAL))
	683	{
	684	assert(n < pRExC_state->code_blocks->count);
	685	pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
	686	pRExC_state->code_blocks->cb[n].block = oplist;
	687	pRExC_state->code_blocks->cb[n].src_regex = NULL;
	688	n++;
	689	code = 1;
	690	oplist = OpSIBLING(oplist); /* skip CONST */
	691	assert(oplist);
	692	}
	693	oplist = OpSIBLING(oplist);;
	694	}
	695
	696	/* apply magic and QR overloading to arg */
	697
	698	SvGETMAGIC(msv);
	699	if (SvROK(msv) && SvAMAGIC(msv)) {
	700	SV *sv = AMG_CALLunary(msv, regexp_amg);
	701	if (sv) {
	702	if (SvROK(sv))
	703	sv = SvRV(sv);
	704	if (SvTYPE(sv) != SVt_REGEXP)
	705	Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
	706	msv = sv;
	707	}
	708	}
	709
	710	/* try concatenation overload ... */
	711	if (pat && (SvAMAGIC(pat) \|\| SvAMAGIC(msv)) &&
	712	(sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
	713	{
	714	sv_setsv(pat, sv);
	715	/* overloading involved: all bets are off over literal
	716	* code. Pretend we haven't seen it */
	717	if (n)
	718	pRExC_state->code_blocks->count -= n;
	719	n = 0;
	720	}
	721	else {
	722	/* ... or failing that, try "" overload */
	723	while (SvAMAGIC(msv)
	724	&& (sv = AMG_CALLunary(msv, string_amg))
	725	&& sv != msv
	726	&& !( SvROK(msv)
	727	&& SvROK(sv)
	728	&& SvRV(msv) == SvRV(sv))
	729	) {
	730	msv = sv;
	731	SvGETMAGIC(msv);
	732	}
	733	if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
	734	msv = SvRV(msv);
	735
	736	if (pat) {
	737	/* this is a partially unrolled
	738	* sv_catsv_nomg(pat, msv);
	739	* that allows us to adjust code block indices if
	740	* needed */
	741	STRLEN dlen;
	742	char *dst = SvPV_force_nomg(pat, dlen);
	743	orig_patlen = dlen;
	744	if (SvUTF8(msv) && !SvUTF8(pat)) {
	745	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
	746	sv_setpvn(pat, dst, dlen);
	747	SvUTF8_on(pat);
	748	}
	749	sv_catsv_nomg(pat, msv);
	750	rx = msv;
	751	}
	752	else {
	753	/* We have only one SV to process, but we need to verify
	754	* it is properly null terminated or we will fail asserts
	755	* later. In theory we probably shouldn't get such SV's,
	756	* but if we do we should handle it gracefully. */
	757	if ( SvTYPE(msv) != SVt_PV \|\| (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) \|\| SvIsCOW_shared_hash(msv) ) {
	758	/* not a string, or a string with a trailing null */
	759	pat = msv;
	760	} else {
	761	/* a string with no trailing null, we need to copy it
	762	* so it has a trailing null */
	763	pat = sv_2mortal(newSVsv(msv));
	764	}
	765	}
	766
	767	if (code)
	768	pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
	769	}
	770
	771	/* extract any code blocks within any embedded qr//'s */
	772	if (rx && SvTYPE(rx) == SVt_REGEXP
	773	&& RX_ENGINE((REGEXP*)rx)->op_comp)
	774	{
	775
	776	RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
	777	if (ri->code_blocks && ri->code_blocks->count) {
	778	int i;
	779	/* the presence of an embedded qr// with code means
	780	* we should always recompile: the text of the
	781	* qr// may not have changed, but it may be a
	782	* different closure than last time */
	783	*recompile_p = 1;
	784	if (pRExC_state->code_blocks) {
	785	int new_count = pRExC_state->code_blocks->count
	786	+ ri->code_blocks->count;
	787	Renew(pRExC_state->code_blocks->cb,
	788	new_count, struct reg_code_block);
	789	pRExC_state->code_blocks->count = new_count;
	790	}
	791	else
	792	pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
	793	ri->code_blocks->count);
	794
	795	for (i=0; i < ri->code_blocks->count; i++) {
	796	struct reg_code_block src, dst;
	797	STRLEN offset = orig_patlen
	798	+ ReANY((REGEXP *)rx)->pre_prefix;
	799	assert(n < pRExC_state->code_blocks->count);
	800	src = &ri->code_blocks->cb[i];
	801	dst = &pRExC_state->code_blocks->cb[n];
	802	dst->start = src->start + offset;
	803	dst->end = src->end + offset;
	804	dst->block = src->block;
	805	dst->src_regex = (REGEXP) SvREFCNT_inc( (SV)
	806	src->src_regex
	807	? src->src_regex
	808	: (REGEXP*)rx);
	809	n++;
	810	}
	811	}
	812	}
	813	}
	814	/* avoid calling magic multiple times on a single element e.g. =~ $qr */
	815	if (alloced)
	816	SvSETMAGIC(pat);
	817
	818	return pat;
	819	}
	820
	821
	822
	823	/* see if there are any run-time code blocks in the pattern.
	824	* False positives are allowed */
	825
	826	static bool
	827	S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
	828	char *pat, STRLEN plen)
	829	{
	830	int n = 0;
	831	STRLEN s;
	832
	833	PERL_UNUSED_CONTEXT;
	834
	835	for (s = 0; s < plen; s++) {
	836	if ( pRExC_state->code_blocks
	837	&& n < pRExC_state->code_blocks->count
	838	&& s == pRExC_state->code_blocks->cb[n].start)
	839	{
	840	s = pRExC_state->code_blocks->cb[n].end;
	841	n++;
	842	continue;
	843	}
	844	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
	845	* positives here */
	846	if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
	847	(pat[s+2] == '{'
	848	\|\| (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
	849	)
	850	return 1;
	851	}
	852	return 0;
	853	}
	854
	855	/* Handle run-time code blocks. We will already have compiled any direct
	856	* or indirect literal code blocks. Now, take the pattern 'pat' and make a
	857	* copy of it, but with any literal code blocks blanked out and
	858	* appropriate chars escaped; then feed it into
	859	*
	860	* eval "qr'modified_pattern'"
	861	*
	862	* For example,
	863	*
	864	* a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
	865	*
	866	* becomes
	867	*
	868	* qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
	869	*
	870	* After eval_sv()-ing that, grab any new code blocks from the returned qr
	871	* and merge them with any code blocks of the original regexp.
	872	*
	873	* If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
	874	* instead, just save the qr and return FALSE; this tells our caller that
	875	* the original pattern needs upgrading to utf8.
	876	*/
	877
	878	static bool
	879	S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
	880	char *pat, STRLEN plen)
	881	{
	882	SV *qr;
	883
	884	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	885
	886	if (pRExC_state->runtime_code_qr) {
	887	/* this is the second time we've been called; this should
	888	* only happen if the main pattern got upgraded to utf8
	889	* during compilation; re-use the qr we compiled first time
	890	* round (which should be utf8 too)
	891	*/
	892	qr = pRExC_state->runtime_code_qr;
	893	pRExC_state->runtime_code_qr = NULL;
	894	assert(RExC_utf8 && SvUTF8(qr));
	895	}
	896	else {
	897	int n = 0;
	898	STRLEN s;
	899	char p, newpat;
	900	int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
	901	SV sv, qr_ref;
	902	dSP;
	903
	904	/* determine how many extra chars we need for ' and \ escaping */
	905	for (s = 0; s < plen; s++) {
	906	if (pat[s] == '\'' \|\| pat[s] == '\\')
	907	newlen++;
	908	}
	909
	910	Newx(newpat, newlen, char);
	911	p = newpat;
	912	p++ = 'q'; p++ = 'r'; *p++ = '\'';
	913
	914	for (s = 0; s < plen; s++) {
	915	if ( pRExC_state->code_blocks
	916	&& n < pRExC_state->code_blocks->count
	917	&& s == pRExC_state->code_blocks->cb[n].start)
	918	{
	919	/* blank out literal code block so that they aren't
	920	* recompiled: eg change from/to:
	921	* /(?{xyz})/
	922	* /(?=====)/
	923	* and
	924	* /(??{xyz})/
	925	* /(?======)/
	926	* and
	927	* /(?(?{xyz}))/
	928	* /(?(?=====))/
	929	*/
	930	assert(pat[s] == '(');
	931	assert(pat[s+1] == '?');
	932	*p++ = '(';
	933	*p++ = '?';
	934	s += 2;
	935	while (s < pRExC_state->code_blocks->cb[n].end) {
	936	*p++ = '=';
	937	s++;
	938	}
	939	*p++ = ')';
	940	n++;
	941	continue;
	942	}
	943	if (pat[s] == '\'' \|\| pat[s] == '\\')
	944	*p++ = '\\';
	945	*p++ = pat[s];
	946	}
	947	*p++ = '\'';
	948	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
	949	*p++ = 'x';
	950	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
	951	*p++ = 'x';
	952	}
	953	}
	954	*p++ = '\0';
	955	DEBUG_COMPILE_r({
	956	Perl_re_printf( aTHX_
	957	"%sre-parsing pattern for runtime code:%s %s\n",
	958	PL_colors[4], PL_colors[5], newpat);
	959	});
	960
	961	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
	962	Safefree(newpat);
	963
	964	ENTER;
	965	SAVETMPS;
	966	save_re_context();
	967	PUSHSTACKi(PERLSI_REQUIRE);
	968	/* G_RE_REPARSING causes the toker to collapse \\ into \ when
	969	* parsing qr''; normally only q'' does this. It also alters
	970	* hints handling */
	971	eval_sv(sv, G_SCALAR\|G_RE_REPARSING);
	972	SvREFCNT_dec_NN(sv);
	973	SPAGAIN;
	974	qr_ref = POPs;
	975	PUTBACK;
	976	{
	977	SV * const errsv = ERRSV;
	978	if (SvTRUE_NN(errsv))
	979	/* use croak_sv ? */
	980	Perl_croak_nocontext("%" SVf, SVfARG(errsv));
	981	}
	982	assert(SvROK(qr_ref));
	983	qr = SvRV(qr_ref);
	984	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
	985	/* the leaving below frees the tmp qr_ref.
	986	* Give qr a life of its own */
	987	SvREFCNT_inc(qr);
	988	POPSTACK;
	989	FREETMPS;
	990	LEAVE;
	991
	992	}
	993
	994	if (!RExC_utf8 && SvUTF8(qr)) {
	995	/* first time through; the pattern got upgraded; save the
	996	* qr for the next time through */
	997	assert(!pRExC_state->runtime_code_qr);
	998	pRExC_state->runtime_code_qr = qr;
	999	return 0;
	1000	}
	1001
	1002
	1003	/* extract any code blocks within the returned qr// */
	1004
	1005
	1006	/* merge the main (r1) and run-time (r2) code blocks into one */
	1007	{
	1008	RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
	1009	struct reg_code_block new_block, dst;
	1010	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
	1011	int i1 = 0, i2 = 0;
	1012	int r1c, r2c;
	1013
	1014	if (!r2->code_blocks \|\| !r2->code_blocks->count) /* we guessed wrong */
	1015	{
	1016	SvREFCNT_dec_NN(qr);
	1017	return 1;
	1018	}
	1019
	1020	if (!r1->code_blocks)
	1021	r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
	1022
	1023	r1c = r1->code_blocks->count;
	1024	r2c = r2->code_blocks->count;
	1025
	1026	Newx(new_block, r1c + r2c, struct reg_code_block);
	1027
	1028	dst = new_block;
	1029
	1030	while (i1 < r1c \|\| i2 < r2c) {
	1031	struct reg_code_block *src;
	1032	bool is_qr = 0;
	1033
	1034	if (i1 == r1c) {
	1035	src = &r2->code_blocks->cb[i2++];
	1036	is_qr = 1;
	1037	}
	1038	else if (i2 == r2c)
	1039	src = &r1->code_blocks->cb[i1++];
	1040	else if ( r1->code_blocks->cb[i1].start
	1041	< r2->code_blocks->cb[i2].start)
	1042	{
	1043	src = &r1->code_blocks->cb[i1++];
	1044	assert(src->end < r2->code_blocks->cb[i2].start);
	1045	}
	1046	else {
	1047	assert( r1->code_blocks->cb[i1].start
	1048	> r2->code_blocks->cb[i2].start);
	1049	src = &r2->code_blocks->cb[i2++];
	1050	is_qr = 1;
	1051	assert(src->end < r1->code_blocks->cb[i1].start);
	1052	}
	1053
	1054	assert(pat[src->start] == '(');
	1055	assert(pat[src->end] == ')');
	1056	dst->start = src->start;
	1057	dst->end = src->end;
	1058	dst->block = src->block;
	1059	dst->src_regex = is_qr ? (REGEXP) SvREFCNT_inc( (SV) qr)
	1060	: src->src_regex;
	1061	dst++;
	1062	}
	1063	r1->code_blocks->count += r2c;
	1064	Safefree(r1->code_blocks->cb);
	1065	r1->code_blocks->cb = new_block;
	1066	}
	1067
	1068	SvREFCNT_dec_NN(qr);
	1069	return 1;
	1070	}
	1071
	1072
	1073	STATIC bool
	1074	S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
	1075	struct reg_substr_datum *rsd,
	1076	struct scan_data_substrs *sub,
	1077	STRLEN longest_length)
	1078	{
	1079	/* This is the common code for setting up the floating and fixed length
	1080	* string data extracted from Perl_re_op_compile() below. Returns a boolean
	1081	* as to whether succeeded or not */
	1082
	1083	I32 t;
	1084	SSize_t ml;
	1085	bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
	1086	bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
	1087
	1088	if (! (longest_length
	1089	\|\| (eol /* Can't have SEOL and MULTI */
	1090	&& (! meol \|\| (RExC_flags & RXf_PMf_MULTILINE)))
	1091	)
	1092	/* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
	1093	\|\| (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
	1094	{
	1095	return FALSE;
	1096	}
	1097
	1098	/* copy the information about the longest from the reg_scan_data
	1099	over to the program. */
	1100	if (SvUTF8(sub->str)) {
	1101	rsd->substr = NULL;
	1102	rsd->utf8_substr = sub->str;
	1103	} else {
	1104	rsd->substr = sub->str;
	1105	rsd->utf8_substr = NULL;
	1106	}
	1107	/* end_shift is how many chars that must be matched that
	1108	follow this item. We calculate it ahead of time as once the
	1109	lookbehind offset is added in we lose the ability to correctly
	1110	calculate it.*/
	1111	ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
	1112	rsd->end_shift = ml - sub->min_offset
	1113	- longest_length
	1114	/* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
	1115	* intead? - DAPM
	1116	+ (SvTAIL(sub->str) != 0)
	1117	*/
	1118	+ sub->lookbehind;
	1119
	1120	t = (eol/* Can't have SEOL and MULTI */
	1121	&& (! meol \|\| (RExC_flags & RXf_PMf_MULTILINE)));
	1122	fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
	1123
	1124	return TRUE;
	1125	}
	1126
	1127	STATIC void
	1128	S_set_regex_pv(pTHX_ RExC_state_t pRExC_state, REGEXP Rx)
	1129	{
	1130	/* Calculates and sets in the compiled pattern 'Rx' the string to compile,
	1131	* properly wrapped with the right modifiers */
	1132
	1133	bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	1134	bool has_charset = RExC_utf8 \|\| (get_regex_charset(RExC_rx->extflags)
	1135	!= REGEX_DEPENDS_CHARSET);
	1136
	1137	/* The caret is output if there are any defaults: if not all the STD
	1138	* flags are set, or if no character set specifier is needed */
	1139	bool has_default =
	1140	(((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	1141	\|\| ! has_charset);
	1142	bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
	1143	== REG_RUN_ON_COMMENT_SEEN);
	1144	U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
	1145	>> RXf_PMf_STD_PMMOD_SHIFT);
	1146	const char fptr = STD_PAT_MODS; /"msixxn"*/
	1147	char *p;
	1148	STRLEN pat_len = RExC_precomp_end - RExC_precomp;
	1149
	1150	/* We output all the necessary flags; we never output a minus, as all
	1151	* those are defaults, so are
	1152	* covered by the caret */
	1153	const STRLEN wraplen = pat_len + has_p + has_runon
	1154	+ has_default /* If needs a caret */
	1155	+ PL_bitcount[reganch] /* 1 char for each set standard flag */
	1156
	1157	/* If needs a character set specifier */
	1158	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	1159	+ (sizeof("(?:)") - 1);
	1160
	1161	PERL_ARGS_ASSERT_SET_REGEX_PV;
	1162
	1163	/* make sure PL_bitcount bounds not exceeded */
	1164	STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
	1165
	1166	p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
	1167	SvPOK_on(Rx);
	1168	if (RExC_utf8)
	1169	SvFLAGS(Rx) \|= SVf_UTF8;
	1170	p++='('; p++='?';
	1171
	1172	/* If a default, cover it using the caret */
	1173	if (has_default) {
	1174	*p++= DEFAULT_PAT_MOD;
	1175	}
	1176	if (has_charset) {
	1177	STRLEN len;
	1178	const char* name;
	1179
	1180	name = get_regex_charset_name(RExC_rx->extflags, &len);
	1181	if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
	1182	assert(RExC_utf8);
	1183	name = UNICODE_PAT_MODS;
	1184	len = sizeof(UNICODE_PAT_MODS) - 1;
	1185	}
	1186	Copy(name, p, len, char);
	1187	p += len;
	1188	}
	1189	if (has_p)
	1190	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	1191	{
	1192	char ch;
	1193	while((ch = *fptr++)) {
	1194	if(reganch & 1)
	1195	*p++ = ch;
	1196	reganch >>= 1;
	1197	}
	1198	}
	1199
	1200	*p++ = ':';
	1201	Copy(RExC_precomp, p, pat_len, char);
	1202	assert ((RX_WRAPPED(Rx) - p) < 16);
	1203	RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
	1204	p += pat_len;
	1205
	1206	/* Adding a trailing \n causes this to compile properly:
	1207	my $R = qr / A B C # D E/x; /($R)/
	1208	Otherwise the parens are considered part of the comment */
	1209	if (has_runon)
	1210	*p++ = '\n';
	1211	*p++ = ')';
	1212	*p = 0;
	1213	SvCUR_set(Rx, p - RX_WRAPPED(Rx));
	1214	}
	1215
	1216	/*
	1217	* Perl_re_op_compile - the perl internal RE engine's function to compile a
	1218	* regular expression into internal code.
	1219	* The pattern may be passed either as:
	1220	* a list of SVs (patternp plus pat_count)
	1221	* a list of OPs (expr)
	1222	* If both are passed, the SV list is used, but the OP list indicates
	1223	* which SVs are actually pre-compiled code blocks
	1224	*
	1225	* The SVs in the list have magic and qr overloading applied to them (and
	1226	* the list may be modified in-place with replacement SVs in the latter
	1227	* case).
	1228	*
	1229	* If the pattern hasn't changed from old_re, then old_re will be
	1230	* returned.
	1231	*
	1232	* eng is the current engine. If that engine has an op_comp method, then
	1233	* handle directly (i.e. we assume that op_comp was us); otherwise, just
	1234	* do the initial concatenation of arguments and pass on to the external
	1235	* engine.
	1236	*
	1237	* If is_bare_re is not null, set it to a boolean indicating whether the
	1238	* arg list reduced (after overloading) to a single bare regex which has
	1239	* been returned (i.e. /$qr/).
	1240	*
	1241	* orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
	1242	*
	1243	* pm_flags contains the PMf_* flags, typically based on those from the
	1244	* pm_flags field of the related PMOP. Currently we're only interested in
	1245	* PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
	1246	*
	1247	* For many years this code had an initial sizing pass that calculated
	1248	* (sometimes incorrectly, leading to security holes) the size needed for the
	1249	* compiled pattern. That was changed by commit
	1250	* 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
	1251	* node at a time, as parsing goes along. Patches welcome to fix any obsolete
	1252	* references to this sizing pass.
	1253	*
	1254	* Now, an initial crude guess as to the size needed is made, based on the
	1255	* length of the pattern. Patches welcome to improve that guess. That amount
	1256	* of space is malloc'd and then immediately freed, and then clawed back node
	1257	* by node. This design is to minimize, to the extent possible, memory churn
	1258	* when doing the reallocs.
	1259	*
	1260	* A separate parentheses counting pass may be needed in some cases.
	1261	* (Previously the sizing pass did this.) Patches welcome to reduce the number
	1262	* of these cases.
	1263	*
	1264	* The existence of a sizing pass necessitated design decisions that are no
	1265	* longer needed. There are potential areas of simplification.
	1266	*
	1267	* Beware that the optimization-preparation code in here knows about some
	1268	* of the structure of the compiled regexp. [I'll say.]
	1269	*/
	1270
	1271	REGEXP *
	1272	Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
	1273	OP expr, const regexp_engine eng, REGEXP *old_re,
	1274	bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
	1275	{
	1276	REGEXP Rx; / Capital 'R' means points to a REGEXP */
	1277	STRLEN plen;
	1278	char *exp;
	1279	regnode *scan;
	1280	I32 flags;
	1281	SSize_t minlen = 0;
	1282	U32 rx_flags;
	1283	SV *pat;
	1284	SV** new_patternp = patternp;
	1285
	1286	/* these are all flags - maybe they should be turned
	1287	* into a single int with different bit masks */
	1288	I32 sawlookahead = 0;
	1289	I32 sawplus = 0;
	1290	I32 sawopen = 0;
	1291	I32 sawminmod = 0;
	1292
	1293	regex_charset initial_charset = get_regex_charset(orig_rx_flags);
	1294	bool recompile = 0;
	1295	bool runtime_code = 0;
	1296	scan_data_t data;
	1297	RExC_state_t RExC_state;
	1298	RExC_state_t * const pRExC_state = &RExC_state;
	1299	#ifdef TRIE_STUDY_OPT
	1300	/* search for "restudy" in this file for a detailed explanation */
	1301	int restudied = 0;
	1302	RExC_state_t copyRExC_state;
	1303	#endif
	1304	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	1305
	1306	PERL_ARGS_ASSERT_RE_OP_COMPILE;
	1307
	1308	DEBUG_r(if (!PL_colorset) reginitcolors());
	1309
	1310
	1311	pRExC_state->warn_text = NULL;
	1312	pRExC_state->unlexed_names = NULL;
	1313	pRExC_state->code_blocks = NULL;
	1314
	1315	if (is_bare_re)
	1316	*is_bare_re = FALSE;
	1317
	1318	if (expr && (expr->op_type == OP_LIST \|\|
	1319	(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
	1320	/* allocate code_blocks if needed */
	1321	OP *o;
	1322	int ncode = 0;
	1323
	1324	for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
	1325	if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
	1326	ncode++; /* count of DO blocks */
	1327
	1328	if (ncode)
	1329	pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
	1330	}
	1331
	1332	if (!pat_count) {
	1333	/* compile-time pattern with just OP_CONSTs and DO blocks */
	1334
	1335	int n;
	1336	OP *o;
	1337
	1338	/* find how many CONSTs there are */
	1339	assert(expr);
	1340	n = 0;
	1341	if (expr->op_type == OP_CONST)
	1342	n = 1;
	1343	else
	1344	for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
	1345	if (o->op_type == OP_CONST)
	1346	n++;
	1347	}
	1348
	1349	/* fake up an SV array */
	1350
	1351	assert(!new_patternp);
	1352	Newx(new_patternp, n, SV*);
	1353	SAVEFREEPV(new_patternp);
	1354	pat_count = n;
	1355
	1356	n = 0;
	1357	if (expr->op_type == OP_CONST)
	1358	new_patternp[n] = cSVOPx_sv(expr);
	1359	else
	1360	for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
	1361	if (o->op_type == OP_CONST)
	1362	new_patternp[n++] = cSVOPo_sv;
	1363	}
	1364
	1365	}
	1366
	1367	DEBUG_PARSE_r(Perl_re_printf( aTHX_
	1368	"Assembling pattern from %d elements%s\n", pat_count,
	1369	orig_rx_flags & RXf_SPLIT ? " for split" : ""));
	1370
	1371	/* set expr to the first arg op */
	1372
	1373	if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
	1374	&& expr->op_type != OP_CONST)
	1375	{
	1376	expr = cLISTOPx(expr)->op_first;
	1377	assert( expr->op_type == OP_PUSHMARK
	1378	\|\| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
	1379	\|\| expr->op_type == OP_PADRANGE);
	1380	expr = OpSIBLING(expr);
	1381	}
	1382
	1383	pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
	1384	expr, &recompile, NULL);
	1385
	1386	/* handle bare (possibly after overloading) regex: foo =~ $re */
	1387	{
	1388	SV *re = pat;
	1389	if (SvROK(re))
	1390	re = SvRV(re);
	1391	if (SvTYPE(re) == SVt_REGEXP) {
	1392	if (is_bare_re)
	1393	*is_bare_re = TRUE;
	1394	SvREFCNT_inc(re);
	1395	DEBUG_PARSE_r(Perl_re_printf( aTHX_
	1396	"Precompiled pattern%s\n",
	1397	orig_rx_flags & RXf_SPLIT ? " for split" : ""));
	1398
	1399	return (REGEXP*)re;
	1400	}
	1401	}
	1402
	1403	exp = SvPV_nomg(pat, plen);
	1404
	1405	if (!eng->op_comp) {
	1406	if ((SvUTF8(pat) && IN_BYTES)
	1407	\|\| SvGMAGICAL(pat) \|\| SvAMAGIC(pat))
	1408	{
	1409	/* make a temporary copy; either to convert to bytes,
	1410	* or to avoid repeating get-magic / overloaded stringify */
	1411	pat = newSVpvn_flags(exp, plen, SVs_TEMP \|
	1412	(IN_BYTES ? 0 : SvUTF8(pat)));
	1413	}
	1414	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
	1415	}
	1416
	1417	/* ignore the utf8ness if the pattern is 0 length */
	1418	RExC_utf8 = RExC_orig_utf8 = (plen == 0 \|\| IN_BYTES) ? 0 : SvUTF8(pat);
	1419	RExC_uni_semantics = 0;
	1420	RExC_contains_locale = 0;
	1421	RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
	1422	RExC_in_script_run = 0;
	1423	RExC_study_started = 0;
	1424	pRExC_state->runtime_code_qr = NULL;
	1425	RExC_frame_head= NULL;
	1426	RExC_frame_last= NULL;
	1427	RExC_frame_count= 0;
	1428	RExC_latest_warn_offset = 0;
	1429	RExC_use_BRANCHJ = 0;
	1430	RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
	1431	RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
	1432	RExC_logical_total_parens = 0;
	1433	RExC_total_parens = 0;
	1434	RExC_logical_to_parno = NULL;
	1435	RExC_parno_to_logical = NULL;
	1436	RExC_open_parens = NULL;
	1437	RExC_close_parens = NULL;
	1438	RExC_paren_names = NULL;
	1439	RExC_size = 0;
	1440	RExC_seen_d_op = FALSE;
	1441	#ifdef DEBUGGING
	1442	RExC_paren_name_list = NULL;
	1443	#endif
	1444
	1445	DEBUG_r({
	1446	RExC_mysv1= sv_newmortal();
	1447	RExC_mysv2= sv_newmortal();
	1448	});
	1449
	1450	DEBUG_COMPILE_r({
	1451	SV *dsv= sv_newmortal();
	1452	RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
	1453	Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
	1454	PL_colors[4], PL_colors[5], s);
	1455	});
	1456
	1457	/* we jump here if we have to recompile, e.g., from upgrading the pattern
	1458	* to utf8 */
	1459
	1460	if ((pm_flags & PMf_USE_RE_EVAL)
	1461	/* this second condition covers the non-regex literal case,
	1462	* i.e. $foo =~ '(?{})'. */
	1463	\|\| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
	1464	)
	1465	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
	1466
	1467	redo_parse:
	1468	/* return old regex if pattern hasn't changed */
	1469	/* XXX: note in the below we have to check the flags as well as the
	1470	* pattern.
	1471	*
	1472	* Things get a touch tricky as we have to compare the utf8 flag
	1473	* independently from the compile flags. */
	1474
	1475	if ( old_re
	1476	&& !recompile
	1477	&& cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
	1478	&& ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
	1479	&& RX_PRELEN(old_re) == plen
	1480	&& memEQ(RX_PRECOMP(old_re), exp, plen)
	1481	&& !runtime_code /* with runtime code, always recompile */ )
	1482	{
	1483	DEBUG_COMPILE_r({
	1484	SV *dsv= sv_newmortal();
	1485	RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
	1486	Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
	1487	PL_colors[4], PL_colors[5], s);
	1488	});
	1489	return old_re;
	1490	}
	1491
	1492	/* Allocate the pattern's SV */
	1493	RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
	1494	RExC_rx = ReANY(Rx);
	1495	if ( RExC_rx == NULL )
	1496	FAIL("Regexp out of space");
	1497
	1498	rx_flags = orig_rx_flags;
	1499
	1500	if ( toUSE_UNI_CHARSET_NOT_DEPENDS
	1501	&& initial_charset == REGEX_DEPENDS_CHARSET)
	1502	{
	1503
	1504	/* Set to use unicode semantics if the pattern is in utf8 and has the
	1505	* 'depends' charset specified, as it means unicode when utf8 */
	1506	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	1507	RExC_uni_semantics = 1;
	1508	}
	1509
	1510	RExC_pm_flags = pm_flags;
	1511
	1512	if (runtime_code) {
	1513	assert(TAINTING_get \|\| !TAINT_get);
	1514	if (TAINT_get)
	1515	Perl_croak(aTHX_ "Eval-group in insecure regular expression");
	1516
	1517	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
	1518	/* whoops, we have a non-utf8 pattern, whilst run-time code
	1519	* got compiled as utf8. Try again with a utf8 pattern */
	1520	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
	1521	pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
	1522	goto redo_parse;
	1523	}
	1524	}
	1525	assert(!pRExC_state->runtime_code_qr);
	1526
	1527	RExC_sawback = 0;
	1528
	1529	RExC_seen = 0;
	1530	RExC_maxlen = 0;
	1531	RExC_in_lookaround = 0;
	1532	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	1533	RExC_recode_x_to_native = 0;
	1534	RExC_in_multi_char_class = 0;
	1535
	1536	RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
	1537	RExC_precomp_end = RExC_end = exp + plen;
	1538	RExC_nestroot = 0;
	1539	RExC_whilem_seen = 0;
	1540	RExC_end_op = NULL;
	1541	RExC_recurse = NULL;
	1542	RExC_study_chunk_recursed = NULL;
	1543	RExC_study_chunk_recursed_bytes= 0;
	1544	RExC_recurse_count = 0;
	1545	RExC_sets_depth = 0;
	1546	pRExC_state->code_index = 0;
	1547
	1548	/* Initialize the string in the compiled pattern. This is so that there is
	1549	* something to output if necessary */
	1550	set_regex_pv(pRExC_state, Rx);
	1551
	1552	DEBUG_PARSE_r({
	1553	Perl_re_printf( aTHX_
	1554	"Starting parse and generation\n");
	1555	RExC_lastnum=0;
	1556	RExC_lastparse=NULL;
	1557	});
	1558
	1559	/* Allocate space and zero-initialize. Note, the two step process
	1560	of zeroing when in debug mode, thus anything assigned has to
	1561	happen after that */
	1562	if (! RExC_size) {
	1563
	1564	/* On the first pass of the parse, we guess how big this will be. Then
	1565	* we grow in one operation to that amount and then give it back. As
	1566	* we go along, we re-allocate what we need.
	1567	*
	1568	* XXX Currently the guess is essentially that the pattern will be an
	1569	* EXACT node with one byte input, one byte output. This is crude, and
	1570	* better heuristics are welcome.
	1571	*
	1572	* On any subsequent passes, we guess what we actually computed in the
	1573	* latest earlier pass. Such a pass probably didn't complete so is
	1574	* missing stuff. We could improve those guesses by knowing where the
	1575	* parse stopped, and use the length so far plus apply the above
	1576	* assumption to what's left. */
	1577	RExC_size = STR_SZ(RExC_end - RExC_start);
	1578	}
	1579
	1580	Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
	1581	if ( RExC_rxi == NULL )
	1582	FAIL("Regexp out of space");
	1583
	1584	Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
	1585	RXi_SET( RExC_rx, RExC_rxi );
	1586
	1587	/* We start from 0 (over from 0 in the case this is a reparse. The first
	1588	* node parsed will give back any excess memory we have allocated so far).
	1589	* */
	1590	RExC_size = 0;
	1591
	1592	/* non-zero initialization begins here */
	1593	RExC_rx->engine= eng;
	1594	RExC_rx->extflags = rx_flags;
	1595	RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
	1596
	1597	if (pm_flags & PMf_IS_QR) {
	1598	RExC_rxi->code_blocks = pRExC_state->code_blocks;
	1599	if (RExC_rxi->code_blocks) {
	1600	RExC_rxi->code_blocks->refcnt++;
	1601	}
	1602	}
	1603
	1604	RExC_rx->intflags = 0;
	1605
	1606	RExC_flags = rx_flags; /* don't let top level (?i) bleed */
	1607	RExC_parse_set(exp);
	1608
	1609	/* This NUL is guaranteed because the pattern comes from an SV*, and the sv
	1610	* code makes sure the final byte is an uncounted NUL. But should this
	1611	* ever not be the case, lots of things could read beyond the end of the
	1612	* buffer: loops like
	1613	* while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
	1614	* strchr(RExC_parse, "foo");
	1615	* etc. So it is worth noting. */
	1616	assert(*RExC_end == '\0');
	1617
	1618	RExC_naughty = 0;
	1619	RExC_npar = 1;
	1620	RExC_logical_npar = 1;
	1621	RExC_parens_buf_size = 0;
	1622	RExC_emit_start = RExC_rxi->program;
	1623	pRExC_state->code_index = 0;
	1624
	1625	((char) RExC_emit_start) = (char) REG_MAGIC;
	1626	RExC_emit = NODE_STEP_REGNODE;
	1627
	1628	/* Do the parse */
	1629	if (reg(pRExC_state, 0, &flags, 1)) {
	1630
	1631	/* Success!, But we may need to redo the parse knowing how many parens
	1632	* there actually are */
	1633	if (IN_PARENS_PASS) {
	1634	flags \|= RESTART_PARSE;
	1635	}
	1636
	1637	/* We have that number in RExC_npar */
	1638	RExC_total_parens = RExC_npar;
	1639	RExC_logical_total_parens = RExC_logical_npar;
	1640	}
	1641	else if (! MUST_RESTART(flags)) {
	1642	ReREFCNT_dec(Rx);
	1643	Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
	1644	}
	1645
	1646	/* Here, we either have success, or we have to redo the parse for some reason */
	1647	if (MUST_RESTART(flags)) {
	1648
	1649	/* It's possible to write a regexp in ascii that represents Unicode
	1650	codepoints outside of the byte range, such as via \x{100}. If we
	1651	detect such a sequence we have to convert the entire pattern to utf8
	1652	and then recompile, as our sizing calculation will have been based
	1653	on 1 byte == 1 character, but we will need to use utf8 to encode
	1654	at least some part of the pattern, and therefore must convert the whole
	1655	thing.
	1656	-- dmq */
	1657	if (flags & NEED_UTF8) {
	1658
	1659	/* We have stored the offset of the final warning output so far.
	1660	* That must be adjusted. Any variant characters between the start
	1661	* of the pattern and this warning count for 2 bytes in the final,
	1662	* so just add them again */
	1663	if (UNLIKELY(RExC_latest_warn_offset > 0)) {
	1664	RExC_latest_warn_offset +=
	1665	variant_under_utf8_count((U8 ) exp, (U8 ) exp
	1666	+ RExC_latest_warn_offset);
	1667	}
	1668	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
	1669	pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
	1670	DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
	1671	}
	1672	else {
	1673	DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
	1674	}
	1675
	1676	if (ALL_PARENS_COUNTED) {
	1677	/* Make enough room for all the known parens, and zero it */
	1678	Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
	1679	Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
	1680	RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
	1681
	1682	Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
	1683	Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
	1684	/* we do NOT reinitialize RExC_logical_to_parno and
	1685	* RExC_parno_to_logical here. We need their data on the second
	1686	* pass */
	1687	}
	1688	else { /* Parse did not complete. Reinitialize the parentheses
	1689	structures */
	1690	RExC_total_parens = 0;
	1691	if (RExC_open_parens) {
	1692	Safefree(RExC_open_parens);
	1693	RExC_open_parens = NULL;
	1694	}
	1695	if (RExC_close_parens) {
	1696	Safefree(RExC_close_parens);
	1697	RExC_close_parens = NULL;
	1698	}
	1699	if (RExC_logical_to_parno) {
	1700	Safefree(RExC_logical_to_parno);
	1701	RExC_logical_to_parno = NULL;
	1702	}
	1703	if (RExC_parno_to_logical) {
	1704	Safefree(RExC_parno_to_logical);
	1705	RExC_parno_to_logical = NULL;
	1706	}
	1707	}
	1708
	1709	/* Clean up what we did in this parse */
	1710	SvREFCNT_dec_NN(RExC_rx_sv);
	1711
	1712	goto redo_parse;
	1713	}
	1714
	1715	/* Here, we have successfully parsed and generated the pattern's program
	1716	* for the regex engine. We are ready to finish things up and look for
	1717	* optimizations. */
	1718
	1719	/* Update the string to compile, with correct modifiers, etc */
	1720	set_regex_pv(pRExC_state, Rx);
	1721
	1722	RExC_rx->nparens = RExC_total_parens - 1;
	1723	RExC_rx->logical_nparens = RExC_logical_total_parens - 1;
	1724
	1725	/* Uses the upper 4 bits of the FLAGS field, so keep within that size */
	1726	if (RExC_whilem_seen > 15)
	1727	RExC_whilem_seen = 15;
	1728
	1729	DEBUG_PARSE_r({
	1730	Perl_re_printf( aTHX_
	1731	"Required size %" IVdf " nodes\n", (IV)RExC_size);
	1732	RExC_lastnum=0;
	1733	RExC_lastparse=NULL;
	1734	});
	1735
	1736	SetProgLen(RExC_rxi,RExC_size);
	1737
	1738	DEBUG_DUMP_PRE_OPTIMIZE_r({
	1739	SV * const sv = sv_newmortal();
	1740	RXi_GET_DECL(RExC_rx, ri);
	1741	DEBUG_RExC_seen();
	1742	Perl_re_printf( aTHX_ "Program before optimization:\n");
	1743
	1744	(void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
	1745	sv, 0, 0);
	1746	});
	1747
	1748	DEBUG_OPTIMISE_r(
	1749	Perl_re_printf( aTHX_ "Starting post parse optimization\n");
	1750	);
	1751
	1752	/* XXXX To minimize changes to RE engine we always allocate
	1753	3-units-long substrs field. */
	1754	Newx(RExC_rx->substrs, 1, struct reg_substr_data);
	1755	if (RExC_recurse_count) {
	1756	Newx(RExC_recurse, RExC_recurse_count, regnode *);
	1757	SAVEFREEPV(RExC_recurse);
	1758	}
	1759
	1760	if (RExC_seen & REG_RECURSE_SEEN) {
	1761	/* Note, RExC_total_parens is 1 + the number of parens in a pattern.
	1762	* So its 1 if there are no parens. */
	1763	RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
	1764	((RExC_total_parens & 0x07) != 0);
	1765	Newx(RExC_study_chunk_recursed,
	1766	RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
	1767	SAVEFREEPV(RExC_study_chunk_recursed);
	1768	}
	1769
	1770	reStudy:
	1771	RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
	1772	DEBUG_r(
	1773	RExC_study_chunk_recursed_count= 0;
	1774	);
	1775	Zero(RExC_rx->substrs, 1, struct reg_substr_data);
	1776	if (RExC_study_chunk_recursed) {
	1777	Zero(RExC_study_chunk_recursed,
	1778	RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
	1779	}
	1780
	1781
	1782	#ifdef TRIE_STUDY_OPT
	1783	/* search for "restudy" in this file for a detailed explanation */
	1784	if (!restudied) {
	1785	StructCopy(&zero_scan_data, &data, scan_data_t);
	1786	copyRExC_state = RExC_state;
	1787	} else {
	1788	U32 seen=RExC_seen;
	1789	DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
	1790
	1791	RExC_state = copyRExC_state;
	1792	if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
	1793	RExC_seen \|= REG_TOP_LEVEL_BRANCHES_SEEN;
	1794	else
	1795	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
	1796	StructCopy(&zero_scan_data, &data, scan_data_t);
	1797	}
	1798	#else
	1799	StructCopy(&zero_scan_data, &data, scan_data_t);
	1800	#endif
	1801
	1802	/* Dig out information for optimizations. */
	1803	RExC_rx->extflags = RExC_flags; /* was pm_op */
	1804	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	1805
	1806	if (UTF)
	1807	SvUTF8_on(Rx); /* Unicode in it? */
	1808	RExC_rxi->regstclass = NULL;
	1809	if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
	1810	RExC_rx->intflags \|= PREGf_NAUGHTY;
	1811	scan = RExC_rxi->program + 1; /* First BRANCH. */
	1812
	1813	/* testing for BRANCH here tells us whether there is "must appear"
	1814	data in the pattern. If there is then we can use it for optimisations */
	1815	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
	1816	*/
	1817	SSize_t fake_deltap;
	1818	STRLEN longest_length[2];
	1819	regnode_ssc ch_class; /* pointed to by data */
	1820	int stclass_flag;
	1821	SSize_t last_close = 0; /* pointed to by data */
	1822	regnode *first= scan;
	1823	regnode *first_next= regnext(first);
	1824	regnode *last_close_op= NULL;
	1825	int i;
	1826
	1827	/*
	1828	* Skip introductions and multiplicators >= 1
	1829	* so that we can extract the 'meat' of the pattern that must
	1830	* match in the large if() sequence following.
	1831	* NOTE that EXACT is NOT covered here, as it is normally
	1832	* picked up by the optimiser separately.
	1833	*
	1834	* This is unfortunate as the optimiser isnt handling lookahead
	1835	* properly currently.
	1836	*
	1837	*/
	1838	while (1)
	1839	{
	1840	if (OP(first) == OPEN)
	1841	sawopen = 1;
	1842	else
	1843	if (OP(first) == IFMATCH && !FLAGS(first))
	1844	/* for now we can't handle lookbehind IFMATCH */
	1845	sawlookahead = 1;
	1846	else
	1847	if (OP(first) == PLUS)
	1848	sawplus = 1;
	1849	else
	1850	if (OP(first) == MINMOD)
	1851	sawminmod = 1;
	1852	else
	1853	if (!(
	1854	/* An OR of one alternative - should not happen now. */
	1855	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	1856	/* An {n,m} with n>0 */
	1857	(REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) \|\|
	1858	(OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END)
	1859	)){
	1860	break;
	1861	}
	1862
	1863	first = REGNODE_AFTER(first);
	1864	first_next= regnext(first);
	1865	}
	1866
	1867	/* Starting-point info. */
	1868	again:
	1869	DEBUG_PEEP("first:", first, 0, 0);
	1870	/* Ignore EXACT as we deal with it later. */
	1871	if (REGNODE_TYPE(OP(first)) == EXACT) {
	1872	if (! isEXACTFish(OP(first))) {
	1873	NOOP; /* Empty, get anchored substr later. */
	1874	}
	1875	else
	1876	RExC_rxi->regstclass = first;
	1877	}
	1878	#ifdef TRIE_STCLASS
	1879	else if (REGNODE_TYPE(OP(first)) == TRIE &&
	1880	((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0)
	1881	{
	1882	/* this can happen only on restudy
	1883	* Search for "restudy" in this file to find
	1884	* a comment with details. */
	1885	RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
	1886	}
	1887	#endif
	1888	else if (REGNODE_SIMPLE(OP(first)))
	1889	RExC_rxi->regstclass = first;
	1890	else if (REGNODE_TYPE(OP(first)) == BOUND \|\|
	1891	REGNODE_TYPE(OP(first)) == NBOUND)
	1892	RExC_rxi->regstclass = first;
	1893	else if (REGNODE_TYPE(OP(first)) == BOL) {
	1894	RExC_rx->intflags \|= (OP(first) == MBOL
	1895	? PREGf_ANCH_MBOL
	1896	: PREGf_ANCH_SBOL);
	1897	first = REGNODE_AFTER(first);
	1898	goto again;
	1899	}
	1900	else if (OP(first) == GPOS) {
	1901	RExC_rx->intflags \|= PREGf_ANCH_GPOS;
	1902	first = REGNODE_AFTER_type(first,tregnode_GPOS);
	1903	goto again;
	1904	}
	1905	else if ((!sawopen \|\| !RExC_sawback) &&
	1906	!sawlookahead &&
	1907	(OP(first) == STAR &&
	1908	REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
	1909	!(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN))
	1910	{
	1911	/* turn .* into ^.* with an implied $=1 /
	1912	const int type =
	1913	(OP(REGNODE_AFTER(first)) == REG_ANY)
	1914	? PREGf_ANCH_MBOL
	1915	: PREGf_ANCH_SBOL;
	1916	RExC_rx->intflags \|= (type \| PREGf_IMPLICIT);
	1917	first = REGNODE_AFTER(first);
	1918	goto again;
	1919	}
	1920	if (sawplus && !sawminmod && !sawlookahead
	1921	&& (!sawopen \|\| !RExC_sawback)
	1922	&& !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */
	1923	/* x+ must match at the 1st pos of run of x's */
	1924	RExC_rx->intflags \|= PREGf_SKIP;
	1925
	1926	/* Scan is after the zeroth branch, first is atomic matcher. */
	1927	#ifdef TRIE_STUDY_OPT
	1928	/* search for "restudy" in this file for a detailed explanation */
	1929	DEBUG_PARSE_r(
	1930	if (!restudied)
	1931	Perl_re_printf( aTHX_ "first at %" IVdf "\n",
	1932	(IV)(first - scan + 1))
	1933	);
	1934	#else
	1935	DEBUG_PARSE_r(
	1936	Perl_re_printf( aTHX_ "first at %" IVdf "\n",
	1937	(IV)(first - scan + 1))
	1938	);
	1939	#endif
	1940
	1941
	1942	/*
	1943	* If there's something expensive in the r.e., find the
	1944	* longest literal string that must appear and make it the
	1945	* regmust. Resolve ties in favor of later strings, since
	1946	* the regstart check works with the beginning of the r.e.
	1947	* and avoiding duplication strengthens checking. Not a
	1948	* strong reason, but sufficient in the absence of others.
	1949	* [Now we resolve ties in favor of the earlier string if
	1950	* it happens that c_offset_min has been invalidated, since the
	1951	* earlier string may buy us something the later one won't.]
	1952	*/
	1953
	1954	data.substrs[0].str = newSVpvs("");
	1955	data.substrs[1].str = newSVpvs("");
	1956	data.last_found = newSVpvs("");
	1957	data.cur_is_floating = 0; /* initially any found substring is fixed */
	1958	ENTER_with_name("study_chunk");
	1959	SAVEFREESV(data.substrs[0].str);
	1960	SAVEFREESV(data.substrs[1].str);
	1961	SAVEFREESV(data.last_found);
	1962	first = scan;
	1963	if (!RExC_rxi->regstclass) {
	1964	ssc_init(pRExC_state, &ch_class);
	1965	data.start_class = &ch_class;
	1966	stclass_flag = SCF_DO_STCLASS_AND;
	1967	} else /* XXXX Check for BOUND? */
	1968	stclass_flag = 0;
	1969	data.last_closep = &last_close;
	1970	data.last_close_opp = &last_close_op;
	1971
	1972	DEBUG_RExC_seen();
	1973	/*
	1974	* MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
	1975	* (NO top level branches)
	1976	*/
	1977	minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
	1978	scan + RExC_size, /* Up to end */
	1979	&data, -1, 0, NULL,
	1980	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag
	1981	\| (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
	1982	0, TRUE);
	1983	/* search for "restudy" in this file for a detailed explanation
	1984	* of 'restudied' and SCF_TRIE_DOING_RESTUDY */
	1985
	1986
	1987	CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
	1988
	1989
	1990	if ( RExC_total_parens == 1 && !data.cur_is_floating
	1991	&& data.last_start_min == 0 && data.last_end > 0
	1992	&& !RExC_seen_zerolen
	1993	&& !(RExC_seen & REG_VERBARG_SEEN)
	1994	&& !(RExC_seen & REG_GPOS_SEEN)
	1995	){
	1996	RExC_rx->extflags \|= RXf_CHECK_ALL;
	1997	}
	1998	scan_commit(pRExC_state, &data,&minlen, 0);
	1999
	2000
	2001	/* XXX this is done in reverse order because that's the way the
	2002	* code was before it was parameterised. Don't know whether it
	2003	* actually needs doing in reverse order. DAPM */
	2004	for (i = 1; i >= 0; i--) {
	2005	longest_length[i] = CHR_SVLEN(data.substrs[i].str);
	2006
	2007	if ( !( i
	2008	&& SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
	2009	&& data.substrs[0].min_offset
	2010	== data.substrs[1].min_offset
	2011	&& SvCUR(data.substrs[0].str)
	2012	== SvCUR(data.substrs[1].str)
	2013	)
	2014	&& S_setup_longest (aTHX_ pRExC_state,
	2015	&(RExC_rx->substrs->data[i]),
	2016	&(data.substrs[i]),
	2017	longest_length[i]))
	2018	{
	2019	RExC_rx->substrs->data[i].min_offset =
	2020	data.substrs[i].min_offset - data.substrs[i].lookbehind;
	2021
	2022	RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
	2023	/* Don't offset infinity */
	2024	if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
	2025	RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
	2026	SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
	2027	}
	2028	else {
	2029	RExC_rx->substrs->data[i].substr = NULL;
	2030	RExC_rx->substrs->data[i].utf8_substr = NULL;
	2031	longest_length[i] = 0;
	2032	}
	2033	}
	2034
	2035	LEAVE_with_name("study_chunk");
	2036
	2037	if (RExC_rxi->regstclass
	2038	&& (OP(RExC_rxi->regstclass) == REG_ANY \|\| OP(RExC_rxi->regstclass) == SANY))
	2039	RExC_rxi->regstclass = NULL;
	2040
	2041	if ((!(RExC_rx->substrs->data[0].substr \|\| RExC_rx->substrs->data[0].utf8_substr)
	2042	\|\| RExC_rx->substrs->data[0].min_offset)
	2043	&& stclass_flag
	2044	&& ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
	2045	&& is_ssc_worth_it(pRExC_state, data.start_class))
	2046	{
	2047	const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
	2048
	2049	ssc_finalize(pRExC_state, data.start_class);
	2050
	2051	Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
	2052	StructCopy(data.start_class,
	2053	(regnode_ssc*)RExC_rxi->data->data[n],
	2054	regnode_ssc);
	2055	RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
	2056	RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	2057	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	2058	regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
	2059	Perl_re_printf( aTHX_
	2060	"synthetic stclass \"%s\".\n",
	2061	SvPVX_const(sv));});
	2062	data.start_class = NULL;
	2063	}
	2064
	2065	/* A temporary algorithm prefers floated substr to fixed one of
	2066	* same length to dig more info. */
	2067	i = (longest_length[0] <= longest_length[1]);
	2068	RExC_rx->substrs->check_ix = i;
	2069	RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
	2070	RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
	2071	RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
	2072	RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
	2073	RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
	2074	if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL\|PREGf_ANCH_GPOS)))
	2075	RExC_rx->intflags \|= PREGf_NOSCAN;
	2076
	2077	if ((RExC_rx->check_substr \|\| RExC_rx->check_utf8) ) {
	2078	RExC_rx->extflags \|= RXf_USE_INTUIT;
	2079	if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
	2080	RExC_rx->extflags \|= RXf_INTUIT_TAIL;
	2081	}
	2082
	2083	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	2084	if ( (STRLEN)minlen < longest_length[1] )
	2085	minlen= longest_length[1];
	2086	if ( (STRLEN)minlen < longest_length[0] )
	2087	minlen= longest_length[0];
	2088	*/
	2089	}
	2090	else {
	2091	/* Several toplevels. Best we can is to set minlen. */
	2092	SSize_t fake_deltap;
	2093	regnode_ssc ch_class;
	2094	SSize_t last_close = 0;
	2095	regnode *last_close_op = NULL;
	2096
	2097	DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
	2098
	2099	scan = RExC_rxi->program + 1;
	2100	ssc_init(pRExC_state, &ch_class);
	2101	data.start_class = &ch_class;
	2102	data.last_closep = &last_close;
	2103	data.last_close_opp = &last_close_op;
	2104
	2105	DEBUG_RExC_seen();
	2106	/*
	2107	* MAIN ENTRY FOR study_chunk() FOR m/P1\|P2\|.../
	2108	* (patterns WITH top level branches)
	2109	*/
	2110	minlen = study_chunk(pRExC_state,
	2111	&scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
	2112	SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS\|(restudied
	2113	? SCF_TRIE_DOING_RESTUDY
	2114	: 0),
	2115	0, TRUE);
	2116	/* search for "restudy" in this file for a detailed explanation
	2117	* of 'restudied' and SCF_TRIE_DOING_RESTUDY */
	2118
	2119	CHECK_RESTUDY_GOTO_butfirst(NOOP);
	2120
	2121	RExC_rx->check_substr = NULL;
	2122	RExC_rx->check_utf8 = NULL;
	2123	RExC_rx->substrs->data[0].substr = NULL;
	2124	RExC_rx->substrs->data[0].utf8_substr = NULL;
	2125	RExC_rx->substrs->data[1].substr = NULL;
	2126	RExC_rx->substrs->data[1].utf8_substr = NULL;
	2127
	2128	if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
	2129	&& is_ssc_worth_it(pRExC_state, data.start_class))
	2130	{
	2131	const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
	2132
	2133	ssc_finalize(pRExC_state, data.start_class);
	2134
	2135	Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
	2136	StructCopy(data.start_class,
	2137	(regnode_ssc*)RExC_rxi->data->data[n],
	2138	regnode_ssc);
	2139	RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
	2140	RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	2141	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	2142	regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
	2143	Perl_re_printf( aTHX_
	2144	"synthetic stclass \"%s\".\n",
	2145	SvPVX_const(sv));});
	2146	data.start_class = NULL;
	2147	}
	2148	}
	2149
	2150	if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
	2151	RExC_rx->extflags \|= RXf_UNBOUNDED_QUANTIFIER_SEEN;
	2152	RExC_rx->maxlen = REG_INFTY;
	2153	}
	2154	else {
	2155	RExC_rx->maxlen = RExC_maxlen;
	2156	}
	2157
	2158	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	2159	the "real" pattern. */
	2160	DEBUG_OPTIMISE_r({
	2161	Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
	2162	(IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
	2163	});
	2164	RExC_rx->minlenret = minlen;
	2165	if (RExC_rx->minlen < minlen)
	2166	RExC_rx->minlen = minlen;
	2167
	2168	if (RExC_seen & REG_RECURSE_SEEN ) {
	2169	RExC_rx->intflags \|= PREGf_RECURSE_SEEN;
	2170	Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
	2171	}
	2172	if (RExC_seen & REG_GPOS_SEEN)
	2173	RExC_rx->intflags \|= PREGf_GPOS_SEEN;
	2174
	2175	if (RExC_seen & REG_PESSIMIZE_SEEN)
	2176	RExC_rx->intflags \|= PREGf_PESSIMIZE_SEEN;
	2177
	2178	if (RExC_seen & REG_LOOKBEHIND_SEEN)
	2179	RExC_rx->extflags \|= RXf_NO_INPLACE_SUBST; /* inplace might break the
	2180	lookbehind */
	2181	if (pRExC_state->code_blocks)
	2182	RExC_rx->extflags \|= RXf_EVAL_SEEN;
	2183
	2184	if (RExC_seen & REG_VERBARG_SEEN) {
	2185	RExC_rx->intflags \|= PREGf_VERBARG_SEEN;
	2186	RExC_rx->extflags \|= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
	2187	}
	2188
	2189	if (RExC_seen & REG_CUTGROUP_SEEN)
	2190	RExC_rx->intflags \|= PREGf_CUTGROUP_SEEN;
	2191
	2192	if (pm_flags & PMf_USE_RE_EVAL)
	2193	RExC_rx->intflags \|= PREGf_USE_RE_EVAL;
	2194
	2195	if (RExC_paren_names)
	2196	RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	2197	else
	2198	RXp_PAREN_NAMES(RExC_rx) = NULL;
	2199
	2200	/* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
	2201	* so it can be used in pp.c */
	2202	if (RExC_rx->intflags & PREGf_ANCH)
	2203	RExC_rx->extflags \|= RXf_IS_ANCHORED;
	2204
	2205
	2206	{
	2207	/* this is used to identify "special" patterns that might result
	2208	* in Perl NOT calling the regex engine and instead doing the match "itself",
	2209	* particularly special cases in split//. By having the regex compiler
	2210	* do this pattern matching at a regop level (instead of by inspecting the pattern)
	2211	* we avoid weird issues with equivalent patterns resulting in different behavior,
	2212	* AND we allow non Perl engines to get the same optimizations by the setting the
	2213	* flags appropriately - Yves */
	2214	regnode *first = RExC_rxi->program + 1;
	2215	U8 fop = OP(first);
	2216	regnode *next = NULL;
	2217	U8 nop = 0;
	2218	if (fop == NOTHING \|\| fop == MBOL \|\| fop == SBOL \|\| fop == PLUS) {
	2219	next = REGNODE_AFTER(first);
	2220	nop = OP(next);
	2221	}
	2222	/* It's safe to read through *next only if OP(first) is a regop of
	2223	* the right type (not EXACT, for example).
	2224	*/
	2225	if (REGNODE_TYPE(fop) == NOTHING && nop == END)
	2226	RExC_rx->extflags \|= RXf_NULL;
	2227	else if ((fop == MBOL \|\| (fop == SBOL && !FLAGS(first))) && nop == END)
	2228	/* when fop is SBOL first->flags will be true only when it was
	2229	* produced by parsing /\A/, and not when parsing /^/. This is
	2230	* very important for the split code as there we want to
	2231	* treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
	2232	* See rt #122761 for more details. -- Yves */
	2233	RExC_rx->extflags \|= RXf_START_ONLY;
	2234	else if (fop == PLUS
	2235	&& REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
	2236	&& OP(regnext(first)) == END)
	2237	RExC_rx->extflags \|= RXf_WHITE;
	2238	else if ( RExC_rx->extflags & RXf_SPLIT
	2239	&& (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
	2240	&& STR_LEN(first) == 1
	2241	&& *(STRING(first)) == ' '
	2242	&& OP(regnext(first)) == END )
	2243	RExC_rx->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	2244
	2245	}
	2246
	2247	if (RExC_contains_locale) {
	2248	RXp_EXTFLAGS(RExC_rx) \|= RXf_TAINTED;
	2249	}
	2250
	2251	#ifdef DEBUGGING
	2252	if (RExC_paren_names) {
	2253	RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a"));
	2254	RExC_rxi->data->data[RExC_rxi->name_list_idx]
	2255	= (void*)SvREFCNT_inc(RExC_paren_name_list);
	2256	} else
	2257	#endif
	2258	RExC_rxi->name_list_idx = 0;
	2259
	2260	while ( RExC_recurse_count > 0 ) {
	2261	const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
	2262	/*
	2263	* This data structure is set up in study_chunk() and is used
	2264	* to calculate the distance between a GOSUB regopcode and
	2265	* the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
	2266	* it refers to.
	2267	*
	2268	* If for some reason someone writes code that optimises
	2269	* away a GOSUB opcode then the assert should be changed to
	2270	* an if(scan) to guard the ARG2i_SET() - Yves
	2271	*
	2272	*/
	2273	assert(scan && OP(scan) == GOSUB);
	2274	ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan));
	2275	}
	2276	if (RExC_logical_total_parens != RExC_total_parens) {
	2277	Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32);
	2278	/* we rebuild this below */
	2279	Zero(RExC_logical_to_parno, RExC_total_parens, I32);
	2280	for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) {
	2281	int logical_parno= RExC_parno_to_logical[parno];
	2282	assert(logical_parno);
	2283	RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno];
	2284	RExC_logical_to_parno[logical_parno] = parno;
	2285	}
	2286	RExC_rx->logical_to_parno = RExC_logical_to_parno;
	2287	RExC_rx->parno_to_logical = RExC_parno_to_logical;
	2288	RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next;
	2289	RExC_logical_to_parno = NULL;
	2290	RExC_parno_to_logical = NULL;
	2291	RExC_parno_to_logical_next = NULL;
	2292	} else {
	2293	RExC_rx->logical_to_parno = NULL;
	2294	RExC_rx->parno_to_logical = NULL;
	2295	RExC_rx->parno_to_logical_next = NULL;
	2296	}
	2297
	2298	Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair);
	2299	/* assume we don't need to swap parens around before we match */
	2300	DEBUG_TEST_r({
	2301	Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
	2302	(unsigned long)RExC_study_chunk_recursed_count);
	2303	});
	2304	DEBUG_DUMP_r({
	2305	DEBUG_RExC_seen();
	2306	Perl_re_printf( aTHX_ "Final program:\n");
	2307	regdump(RExC_rx);
	2308	});
	2309
	2310	if (RExC_open_parens) {
	2311	Safefree(RExC_open_parens);
	2312	RExC_open_parens = NULL;
	2313	}
	2314	if (RExC_close_parens) {
	2315	Safefree(RExC_close_parens);
	2316	RExC_close_parens = NULL;
	2317	}
	2318	if (RExC_logical_to_parno) {
	2319	Safefree(RExC_logical_to_parno);
	2320	RExC_logical_to_parno = NULL;
	2321	}
	2322	if (RExC_parno_to_logical) {
	2323	Safefree(RExC_parno_to_logical);
	2324	RExC_parno_to_logical = NULL;
	2325	}
	2326
	2327	#ifdef USE_ITHREADS
	2328	/* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
	2329	* by setting the regexp SV to readonly-only instead. If the
	2330	* pattern's been recompiled, the USEDness should remain. */
	2331	if (old_re && SvREADONLY(old_re))
	2332	SvREADONLY_on(Rx);
	2333	#endif
	2334	return Rx;
	2335	}
	2336
	2337
	2338
	2339	SV*
	2340	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	2341	{
	2342	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	2343	PERL_UNUSED_ARG(rx);
	2344	if (0)
	2345	return NULL;
	2346	else
	2347	return newSVpvs("Regexp");
	2348	}
	2349
	2350	/* Scans the name of a named buffer from the pattern.
	2351	* If flags is REG_RSN_RETURN_NULL returns null.
	2352	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	2353	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	2354	* to the parsed name as looked up in the RExC_paren_names hash.
	2355	* If there is an error throws a vFAIL().. type exception.
	2356	*/
	2357
	2358	#define REG_RSN_RETURN_NULL 0
	2359	#define REG_RSN_RETURN_NAME 1
	2360	#define REG_RSN_RETURN_DATA 2
	2361
	2362	STATIC SV*
	2363	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	2364	{
	2365	char *name_start = RExC_parse;
	2366	SV* sv_name;
	2367
	2368	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	2369
	2370	assert (RExC_parse <= RExC_end);
	2371	if (RExC_parse == RExC_end) NOOP;
	2372	else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
	2373	/* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
	2374	* using do...while */
	2375	if (UTF)
	2376	do {
	2377	RExC_parse_inc_utf8();
	2378	} while ( RExC_parse < RExC_end
	2379	&& isWORDCHAR_utf8_safe((U8)RExC_parse, (U8) RExC_end));
	2380	else
	2381	do {
	2382	RExC_parse_inc_by(1);
	2383	} while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
	2384	} else {
	2385	RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
	2386	character */
	2387	vFAIL("Group name must start with a non-digit word character");
	2388	}
	2389	sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	2390	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	2391	if ( flags == REG_RSN_RETURN_NAME)
	2392	return sv_name;
	2393	else if (flags==REG_RSN_RETURN_DATA) {
	2394	HE *he_str = NULL;
	2395	SV *sv_dat = NULL;
	2396	if ( ! sv_name ) /* should not happen*/
	2397	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	2398	if (RExC_paren_names)
	2399	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	2400	if ( he_str )
	2401	sv_dat = HeVAL(he_str);
	2402	if ( ! sv_dat ) { /* Didn't find group */
	2403
	2404	/* It might be a forward reference; we can't fail until we
	2405	* know, by completing the parse to get all the groups, and
	2406	* then reparsing */
	2407	if (ALL_PARENS_COUNTED) {
	2408	vFAIL("Reference to nonexistent named group");
	2409	}
	2410	else {
	2411	REQUIRE_PARENS_PASS;
	2412	}
	2413	}
	2414	return sv_dat;
	2415	}
	2416
	2417	Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
	2418	(unsigned long) flags);
	2419	}
	2420
	2421	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	2422	if (RExC_lastparse!=RExC_parse) { \
	2423	Perl_re_printf( aTHX_ "%s", \
	2424	Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
	2425	RExC_end - RExC_parse, 16, \
	2426	"", "", \
	2427	PERL_PV_ESCAPE_UNI_DETECT \| \
	2428	PERL_PV_PRETTY_ELLIPSES \| \
	2429	PERL_PV_PRETTY_LTGT \| \
	2430	PERL_PV_ESCAPE_RE \| \
	2431	PERL_PV_PRETTY_EXACTSIZE \
	2432	) \
	2433	); \
	2434	} else \
	2435	Perl_re_printf( aTHX_ "%16s",""); \
	2436	\
	2437	if (RExC_lastnum!=RExC_emit) \
	2438	Perl_re_printf( aTHX_ "\|%4zu", RExC_emit); \
	2439	else \
	2440	Perl_re_printf( aTHX_ "\|%4s",""); \
	2441	Perl_re_printf( aTHX_ "\|%*s%-4s", \
	2442	(int)((depth*2)), "", \
	2443	(funcname) \
	2444	); \
	2445	RExC_lastnum=RExC_emit; \
	2446	RExC_lastparse=RExC_parse; \
	2447	})
	2448
	2449
	2450
	2451	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	2452	DEBUG_PARSE_MSG((funcname)); \
	2453	Perl_re_printf( aTHX_ "%4s","\n"); \
	2454	})
	2455	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
	2456	DEBUG_PARSE_MSG((funcname)); \
	2457	Perl_re_printf( aTHX_ fmt "\n",args); \
	2458	})
	2459
	2460
	2461	STATIC void
	2462	S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
	2463	{
	2464	/* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
	2465	* constructs, and updates RExC_flags with them. On input, RExC_parse
	2466	* should point to the first flag; it is updated on output to point to the
	2467	* final ')' or ':'. There needs to be at least one flag, or this will
	2468	* abort */
	2469
	2470	/* for (?g), (?gc), and (?o) warnings; warning
	2471	about (?c) will warn about (?g) -- japhy */
	2472
	2473	#define WASTED_O 0x01
	2474	#define WASTED_G 0x02
	2475	#define WASTED_C 0x04
	2476	#define WASTED_GC (WASTED_G\|WASTED_C)
	2477	I32 wastedflags = 0x00;
	2478	U32 posflags = 0, negflags = 0;
	2479	U32 *flagsp = &posflags;
	2480	char has_charset_modifier = '\0';
	2481	regex_charset cs;
	2482	bool has_use_defaults = FALSE;
	2483	const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
	2484	int x_mod_count = 0;
	2485
	2486	PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
	2487
	2488	/* '^' as an initial flag sets certain defaults */
	2489	if (UCHARAT(RExC_parse) == '^') {
	2490	RExC_parse_inc_by(1);
	2491	has_use_defaults = TRUE;
	2492	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	2493	cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
	2494	? REGEX_UNICODE_CHARSET
	2495	: REGEX_DEPENDS_CHARSET;
	2496	set_regex_charset(&RExC_flags, cs);
	2497	}
	2498	else {
	2499	cs = get_regex_charset(RExC_flags);
	2500	if ( cs == REGEX_DEPENDS_CHARSET
	2501	&& (toUSE_UNI_CHARSET_NOT_DEPENDS))
	2502	{
	2503	cs = REGEX_UNICODE_CHARSET;
	2504	}
	2505	}
	2506
	2507	while (RExC_parse < RExC_end) {
	2508	/* && memCHRs("iogcmsx", RExC_parse) /
	2509	/* (?g), (?gc) and (?o) are useless here
	2510	and must be globally applied -- japhy */
	2511	if ((RExC_pm_flags & PMf_WILDCARD)) {
	2512	if (flagsp == & negflags) {
	2513	if (*RExC_parse == 'm') {
	2514	RExC_parse_inc_by(1);
	2515	/* diag_listed_as: Use of %s is not allowed in Unicode
	2516	property wildcard subpatterns in regex; marked by <--
	2517	HERE in m/%s/ */
	2518	vFAIL("Use of modifier '-m' is not allowed in Unicode"
	2519	" property wildcard subpatterns");
	2520	}
	2521	}
	2522	else {
	2523	if (*RExC_parse == 's') {
	2524	goto modifier_illegal_in_wildcard;
	2525	}
	2526	}
	2527	}
	2528
	2529	switch (*RExC_parse) {
	2530
	2531	/* Code for the imsxn flags */
	2532	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
	2533
	2534	case LOCALE_PAT_MOD:
	2535	if (has_charset_modifier) {
	2536	goto excess_modifier;
	2537	}
	2538	else if (flagsp == &negflags) {
	2539	goto neg_modifier;
	2540	}
	2541	cs = REGEX_LOCALE_CHARSET;
	2542	has_charset_modifier = LOCALE_PAT_MOD;
	2543	break;
	2544	case UNICODE_PAT_MOD:
	2545	if (has_charset_modifier) {
	2546	goto excess_modifier;
	2547	}
	2548	else if (flagsp == &negflags) {
	2549	goto neg_modifier;
	2550	}
	2551	cs = REGEX_UNICODE_CHARSET;
	2552	has_charset_modifier = UNICODE_PAT_MOD;
	2553	break;
	2554	case ASCII_RESTRICT_PAT_MOD:
	2555	if (flagsp == &negflags) {
	2556	goto neg_modifier;
	2557	}
	2558	if (has_charset_modifier) {
	2559	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	2560	goto excess_modifier;
	2561	}
	2562	/* Doubled modifier implies more restricted */
	2563	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	2564	}
	2565	else {
	2566	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	2567	}
	2568	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	2569	break;
	2570	case DEPENDS_PAT_MOD:
	2571	if (has_use_defaults) {
	2572	goto fail_modifiers;
	2573	}
	2574	else if (flagsp == &negflags) {
	2575	goto neg_modifier;
	2576	}
	2577	else if (has_charset_modifier) {
	2578	goto excess_modifier;
	2579	}
	2580
	2581	/* The dual charset means unicode semantics if the
	2582	* pattern (or target, not known until runtime) are
	2583	* utf8, or something in the pattern indicates unicode
	2584	* semantics */
	2585	cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
	2586	? REGEX_UNICODE_CHARSET
	2587	: REGEX_DEPENDS_CHARSET;
	2588	has_charset_modifier = DEPENDS_PAT_MOD;
	2589	break;
	2590	excess_modifier:
	2591	RExC_parse_inc_by(1);
	2592	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	2593	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	2594	}
	2595	else if (has_charset_modifier == *(RExC_parse - 1)) {
	2596	vFAIL2("Regexp modifier \"%c\" may not appear twice",
	2597	*(RExC_parse - 1));
	2598	}
	2599	else {
	2600	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	2601	}
	2602	NOT_REACHED; /NOTREACHED/
	2603	neg_modifier:
	2604	RExC_parse_inc_by(1);
	2605	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
	2606	*(RExC_parse - 1));
	2607	NOT_REACHED; /NOTREACHED/
	2608	case GLOBAL_PAT_MOD: /* 'g' */
	2609	if (RExC_pm_flags & PMf_WILDCARD) {
	2610	goto modifier_illegal_in_wildcard;
	2611	}
	2612	/FALLTHROUGH/
	2613	case ONCE_PAT_MOD: /* 'o' */
	2614	if (ckWARN(WARN_REGEXP)) {
	2615	const I32 wflagbit = *RExC_parse == 'o'
	2616	? WASTED_O
	2617	: WASTED_G;
	2618	if (! (wastedflags & wflagbit) ) {
	2619	wastedflags \|= wflagbit;
	2620	/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
	2621	vWARN5(
	2622	RExC_parse + 1,
	2623	"Useless (%s%c) - %suse /%c modifier",
	2624	flagsp == &negflags ? "?-" : "?",
	2625	*RExC_parse,
	2626	flagsp == &negflags ? "don't " : "",
	2627	*RExC_parse
	2628	);
	2629	}
	2630	}
	2631	break;
	2632
	2633	case CONTINUE_PAT_MOD: /* 'c' */
	2634	if (RExC_pm_flags & PMf_WILDCARD) {
	2635	goto modifier_illegal_in_wildcard;
	2636	}
	2637	if (ckWARN(WARN_REGEXP)) {
	2638	if (! (wastedflags & WASTED_C) ) {
	2639	wastedflags \|= WASTED_GC;
	2640	/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
	2641	vWARN3(
	2642	RExC_parse + 1,
	2643	"Useless (%sc) - %suse /gc modifier",
	2644	flagsp == &negflags ? "?-" : "?",
	2645	flagsp == &negflags ? "don't " : ""
	2646	);
	2647	}
	2648	}
	2649	break;
	2650	case KEEPCOPY_PAT_MOD: /* 'p' */
	2651	if (RExC_pm_flags & PMf_WILDCARD) {
	2652	goto modifier_illegal_in_wildcard;
	2653	}
	2654	if (flagsp == &negflags) {
	2655	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	2656	} else {
	2657	*flagsp \|= RXf_PMf_KEEPCOPY;
	2658	}
	2659	break;
	2660	case '-':
	2661	/* A flag is a default iff it is following a minus, so
	2662	* if there is a minus, it means will be trying to
	2663	* re-specify a default which is an error */
	2664	if (has_use_defaults \|\| flagsp == &negflags) {
	2665	goto fail_modifiers;
	2666	}
	2667	flagsp = &negflags;
	2668	wastedflags = 0; /* reset so (?g-c) warns twice */
	2669	x_mod_count = 0;
	2670	break;
	2671	case ':':
	2672	case ')':
	2673
	2674	if ( (RExC_pm_flags & PMf_WILDCARD)
	2675	&& cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	2676	{
	2677	RExC_parse_inc_by(1);
	2678	/* diag_listed_as: Use of %s is not allowed in Unicode
	2679	property wildcard subpatterns in regex; marked by <--
	2680	HERE in m/%s/ */
	2681	vFAIL2("Use of modifier '%c' is not allowed in Unicode"
	2682	" property wildcard subpatterns",
	2683	has_charset_modifier);
	2684	}
	2685
	2686	if ((posflags & (RXf_PMf_EXTENDED\|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
	2687	negflags \|= RXf_PMf_EXTENDED_MORE;
	2688	}
	2689	RExC_flags \|= posflags;
	2690
	2691	if (negflags & RXf_PMf_EXTENDED) {
	2692	negflags \|= RXf_PMf_EXTENDED_MORE;
	2693	}
	2694	RExC_flags &= ~negflags;
	2695	set_regex_charset(&RExC_flags, cs);
	2696
	2697	return;
	2698	default:
	2699	fail_modifiers:
	2700	RExC_parse_inc_if_char();
	2701	/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
	2702	vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
	2703	UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
	2704	NOT_REACHED; /NOTREACHED/
	2705	}
	2706
	2707	RExC_parse_inc();
	2708	}
	2709
	2710	vFAIL("Sequence (?... not terminated");
	2711
	2712	modifier_illegal_in_wildcard:
	2713	RExC_parse_inc_by(1);
	2714	/* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
	2715	subpatterns in regex; marked by <-- HERE in m/%s/ */
	2716	vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
	2717	" subpatterns", *(RExC_parse - 1));
	2718	}
	2719
	2720	/*
	2721	- reg - regular expression, i.e. main body or parenthesized thing
	2722	*
	2723	* Caller must absorb opening parenthesis.
	2724	*
	2725	* Combining parenthesis handling with the base level of regular expression
	2726	* is a trifle forced, but the need to tie the tails of the branches to what
	2727	* follows makes it hard to avoid.
	2728	*/
	2729
	2730	STATIC regnode_offset
	2731	S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
	2732	I32 *flagp,
	2733	char * backref_parse_start,
	2734	char ch
	2735	)
	2736	{
	2737	regnode_offset ret;
	2738	char* name_start = RExC_parse;
	2739	U32 num = 0;
	2740	SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
	2741	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	2742
	2743	PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
	2744
	2745	if (RExC_parse != name_start && ch == '}') {
	2746	while (isBLANK(*RExC_parse)) {
	2747	RExC_parse_inc_by(1);
	2748	}
	2749	}
	2750	if (RExC_parse == name_start \|\| *RExC_parse != ch) {
	2751	/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
	2752	vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
	2753	}
	2754
	2755	if (sv_dat) {
	2756	num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
	2757	RExC_rxi->data->data[num]=(void*)sv_dat;
	2758	SvREFCNT_inc_simple_void_NN(sv_dat);
	2759	}
	2760	RExC_sawback = 1;
	2761	ret = reg2node(pRExC_state,
	2762	((! FOLD)
	2763	? REFN
	2764	: (ASCII_FOLD_RESTRICTED)
	2765	? REFFAN
	2766	: (AT_LEAST_UNI_SEMANTICS)
	2767	? REFFUN
	2768	: (LOC)
	2769	? REFFLN
	2770	: REFFN),
	2771	num, RExC_nestroot);
	2772	if (RExC_nestroot && num >= (U32)RExC_nestroot)
	2773	FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
	2774	*flagp \|= HASWIDTH;
	2775
	2776	nextchar(pRExC_state);
	2777	return ret;
	2778	}
	2779
	2780	/* reg_la_NOTHING()
	2781	*
	2782	* Maybe parse a parenthesized lookaround construct that is equivalent to a
	2783	* NOTHING regop when the construct is empty.
	2784	*
	2785	* Calls skip_to_be_ignored_text() before checking if the construct is empty.
	2786	*
	2787	* Checks for unterminated constructs and throws a "not terminated" error
	2788	* with the appropriate type if necessary
	2789	*
	2790	* Assuming it does not throw an exception increments RExC_seen_zerolen.
	2791	*
	2792	* If the construct is empty generates a NOTHING op and returns its
	2793	* regnode_offset, which the caller would then return to its caller.
	2794	*
	2795	* If the construct is not empty increments RExC_in_lookaround, and turns
	2796	* on any flags provided in RExC_seen, and then returns 0 to signify
	2797	* that parsing should continue.
	2798	*
	2799	* PS: I would have called this reg_parse_lookaround_NOTHING() but then
	2800	* any use of it would have had to be broken onto multiple lines, hence
	2801	* the abbreviation.
	2802	*/
	2803	STATIC regnode_offset
	2804	S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
	2805	const char *type)
	2806	{
	2807
	2808	PERL_ARGS_ASSERT_REG_LA_NOTHING;
	2809
	2810	/* false below so we do not force /x */
	2811	skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
	2812
	2813	if (RExC_parse >= RExC_end)
	2814	vFAIL2("Sequence (%s... not terminated", type);
	2815
	2816	/* Always increment as NOTHING regops are zerolen */
	2817	RExC_seen_zerolen++;
	2818
	2819	if (*RExC_parse == ')') {
	2820	regnode_offset ret= reg_node(pRExC_state, NOTHING);
	2821	nextchar(pRExC_state);
	2822	return ret;
	2823	}
	2824
	2825	RExC_seen \|= flags;
	2826	RExC_in_lookaround++;
	2827	return 0; /* keep parsing! */
	2828	}
	2829
	2830	/* reg_la_OPFAIL()
	2831	*
	2832	* Maybe parse a parenthesized lookaround construct that is equivalent to a
	2833	* OPFAIL regop when the construct is empty.
	2834	*
	2835	* Calls skip_to_be_ignored_text() before checking if the construct is empty.
	2836	*
	2837	* Checks for unterminated constructs and throws a "not terminated" error
	2838	* if necessary.
	2839	*
	2840	* If the construct is empty generates an OPFAIL op and returns its
	2841	* regnode_offset which the caller should then return to its caller.
	2842	*
	2843	* If the construct is not empty increments RExC_in_lookaround, and also
	2844	* increments RExC_seen_zerolen, and turns on the flags provided in
	2845	* RExC_seen, and then returns 0 to signify that parsing should continue.
	2846	*
	2847	* PS: I would have called this reg_parse_lookaround_OPFAIL() but then
	2848	* any use of it would have had to be broken onto multiple lines, hence
	2849	* the abbreviation.
	2850	*/
	2851
	2852	STATIC regnode_offset
	2853	S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
	2854	const char *type)
	2855	{
	2856
	2857	PERL_ARGS_ASSERT_REG_LA_OPFAIL;
	2858
	2859	/* FALSE so we don't force to /x below */;
	2860	skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
	2861
	2862	if (RExC_parse >= RExC_end)
	2863	vFAIL2("Sequence (%s... not terminated", type);
	2864
	2865	if (*RExC_parse == ')') {
	2866	regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0);
	2867	nextchar(pRExC_state);
	2868	return ret; /* return produced regop */
	2869	}
	2870
	2871	/* only increment zerolen after we check if we produce an OPFAIL
	2872	* as an OPFAIL does not match a zero length construct, as it
	2873	* does not match ever. */
	2874	RExC_seen_zerolen++;
	2875	RExC_seen \|= flags;
	2876	RExC_in_lookaround++;
	2877	return 0; /* keep parsing! */
	2878	}
	2879
	2880	/* Below are the main parsing routines.
	2881	*
	2882	* S_reg() parses a whole pattern or subpattern. It itself handles things
	2883	* like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
	2884	* alternation '\|' in the '...' pattern.
	2885	* S_regbranch() effectively implements the concatenation operator, handling
	2886	* one alternative of '\|', repeatedly calling S_regpiece on each
	2887	* segment of the input.
	2888	* S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
	2889	* and then adds any quantifier for that chunk.
	2890	* S_regatom() parses the next chunk of the input, returning when it
	2891	* determines it has found a complete atomic chunk. The chunk may
	2892	* be a nested subpattern, in which case S_reg is called
	2893	* recursively
	2894	*
	2895	* The functions generate regnodes as they go along, appending each to the
	2896	* pattern data structure so far. They return the offset of the current final
	2897	* node into that structure, or 0 on failure.
	2898	*
	2899	* There are three parameters common to all of them:
	2900	* pRExC_state is a structure with much information about the current
	2901	* state of the parse. It's easy to add new elements to
	2902	* convey new information, but beware that an error return may
	2903	* require clearing the element.
	2904	* flagp is a pointer to bit flags set in a lower level to pass up
	2905	* to higher levels information, such as the cause of a
	2906	* failure, or some characteristic about the generated node
	2907	* depth is roughly the recursion depth, mostly unused except for
	2908	* pretty printing debugging info.
	2909	*
	2910	* There are ancillary functions that these may farm work out to, using the
	2911	* same parameters.
	2912	*
	2913	* The protocol for handling flags is that each function will, before
	2914	* returning, add into *flagp the flags it needs to pass up. Each function has
	2915	* a second flags variable, typically named 'flags', which it sets and clears
	2916	* at will. Flag bits in it are used in that function, and it calls the next
	2917	* layer down with its 'flagp' parameter set to '&flags'. Thus, upon return,
	2918	* 'flags' will contain whatever it had before the call, plus whatever that
	2919	* function passed up. If it wants to pass any of these up to its caller, it
	2920	* has to add them to its *flagp. This means that it takes extra steps to keep
	2921	* passing a flag upwards, and otherwise the flag bit is cleared for higher
	2922	* functions.
	2923	*/
	2924
	2925	/* On success, returns the offset at which any next node should be placed into
	2926	* the regex engine program being compiled.
	2927	*
	2928	* Returns 0 otherwise, with *flagp set to indicate why:
	2929	* TRYAGAIN at the end of (?) that only sets flags.
	2930	* RESTART_PARSE if the parse needs to be restarted, or'd with
	2931	* NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
	2932	* Otherwise would only return 0 if regbranch() returns 0, which cannot
	2933	* happen. */
	2934	STATIC regnode_offset
	2935	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp, U32 depth)
	2936	/* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
	2937	* 2 is like 1, but indicates that nextchar() has been called to advance
	2938	* RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
	2939	* this flag alerts us to the need to check for that */
	2940	{
	2941	regnode_offset ret = 0; /* Will be the head of the group. */
	2942	regnode_offset br;
	2943	regnode_offset lastbr;
	2944	regnode_offset ender = 0;
	2945	I32 logical_parno = 0;
	2946	I32 parno = 0;
	2947	I32 flags;
	2948	U32 oregflags = RExC_flags;
	2949	bool have_branch = 0;
	2950	bool is_open = 0;
	2951	I32 freeze_paren = 0;
	2952	I32 after_freeze = 0;
	2953	I32 num; /* numeric backreferences */
	2954	SV * max_open; /* Max number of unclosed parens */
	2955	I32 was_in_lookaround = RExC_in_lookaround;
	2956	I32 fake_eval = 0; /* matches paren */
	2957
	2958	/* The difference between the following variables can be seen with *
	2959	* the broken pattern /(?:foo/ where segment_parse_start will point *
	2960	* at the 'f', and reg_parse_start will point at the '(' */
	2961
	2962	/* the following is used for unmatched '(' errors */
	2963	char * const reg_parse_start = RExC_parse;
	2964
	2965	/* the following is used to track where various segments of
	2966	* the pattern that we parse out started. */
	2967	char * segment_parse_start = RExC_parse;
	2968
	2969	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	2970
	2971	PERL_ARGS_ASSERT_REG;
	2972	DEBUG_PARSE("reg ");
	2973
	2974	max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
	2975	assert(max_open);
	2976	if (!SvIOK(max_open)) {
	2977	sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
	2978	}
	2979	if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
	2980	open paren */
	2981	vFAIL("Too many nested open parens");
	2982	}
	2983
	2984	flagp = 0; / Initialize. */
	2985
	2986	/* Having this true makes it feasible to have a lot fewer tests for the
	2987	* parse pointer being in scope. For example, we can write
	2988	* while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
	2989	* instead of
	2990	* while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
	2991	*/
	2992	assert(*RExC_end == '\0');
	2993
	2994	/* Make an OPEN node, if parenthesized. */
	2995	if (paren) {
	2996
	2997	/* Under /x, space and comments can be gobbled up between the '(' and
	2998	* here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
	2999	* intervening space, as the sequence is a token, and a token should be
	3000	* indivisible */
	3001	bool has_intervening_patws = (paren == 2)
	3002	&& *(RExC_parse - 1) != '(';
	3003
	3004	if (RExC_parse >= RExC_end) {
	3005	vFAIL("Unmatched (");
	3006	}
	3007
	3008	if (paren == 'r') { /* Atomic script run */
	3009	paren = '>';
	3010	goto parse_rest;
	3011	}
	3012	else if ( RExC_parse == '') { /* (VERB:ARG), (construct:...) */
	3013	if (RExC_parse[1] == '{') { /* ({ ... }) optimistic EVAL /
	3014	fake_eval = '{';
	3015	goto handle_qmark;
	3016	}
	3017
	3018	char *start_verb = RExC_parse + 1;
	3019	STRLEN verb_len;
	3020	char *start_arg = NULL;
	3021	unsigned char op = 0;
	3022	int arg_required = 0;
	3023	int internal_argval = -1; /* if > -1 no argument allowed */
	3024	bool has_upper = FALSE;
	3025	U32 seen_flag_set = 0; /* RExC_seen flags we must set */
	3026
	3027	if (has_intervening_patws) {
	3028	RExC_parse_inc_by(1); /* past the '' /
	3029
	3030	/* For strict backwards compatibility, don't change the message
	3031	* now that we also have lowercase operands */
	3032	if (isUPPER(*RExC_parse)) {
	3033	vFAIL("In '(VERB...)', the '(' and '' must be adjacent");
	3034	}
	3035	else {
	3036	vFAIL("In '(...)', the '(' and '' must be adjacent");
	3037	}
	3038	}
	3039	while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
	3040	if ( *RExC_parse == ':' ) {
	3041	start_arg = RExC_parse + 1;
	3042	break;
	3043	}
	3044	else if (! UTF) {
	3045	if (isUPPER(*RExC_parse)) {
	3046	has_upper = TRUE;
	3047	}
	3048	RExC_parse_inc_by(1);
	3049	}
	3050	else {
	3051	RExC_parse_inc_utf8();
	3052	}
	3053	}
	3054	verb_len = RExC_parse - start_verb;
	3055	if ( start_arg ) {
	3056	if (RExC_parse >= RExC_end) {
	3057	goto unterminated_verb_pattern;
	3058	}
	3059
	3060	RExC_parse_inc();
	3061	while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
	3062	RExC_parse_inc();
	3063	}
	3064	if ( RExC_parse >= RExC_end \|\| *RExC_parse != ')' ) {
	3065	unterminated_verb_pattern:
	3066	if (has_upper) {
	3067	vFAIL("Unterminated verb pattern argument");
	3068	}
	3069	else {
	3070	vFAIL("Unterminated '(*...' argument");
	3071	}
	3072	}
	3073	} else {
	3074	if ( RExC_parse >= RExC_end \|\| *RExC_parse != ')' ) {
	3075	if (has_upper) {
	3076	vFAIL("Unterminated verb pattern");
	3077	}
	3078	else {
	3079	vFAIL("Unterminated '(*...' construct");
	3080	}
	3081	}
	3082	}
	3083
	3084	/* Here, we know that RExC_parse < RExC_end */
	3085
	3086	switch ( *start_verb ) {
	3087	case 'A': /* (ACCEPT) /
	3088	if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
	3089	op = ACCEPT;
	3090	internal_argval = RExC_nestroot;
	3091	}
	3092	break;
	3093	case 'C': /* (COMMIT) /
	3094	if ( memEQs(start_verb, verb_len,"COMMIT") )
	3095	op = COMMIT;
	3096	break;
	3097	case 'F': /* (FAIL) /
	3098	if ( verb_len==1 \|\| memEQs(start_verb, verb_len,"FAIL") ) {
	3099	op = OPFAIL;
	3100	}
	3101	break;
	3102	case ':': /* (:NAME) /
	3103	case 'M': /* (MARK:NAME) /
	3104	if ( verb_len==0 \|\| memEQs(start_verb, verb_len,"MARK") ) {
	3105	op = MARKPOINT;
	3106	arg_required = 1;
	3107	}
	3108	break;
	3109	case 'P': /* (PRUNE) /
	3110	if ( memEQs(start_verb, verb_len,"PRUNE") )
	3111	op = PRUNE;
	3112	break;
	3113	case 'S': /* (SKIP) /
	3114	if ( memEQs(start_verb, verb_len,"SKIP") )
	3115	op = SKIP;
	3116	break;
	3117	case 'T': /* (THEN) /
	3118	/* [19:06] <TimToady> :: is then */
	3119	if ( memEQs(start_verb, verb_len,"THEN") ) {
	3120	op = CUTGROUP;
	3121	RExC_seen \|= REG_CUTGROUP_SEEN;
	3122	}
	3123	break;
	3124	case 'a':
	3125	if ( memEQs(start_verb, verb_len, "asr")
	3126	\|\| memEQs(start_verb, verb_len, "atomic_script_run"))
	3127	{
	3128	paren = 'r'; /* Mnemonic: recursed run */
	3129	goto script_run;
	3130	}
	3131	else if (memEQs(start_verb, verb_len, "atomic")) {
	3132	paren = 't'; /* AtOMIC */
	3133	goto alpha_assertions;
	3134	}
	3135	break;
	3136	case 'p':
	3137	if ( memEQs(start_verb, verb_len, "plb")
	3138	\|\| memEQs(start_verb, verb_len, "positive_lookbehind"))
	3139	{
	3140	paren = 'b';
	3141	goto lookbehind_alpha_assertions;
	3142	}
	3143	else if ( memEQs(start_verb, verb_len, "pla")
	3144	\|\| memEQs(start_verb, verb_len, "positive_lookahead"))
	3145	{
	3146	paren = 'a';
	3147	goto alpha_assertions;
	3148	}
	3149	break;
	3150	case 'n':
	3151	if ( memEQs(start_verb, verb_len, "nlb")
	3152	\|\| memEQs(start_verb, verb_len, "negative_lookbehind"))
	3153	{
	3154	paren = 'B';
	3155	goto lookbehind_alpha_assertions;
	3156	}
	3157	else if ( memEQs(start_verb, verb_len, "nla")
	3158	\|\| memEQs(start_verb, verb_len, "negative_lookahead"))
	3159	{
	3160	paren = 'A';
	3161	goto alpha_assertions;
	3162	}
	3163	break;
	3164	case 's':
	3165	if ( memEQs(start_verb, verb_len, "sr")
	3166	\|\| memEQs(start_verb, verb_len, "script_run"))
	3167	{
	3168	regnode_offset atomic;
	3169
	3170	paren = 's';
	3171
	3172	script_run:
	3173
	3174	/* This indicates Unicode rules. */
	3175	REQUIRE_UNI_RULES(flagp, 0);
	3176
	3177	if (! start_arg) {
	3178	goto no_colon;
	3179	}
	3180
	3181	RExC_parse_set(start_arg);
	3182
	3183	if (RExC_in_script_run) {
	3184
	3185	/* Nested script runs are treated as no-ops, because
	3186	* if the nested one fails, the outer one must as
	3187	* well. It could fail sooner, and avoid (??{} with
	3188	* side effects, but that is explicitly documented as
	3189	* undefined behavior. */
	3190
	3191	ret = 0;
	3192
	3193	if (paren == 's') {
	3194	paren = ':';
	3195	goto parse_rest;
	3196	}
	3197
	3198	/* But, the atomic part of a nested atomic script run
	3199	* isn't a no-op, but can be treated just like a '(?>'
	3200	* */
	3201	paren = '>';
	3202	goto parse_rest;
	3203	}
	3204
	3205	if (paren == 's') {
	3206	/* Here, we're starting a new regular script run */
	3207	ret = reg_node(pRExC_state, SROPEN);
	3208	RExC_in_script_run = 1;
	3209	is_open = 1;
	3210	goto parse_rest;
	3211	}
	3212
	3213	/* Here, we are starting an atomic script run. This is
	3214	* handled by recursing to deal with the atomic portion
	3215	* separately, enclosed in SROPEN ... SRCLOSE nodes */
	3216
	3217	ret = reg_node(pRExC_state, SROPEN);
	3218
	3219	RExC_in_script_run = 1;
	3220
	3221	atomic = reg(pRExC_state, 'r', &flags, depth);
	3222	if (flags & (RESTART_PARSE\|NEED_UTF8)) {
	3223	*flagp = flags & (RESTART_PARSE\|NEED_UTF8);
	3224	return 0;
	3225	}
	3226
	3227	if (! REGTAIL(pRExC_state, ret, atomic)) {
	3228	REQUIRE_BRANCHJ(flagp, 0);
	3229	}
	3230
	3231	if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
	3232	SRCLOSE)))
	3233	{
	3234	REQUIRE_BRANCHJ(flagp, 0);
	3235	}
	3236
	3237	RExC_in_script_run = 0;
	3238	return ret;
	3239	}
	3240
	3241	break;
	3242
	3243	lookbehind_alpha_assertions:
	3244	seen_flag_set = REG_LOOKBEHIND_SEEN;
	3245	/FALLTHROUGH/
	3246
	3247	alpha_assertions:
	3248
	3249	if ( !start_arg ) {
	3250	goto no_colon;
	3251	}
	3252
	3253	if ( RExC_parse == start_arg ) {
	3254	if ( paren == 'A' \|\| paren == 'B' ) {
	3255	/* An empty negative lookaround assertion is failure.
	3256	* See also: S_reg_la_OPFAIL() */
	3257
	3258	/* Note: OPFAIL is not zerolen. */
	3259	ret = reg1node(pRExC_state, OPFAIL, 0);
	3260	nextchar(pRExC_state);
	3261	return ret;
	3262	}
	3263	else
	3264	if ( paren == 'a' \|\| paren == 'b' ) {
	3265	/* An empty positive lookaround assertion is success.
	3266	* See also: S_reg_la_NOTHING() */
	3267
	3268	/* Note: NOTHING is zerolen, so increment here */
	3269	RExC_seen_zerolen++;
	3270	ret = reg_node(pRExC_state, NOTHING);
	3271	nextchar(pRExC_state);
	3272	return ret;
	3273	}
	3274	}
	3275
	3276	RExC_seen_zerolen++;
	3277	RExC_in_lookaround++;
	3278	RExC_seen \|= seen_flag_set;
	3279
	3280	RExC_parse_set(start_arg);
	3281	goto parse_rest;
	3282
	3283	no_colon:
	3284	vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
	3285	UTF8fARG(UTF, verb_len, start_verb));
	3286	NOT_REACHED; /NOTREACHED/
	3287
	3288	} /* End of switch */
	3289	if ( ! op ) {
	3290	RExC_parse_inc_safe();
	3291	if (has_upper \|\| verb_len == 0) {
	3292	vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
	3293	UTF8fARG(UTF, verb_len, start_verb));
	3294	}
	3295	else {
	3296	vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
	3297	UTF8fARG(UTF, verb_len, start_verb));
	3298	}
	3299	}
	3300	if ( RExC_parse == start_arg ) {
	3301	start_arg = NULL;
	3302	}
	3303	if ( arg_required && !start_arg ) {
	3304	vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
	3305	(int) verb_len, start_verb);
	3306	}
	3307	if (internal_argval == -1) {
	3308	ret = reg1node(pRExC_state, op, 0);
	3309	} else {
	3310	ret = reg2node(pRExC_state, op, 0, internal_argval);
	3311	}
	3312	RExC_seen \|= REG_VERBARG_SEEN;
	3313	if (start_arg) {
	3314	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	3315	ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state,
	3316	STR_WITH_LEN("S"));
	3317	RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv;
	3318	FLAGS(REGNODE_p(ret)) = 1;
	3319	} else {
	3320	FLAGS(REGNODE_p(ret)) = 0;
	3321	}
	3322	if ( internal_argval != -1 )
	3323	ARG2i_SET(REGNODE_p(ret), internal_argval);
	3324	nextchar(pRExC_state);
	3325	return ret;
	3326	}
	3327	else if (RExC_parse == '?') { / (?...) */
	3328	handle_qmark:
	3329	; /* make sure the label has a statement associated with it*/
	3330	bool is_logical = 0, is_optimistic = 0;
	3331	const char * const seqstart = RExC_parse;
	3332	const char * endptr;
	3333	const char non_existent_group_msg[]
	3334	= "Reference to nonexistent group";
	3335	const char impossible_group[] = "Invalid reference to group";
	3336
	3337	if (has_intervening_patws) {
	3338	RExC_parse_inc_by(1);
	3339	vFAIL("In '(?...)', the '(' and '?' must be adjacent");
	3340	}
	3341
	3342	RExC_parse_inc_by(1); /* past the '?' */
	3343	if (!fake_eval) {
	3344	paren = RExC_parse; / might be a trailing NUL, if not
	3345	well-formed */
	3346	is_optimistic = 0;
	3347	} else {
	3348	is_optimistic = 1;
	3349	paren = fake_eval;
	3350	}
	3351	RExC_parse_inc();
	3352	if (RExC_parse > RExC_end) {
	3353	paren = '\0';
	3354	}
	3355	ret = 0; /* For look-ahead/behind. */
	3356	switch (paren) {
	3357
	3358	case 'P': /* (?P...) variants for those used to PCRE/Python */
	3359	paren = *RExC_parse;
	3360	if ( paren == '<') { /* (?P<...>) named capture */
	3361	RExC_parse_inc_by(1);
	3362	if (RExC_parse >= RExC_end) {
	3363	vFAIL("Sequence (?P<... not terminated");
	3364	}
	3365	goto named_capture;
	3366	}
	3367	else if (paren == '>') { /* (?P>name) named recursion */
	3368	RExC_parse_inc_by(1);
	3369	if (RExC_parse >= RExC_end) {
	3370	vFAIL("Sequence (?P>... not terminated");
	3371	}
	3372	goto named_recursion;
	3373	}
	3374	else if (paren == '=') { /* (?P=...) named backref */
	3375	RExC_parse_inc_by(1);
	3376	return handle_named_backref(pRExC_state, flagp,
	3377	segment_parse_start, ')');
	3378	}
	3379	RExC_parse_inc_if_char();
	3380	/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
	3381	vFAIL3("Sequence (%.*s...) not recognized",
	3382	(int) (RExC_parse - seqstart), seqstart);
	3383	NOT_REACHED; /NOTREACHED/
	3384	case '<': /* (?<...) */
	3385	/* If you want to support (?<...), first reconcile with GH #17363 /
	3386	if (*RExC_parse == '!') {
	3387	paren = ','; /* negative lookbehind (?<! ... ) */
	3388	RExC_parse_inc_by(1);
	3389	if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
	3390	return ret;
	3391	break;
	3392	}
	3393	else
	3394	if (*RExC_parse == '=') {
	3395	/* paren = '<' - negative lookahead (?<= ... ) */
	3396	RExC_parse_inc_by(1);
	3397	if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
	3398	return ret;
	3399	break;
	3400	}
	3401	else
	3402	named_capture:
	3403	{ /* (?<...>) */
	3404	char *name_start;
	3405	SV *svname;
	3406	paren= '>';
	3407	/* FALLTHROUGH */
	3408	case '\'': /* (?'...') */
	3409	name_start = RExC_parse;
	3410	svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
	3411	if ( RExC_parse == name_start
	3412	\|\| RExC_parse >= RExC_end
	3413	\|\| *RExC_parse != paren)
	3414	{
	3415	vFAIL2("Sequence (?%c... not terminated",
	3416	paren=='>' ? '<' : (char) paren);
	3417	}
	3418	{
	3419	HE *he_str;
	3420	SV *sv_dat = NULL;
	3421	if (!svname) /* shouldn't happen */
	3422	Perl_croak(aTHX_
	3423	"panic: reg_scan_name returned NULL");
	3424	if (!RExC_paren_names) {
	3425	RExC_paren_names= newHV();
	3426	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	3427	#ifdef DEBUGGING
	3428	RExC_paren_name_list= newAV();
	3429	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	3430	#endif
	3431	}
	3432	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	3433	if ( he_str )
	3434	sv_dat = HeVAL(he_str);
	3435	if ( ! sv_dat ) {
	3436	/* croak baby croak */
	3437	Perl_croak(aTHX_
	3438	"panic: paren_name hash element allocation failed");
	3439	} else if ( SvPOK(sv_dat) ) {
	3440	/* (?\|...) can mean we have dupes so scan to check
	3441	its already been stored. Maybe a flag indicating
	3442	we are inside such a construct would be useful,
	3443	but the arrays are likely to be quite small, so
	3444	for now we punt -- dmq */
	3445	IV count = SvIV(sv_dat);
	3446	I32 pv = (I32)SvPVX(sv_dat);
	3447	IV i;
	3448	for ( i = 0 ; i < count ; i++ ) {
	3449	if ( pv[i] == RExC_npar ) {
	3450	count = 0;
	3451	break;
	3452	}
	3453	}
	3454	if ( count ) {
	3455	pv = (I32*)SvGROW(sv_dat,
	3456	SvCUR(sv_dat) + sizeof(I32)+1);
	3457	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	3458	pv[count] = RExC_npar;
	3459	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	3460	}
	3461	} else {
	3462	(void)SvUPGRADE(sv_dat, SVt_PVNV);
	3463	sv_setpvn(sv_dat, (char *)&(RExC_npar),
	3464	sizeof(I32));
	3465	SvIOK_on(sv_dat);
	3466	SvIV_set(sv_dat, 1);
	3467	}
	3468	#ifdef DEBUGGING
	3469	/* No, this does not cause a memory leak under
	3470	* debugging. RExC_paren_name_list is freed later
	3471	* on in the dump process. - Yves
	3472	*/
	3473	if (!av_store(RExC_paren_name_list,
	3474	RExC_npar, SvREFCNT_inc_NN(svname)))
	3475	SvREFCNT_dec_NN(svname);
	3476	#endif
	3477
	3478	}
	3479	nextchar(pRExC_state);
	3480	paren = 1;
	3481	goto capturing_parens;
	3482	}
	3483	NOT_REACHED; /NOTREACHED/
	3484	case '=': /* (?=...) */
	3485	if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
	3486	return ret;
	3487	break;
	3488	case '!': /* (?!...) */
	3489	if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
	3490	return ret;
	3491	break;
	3492	case '\|': /* (?\|...) */
	3493	/* branch reset, behave like a (?:...) except that
	3494	buffers in alternations share the same numbers */
	3495	paren = ':';
	3496	after_freeze = freeze_paren = RExC_logical_npar;
	3497
	3498	/* XXX This construct currently requires an extra pass.
	3499	* Investigation would be required to see if that could be
	3500	* changed */
	3501	REQUIRE_PARENS_PASS;
	3502	break;
	3503	case ':': /* (?:...) */
	3504	case '>': /* (?>...) */
	3505	break;
	3506	case '$': /* (?$...) */
	3507	case '@': /* (?@...) */
	3508	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	3509	break;
	3510	case '0' : /* (?0) */
	3511	case 'R' : /* (?R) */
	3512	if (RExC_parse == RExC_end \|\| *RExC_parse != ')')
	3513	FAIL("Sequence (?R) not terminated");
	3514	num = 0;
	3515	RExC_seen \|= REG_RECURSE_SEEN;
	3516
	3517	/* XXX These constructs currently require an extra pass.
	3518	* It probably could be changed */
	3519	REQUIRE_PARENS_PASS;
	3520
	3521	*flagp \|= POSTPONED;
	3522	goto gen_recurse_regop;
	3523	/notreached/
	3524	/* named and numeric backreferences */
	3525	case '&': /* (?&NAME) */
	3526	segment_parse_start = RExC_parse - 1;
	3527	named_recursion:
	3528	{
	3529	SV *sv_dat = reg_scan_name(pRExC_state,
	3530	REG_RSN_RETURN_DATA);
	3531	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	3532	}
	3533	if (RExC_parse >= RExC_end \|\| *RExC_parse != ')')
	3534	vFAIL("Sequence (?&... not terminated");
	3535	goto gen_recurse_regop;
	3536	/* NOTREACHED */
	3537	case '+':
	3538	if (! inRANGE(RExC_parse[0], '1', '9')) {
	3539	RExC_parse_inc_by(1);
	3540	vFAIL("Illegal pattern");
	3541	}
	3542	goto parse_recursion;
	3543	/* NOTREACHED*/
	3544	case '-': /* (?-1) */
	3545	if (! inRANGE(RExC_parse[0], '1', '9')) {
	3546	RExC_parse--; /* rewind to let it be handled later */
	3547	goto parse_flags;
	3548	}
	3549	/* FALLTHROUGH */
	3550	case '1': case '2': case '3': case '4': /* (?1) */
	3551	case '5': case '6': case '7': case '8': case '9':
	3552	RExC_parse_set((char ) seqstart + 1); / Point to the digit */
	3553	parse_recursion:
	3554	{
	3555	bool is_neg = FALSE;
	3556	UV unum;
	3557	segment_parse_start = RExC_parse - 1;
	3558	if (*RExC_parse == '-') {
	3559	RExC_parse_inc_by(1);
	3560	is_neg = TRUE;
	3561	}
	3562	endptr = RExC_end;
	3563	if (grok_atoUV(RExC_parse, &unum, &endptr)
	3564	&& unum <= I32_MAX
	3565	) {
	3566	num = (I32)unum;
	3567	RExC_parse_set((char*)endptr);
	3568	}
	3569	else { /* Overflow, or something like that. Position
	3570	beyond all digits for the message */
	3571	while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
	3572	RExC_parse_inc_by(1);
	3573	}
	3574	vFAIL(impossible_group);
	3575	}
	3576	if (is_neg) {
	3577	/* -num is always representable on 1 and 2's complement
	3578	* machines */
	3579	num = -num;
	3580	}
	3581	}
	3582	if (*RExC_parse!=')')
	3583	vFAIL("Expecting close bracket");
	3584
	3585	if (paren == '-' \|\| paren == '+') {
	3586
	3587	/* Don't overflow */
	3588	if (UNLIKELY(I32_MAX - RExC_npar < num)) {
	3589	RExC_parse_inc_by(1);
	3590	vFAIL(impossible_group);
	3591	}
	3592
	3593	/*
	3594	Diagram of capture buffer numbering.
	3595	Top line is the normal capture buffer numbers
	3596	Bottom line is the negative indexing as from
	3597	the X (the (?-2))
	3598
	3599	1 2 3 4 5 X Y 6 7
	3600	/(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
	3601	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	3602	- 5 4 3 2 1 X Y x x
	3603
	3604	Resolve to absolute group. Recall that RExC_npar is +1 of
	3605	the actual parenthesis group number. For lookahead, we
	3606	have to compensate for that. Using the above example, when
	3607	we get to Y in the parse, num is 2 and RExC_npar is 6. We
	3608	want 7 for +2, and 4 for -2.
	3609	*/
	3610	if ( paren == '+' ) {
	3611	num--;
	3612	}
	3613
	3614	num += RExC_npar;
	3615
	3616	if (paren == '-' && num < 1) {
	3617	RExC_parse_inc_by(1);
	3618	vFAIL(non_existent_group_msg);
	3619	}
	3620	}
	3621	else
	3622	if (num && num < RExC_logical_npar) {
	3623	num = RExC_logical_to_parno[num];
	3624	}
	3625	else
	3626	if (ALL_PARENS_COUNTED) {
	3627	if (num < RExC_logical_total_parens) {
	3628	num = RExC_logical_to_parno[num];
	3629	}
	3630	else {
	3631	RExC_parse_inc_by(1);
	3632	vFAIL(non_existent_group_msg);
	3633	}
	3634	}
	3635	else {
	3636	REQUIRE_PARENS_PASS;
	3637	}
	3638
	3639
	3640	gen_recurse_regop:
	3641	if (num >= RExC_npar) {
	3642
	3643	/* It might be a forward reference; we can't fail until we
	3644	* know, by completing the parse to get all the groups, and
	3645	* then reparsing */
	3646	if (ALL_PARENS_COUNTED) {
	3647	if (num >= RExC_total_parens) {
	3648	RExC_parse_inc_by(1);
	3649	vFAIL(non_existent_group_msg);
	3650	}
	3651	}
	3652	else {
	3653	REQUIRE_PARENS_PASS;
	3654	}
	3655	}
	3656
	3657	/* We keep track how many GOSUB items we have produced.
	3658	To start off the ARG2i() of the GOSUB holds its "id",
	3659	which is used later in conjunction with RExC_recurse
	3660	to calculate the offset we need to jump for the GOSUB,
	3661	which it will store in the final representation.
	3662	We have to defer the actual calculation until much later
	3663	as the regop may move.
	3664	*/
	3665	ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count);
	3666	RExC_recurse_count++;
	3667	DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
	3668	"%s%s Recurse #%" UVuf " to %" IVdf "\n",
	3669	22, "\| \|", (int)(depth * 2 + 1), "",
	3670	(UV)ARG1u(REGNODE_p(ret)),
	3671	(IV)ARG2i(REGNODE_p(ret))));
	3672	RExC_seen \|= REG_RECURSE_SEEN;
	3673
	3674	*flagp \|= POSTPONED;
	3675	assert(*RExC_parse == ')');
	3676	nextchar(pRExC_state);
	3677	return ret;
	3678
	3679	/* NOTREACHED */
	3680
	3681	case '?': /* (??...) */
	3682	is_logical = 1;
	3683	if (*RExC_parse != '{') {
	3684	RExC_parse_inc_if_char();
	3685	/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
	3686	vFAIL2utf8f(
	3687	"Sequence (%" UTF8f "...) not recognized",
	3688	UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
	3689	NOT_REACHED; /NOTREACHED/
	3690	}
	3691	*flagp \|= POSTPONED;
	3692	paren = '{';
	3693	RExC_parse_inc_by(1);
	3694	/* FALLTHROUGH */
	3695	case '{': /* (?{...}) */
	3696	{
	3697	U32 n = 0;
	3698	struct reg_code_block *cb;
	3699	OP * o;
	3700
	3701	RExC_seen_zerolen++;
	3702
	3703	if ( !pRExC_state->code_blocks
	3704	\|\| pRExC_state->code_index
	3705	>= pRExC_state->code_blocks->count
	3706	\|\| pRExC_state->code_blocks->cb[pRExC_state->code_index].start
	3707	!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
	3708	- RExC_start)
	3709	) {
	3710	if (RExC_pm_flags & PMf_USE_RE_EVAL)
	3711	FAIL("panic: Sequence (?{...}): no code block found\n");
	3712	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	3713	}
	3714	/* this is a pre-compiled code block (?{...}) */
	3715	cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
	3716	RExC_parse_set(RExC_start + cb->end);
	3717	o = cb->block;
	3718	if (cb->src_regex) {
	3719	n = reg_add_data(pRExC_state, STR_WITH_LEN("rl"));
	3720	RExC_rxi->data->data[n] =
	3721	(void)SvREFCNT_inc((SV)cb->src_regex);
	3722	RExC_rxi->data->data[n+1] = (void*)o;
	3723	}
	3724	else {
	3725	n = reg_add_data(pRExC_state,
	3726	(RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
	3727	RExC_rxi->data->data[n] = (void*)o;
	3728	}
	3729	pRExC_state->code_index++;
	3730	nextchar(pRExC_state);
	3731	if (!is_optimistic)
	3732	RExC_seen \|= REG_PESSIMIZE_SEEN;
	3733
	3734	if (is_logical) {
	3735	regnode_offset eval;
	3736	ret = reg_node(pRExC_state, LOGICAL);
	3737	FLAGS(REGNODE_p(ret)) = 2;
	3738
	3739	eval = reg2node(pRExC_state, EVAL,
	3740	n,
	3741
	3742	/* for later propagation into (??{})
	3743	* return value */
	3744	RExC_flags & RXf_PMf_COMPILETIME
	3745	);
	3746	FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
	3747	if (! REGTAIL(pRExC_state, ret, eval)) {
	3748	REQUIRE_BRANCHJ(flagp, 0);
	3749	}
	3750	return ret;
	3751	}
	3752	ret = reg2node(pRExC_state, EVAL, n, 0);
	3753	FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
	3754
	3755	return ret;
	3756	}
	3757	case '(': /* (?(?{...})...) and (?(?=...)...) */
	3758	{
	3759	int is_define= 0;
	3760	const int DEFINE_len = sizeof("DEFINE") - 1;
	3761	if ( RExC_parse < RExC_end - 1
	3762	&& ( ( RExC_parse[0] == '?' /* (?(?...)) */
	3763	&& ( RExC_parse[1] == '='
	3764	\|\| RExC_parse[1] == '!'
	3765	\|\| RExC_parse[1] == '<'
	3766	\|\| RExC_parse[1] == '{'))
	3767	\|\| ( RExC_parse[0] == '' / (?(...)) /
	3768	&& ( RExC_parse[1] == '{'
	3769	\|\| ( memBEGINs(RExC_parse + 1,
	3770	(Size_t) (RExC_end - (RExC_parse + 1)),
	3771	"pla:")
	3772	\|\| memBEGINs(RExC_parse + 1,
	3773	(Size_t) (RExC_end - (RExC_parse + 1)),
	3774	"plb:")
	3775	\|\| memBEGINs(RExC_parse + 1,
	3776	(Size_t) (RExC_end - (RExC_parse + 1)),
	3777	"nla:")
	3778	\|\| memBEGINs(RExC_parse + 1,
	3779	(Size_t) (RExC_end - (RExC_parse + 1)),
	3780	"nlb:")
	3781	\|\| memBEGINs(RExC_parse + 1,
	3782	(Size_t) (RExC_end - (RExC_parse + 1)),
	3783	"positive_lookahead:")
	3784	\|\| memBEGINs(RExC_parse + 1,
	3785	(Size_t) (RExC_end - (RExC_parse + 1)),
	3786	"positive_lookbehind:")
	3787	\|\| memBEGINs(RExC_parse + 1,
	3788	(Size_t) (RExC_end - (RExC_parse + 1)),
	3789	"negative_lookahead:")
	3790	\|\| memBEGINs(RExC_parse + 1,
	3791	(Size_t) (RExC_end - (RExC_parse + 1)),
	3792	"negative_lookbehind:")))))
	3793	) { /* Lookahead or eval. */
	3794	I32 flag;
	3795	regnode_offset tail;
	3796
	3797	ret = reg_node(pRExC_state, LOGICAL);
	3798	FLAGS(REGNODE_p(ret)) = 1;
	3799
	3800	tail = reg(pRExC_state, 1, &flag, depth+1);
	3801	RETURN_FAIL_ON_RESTART(flag, flagp);
	3802	if (! REGTAIL(pRExC_state, ret, tail)) {
	3803	REQUIRE_BRANCHJ(flagp, 0);
	3804	}
	3805	goto insert_if;
	3806	}
	3807	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	3808	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	3809	{
	3810	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	3811	char *name_start= RExC_parse;
	3812	RExC_parse_inc_by(1);
	3813	U32 num = 0;
	3814	SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
	3815	if ( RExC_parse == name_start
	3816	\|\| RExC_parse >= RExC_end
	3817	\|\| *RExC_parse != ch)
	3818	{
	3819	vFAIL2("Sequence (?(%c... not terminated",
	3820	(ch == '>' ? '<' : ch));
	3821	}
	3822	RExC_parse_inc_by(1);
	3823	if (sv_dat) {
	3824	num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
	3825	RExC_rxi->data->data[num]=(void*)sv_dat;
	3826	SvREFCNT_inc_simple_void_NN(sv_dat);
	3827	}
	3828	ret = reg1node(pRExC_state, GROUPPN, num);
	3829	goto insert_if_check_paren;
	3830	}
	3831	else if (memBEGINs(RExC_parse,
	3832	(STRLEN) (RExC_end - RExC_parse),
	3833	"DEFINE"))
	3834	{
	3835	ret = reg1node(pRExC_state, DEFINEP, 0);
	3836	RExC_parse_inc_by(DEFINE_len);
	3837	is_define = 1;
	3838	goto insert_if_check_paren;
	3839	}
	3840	else if (RExC_parse[0] == 'R') {
	3841	RExC_parse_inc_by(1);
	3842	/* parno == 0 => /(?(R)YES\|NO)/ "in any form of recursion OR eval"
	3843	* parno == 1 => /(?(R0)YES\|NO)/ "in GOSUB (?0) / (?R)"
	3844	* parno == 2 => /(?(R1)YES\|NO)/ "in GOSUB (?1) (parno-1)"
	3845	*/
	3846	parno = 0;
	3847	if (RExC_parse[0] == '0') {
	3848	parno = 1;
	3849	RExC_parse_inc_by(1);
	3850	}
	3851	else if (inRANGE(RExC_parse[0], '1', '9')) {
	3852	UV uv;
	3853	endptr = RExC_end;
	3854	if (grok_atoUV(RExC_parse, &uv, &endptr)
	3855	&& uv <= I32_MAX
	3856	) {
	3857	parno = (I32)uv + 1;
	3858	RExC_parse_set((char*)endptr);
	3859	}
	3860	/* else "Switch condition not recognized" below */
	3861	} else if (RExC_parse[0] == '&') {
	3862	SV *sv_dat;
	3863	RExC_parse_inc_by(1);
	3864	sv_dat = reg_scan_name(pRExC_state,
	3865	REG_RSN_RETURN_DATA);
	3866	if (sv_dat)
	3867	parno = 1 + ((I32 )SvPVX(sv_dat));
	3868	}
	3869	ret = reg1node(pRExC_state, INSUBP, parno);
	3870	goto insert_if_check_paren;
	3871	}
	3872	else if (inRANGE(RExC_parse[0], '1', '9')) {
	3873	/* (?(1)...) */
	3874	char c;
	3875	UV uv;
	3876	endptr = RExC_end;
	3877	if (grok_atoUV(RExC_parse, &uv, &endptr)
	3878	&& uv <= I32_MAX
	3879	) {
	3880	parno = (I32)uv;
	3881	RExC_parse_set((char*)endptr);
	3882	}
	3883	else {
	3884	vFAIL("panic: grok_atoUV returned FALSE");
	3885	}
	3886	ret = reg1node(pRExC_state, GROUPP, parno);
	3887
	3888	insert_if_check_paren:
	3889	if (UCHARAT(RExC_parse) != ')') {
	3890	RExC_parse_inc_safe();
	3891	vFAIL("Switch condition not recognized");
	3892	}
	3893	nextchar(pRExC_state);
	3894	insert_if:
	3895	if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state,
	3896	IFTHEN, 0)))
	3897	{
	3898	REQUIRE_BRANCHJ(flagp, 0);
	3899	}
	3900	br = regbranch(pRExC_state, &flags, 1, depth+1);
	3901	if (br == 0) {
	3902	RETURN_FAIL_ON_RESTART(flags,flagp);
	3903	FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
	3904	(UV) flags);
	3905	} else
	3906	if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state,
	3907	LONGJMP, 0)))
	3908	{
	3909	REQUIRE_BRANCHJ(flagp, 0);
	3910	}
	3911	c = UCHARAT(RExC_parse);
	3912	nextchar(pRExC_state);
	3913	if (flags&HASWIDTH)
	3914	*flagp \|= HASWIDTH;
	3915	if (c == '\|') {
	3916	if (is_define)
	3917	vFAIL("(?(DEFINE)....) does not allow branches");
	3918
	3919	/* Fake one for optimizer. */
	3920	lastbr = reg1node(pRExC_state, IFTHEN, 0);
	3921
	3922	if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
	3923	RETURN_FAIL_ON_RESTART(flags, flagp);
	3924	FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
	3925	(UV) flags);
	3926	}
	3927	if (! REGTAIL(pRExC_state, ret, lastbr)) {
	3928	REQUIRE_BRANCHJ(flagp, 0);
	3929	}
	3930	if (flags&HASWIDTH)
	3931	*flagp \|= HASWIDTH;
	3932	c = UCHARAT(RExC_parse);
	3933	nextchar(pRExC_state);
	3934	}
	3935	else
	3936	lastbr = 0;
	3937	if (c != ')') {
	3938	if (RExC_parse >= RExC_end)
	3939	vFAIL("Switch (?(condition)... not terminated");
	3940	else
	3941	vFAIL("Switch (?(condition)... contains too many branches");
	3942	}
	3943	ender = reg_node(pRExC_state, TAIL);
	3944	if (! REGTAIL(pRExC_state, br, ender)) {
	3945	REQUIRE_BRANCHJ(flagp, 0);
	3946	}
	3947	if (lastbr) {
	3948	if (! REGTAIL(pRExC_state, lastbr, ender)) {
	3949	REQUIRE_BRANCHJ(flagp, 0);
	3950	}
	3951	if (! REGTAIL(pRExC_state,
	3952	REGNODE_OFFSET(
	3953	REGNODE_AFTER(REGNODE_p(lastbr))),
	3954	ender))
	3955	{
	3956	REQUIRE_BRANCHJ(flagp, 0);
	3957	}
	3958	}
	3959	else
	3960	if (! REGTAIL(pRExC_state, ret, ender)) {
	3961	REQUIRE_BRANCHJ(flagp, 0);
	3962	}
	3963	#if 0 /* Removing this doesn't cause failures in the test suite -- khw */
	3964	RExC_size++; /* XXX WHY do we need this?!!
	3965	For large programs it seems to be required
	3966	but I can't figure out why. -- dmq*/
	3967	#endif
	3968	return ret;
	3969	}
	3970	RExC_parse_inc_safe();
	3971	vFAIL("Unknown switch condition (?(...))");
	3972	}
	3973	case '[': /* (?[ ... ]) */
	3974	return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
	3975	case 0: /* A NUL */
	3976	RExC_parse--; /* for vFAIL to print correctly */
	3977	vFAIL("Sequence (? incomplete");
	3978	break;
	3979
	3980	case ')':
	3981	if (RExC_strict) { /* [perl #132851] */
	3982	ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
	3983	}
	3984	/* FALLTHROUGH */
	3985	case '': / If you want to support (?...), first reconcile with GH #17363 /
	3986	/* FALLTHROUGH */
	3987	default: /* e.g., (?i) */
	3988	RExC_parse_set((char *) seqstart + 1);
	3989	parse_flags:
	3990	parse_lparen_question_flags(pRExC_state);
	3991	if (UCHARAT(RExC_parse) != ':') {
	3992	if (RExC_parse < RExC_end)
	3993	nextchar(pRExC_state);
	3994	*flagp = TRYAGAIN;
	3995	return 0;
	3996	}
	3997	paren = ':';
	3998	nextchar(pRExC_state);
	3999	ret = 0;
	4000	goto parse_rest;
	4001	} /* end switch */
	4002	}
	4003	else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
	4004	capturing_parens:
	4005	parno = RExC_npar;
	4006	RExC_npar++;
	4007	if (RExC_npar >= U16_MAX)
	4008	FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar);
	4009
	4010	logical_parno = RExC_logical_npar;
	4011	RExC_logical_npar++;
	4012	if (! ALL_PARENS_COUNTED) {
	4013	/* If we are in our first pass through (and maybe only pass),
	4014	* we need to allocate memory for the capturing parentheses
	4015	* data structures.
	4016	*/
	4017
	4018	if (!RExC_parens_buf_size) {
	4019	/* first guess at number of parens we might encounter */
	4020	RExC_parens_buf_size = 10;
	4021
	4022	/* setup RExC_open_parens, which holds the address of each
	4023	* OPEN tag, and to make things simpler for the 0 index the
	4024	* start of the program - this is used later for offsets */
	4025	Newxz(RExC_open_parens, RExC_parens_buf_size,
	4026	regnode_offset);
	4027	RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
	4028
	4029	/* setup RExC_close_parens, which holds the address of each
	4030	* CLOSE tag, and to make things simpler for the 0 index
	4031	* the end of the program - this is used later for offsets
	4032	* */
	4033	Newxz(RExC_close_parens, RExC_parens_buf_size,
	4034	regnode_offset);
	4035	/* we don't know where end op starts yet, so we don't need to
	4036	* set RExC_close_parens[0] like we do RExC_open_parens[0]
	4037	* above */
	4038
	4039	Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32);
	4040	Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32);
	4041	}
	4042	else if (RExC_npar > RExC_parens_buf_size) {
	4043	I32 old_size = RExC_parens_buf_size;
	4044
	4045	RExC_parens_buf_size *= 2;
	4046
	4047	Renew(RExC_open_parens, RExC_parens_buf_size,
	4048	regnode_offset);
	4049	Zero(RExC_open_parens + old_size,
	4050	RExC_parens_buf_size - old_size, regnode_offset);
	4051
	4052	Renew(RExC_close_parens, RExC_parens_buf_size,
	4053	regnode_offset);
	4054	Zero(RExC_close_parens + old_size,
	4055	RExC_parens_buf_size - old_size, regnode_offset);
	4056
	4057	Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32);
	4058	Zero(RExC_logical_to_parno + old_size,
	4059	RExC_parens_buf_size - old_size, I32);
	4060
	4061	Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32);
	4062	Zero(RExC_parno_to_logical + old_size,
	4063	RExC_parens_buf_size - old_size, I32);
	4064	}
	4065	}
	4066
	4067	ret = reg1node(pRExC_state, OPEN, parno);
	4068	if (!RExC_nestroot)
	4069	RExC_nestroot = parno;
	4070	if (RExC_open_parens && !RExC_open_parens[parno])
	4071	{
	4072	DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
	4073	"%s%s Setting open paren #%" IVdf " to %zu\n",
	4074	22, "\| \|", (int)(depth * 2 + 1), "",
	4075	(IV)parno, ret));
	4076	RExC_open_parens[parno]= ret;
	4077	}
	4078	if (RExC_parno_to_logical) {
	4079	RExC_parno_to_logical[parno] = logical_parno;
	4080	if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno])
	4081	RExC_logical_to_parno[logical_parno] = parno;
	4082	}
	4083	is_open = 1;
	4084	} else {
	4085	/* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
	4086	paren = ':';
	4087	ret = 0;
	4088	}
	4089	}
	4090	else /* ! paren */
	4091	ret = 0;
	4092
	4093	parse_rest:
	4094	/* Pick up the branches, linking them together. */
	4095	segment_parse_start = RExC_parse;
	4096	I32 npar_before_regbranch = RExC_npar - 1;
	4097	br = regbranch(pRExC_state, &flags, 1, depth+1);
	4098
	4099	/* branch_len = (paren != 0); */
	4100
	4101	if (br == 0) {
	4102	RETURN_FAIL_ON_RESTART(flags, flagp);
	4103	FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
	4104	}
	4105	if (*RExC_parse == '\|') {
	4106	if (RExC_use_BRANCHJ) {
	4107	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	4108	ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
	4109	ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
	4110	}
	4111	else {
	4112	reginsert(pRExC_state, BRANCH, br, depth+1);
	4113	ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
	4114	ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
	4115	}
	4116	have_branch = 1;
	4117	}
	4118	else if (paren == ':') {
	4119	*flagp \|= flags&SIMPLE;
	4120	}
	4121	if (is_open) { /* Starts with OPEN. */
	4122	if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
	4123	REQUIRE_BRANCHJ(flagp, 0);
	4124	}
	4125	}
	4126	else if (paren != '?') /* Not Conditional */
	4127	ret = br;
	4128	*flagp \|= flags & (HASWIDTH \| POSTPONED);
	4129	lastbr = br;
	4130	while (*RExC_parse == '\|') {
	4131	if (RExC_use_BRANCHJ) {
	4132	bool shut_gcc_up;
	4133
	4134	ender = reg1node(pRExC_state, LONGJMP, 0);
	4135
	4136	/* Append to the previous. */
	4137	shut_gcc_up = REGTAIL(pRExC_state,
	4138	REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
	4139	ender);
	4140	PERL_UNUSED_VAR(shut_gcc_up);
	4141	}
	4142	nextchar(pRExC_state);
	4143	if (freeze_paren) {
	4144	if (RExC_logical_npar > after_freeze)
	4145	after_freeze = RExC_logical_npar;
	4146	RExC_logical_npar = freeze_paren;
	4147	}
	4148	br = regbranch(pRExC_state, &flags, 0, depth+1);
	4149
	4150	if (br == 0) {
	4151	RETURN_FAIL_ON_RESTART(flags, flagp);
	4152	FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
	4153	}
	4154	if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
	4155	REQUIRE_BRANCHJ(flagp, 0);
	4156	}
	4157	assert(OP(REGNODE_p(br)) == BRANCH \|\| OP(REGNODE_p(br))==BRANCHJ);
	4158	assert(OP(REGNODE_p(lastbr)) == BRANCH \|\| OP(REGNODE_p(lastbr))==BRANCHJ);
	4159	if (OP(REGNODE_p(br)) == BRANCH) {
	4160	if (OP(REGNODE_p(lastbr)) == BRANCH)
	4161	ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
	4162	else
	4163	ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
	4164	}
	4165	else
	4166	if (OP(REGNODE_p(br)) == BRANCHJ) {
	4167	if (OP(REGNODE_p(lastbr)) == BRANCH)
	4168	ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
	4169	else
	4170	ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
	4171	}
	4172
	4173	lastbr = br;
	4174	*flagp \|= flags & (HASWIDTH \| POSTPONED);
	4175	}
	4176
	4177	if (have_branch \|\| paren != ':') {
	4178	regnode * br;
	4179
	4180	/* Make a closing node, and hook it on the end. */
	4181	switch (paren) {
	4182	case ':':
	4183	ender = reg_node(pRExC_state, TAIL);
	4184	break;
	4185	case 1: case 2:
	4186	ender = reg1node(pRExC_state, CLOSE, parno);
	4187	if ( RExC_close_parens ) {
	4188	DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
	4189	"%s%s Setting close paren #%" IVdf " to %zu\n",
	4190	22, "\| \|", (int)(depth * 2 + 1), "",
	4191	(IV)parno, ender));
	4192	RExC_close_parens[parno]= ender;
	4193	if (RExC_nestroot == parno)
	4194	RExC_nestroot = 0;
	4195	}
	4196	break;
	4197	case 's':
	4198	ender = reg_node(pRExC_state, SRCLOSE);
	4199	RExC_in_script_run = 0;
	4200	break;
	4201	/* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
	4202	case 'b': /* (positive_lookbehind: ... ) (plb: ... ) */
	4203	case 'B': /* (negative_lookbehind: ... ) (nlb: ... ) */
	4204	case '<': /* (?<= ... ) */
	4205	case ',': /* (?<! ... ) */
	4206	*flagp &= ~HASWIDTH;
	4207	ender = reg_node(pRExC_state, LOOKBEHIND_END);
	4208	break;
	4209	/* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
	4210	case 'a':
	4211	case 'A':
	4212	case '=':
	4213	case '!':
	4214	*flagp &= ~HASWIDTH;
	4215	/* FALLTHROUGH */
	4216	case 't': /* aTomic */
	4217	case '>':
	4218	ender = reg_node(pRExC_state, SUCCEED);
	4219	break;
	4220	case 0:
	4221	ender = reg_node(pRExC_state, END);
	4222	assert(!RExC_end_op); /* there can only be one! */
	4223	RExC_end_op = REGNODE_p(ender);
	4224	if (RExC_close_parens) {
	4225	DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
	4226	"%s%s Setting close paren #0 (END) to %zu\n",
	4227	22, "\| \|", (int)(depth * 2 + 1), "",
	4228	ender));
	4229
	4230	RExC_close_parens[0]= ender;
	4231	}
	4232	break;
	4233	}
	4234	DEBUG_PARSE_r({
	4235	DEBUG_PARSE_MSG("lsbr");
	4236	regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
	4237	regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
	4238	Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
	4239	SvPV_nolen_const(RExC_mysv1),
	4240	(IV)lastbr,
	4241	SvPV_nolen_const(RExC_mysv2),
	4242	(IV)ender,
	4243	(IV)(ender - lastbr)
	4244	);
	4245	});
	4246	if (OP(REGNODE_p(lastbr)) == BRANCH) {
	4247	ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
	4248	}
	4249	else
	4250	if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
	4251	ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
	4252	}
	4253
	4254	if (! REGTAIL(pRExC_state, lastbr, ender)) {
	4255	REQUIRE_BRANCHJ(flagp, 0);
	4256	}
	4257
	4258	if (have_branch) {
	4259	char is_nothing= 1;
	4260	if (depth==1)
	4261	RExC_seen \|= REG_TOP_LEVEL_BRANCHES_SEEN;
	4262
	4263	/* Hook the tails of the branches to the closing node. */
	4264	for (br = REGNODE_p(ret); br; br = regnext(br)) {
	4265	const U8 op = REGNODE_TYPE(OP(br));
	4266	regnode *nextoper = REGNODE_AFTER(br);
	4267	if (op == BRANCH) {
	4268	if (! REGTAIL_STUDY(pRExC_state,
	4269	REGNODE_OFFSET(nextoper),
	4270	ender))
	4271	{
	4272	REQUIRE_BRANCHJ(flagp, 0);
	4273	}
	4274	if ( OP(nextoper) != NOTHING
	4275	\|\| regnext(nextoper) != REGNODE_p(ender))
	4276	is_nothing= 0;
	4277	}
	4278	else if (op == BRANCHJ) {
	4279	bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
	4280	REGNODE_OFFSET(nextoper),
	4281	ender);
	4282	PERL_UNUSED_VAR(shut_gcc_up);
	4283	/* for now we always disable this optimisation * /
	4284	regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
	4285	if ( OP(nopr) != NOTHING
	4286	\|\| regnext(nopr) != REGNODE_p(ender))
	4287	*/
	4288	is_nothing= 0;
	4289	}
	4290	}
	4291	if (is_nothing) {
	4292	regnode * ret_as_regnode = REGNODE_p(ret);
	4293	br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
	4294	? regnext(ret_as_regnode)
	4295	: ret_as_regnode;
	4296	DEBUG_PARSE_r({
	4297	DEBUG_PARSE_MSG("NADA");
	4298	regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
	4299	NULL, pRExC_state);
	4300	regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
	4301	NULL, pRExC_state);
	4302	Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
	4303	SvPV_nolen_const(RExC_mysv1),
	4304	(IV)REG_NODE_NUM(ret_as_regnode),
	4305	SvPV_nolen_const(RExC_mysv2),
	4306	(IV)ender,
	4307	(IV)(ender - ret)
	4308	);
	4309	});
	4310	OP(br)= NOTHING;
	4311	if (OP(REGNODE_p(ender)) == TAIL) {
	4312	NEXT_OFF(br)= 0;
	4313	RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
	4314	} else {
	4315	regnode *opt;
	4316	for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
	4317	OP(opt)= OPTIMIZED;
	4318	NEXT_OFF(br)= REGNODE_p(ender) - br;
	4319	}
	4320	}
	4321	}
	4322	}
	4323
	4324	{
	4325	const char *p;
	4326	/* Even/odd or x=don't care: 010101x10x */
	4327	static const char parens[] = "=!aA<,>Bbt";
	4328	/* flag below is set to 0 up through 'A'; 1 for larger */
	4329
	4330	if (paren && (p = strchr(parens, paren))) {
	4331	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	4332	int flag = (p - parens) > 3;
	4333
	4334	if (paren == '>' \|\| paren == 't') {
	4335	node = SUSPEND, flag = 0;
	4336	}
	4337
	4338	reginsert(pRExC_state, node, ret, depth+1);
	4339	FLAGS(REGNODE_p(ret)) = flag;
	4340	if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
	4341	{
	4342	REQUIRE_BRANCHJ(flagp, 0);
	4343	}
	4344	}
	4345	}
	4346
	4347	/* Check for proper termination. */
	4348	if (paren) {
	4349	/* restore original flags, but keep (?p) and, if we've encountered
	4350	* something in the parse that changes /d rules into /u, keep the /u */
	4351	RExC_flags = oregflags \| (RExC_flags & RXf_PMf_KEEPCOPY);
	4352	if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
	4353	set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
	4354	}
	4355	if (RExC_parse >= RExC_end \|\| UCHARAT(RExC_parse) != ')') {
	4356	RExC_parse_set(reg_parse_start);
	4357	vFAIL("Unmatched (");
	4358	}
	4359	nextchar(pRExC_state);
	4360	}
	4361	else if (!paren && RExC_parse < RExC_end) {
	4362	if (*RExC_parse == ')') {
	4363	RExC_parse_inc_by(1);
	4364	vFAIL("Unmatched )");
	4365	}
	4366	else
	4367	FAIL("Junk on end of regexp"); /* "Can't happen". */
	4368	NOT_REACHED; /* NOTREACHED */
	4369	}
	4370
	4371	if (after_freeze > RExC_logical_npar)
	4372	RExC_logical_npar = after_freeze;
	4373
	4374	RExC_in_lookaround = was_in_lookaround;
	4375
	4376	return(ret);
	4377	}
	4378
	4379	/*
	4380	- regbranch - one alternative of an \| operator
	4381	*
	4382	* Implements the concatenation operator.
	4383	*
	4384	* On success, returns the offset at which any next node should be placed into
	4385	* the regex engine program being compiled.
	4386	*
	4387	* Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
	4388	* to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
	4389	* UTF-8
	4390	*/
	4391	STATIC regnode_offset
	4392	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	4393	{
	4394	regnode_offset ret;
	4395	regnode_offset chain = 0;
	4396	regnode_offset latest;
	4397	regnode *branch_node = NULL;
	4398	I32 flags = 0, c = 0;
	4399	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	4400
	4401	PERL_ARGS_ASSERT_REGBRANCH;
	4402
	4403	DEBUG_PARSE("brnc");
	4404
	4405	if (first)
	4406	ret = 0;
	4407	else {
	4408	if (RExC_use_BRANCHJ) {
	4409	ret = reg2node(pRExC_state, BRANCHJ, 0, 0);
	4410	branch_node = REGNODE_p(ret);
	4411	ARG2a_SET(branch_node, (U16)RExC_npar-1);
	4412	} else {
	4413	ret = reg1node(pRExC_state, BRANCH, 0);
	4414	branch_node = REGNODE_p(ret);
	4415	ARG1a_SET(branch_node, (U16)RExC_npar-1);
	4416	}
	4417	}
	4418
	4419	flagp = 0; / Initialize. */
	4420
	4421	skip_to_be_ignored_text(pRExC_state, &RExC_parse,
	4422	FALSE /* Don't force to /x */ );
	4423	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	4424	flags &= ~TRYAGAIN;
	4425	latest = regpiece(pRExC_state, &flags, depth+1);
	4426	if (latest == 0) {
	4427	if (flags & TRYAGAIN)
	4428	continue;
	4429	RETURN_FAIL_ON_RESTART(flags, flagp);
	4430	FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
	4431	}
	4432	else if (ret == 0)
	4433	ret = latest;
	4434	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	4435	if (chain != 0) {
	4436	/* FIXME adding one for every branch after the first is probably
	4437	* excessive now we have TRIE support. (hv) */
	4438	MARK_NAUGHTY(1);
	4439	if (! REGTAIL(pRExC_state, chain, latest)) {
	4440	/* XXX We could just redo this branch, but figuring out what
	4441	* bookkeeping needs to be reset is a pain, and it's likely
	4442	* that other branches that goto END will also be too large */
	4443	REQUIRE_BRANCHJ(flagp, 0);
	4444	}
	4445	}
	4446	chain = latest;
	4447	c++;
	4448	}
	4449	if (chain == 0) { /* Loop ran zero times. */
	4450	chain = reg_node(pRExC_state, NOTHING);
	4451	if (ret == 0)
	4452	ret = chain;
	4453	}
	4454	if (c == 1) {
	4455	*flagp \|= flags & SIMPLE;
	4456	}
	4457	return ret;
	4458	}
	4459
	4460	#define RBRACE 0
	4461	#define MIN_S 1
	4462	#define MIN_E 2
	4463	#define MAX_S 3
	4464	#define MAX_E 4
	4465
	4466	#ifndef PERL_IN_XSUB_RE
	4467	bool
	4468	Perl_regcurly(const char s, const char e, const char * result[5])
	4469	{
	4470	/* This function matches a {m,n} quantifier. When called with a NULL final
	4471	* argument, it simply parses the input from 's' up through 'e-1', and
	4472	* returns a boolean as to whether or not this input is syntactically a
	4473	* {m,n} quantifier.
	4474	*
	4475	* When called with a non-NULL final parameter, and when the function
	4476	* returns TRUE, it additionally stores information into the array
	4477	* specified by that parameter about what it found in the parse. The
	4478	* parameter must be a pointer into a 5 element array of 'const char *'
	4479	* elements. The returned information is as follows:
	4480	* result[RBRACE] points to the closing brace
	4481	* result[MIN_S] points to the first byte of the lower bound
	4482	* result[MIN_E] points to one beyond the final byte of the lower bound
	4483	* result[MAX_S] points to the first byte of the upper bound
	4484	* result[MAX_E] points to one beyond the final byte of the upper bound
	4485	*
	4486	* If the quantifier is of the form {m,} (meaning an infinite upper
	4487	* bound), result[MAX_E] is set to result[MAX_S]; what they actually point
	4488	* to is irrelevant, just that it's the same place
	4489	*
	4490	* If instead the quantifier is of the form {m} there is actually only
	4491	* one bound, and both the upper and lower result[] elements are set to
	4492	* point to it.
	4493	*
	4494	* This function checks only for syntactic validity; it leaves checking for
	4495	* semantic validity and raising any diagnostics to the caller. This
	4496	* function is called in multiple places to check for syntax, but only from
	4497	* one for semantics. It makes it as simple as possible for the
	4498	* syntax-only callers, while furnishing just enough information for the
	4499	* semantic caller.
	4500	*/
	4501
	4502	const char * min_start = NULL;
	4503	const char * max_start = NULL;
	4504	const char * min_end = NULL;
	4505	const char * max_end = NULL;
	4506
	4507	bool has_comma = FALSE;
	4508
	4509	PERL_ARGS_ASSERT_REGCURLY;
	4510
	4511	if (s >= e \|\| *s++ != '{')
	4512	return FALSE;
	4513
	4514	while (s < e && isBLANK(*s)) {
	4515	s++;
	4516	}
	4517
	4518	if isDIGIT(*s) {
	4519	min_start = s;
	4520	do {
	4521	s++;
	4522	} while (s < e && isDIGIT(*s));
	4523	min_end = s;
	4524	}
	4525
	4526	while (s < e && isBLANK(*s)) {
	4527	s++;
	4528	}
	4529
	4530	if (*s == ',') {
	4531	has_comma = TRUE;
	4532	s++;
	4533
	4534	while (s < e && isBLANK(*s)) {
	4535	s++;
	4536	}
	4537
	4538	if isDIGIT(*s) {
	4539	max_start = s;
	4540	do {
	4541	s++;
	4542	} while (s < e && isDIGIT(*s));
	4543	max_end = s;
	4544	}
	4545	}
	4546
	4547	while (s < e && isBLANK(*s)) {
	4548	s++;
	4549	}
	4550	/* Need at least one number */
	4551	if (s >= e \|\| *s != '}' \|\| (! min_start && ! max_end)) {
	4552	return FALSE;
	4553	}
	4554
	4555	if (result) {
	4556
	4557	result[RBRACE] = s;
	4558
	4559	result[MIN_S] = min_start;
	4560	result[MIN_E] = min_end;
	4561	if (has_comma) {
	4562	if (max_start) {
	4563	result[MAX_S] = max_start;
	4564	result[MAX_E] = max_end;
	4565	}
	4566	else {
	4567	/* Having no value after the comma is signalled by setting
	4568	* start and end to the same value. What that value is isn't
	4569	* relevant; NULL is chosen simply because it will fail if the
	4570	* caller mistakenly uses it */
	4571	result[MAX_S] = result[MAX_E] = NULL;
	4572	}
	4573	}
	4574	else { /* No comma means lower and upper bounds are the same */
	4575	result[MAX_S] = min_start;
	4576	result[MAX_E] = min_end;
	4577	}
	4578	}
	4579
	4580	return TRUE;
	4581	}
	4582	#endif
	4583
	4584	U32
	4585	S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
	4586	const char * start, const char * end)
	4587	{
	4588	/* This is a helper function for regpiece() to compute, given the
	4589	* quantifier {m,n}, the value of either m or n, based on the starting
	4590	* position 'start' in the string, through the byte 'end-1', returning it
	4591	* if valid, and failing appropriately if not. It knows the restrictions
	4592	* imposed on quantifier values */
	4593
	4594	UV uv;
	4595	STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
	4596
	4597	PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
	4598
	4599	if (grok_atoUV(start, &uv, &end)) {
	4600	if (uv < REG_INFTY) { /* A valid, small-enough number */
	4601	return (U32) uv;
	4602	}
	4603	}
	4604	else if (start == '0') { / grok_atoUV() fails for only two reasons:
	4605	leading zeros or overflow */
	4606	RExC_parse_set((char * ) end);
	4607
	4608	/* Perhaps too generic a msg for what is only failure from having
	4609	* leading zeros, but this is how it's always behaved. */
	4610	vFAIL("Invalid quantifier in {,}");
	4611	NOT_REACHED; /NOTREACHED/
	4612	}
	4613
	4614	/* Here, found a quantifier, but was too large; either it overflowed or was
	4615	* too big a legal number */
	4616	RExC_parse_set((char * ) end);
	4617	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	4618
	4619	NOT_REACHED; /NOTREACHED/
	4620	return U32_MAX; /* Perhaps some compilers will be expecting a return */
	4621	}
	4622
	4623	/*
	4624	- regpiece - something followed by possible quantifier * + ? {n,m}
	4625	*
	4626	* Note that the branching code sequences used for ? and the general cases
	4627	* of * and + are somewhat optimized: they use the same NOTHING node as
	4628	* both the endmarker for their branch list and the body of the last branch.
	4629	* It might seem that this node could be dispensed with entirely, but the
	4630	* endmarker role is not redundant.
	4631	*
	4632	* On success, returns the offset at which any next node should be placed into
	4633	* the regex engine program being compiled.
	4634	*
	4635	* Returns 0 otherwise, with *flagp set to indicate why:
	4636	* TRYAGAIN if regatom() returns 0 with TRYAGAIN.
	4637	* RESTART_PARSE if the parse needs to be restarted, or'd with
	4638	* NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
	4639	*/
	4640	STATIC regnode_offset
	4641	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	4642	{
	4643	regnode_offset ret;
	4644	char op;
	4645	I32 flags;
	4646	const char * const origparse = RExC_parse;
	4647	I32 min;
	4648	I32 max = REG_INFTY;
	4649	I32 npar_before = RExC_npar-1;
	4650
	4651	/* Save the original in case we change the emitted regop to a FAIL. */
	4652	const regnode_offset orig_emit = RExC_emit;
	4653
	4654	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	4655
	4656	PERL_ARGS_ASSERT_REGPIECE;
	4657
	4658	DEBUG_PARSE("piec");
	4659
	4660	ret = regatom(pRExC_state, &flags, depth+1);
	4661	if (ret == 0) {
	4662	RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
	4663	FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
	4664	}
	4665	I32 npar_after = RExC_npar-1;
	4666
	4667	op = *RExC_parse;
	4668	switch (op) {
	4669	const char * regcurly_return[5];
	4670
	4671	case '*':
	4672	nextchar(pRExC_state);
	4673	min = 0;
	4674	break;
	4675
	4676	case '+':
	4677	nextchar(pRExC_state);
	4678	min = 1;
	4679	break;
	4680
	4681	case '?':
	4682	nextchar(pRExC_state);
	4683	min = 0; max = 1;
	4684	break;
	4685
	4686	case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
	4687	to determine which */
	4688	if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
	4689	const char * min_start = regcurly_return[MIN_S];
	4690	const char * min_end = regcurly_return[MIN_E];
	4691	const char * max_start = regcurly_return[MAX_S];
	4692	const char * max_end = regcurly_return[MAX_E];
	4693
	4694	if (min_start) {
	4695	min = get_quantifier_value(pRExC_state, min_start, min_end);
	4696	}
	4697	else {
	4698	min = 0;
	4699	}
	4700
	4701	if (max_start == max_end) { /* Was of the form {m,} */
	4702	max = REG_INFTY;
	4703	}
	4704	else if (max_start == min_start) { /* Was of the form {m} */
	4705	max = min;
	4706	}
	4707	else { /* Was of the form {m,n} */
	4708	assert(max_end >= max_start);
	4709
	4710	max = get_quantifier_value(pRExC_state, max_start, max_end);
	4711	}
	4712
	4713	RExC_parse_set((char *) regcurly_return[RBRACE]);
	4714	nextchar(pRExC_state);
	4715
	4716	if (max < min) { /* If can't match, warn and optimize to fail
	4717	unconditionally */
	4718	reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
	4719	ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
	4720	NEXT_OFF(REGNODE_p(orig_emit)) =
	4721	REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
	4722	return ret;
	4723	}
	4724	else if (min == max && *RExC_parse == '?') {
	4725	ckWARN2reg(RExC_parse + 1,
	4726	"Useless use of greediness modifier '%c'",
	4727	*RExC_parse);
	4728	}
	4729
	4730	break;
	4731	} /* End of is {m,n} */
	4732
	4733	/* Here was a '{', but what followed it didn't form a quantifier. */
	4734	/* FALLTHROUGH */
	4735
	4736	default:
	4737	*flagp = flags;
	4738	return(ret);
	4739	NOT_REACHED; /NOTREACHED/
	4740	}
	4741
	4742	/* Here we have a quantifier, and have calculated 'min' and 'max'.
	4743	*
	4744	* Check and possibly adjust a zero width operand */
	4745	if (! (flags & (HASWIDTH\|POSTPONED))) {
	4746	if (max > REG_INFTY/3) {
	4747	ckWARN2reg(RExC_parse,
	4748	"%" UTF8f " matches null string many times",
	4749	UTF8fARG(UTF, (RExC_parse >= origparse
	4750	? RExC_parse - origparse
	4751	: 0),
	4752	origparse));
	4753	}
	4754
	4755	/* There's no point in trying to match something 0 length more than
	4756	* once except for extra side effects, which we don't have here since
	4757	* not POSTPONED */
	4758	if (max > 1) {
	4759	max = 1;
	4760	if (min > max) {
	4761	min = max;
	4762	}
	4763	}
	4764	}
	4765
	4766	/* If this is a code block pass it up */
	4767	*flagp \|= (flags & POSTPONED);
	4768
	4769	if (max > 0) {
	4770	*flagp \|= (flags & HASWIDTH);
	4771	if (max == REG_INFTY)
	4772	RExC_seen \|= REG_UNBOUNDED_QUANTIFIER_SEEN;
	4773	}
	4774
	4775	/* 'SIMPLE' operands don't require full generality */
	4776	if ((flags&SIMPLE)) {
	4777	if (max == REG_INFTY) {
	4778	if (min == 0) {
	4779	if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
	4780	goto min0_maxINF_wildcard_forbidden;
	4781	}
	4782
	4783	reginsert(pRExC_state, STAR, ret, depth+1);
	4784	MARK_NAUGHTY(4);
	4785	goto done_main_op;
	4786	}
	4787	else if (min == 1) {
	4788	reginsert(pRExC_state, PLUS, ret, depth+1);
	4789	MARK_NAUGHTY(3);
	4790	goto done_main_op;
	4791	}
	4792	}
	4793
	4794	/* Here, SIMPLE, but not the '' and '+' special cases /
	4795
	4796	MARK_NAUGHTY_EXP(2, 2);
	4797	reginsert(pRExC_state, CURLY, ret, depth+1);
	4798	}
	4799	else { /* not SIMPLE */
	4800	const regnode_offset w = reg_node(pRExC_state, WHILEM);
	4801
	4802	FLAGS(REGNODE_p(w)) = 0;
	4803	if (! REGTAIL(pRExC_state, ret, w)) {
	4804	REQUIRE_BRANCHJ(flagp, 0);
	4805	}
	4806	if (RExC_use_BRANCHJ) {
	4807	reginsert(pRExC_state, LONGJMP, ret, depth+1);
	4808	reginsert(pRExC_state, NOTHING, ret, depth+1);
	4809	REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP);
	4810	}
	4811	reginsert(pRExC_state, CURLYX, ret, depth+1);
	4812	if (RExC_use_BRANCHJ)
	4813	/* Go over NOTHING to LONGJMP. */
	4814	REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING);
	4815
	4816	if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
	4817	NOTHING)))
	4818	{
	4819	REQUIRE_BRANCHJ(flagp, 0);
	4820	}
	4821	RExC_whilem_seen++;
	4822	MARK_NAUGHTY_EXP(1, 4); /* compound interest */
	4823	}
	4824
	4825	/* Finish up the CURLY/CURLYX case */
	4826	FLAGS(REGNODE_p(ret)) = 0;
	4827
	4828	ARG1i_SET(REGNODE_p(ret), min);
	4829	ARG2i_SET(REGNODE_p(ret), max);
	4830
	4831	/* if we had a npar_after then we need to increment npar_before,
	4832	* we want to track the range of parens we need to reset each iteration
	4833	*/
	4834	if (npar_after!=npar_before) {
	4835	ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1);
	4836	ARG3b_SET(REGNODE_p(ret), (U16)npar_after);
	4837	} else {
	4838	ARG3a_SET(REGNODE_p(ret), 0);
	4839	ARG3b_SET(REGNODE_p(ret), 0);
	4840	}
	4841
	4842	done_main_op:
	4843
	4844	/* Process any greediness modifiers */
	4845	if (*RExC_parse == '?') {
	4846	nextchar(pRExC_state);
	4847	reginsert(pRExC_state, MINMOD, ret, depth+1);
	4848	if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
	4849	REQUIRE_BRANCHJ(flagp, 0);
	4850	}
	4851	}
	4852	else if (*RExC_parse == '+') {
	4853	regnode_offset ender;
	4854	nextchar(pRExC_state);
	4855	ender = reg_node(pRExC_state, SUCCEED);
	4856	if (! REGTAIL(pRExC_state, ret, ender)) {
	4857	REQUIRE_BRANCHJ(flagp, 0);
	4858	}
	4859	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	4860	ender = reg_node(pRExC_state, TAIL);
	4861	if (! REGTAIL(pRExC_state, ret, ender)) {
	4862	REQUIRE_BRANCHJ(flagp, 0);
	4863	}
	4864	}
	4865
	4866	/* Forbid extra quantifiers */
	4867	if (isQUANTIFIER(RExC_parse, RExC_end)) {
	4868	RExC_parse_inc_by(1);
	4869	vFAIL("Nested quantifiers");
	4870	}
	4871
	4872	return(ret);
	4873
	4874	min0_maxINF_wildcard_forbidden:
	4875
	4876	/* Here we are in a wildcard match, and the minimum match length is 0, and
	4877	* the max could be infinity. This is currently forbidden. The only
	4878	* reason is to make it harder to write patterns that take a long long time
	4879	* to halt, and because the use of this construct isn't necessary in
	4880	* matching Unicode property values */
	4881	RExC_parse_inc_by(1);
	4882	/* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
	4883	subpatterns in regex; marked by <-- HERE in m/%s/
	4884	*/
	4885	vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
	4886	" subpatterns");
	4887
	4888	/* Note, don't need to worry about the input being '{0,}', as a '}' isn't
	4889	* legal at all in wildcards, so can't get this far */
	4890
	4891	NOT_REACHED; /NOTREACHED/
	4892	}
	4893
	4894	STATIC bool
	4895	S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
	4896	regnode_offset * node_p,
	4897	UV * code_point_p,
	4898	int * cp_count,
	4899	I32 * flagp,
	4900	const bool strict,
	4901	const U32 depth
	4902	)
	4903	{
	4904	/* This routine teases apart the various meanings of \N and returns
	4905	* accordingly. The input parameters constrain which meaning(s) is/are valid
	4906	* in the current context.
	4907	*
	4908	* Exactly one of <node_p> and <code_point_p> must be non-NULL.
	4909	*
	4910	* If <code_point_p> is not NULL, the context is expecting the result to be a
	4911	* single code point. If this \N instance turns out to a single code point,
	4912	* the function returns TRUE and sets *code_point_p to that code point.
	4913	*
	4914	* If <node_p> is not NULL, the context is expecting the result to be one of
	4915	* the things representable by a regnode. If this \N instance turns out to be
	4916	* one such, the function generates the regnode, returns TRUE and sets *node_p
	4917	* to point to the offset of that regnode into the regex engine program being
	4918	* compiled.
	4919	*
	4920	* If this instance of \N isn't legal in any context, this function will
	4921	* generate a fatal error and not return.
	4922	*
	4923	* On input, RExC_parse should point to the first char following the \N at the
	4924	* time of the call. On successful return, RExC_parse will have been updated
	4925	* to point to just after the sequence identified by this routine. Also
	4926	* *flagp has been updated as needed.
	4927	*
	4928	* When there is some problem with the current context and this \N instance,
	4929	* the function returns FALSE, without advancing RExC_parse, nor setting
	4930	* node_p, nor code_point_p, nor *flagp.
	4931	*
	4932	* If <cp_count> is not NULL, the caller wants to know the length (in code
	4933	* points) that this \N sequence matches. This is set, and the input is
	4934	* parsed for errors, even if the function returns FALSE, as detailed below.
	4935	*
	4936	* There are 6 possibilities here, as detailed in the next 6 paragraphs.
	4937	*
	4938	* Probably the most common case is for the \N to specify a single code point.
	4939	* cp_count will be set to 1, and code_point_p will be set to that code
	4940	* point.
	4941	*
	4942	* Another possibility is for the input to be an empty \N{}. This is no
	4943	* longer accepted, and will generate a fatal error.
	4944	*
	4945	* Another possibility is for a custom charnames handler to be in effect which
	4946	* translates the input name to an empty string. *cp_count will be set to 0.
	4947	* *node_p will be set to a generated NOTHING node.
	4948	*
	4949	* Still another possibility is for the \N to mean [^\n]. *cp_count will be
	4950	* set to 0. *node_p will be set to a generated REG_ANY node.
	4951	*
	4952	* The fifth possibility is that \N resolves to a sequence of more than one
	4953	* code points. *cp_count will be set to the number of code points in the
	4954	* sequence. *node_p will be set to a generated node returned by this
	4955	* function calling S_reg().
	4956	*
	4957	* The sixth and final possibility is that it is premature to be calling this
	4958	* function; the parse needs to be restarted. This can happen when this
	4959	* changes from /d to /u rules, or when the pattern needs to be upgraded to
	4960	* UTF-8. The latter occurs only when the fifth possibility would otherwise
	4961	* be in effect, and is because one of those code points requires the pattern
	4962	* to be recompiled as UTF-8. The function returns FALSE, and sets the
	4963	* RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
	4964	* happens, the caller needs to desist from continuing parsing, and return
	4965	* this information to its caller. This is not set for when there is only one
	4966	* code point, as this can be called as part of an ANYOF node, and they can
	4967	* store above-Latin1 code points without the pattern having to be in UTF-8.
	4968	*
	4969	* For non-single-quoted regexes, the tokenizer has resolved character and
	4970	* sequence names inside \N{...} into their Unicode values, normalizing the
	4971	* result into what we should see here: '\N{U+c1.c2...}', where c1... are the
	4972	* hex-represented code points in the sequence. This is done there because
	4973	* the names can vary based on what charnames pragma is in scope at the time,
	4974	* so we need a way to take a snapshot of what they resolve to at the time of
	4975	* the original parse. [perl #56444].
	4976	*
	4977	* That parsing is skipped for single-quoted regexes, so here we may get
	4978	* '\N{NAME}', which is parsed now. If the single-quoted regex is something
	4979	* like '\N{U+41}', that code point is Unicode, and has to be translated into
	4980	* the native character set for non-ASCII platforms. The other possibilities
	4981	* are already native, so no translation is done. */
	4982
	4983	char * endbrace; /* points to '}' following the name */
	4984	char * e; /* points to final non-blank before endbrace */
	4985	char* p = RExC_parse; /* Temporary */
	4986
	4987	SV * substitute_parse = NULL;
	4988	char *orig_end;
	4989	char *save_start;
	4990	I32 flags;
	4991
	4992	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	4993
	4994	PERL_ARGS_ASSERT_GROK_BSLASH_N;
	4995
	4996	assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
	4997	assert(! (node_p && cp_count)); /* At most 1 should be set */
	4998
	4999	if (cp_count) { /* Initialize return for the most common case */
	5000	*cp_count = 1;
	5001	}
	5002
	5003	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	5004	* modifier. The other meanings do not (except blanks adjacent to and
	5005	* within the braces), so use a temporary until we find out which we are
	5006	* being called with */
	5007	skip_to_be_ignored_text(pRExC_state, &p,
	5008	FALSE /* Don't force to /x */ );
	5009
	5010	/* Disambiguate between \N meaning a named character versus \N meaning
	5011	* [^\n]. The latter is assumed when the {...} following the \N is a legal
	5012	* quantifier, or if there is no '{' at all */
	5013	if (*p != '{' \|\| regcurly(p, RExC_end, NULL)) {
	5014	RExC_parse_set(p);
	5015	if (cp_count) {
	5016	*cp_count = -1;
	5017	}
	5018
	5019	if (! node_p) {
	5020	return FALSE;
	5021	}
	5022
	5023	*node_p = reg_node(pRExC_state, REG_ANY);
	5024	*flagp \|= HASWIDTH\|SIMPLE;
	5025	MARK_NAUGHTY(1);
	5026	return TRUE;
	5027	}
	5028
	5029	/* The test above made sure that the next real character is a '{', but
	5030	* under the /x modifier, it could be separated by space (or a comment and
	5031	* \n) and this is not allowed (for consistency with \x{...} and the
	5032	* tokenizer handling of \N{NAME}). */
	5033	if (*RExC_parse != '{') {
	5034	vFAIL("Missing braces on \\N{}");
	5035	}
	5036
	5037	RExC_parse_inc_by(1); /* Skip past the '{' */
	5038
	5039	endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
	5040	if (! endbrace) { /* no trailing brace */
	5041	vFAIL2("Missing right brace on \\%c{}", 'N');
	5042	}
	5043
	5044	/* Here, we have decided it should be a named character or sequence. These
	5045	* imply Unicode semantics */
	5046	REQUIRE_UNI_RULES(flagp, FALSE);
	5047
	5048	/* \N{_} is what toke.c returns to us to indicate a name that evaluates to
	5049	* nothing at all (not allowed under strict) */
	5050	if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
	5051	RExC_parse_set(endbrace);
	5052	if (strict) {
	5053	RExC_parse_inc_by(1); /* Position after the "}" */
	5054	vFAIL("Zero length \\N{}");
	5055	}
	5056
	5057	if (cp_count) {
	5058	*cp_count = 0;
	5059	}
	5060	nextchar(pRExC_state);
	5061	if (! node_p) {
	5062	return FALSE;
	5063	}
	5064
	5065	*node_p = reg_node(pRExC_state, NOTHING);
	5066	return TRUE;
	5067	}
	5068
	5069	while (isBLANK(*RExC_parse)) {
	5070	RExC_parse_inc_by(1);
	5071	}
	5072
	5073	e = endbrace;
	5074	while (RExC_parse < e && isBLANK(*(e-1))) {
	5075	e--;
	5076	}
	5077
	5078	if (e - RExC_parse < 2 \|\| ! strBEGINs(RExC_parse, "U+")) {
	5079
	5080	/* Here, the name isn't of the form U+.... This can happen if the
	5081	* pattern is single-quoted, so didn't get evaluated in toke.c. Now
	5082	* is the time to find out what the name means */
	5083
	5084	const STRLEN name_len = e - RExC_parse;
	5085	SV * value_sv; /* What does this name evaluate to */
	5086	SV ** value_svp;
	5087	const U8 * value; /* string of name's value */
	5088	STRLEN value_len; /* and its length */
	5089
	5090	/* RExC_unlexed_names is a hash of names that weren't evaluated by
	5091	* toke.c, and their values. Make sure is initialized */
	5092	if (! RExC_unlexed_names) {
	5093	RExC_unlexed_names = newHV();
	5094	}
	5095
	5096	/* If we have already seen this name in this pattern, use that. This
	5097	* allows us to only call the charnames handler once per name per
	5098	* pattern. A broken or malicious handler could return something
	5099	* different each time, which could cause the results to vary depending
	5100	* on if something gets added or subtracted from the pattern that
	5101	* causes the number of passes to change, for example */
	5102	if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
	5103	name_len, 0)))
	5104	{
	5105	value_sv = *value_svp;
	5106	}
	5107	else { /* Otherwise we have to go out and get the name */
	5108	const char * error_msg = NULL;
	5109	value_sv = get_and_check_backslash_N_name(RExC_parse, e,
	5110	UTF,
	5111	&error_msg);
	5112	if (error_msg) {
	5113	RExC_parse_set(endbrace);
	5114	vFAIL(error_msg);
	5115	}
	5116
	5117	/* If no error message, should have gotten a valid return */
	5118	assert (value_sv);
	5119
	5120	/* Save the name's meaning for later use */
	5121	if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
	5122	value_sv, 0))
	5123	{
	5124	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	5125	}
	5126	}
	5127
	5128	/* Here, we have the value the name evaluates to in 'value_sv' */
	5129	value = (U8 *) SvPV(value_sv, value_len);
	5130
	5131	/* See if the result is one code point vs 0 or multiple */
	5132	if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
	5133	? UTF8SKIP(value)
	5134	: 1)))
	5135	{
	5136	/* Here, exactly one code point. If that isn't what is wanted,
	5137	* fail */
	5138	if (! code_point_p) {
	5139	RExC_parse_set(p);
	5140	return FALSE;
	5141	}
	5142
	5143	/* Convert from string to numeric code point */
	5144	*code_point_p = (SvUTF8(value_sv))
	5145	? valid_utf8_to_uvchr(value, NULL)
	5146	: *value;
	5147
	5148	/* Have parsed this entire single code point \N{...}. *cp_count
	5149	* has already been set to 1, so don't do it again. */
	5150	RExC_parse_set(endbrace);
	5151	nextchar(pRExC_state);
	5152	return TRUE;
	5153	} /* End of is a single code point */
	5154
	5155	/* Count the code points, if caller desires. The API says to do this
	5156	* even if we will later return FALSE */
	5157	if (cp_count) {
	5158	*cp_count = 0;
	5159
	5160	*cp_count = (SvUTF8(value_sv))
	5161	? utf8_length(value, value + value_len)
	5162	: value_len;
	5163	}
	5164
	5165	/* Fail if caller doesn't want to handle a multi-code-point sequence.
	5166	* But don't back the pointer up if the caller wants to know how many
	5167	* code points there are (they need to handle it themselves in this
	5168	* case). */
	5169	if (! node_p) {
	5170	if (! cp_count) {
	5171	RExC_parse_set(p);
	5172	}
	5173	return FALSE;
	5174	}
	5175
	5176	/* Convert this to a sub-pattern of the form "(?: ... )", and then call
	5177	* reg recursively to parse it. That way, it retains its atomicness,
	5178	* while not having to worry about any special handling that some code
	5179	* points may have. */
	5180
	5181	substitute_parse = newSVpvs("?:");
	5182	sv_catsv(substitute_parse, value_sv);
	5183	sv_catpv(substitute_parse, ")");
	5184
	5185	/* The value should already be native, so no need to convert on EBCDIC
	5186	* platforms.*/
	5187	assert(! RExC_recode_x_to_native);
	5188
	5189	}
	5190	else { /* \N{U+...} */
	5191	Size_t count = 0; /* code point count kept internally */
	5192
	5193	/* We can get to here when the input is \N{U+...} or when toke.c has
	5194	* converted a name to the \N{U+...} form. This include changing a
	5195	* name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
	5196
	5197	RExC_parse_inc_by(2); /* Skip past the 'U+' */
	5198
	5199	/* Code points are separated by dots. The '}' terminates the whole
	5200	* thing. */
	5201
	5202	do { /* Loop until the ending brace */
	5203	I32 flags = PERL_SCAN_SILENT_OVERFLOW
	5204	\| PERL_SCAN_SILENT_ILLDIGIT
	5205	\| PERL_SCAN_NOTIFY_ILLDIGIT
	5206	\| PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
	5207	\| PERL_SCAN_DISALLOW_PREFIX;
	5208	STRLEN len = e - RExC_parse;
	5209	NV overflow_value;
	5210	char * start_digit = RExC_parse;
	5211	UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
	5212
	5213	if (len == 0) {
	5214	RExC_parse_inc_by(1);
	5215	bad_NU:
	5216	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	5217	}
	5218
	5219	RExC_parse_inc_by(len);
	5220
	5221	if (cp > MAX_LEGAL_CP) {
	5222	vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
	5223	}
	5224
	5225	if (RExC_parse >= e) { /* Got to the closing '}' */
	5226	if (count) {
	5227	goto do_concat;
	5228	}
	5229
	5230	/* Here, is a single code point; fail if doesn't want that */
	5231	if (! code_point_p) {
	5232	RExC_parse_set(p);
	5233	return FALSE;
	5234	}
	5235
	5236	/* A single code point is easy to handle; just return it */
	5237	*code_point_p = UNI_TO_NATIVE(cp);
	5238	RExC_parse_set(endbrace);
	5239	nextchar(pRExC_state);
	5240	return TRUE;
	5241	}
	5242
	5243	/* Here, the parse stopped bfore the ending brace. This is legal
	5244	* only if that character is a dot separating code points, like a
	5245	* multiple character sequence (of the form "\N{U+c1.c2. ... }".
	5246	* So the next character must be a dot (and the one after that
	5247	* can't be the ending brace, or we'd have something like
	5248	* \N{U+100.} )
	5249	* */
	5250	if (*RExC_parse != '.' \|\| RExC_parse + 1 >= e) {
	5251	/point to after 1st invalid /
	5252	RExC_parse_incf(RExC_orig_utf8);
	5253	/Guard against malformed utf8/
	5254	RExC_parse_set(MIN(e, RExC_parse));
	5255	goto bad_NU;
	5256	}
	5257
	5258	/* Here, looks like its really a multiple character sequence. Fail
	5259	* if that's not what the caller wants. But continue with counting
	5260	* and error checking if they still want a count */
	5261	if (! node_p && ! cp_count) {
	5262	return FALSE;
	5263	}
	5264
	5265	/* What is done here is to convert this to a sub-pattern of the
	5266	* form \x{char1}\x{char2}... and then call reg recursively to
	5267	* parse it (enclosing in "(?: ... )" ). That way, it retains its
	5268	* atomicness, while not having to worry about special handling
	5269	* that some code points may have. We don't create a subpattern,
	5270	* but go through the motions of code point counting and error
	5271	* checking, if the caller doesn't want a node returned. */
	5272
	5273	if (node_p && ! substitute_parse) {
	5274	substitute_parse = newSVpvs("?:");
	5275	}
	5276
	5277	do_concat:
	5278
	5279	if (node_p) {
	5280	/* Convert to notation the rest of the code understands */
	5281	sv_catpvs(substitute_parse, "\\x{");
	5282	sv_catpvn(substitute_parse, start_digit,
	5283	RExC_parse - start_digit);
	5284	sv_catpvs(substitute_parse, "}");
	5285	}
	5286
	5287	/* Move to after the dot (or ending brace the final time through.)
	5288	* */
	5289	RExC_parse_inc_by(1);
	5290	count++;
	5291
	5292	} while (RExC_parse < e);
	5293
	5294	if (! node_p) { /* Doesn't want the node */
	5295	assert (cp_count);
	5296
	5297	*cp_count = count;
	5298	return FALSE;
	5299	}
	5300
	5301	sv_catpvs(substitute_parse, ")");
	5302
	5303	/* The values are Unicode, and therefore have to be converted to native
	5304	* on a non-Unicode (meaning non-ASCII) platform. */
	5305	SET_recode_x_to_native(1);
	5306	}
	5307
	5308	/* Here, we have the string the name evaluates to, ready to be parsed,
	5309	* stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
	5310	* constructs. This can be called from within a substitute parse already.
	5311	* The error reporting mechanism doesn't work for 2 levels of this, but the
	5312	* code above has validated this new construct, so there should be no
	5313	* errors generated by the below. And this isn't an exact copy, so the
	5314	* mechanism to seamlessly deal with this won't work, so turn off warnings
	5315	* during it */
	5316	save_start = RExC_start;
	5317	orig_end = RExC_end;
	5318
	5319	RExC_start = SvPVX(substitute_parse);
	5320	RExC_parse_set(RExC_start);
	5321	RExC_end = RExC_parse + SvCUR(substitute_parse);
	5322	TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
	5323
	5324	*node_p = reg(pRExC_state, 1, &flags, depth+1);
	5325
	5326	/* Restore the saved values */
	5327	RESTORE_WARNINGS;
	5328	RExC_start = save_start;
	5329	RExC_parse_set(endbrace);
	5330	RExC_end = orig_end;
	5331	SET_recode_x_to_native(0);
	5332
	5333	SvREFCNT_dec_NN(substitute_parse);
	5334
	5335	if (! *node_p) {
	5336	RETURN_FAIL_ON_RESTART(flags, flagp);
	5337	FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
	5338	(UV) flags);
	5339	}
	5340	*flagp \|= flags&(HASWIDTH\|SIMPLE\|POSTPONED);
	5341
	5342	nextchar(pRExC_state);
	5343
	5344	return TRUE;
	5345	}
	5346
	5347
	5348	STATIC U8
	5349	S_compute_EXACTish(RExC_state_t *pRExC_state)
	5350	{
	5351	U8 op;
	5352
	5353	PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
	5354
	5355	if (! FOLD) {
	5356	return (LOC)
	5357	? EXACTL
	5358	: EXACT;
	5359	}
	5360
	5361	op = get_regex_charset(RExC_flags);
	5362	if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
	5363	op--; /* /a is same as /u, and map /aa's offset to what /a's would have
	5364	been, so there is no hole */
	5365	}
	5366
	5367	return op + EXACTF;
	5368	}
	5369
	5370	/* Parse backref decimal value, unless it's too big to sensibly be a backref,
	5371	* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
	5372
	5373	static I32
	5374	S_backref_value(char p, char e)
	5375	{
	5376	const char* endptr = e;
	5377	UV val;
	5378	if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
	5379	return (I32)val;
	5380	return I32_MAX;
	5381	}
	5382
	5383
	5384	/*
	5385	- regatom - the lowest level
	5386
	5387	Try to identify anything special at the start of the current parse position.
	5388	If there is, then handle it as required. This may involve generating a
	5389	single regop, such as for an assertion; or it may involve recursing, such as
	5390	to handle a () structure.
	5391
	5392	If the string doesn't start with something special then we gobble up
	5393	as much literal text as we can. If we encounter a quantifier, we have to
	5394	back off the final literal character, as that quantifier applies to just it
	5395	and not to the whole string of literals.
	5396
	5397	Once we have been able to handle whatever type of thing started the
	5398	sequence, we return the offset into the regex engine program being compiled
	5399	at which any next regnode should be placed.
	5400
	5401	Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
	5402	Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
	5403	restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
	5404	Otherwise does not return 0.
	5405
	5406	Note: we have to be careful with escapes, as they can be both literal
	5407	and special, and in the case of \10 and friends, context determines which.
	5408
	5409	A summary of the code structure is:
	5410
	5411	switch (first_byte) {
	5412	cases for each special:
	5413	handle this special;
	5414	break;
	5415	case '\\':
	5416	switch (2nd byte) {
	5417	cases for each unambiguous special:
	5418	handle this special;
	5419	break;
	5420	cases for each ambiguous special/literal:
	5421	disambiguate;
	5422	if (special) handle here
	5423	else goto defchar;
	5424	default: // unambiguously literal:
	5425	goto defchar;
	5426	}
	5427	default: // is a literal char
	5428	// FALL THROUGH
	5429	defchar:
	5430	create EXACTish node for literal;
	5431	while (more input and node isn't full) {
	5432	switch (input_byte) {
	5433	cases for each special;
	5434	make sure parse pointer is set so that the next call to
	5435	regatom will see this special first
	5436	goto loopdone; // EXACTish node terminated by prev. char
	5437	default:
	5438	append char to EXACTISH node;
	5439	}
	5440	get next input byte;
	5441	}
	5442	loopdone:
	5443	}
	5444	return the generated node;
	5445
	5446	Specifically there are two separate switches for handling
	5447	escape sequences, with the one for handling literal escapes requiring
	5448	a dummy entry for all of the special escapes that are actually handled
	5449	by the other.
	5450
	5451	*/
	5452
	5453	STATIC regnode_offset
	5454	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	5455	{
	5456	regnode_offset ret = 0;
	5457	I32 flags = 0;
	5458	char *atom_parse_start;
	5459	U8 op;
	5460	int invert = 0;
	5461
	5462	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	5463
	5464	flagp = 0; / Initialize. */
	5465
	5466	DEBUG_PARSE("atom");
	5467
	5468	PERL_ARGS_ASSERT_REGATOM;
	5469
	5470	tryagain:
	5471	atom_parse_start = RExC_parse;
	5472	assert(RExC_parse < RExC_end);
	5473	switch ((U8)*RExC_parse) {
	5474	case '^':
	5475	RExC_seen_zerolen++;
	5476	nextchar(pRExC_state);
	5477	if (RExC_flags & RXf_PMf_MULTILINE)
	5478	ret = reg_node(pRExC_state, MBOL);
	5479	else
	5480	ret = reg_node(pRExC_state, SBOL);
	5481	break;
	5482	case '$':
	5483	nextchar(pRExC_state);
	5484	if (*RExC_parse)
	5485	RExC_seen_zerolen++;
	5486	if (RExC_flags & RXf_PMf_MULTILINE)
	5487	ret = reg_node(pRExC_state, MEOL);
	5488	else
	5489	ret = reg_node(pRExC_state, SEOL);
	5490	break;
	5491	case '.':
	5492	nextchar(pRExC_state);
	5493	if (RExC_flags & RXf_PMf_SINGLELINE)
	5494	ret = reg_node(pRExC_state, SANY);
	5495	else
	5496	ret = reg_node(pRExC_state, REG_ANY);
	5497	*flagp \|= HASWIDTH\|SIMPLE;
	5498	MARK_NAUGHTY(1);
	5499	break;
	5500	case '[':
	5501	{
	5502	char * const cc_parse_start = ++RExC_parse;
	5503	ret = regclass(pRExC_state, flagp, depth+1,
	5504	FALSE, /* means parse the whole char class */
	5505	TRUE, /* allow multi-char folds */
	5506	FALSE, /* don't silence non-portable warnings. */
	5507	(bool) RExC_strict,
	5508	TRUE, /* Allow an optimized regnode result */
	5509	NULL);
	5510	if (ret == 0) {
	5511	RETURN_FAIL_ON_RESTART_FLAGP(flagp);
	5512	FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
	5513	(UV) *flagp);
	5514	}
	5515	if (*RExC_parse != ']') {
	5516	RExC_parse_set(cc_parse_start);
	5517	vFAIL("Unmatched [");
	5518	}
	5519	nextchar(pRExC_state);
	5520	break;
	5521	}
	5522	case '(':
	5523	nextchar(pRExC_state);
	5524	ret = reg(pRExC_state, 2, &flags, depth+1);
	5525	if (ret == 0) {
	5526	if (flags & TRYAGAIN) {
	5527	if (RExC_parse >= RExC_end) {
	5528	/* Make parent create an empty node if needed. */
	5529	*flagp \|= TRYAGAIN;
	5530	return(0);
	5531	}
	5532	goto tryagain;
	5533	}
	5534	RETURN_FAIL_ON_RESTART(flags, flagp);
	5535	FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
	5536	(UV) flags);
	5537	}
	5538	*flagp \|= flags&(HASWIDTH\|SIMPLE\|POSTPONED);
	5539	break;
	5540	case '\|':
	5541	case ')':
	5542	if (flags & TRYAGAIN) {
	5543	*flagp \|= TRYAGAIN;
	5544	return 0;
	5545	}
	5546	vFAIL("Internal urp");
	5547	/* Supposed to be caught earlier. */
	5548	break;
	5549	case '?':
	5550	case '+':
	5551	case '*':
	5552	RExC_parse_inc_by(1);
	5553	vFAIL("Quantifier follows nothing");
	5554	break;
	5555	case '\\':
	5556	/* Special Escapes
	5557
	5558	This switch handles escape sequences that resolve to some kind
	5559	of special regop and not to literal text. Escape sequences that
	5560	resolve to literal text are handled below in the switch marked
	5561	"Literal Escapes".
	5562
	5563	Every entry in this switch must have a corresponding entry
	5564	in the literal escape switch. However, the opposite is not
	5565	required, as the default for this switch is to jump to the
	5566	literal text handling code.
	5567	*/
	5568	RExC_parse_inc_by(1);
	5569	switch ((U8)*RExC_parse) {
	5570	/* Special Escapes */
	5571	case 'A':
	5572	RExC_seen_zerolen++;
	5573	/* Under wildcards, this is changed to match \n; should be
	5574	* invisible to the user, as they have to compile under /m */
	5575	if (RExC_pm_flags & PMf_WILDCARD) {
	5576	ret = reg_node(pRExC_state, MBOL);
	5577	}
	5578	else {
	5579	ret = reg_node(pRExC_state, SBOL);
	5580	/* SBOL is shared with /^/ so we set the flags so we can tell
	5581	* /\A/ from /^/ in split. */
	5582	FLAGS(REGNODE_p(ret)) = 1;
	5583	}
	5584	goto finish_meta_pat;
	5585	case 'G':
	5586	if (RExC_pm_flags & PMf_WILDCARD) {
	5587	RExC_parse_inc_by(1);
	5588	/* diag_listed_as: Use of %s is not allowed in Unicode property
	5589	wildcard subpatterns in regex; marked by <-- HERE in m/%s/
	5590	*/
	5591	vFAIL("Use of '\\G' is not allowed in Unicode property"
	5592	" wildcard subpatterns");
	5593	}
	5594	ret = reg_node(pRExC_state, GPOS);
	5595	RExC_seen \|= REG_GPOS_SEEN;
	5596	goto finish_meta_pat;
	5597	case 'K':
	5598	if (!RExC_in_lookaround) {
	5599	RExC_seen_zerolen++;
	5600	ret = reg_node(pRExC_state, KEEPS);
	5601	/* XXX:dmq : disabling in-place substitution seems to
	5602	* be necessary here to avoid cases of memory corruption, as
	5603	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	5604	*/
	5605	RExC_seen \|= REG_LOOKBEHIND_SEEN;
	5606	goto finish_meta_pat;
	5607	}
	5608	else {
	5609	++RExC_parse; /* advance past the 'K' */
	5610	vFAIL("\\K not permitted in lookahead/lookbehind");
	5611	}
	5612	case 'Z':
	5613	if (RExC_pm_flags & PMf_WILDCARD) {
	5614	/* See comment under \A above */
	5615	ret = reg_node(pRExC_state, MEOL);
	5616	}
	5617	else {
	5618	ret = reg_node(pRExC_state, SEOL);
	5619	}
	5620	RExC_seen_zerolen++; /* Do not optimize RE away */
	5621	goto finish_meta_pat;
	5622	case 'z':
	5623	if (RExC_pm_flags & PMf_WILDCARD) {
	5624	/* See comment under \A above */
	5625	ret = reg_node(pRExC_state, MEOL);
	5626	}
	5627	else {
	5628	ret = reg_node(pRExC_state, EOS);
	5629	}
	5630	RExC_seen_zerolen++; /* Do not optimize RE away */
	5631	goto finish_meta_pat;
	5632	case 'C':
	5633	vFAIL("\\C no longer supported");
	5634	case 'X':
	5635	ret = reg_node(pRExC_state, CLUMP);
	5636	*flagp \|= HASWIDTH;
	5637	goto finish_meta_pat;
	5638
	5639	case 'B':
	5640	invert = 1;
	5641	/* FALLTHROUGH */
	5642	case 'b':
	5643	{
	5644	U8 flags = 0;
	5645	regex_charset charset = get_regex_charset(RExC_flags);
	5646
	5647	RExC_seen_zerolen++;
	5648	RExC_seen \|= REG_LOOKBEHIND_SEEN;
	5649	op = BOUND + charset;
	5650
	5651	if (RExC_parse >= RExC_end \|\| *(RExC_parse + 1) != '{') {
	5652	flags = TRADITIONAL_BOUND;
	5653	if (op > BOUNDA) { /* /aa is same as /a */
	5654	op = BOUNDA;
	5655	}
	5656	}
	5657	else {
	5658	STRLEN length;
	5659	char name = *RExC_parse;
	5660	char * endbrace = (char *) memchr(RExC_parse, '}',
	5661	RExC_end - RExC_parse);
	5662	char * e = endbrace;
	5663
	5664	RExC_parse_inc_by(2);
	5665
	5666	if (! endbrace) {
	5667	vFAIL2("Missing right brace on \\%c{}", name);
	5668	}
	5669
	5670	while (isBLANK(*RExC_parse)) {
	5671	RExC_parse_inc_by(1);
	5672	}
	5673
	5674	while (RExC_parse < e && isBLANK(*(e - 1))) {
	5675	e--;
	5676	}
	5677
	5678	if (e == RExC_parse) {
	5679	RExC_parse_set(endbrace + 1); /* After the '}' */
	5680	vFAIL2("Empty \\%c{}", name);
	5681	}
	5682
	5683	length = e - RExC_parse;
	5684
	5685	switch (*RExC_parse) {
	5686	case 'g':
	5687	if ( length != 1
	5688	&& (memNEs(RExC_parse + 1, length - 1, "cb")))
	5689	{
	5690	goto bad_bound_type;
	5691	}
	5692	flags = GCB_BOUND;
	5693	break;
	5694	case 'l':
	5695	if (length != 2 \|\| *(RExC_parse + 1) != 'b') {
	5696	goto bad_bound_type;
	5697	}
	5698	flags = LB_BOUND;
	5699	break;
	5700	case 's':
	5701	if (length != 2 \|\| *(RExC_parse + 1) != 'b') {
	5702	goto bad_bound_type;
	5703	}
	5704	flags = SB_BOUND;
	5705	break;
	5706	case 'w':
	5707	if (length != 2 \|\| *(RExC_parse + 1) != 'b') {
	5708	goto bad_bound_type;
	5709	}
	5710	flags = WB_BOUND;
	5711	break;
	5712	default:
	5713	bad_bound_type:
	5714	RExC_parse_set(e);
	5715	vFAIL2utf8f(
	5716	"'%" UTF8f "' is an unknown bound type",
	5717	UTF8fARG(UTF, length, e - length));
	5718	NOT_REACHED; /NOTREACHED/
	5719	}
	5720	RExC_parse_set(endbrace);
	5721	REQUIRE_UNI_RULES(flagp, 0);
	5722
	5723	if (op == BOUND) {
	5724	op = BOUNDU;
	5725	}
	5726	else if (op >= BOUNDA) { /* /aa is same as /a */
	5727	op = BOUNDU;
	5728	length += 4;
	5729
	5730	/* Don't have to worry about UTF-8, in this message because
	5731	* to get here the contents of the \b must be ASCII */
	5732	ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
	5733	"Using /u for '%.*s' instead of /%s",
	5734	(unsigned) length,
	5735	endbrace - length + 1,
	5736	(charset == REGEX_ASCII_RESTRICTED_CHARSET)
	5737	? ASCII_RESTRICT_PAT_MODS
	5738	: ASCII_MORE_RESTRICT_PAT_MODS);
	5739	}
	5740	}
	5741
	5742	if (op == BOUND) {
	5743	RExC_seen_d_op = TRUE;
	5744	}
	5745	else if (op == BOUNDL) {
	5746	RExC_contains_locale = 1;
	5747	}
	5748
	5749	if (invert) {
	5750	op += NBOUND - BOUND;
	5751	}
	5752
	5753	ret = reg_node(pRExC_state, op);
	5754	FLAGS(REGNODE_p(ret)) = flags;
	5755
	5756	goto finish_meta_pat;
	5757	}
	5758
	5759	case 'R':
	5760	ret = reg_node(pRExC_state, LNBREAK);
	5761	*flagp \|= HASWIDTH\|SIMPLE;
	5762	goto finish_meta_pat;
	5763
	5764	case 'd':
	5765	case 'D':
	5766	case 'h':
	5767	case 'H':
	5768	case 'p':
	5769	case 'P':
	5770	case 's':
	5771	case 'S':
	5772	case 'v':
	5773	case 'V':
	5774	case 'w':
	5775	case 'W':
	5776	/* These all have the same meaning inside [brackets], and it knows
	5777	* how to do the best optimizations for them. So, pretend we found
	5778	* these within brackets, and let it do the work */
	5779	RExC_parse--;
	5780
	5781	ret = regclass(pRExC_state, flagp, depth+1,
	5782	TRUE, /* means just parse this element */
	5783	FALSE, /* don't allow multi-char folds */
	5784	FALSE, /* don't silence non-portable warnings. It
	5785	would be a bug if these returned
	5786	non-portables */
	5787	(bool) RExC_strict,
	5788	TRUE, /* Allow an optimized regnode result */
	5789	NULL);
	5790	RETURN_FAIL_ON_RESTART_FLAGP(flagp);
	5791	/* regclass() can only return RESTART_PARSE and NEED_UTF8 if
	5792	* multi-char folds are allowed. */
	5793	if (!ret)
	5794	FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
	5795	(UV) *flagp);
	5796
	5797	RExC_parse--; /* regclass() leaves this one too far ahead */
	5798
	5799	finish_meta_pat:
	5800	/* The escapes above that don't take a parameter can't be
	5801	* followed by a '{'. But 'pX', 'p{foo}' and
	5802	* correspondingly 'P' can be */
	5803	if ( RExC_parse - atom_parse_start == 1
	5804	&& UCHARAT(RExC_parse + 1) == '{'
	5805	&& UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
	5806	{
	5807	RExC_parse_inc_by(2);
	5808	vFAIL("Unescaped left brace in regex is illegal here");
	5809	}
	5810	nextchar(pRExC_state);
	5811	break;
	5812	case 'N':
	5813	/* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
	5814	* \N{...} evaluates to a sequence of more than one code points).
	5815	* The function call below returns a regnode, which is our result.
	5816	* The parameters cause it to fail if the \N{} evaluates to a
	5817	* single code point; we handle those like any other literal. The
	5818	* reason that the multicharacter case is handled here and not as
	5819	* part of the EXACtish code is because of quantifiers. In
	5820	* /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
	5821	* this way makes that Just Happen. dmq.
	5822	* join_exact() will join this up with adjacent EXACTish nodes
	5823	* later on, if appropriate. */
	5824	++RExC_parse;
	5825	if (grok_bslash_N(pRExC_state,
	5826	&ret, /* Want a regnode returned */
	5827	NULL, /* Fail if evaluates to a single code
	5828	point */
	5829	NULL, /* Don't need a count of how many code
	5830	points */
	5831	flagp,
	5832	RExC_strict,
	5833	depth)
	5834	) {
	5835	break;
	5836	}
	5837
	5838	RETURN_FAIL_ON_RESTART_FLAGP(flagp);
	5839
	5840	/* Here, evaluates to a single code point. Go get that */
	5841	RExC_parse_set(atom_parse_start);
	5842	goto defchar;
	5843
	5844	case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
	5845	parse_named_seq: /* Also handle non-numeric \g{...} */
	5846	{
	5847	char ch;
	5848	if ( RExC_parse >= RExC_end - 1
	5849	\|\| (( ch = RExC_parse[1]) != '<'
	5850	&& ch != '\''
	5851	&& ch != '{'))
	5852	{
	5853	RExC_parse_inc_by(1);
	5854	/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
	5855	vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
	5856	} else {
	5857	RExC_parse_inc_by(2);
	5858	if (ch == '{') {
	5859	while (isBLANK(*RExC_parse)) {
	5860	RExC_parse_inc_by(1);
	5861	}
	5862	}
	5863	ret = handle_named_backref(pRExC_state,
	5864	flagp,
	5865	atom_parse_start,
	5866	(ch == '<')
	5867	? '>'
	5868	: (ch == '{')
	5869	? '}'
	5870	: '\'');
	5871	}
	5872	break;
	5873	}
	5874	case 'g':
	5875	case '1': case '2': case '3': case '4':
	5876	case '5': case '6': case '7': case '8': case '9':
	5877	{
	5878	I32 num;
	5879	char * endbrace = NULL;
	5880	char * s = RExC_parse;
	5881	char * e = RExC_end;
	5882
	5883	if (*s == 'g') {
	5884	bool isrel = 0;
	5885
	5886	s++;
	5887	if (*s == '{') {
	5888	endbrace = (char *) memchr(s, '}', RExC_end - s);
	5889	if (! endbrace ) {
	5890
	5891	/* Missing '}'. Position after the number to give
	5892	* a better indication to the user of where the
	5893	* problem is. */
	5894	s++;
	5895	if (*s == '-') {
	5896	s++;
	5897	}
	5898
	5899	/* If it looks to be a name and not a number, go
	5900	* handle it there */
	5901	if (! isDIGIT(*s)) {
	5902	goto parse_named_seq;
	5903	}
	5904
	5905	do {
	5906	s++;
	5907	} while isDIGIT(*s);
	5908
	5909	RExC_parse_set(s);
	5910	vFAIL("Unterminated \\g{...} pattern");
	5911	}
	5912
	5913	s++; /* Past the '{' */
	5914
	5915	while (isBLANK(*s)) {
	5916	s++;
	5917	}
	5918
	5919	/* Ignore trailing blanks */
	5920	e = endbrace;
	5921	while (s < e && isBLANK(*(e - 1))) {
	5922	e--;
	5923	}
	5924	}
	5925
	5926	/* Here, have isolated the meat of the construct from any
	5927	* surrounding braces */
	5928
	5929	if (*s == '-') {
	5930	isrel = 1;
	5931	s++;
	5932	}
	5933
	5934	if (endbrace && !isDIGIT(*s)) {
	5935	goto parse_named_seq;
	5936	}
	5937
	5938	RExC_parse_set(s);
	5939	num = S_backref_value(RExC_parse, RExC_end);
	5940	if (num == 0)
	5941	vFAIL("Reference to invalid group 0");
	5942	else if (num == I32_MAX) {
	5943	if (isDIGIT(*RExC_parse))
	5944	vFAIL("Reference to nonexistent group");
	5945	else
	5946	vFAIL("Unterminated \\g... pattern");
	5947	}
	5948
	5949	if (isrel) {
	5950	num = RExC_npar - num;
	5951	if (num < 1)
	5952	vFAIL("Reference to nonexistent or unclosed group");
	5953	}
	5954	else
	5955	if (num < RExC_logical_npar) {
	5956	num = RExC_logical_to_parno[num];
	5957	}
	5958	else
	5959	if (ALL_PARENS_COUNTED) {
	5960	if (num < RExC_logical_total_parens)
	5961	num = RExC_logical_to_parno[num];
	5962	else {
	5963	num = -1;
	5964	}
	5965	}
	5966	else{
	5967	REQUIRE_PARENS_PASS;
	5968	}
	5969	}
	5970	else {
	5971	num = S_backref_value(RExC_parse, RExC_end);
	5972	/* bare \NNN might be backref or octal - if it is larger
	5973	* than or equal RExC_npar then it is assumed to be an
	5974	* octal escape. Note RExC_npar is +1 from the actual
	5975	* number of parens. */
	5976	/* Note we do NOT check if num == I32_MAX here, as that is
	5977	* handled by the RExC_npar check */
	5978
	5979	if ( /* any numeric escape < 10 is always a backref */
	5980	num > 9
	5981	/* any numeric escape < RExC_npar is a backref */
	5982	&& num >= RExC_logical_npar
	5983	/* cannot be an octal escape if it starts with [89]
	5984	* */
	5985	&& ! inRANGE(*RExC_parse, '8', '9')
	5986	) {
	5987	/* Probably not meant to be a backref, instead likely
	5988	* to be an octal character escape, e.g. \35 or \777.
	5989	* The above logic should make it obvious why using
	5990	* octal escapes in patterns is problematic. - Yves */
	5991	RExC_parse_set(atom_parse_start);
	5992	goto defchar;
	5993	}
	5994	if (num < RExC_logical_npar) {
	5995	num = RExC_logical_to_parno[num];
	5996	}
	5997	else
	5998	if (ALL_PARENS_COUNTED) {
	5999	if (num < RExC_logical_total_parens) {
	6000	num = RExC_logical_to_parno[num];
	6001	} else {
	6002	num = -1;
	6003	}
	6004	} else {
	6005	REQUIRE_PARENS_PASS;
	6006	}
	6007	}
	6008
	6009	/* At this point RExC_parse points at a numeric escape like
	6010	* \12 or \88 or the digits in \g{34} or \g34 or something
	6011	* similar, which we should NOT treat as an octal escape. It
	6012	* may or may not be a valid backref escape. For instance
	6013	* \88888888 is unlikely to be a valid backref.
	6014	*
	6015	* We've already figured out what value the digits represent.
	6016	* Now, move the parse to beyond them. */
	6017	if (endbrace) {
	6018	RExC_parse_set(endbrace + 1);
	6019	}
	6020	else while (isDIGIT(*RExC_parse)) {
	6021	RExC_parse_inc_by(1);
	6022	}
	6023	if (num < 0)
	6024	vFAIL("Reference to nonexistent group");
	6025
	6026	if (num >= (I32)RExC_npar) {
	6027	/* It might be a forward reference; we can't fail until we
	6028	* know, by completing the parse to get all the groups, and
	6029	* then reparsing */
	6030	if (ALL_PARENS_COUNTED) {
	6031	if (num >= RExC_total_parens) {
	6032	vFAIL("Reference to nonexistent group");
	6033	}
	6034	}
	6035	else {
	6036	REQUIRE_PARENS_PASS;
	6037	}
	6038	}
	6039	RExC_sawback = 1;
	6040	ret = reg2node(pRExC_state,
	6041	((! FOLD)
	6042	? REF
	6043	: (ASCII_FOLD_RESTRICTED)
	6044	? REFFA
	6045	: (AT_LEAST_UNI_SEMANTICS)
	6046	? REFFU
	6047	: (LOC)
	6048	? REFFL
	6049	: REFF),
	6050	num, RExC_nestroot);
	6051	if (RExC_nestroot && num >= RExC_nestroot)
	6052	FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
	6053	if (OP(REGNODE_p(ret)) == REFF) {
	6054	RExC_seen_d_op = TRUE;
	6055	}
	6056	*flagp \|= HASWIDTH;
	6057
	6058	skip_to_be_ignored_text(pRExC_state, &RExC_parse,
	6059	FALSE /* Don't force to /x */ );
	6060	}
	6061	break;
	6062	case '\0':
	6063	if (RExC_parse >= RExC_end)
	6064	FAIL("Trailing \\");
	6065	/* FALLTHROUGH */
	6066	default:
	6067	/* Do not generate "unrecognized" warnings here, we fall
	6068	back into the quick-grab loop below */
	6069	RExC_parse_set(atom_parse_start);
	6070	goto defchar;
	6071	} /* end of switch on a \foo sequence */
	6072	break;
	6073
	6074	case '#':
	6075
	6076	/* '#' comments should have been spaced over before this function was
	6077	* called */
	6078	assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
	6079	/*
	6080	if (RExC_flags & RXf_PMf_EXTENDED) {
	6081	RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
	6082	if (RExC_parse < RExC_end)
	6083	goto tryagain;
	6084	}
	6085	*/
	6086
	6087	/* FALLTHROUGH */
	6088
	6089	default:
	6090	defchar: {
	6091
	6092	/* Here, we have determined that the next thing is probably a
	6093	* literal character. RExC_parse points to the first byte of its
	6094	* definition. (It still may be an escape sequence that evaluates
	6095	* to a single character) */
	6096
	6097	STRLEN len = 0;
	6098	UV ender = 0;
	6099	char *p;
	6100	char s, old_s = NULL, *old_old_s = NULL;
	6101	char *s0;
	6102	U32 max_string_len = 255;
	6103
	6104	/* We may have to reparse the node, artificially stopping filling
	6105	* it early, based on info gleaned in the first parse. This
	6106	* variable gives where we stop. Make it above the normal stopping
	6107	* place first time through; otherwise it would stop too early */
	6108	U32 upper_fill = max_string_len + 1;
	6109
	6110	/* We start out as an EXACT node, even if under /i, until we find a
	6111	* character which is in a fold. The algorithm now segregates into
	6112	* separate nodes, characters that fold from those that don't under
	6113	* /i. (This hopefully will create nodes that are fixed strings
	6114	* even under /i, giving the optimizer something to grab on to.)
	6115	* So, if a node has something in it and the next character is in
	6116	* the opposite category, that node is closed up, and the function
	6117	* returns. Then regatom is called again, and a new node is
	6118	* created for the new category. */
	6119	U8 node_type = EXACT;
	6120
	6121	/* Assume the node will be fully used; the excess is given back at
	6122	* the end. Under /i, we may need to temporarily add the fold of
	6123	* an extra character or two at the end to check for splitting
	6124	* multi-char folds, so allocate extra space for that. We can't
	6125	* make any other length assumptions, as a byte input sequence
	6126	* could shrink down. */
	6127	Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
	6128	+ ((! FOLD)
	6129	? 0
	6130	: 2 * ((UTF)
	6131	? UTF8_MAXBYTES_CASE
	6132	/* Max non-UTF-8 expansion is 2 */ : 2)));
	6133
	6134	bool next_is_quantifier;
	6135	char * oldp = NULL;
	6136
	6137	/* We can convert EXACTF nodes to EXACTFU if they contain only
	6138	* characters that match identically regardless of the target
	6139	* string's UTF8ness. The reason to do this is that EXACTF is not
	6140	* trie-able, EXACTFU is, and EXACTFU requires fewer operations at
	6141	* runtime.
	6142	*
	6143	* Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
	6144	* contain only above-Latin1 characters (hence must be in UTF8),
	6145	* which don't participate in folds with Latin1-range characters,
	6146	* as the latter's folds aren't known until runtime. */
	6147	bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS \|\| LOC);
	6148
	6149	/* Single-character EXACTish nodes are almost always SIMPLE. This
	6150	* allows us to override this as encountered */
	6151	U8 maybe_SIMPLE = SIMPLE;
	6152
	6153	/* Does this node contain something that can't match unless the
	6154	* target string is (also) in UTF-8 */
	6155	bool requires_utf8_target = FALSE;
	6156
	6157	/* The sequence 'ss' is problematic in non-UTF-8 patterns. */
	6158	bool has_ss = FALSE;
	6159
	6160	/* So is the MICRO SIGN */
	6161	bool has_micro_sign = FALSE;
	6162
	6163	/* Set when we fill up the current node and there is still more
	6164	* text to process */
	6165	bool overflowed;
	6166
	6167	/* Allocate an EXACT node. The node_type may change below to
	6168	* another EXACTish node, but since the size of the node doesn't
	6169	* change, it works */
	6170	ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
	6171	FILL_NODE(ret, node_type);
	6172	RExC_emit += NODE_STEP_REGNODE;
	6173
	6174	s = STRING(REGNODE_p(ret));
	6175
	6176	s0 = s;
	6177
	6178	reparse:
	6179
	6180	p = RExC_parse;
	6181	len = 0;
	6182	s = s0;
	6183	node_type = EXACT;
	6184	oldp = NULL;
	6185	maybe_exactfu = FOLD && (DEPENDS_SEMANTICS \|\| LOC);
	6186	maybe_SIMPLE = SIMPLE;
	6187	requires_utf8_target = FALSE;
	6188	has_ss = FALSE;
	6189	has_micro_sign = FALSE;
	6190
	6191	continue_parse:
	6192
	6193	/* This breaks under rare circumstances. If folding, we do not
	6194	* want to split a node at a character that is a non-final in a
	6195	* multi-char fold, as an input string could just happen to want to
	6196	* match across the node boundary. The code at the end of the loop
	6197	* looks for this, and backs off until it finds not such a
	6198	* character, but it is possible (though extremely, extremely
	6199	* unlikely) for all characters in the node to be non-final fold
	6200	* ones, in which case we just leave the node fully filled, and
	6201	* hope that it doesn't match the string in just the wrong place */
	6202
	6203	assert( ! UTF /* Is at the beginning of a character */
	6204	\|\| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
	6205	\|\| UTF8_IS_START(UCHARAT(RExC_parse)));
	6206
	6207	overflowed = FALSE;
	6208
	6209	/* Here, we have a literal character. Find the maximal string of
	6210	* them in the input that we can fit into a single EXACTish node.
	6211	* We quit at the first non-literal or when the node gets full, or
	6212	* under /i the categorization of folding/non-folding character
	6213	* changes */
	6214	while (p < RExC_end && len < upper_fill) {
	6215
	6216	/* In most cases each iteration adds one byte to the output.
	6217	* The exceptions override this */
	6218	Size_t added_len = 1;
	6219
	6220	oldp = p;
	6221	old_old_s = old_s;
	6222	old_s = s;
	6223
	6224	/* White space has already been ignored */
	6225	assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
	6226	\|\| ! is_PATWS_safe((p), RExC_end, UTF));
	6227
	6228	switch ((U8)*p) {
	6229	const char* message;
	6230	U32 packed_warn;
	6231	U8 grok_c_char;
	6232
	6233	case '^':
	6234	case '$':
	6235	case '.':
	6236	case '[':
	6237	case '(':
	6238	case ')':
	6239	case '\|':
	6240	goto loopdone;
	6241	case '\\':
	6242	/* Literal Escapes Switch
	6243
	6244	This switch is meant to handle escape sequences that
	6245	resolve to a literal character.
	6246
	6247	Every escape sequence that represents something
	6248	else, like an assertion or a char class, is handled
	6249	in the switch marked 'Special Escapes' above in this
	6250	routine, but also has an entry here as anything that
	6251	isn't explicitly mentioned here will be treated as
	6252	an unescaped equivalent literal.
	6253	*/
	6254
	6255	switch ((U8)*++p) {
	6256
	6257	/* These are all the special escapes. */
	6258	case 'A': /* Start assertion */
	6259	case 'b': case 'B': /* Word-boundary assertion*/
	6260	case 'C': /* Single char !DANGEROUS! */
	6261	case 'd': case 'D': /* digit class */
	6262	case 'g': case 'G': /* generic-backref, pos assertion */
	6263	case 'h': case 'H': /* HORIZWS */
	6264	case 'k': case 'K': /* named backref, keep marker */
	6265	case 'p': case 'P': /* Unicode property */
	6266	case 'R': /* LNBREAK */
	6267	case 's': case 'S': /* space class */
	6268	case 'v': case 'V': /* VERTWS */
	6269	case 'w': case 'W': /* word class */
	6270	case 'X': /* eXtended Unicode "combining
	6271	character sequence" */
	6272	case 'z': case 'Z': /* End of line/string assertion */
	6273	--p;
	6274	goto loopdone;
	6275
	6276	/* Anything after here is an escape that resolves to a
	6277	literal. (Except digits, which may or may not)
	6278	*/
	6279	case 'n':
	6280	ender = '\n';
	6281	p++;
	6282	break;
	6283	case 'N': /* Handle a single-code point named character. */
	6284	RExC_parse_set( p + 1 );
	6285	if (! grok_bslash_N(pRExC_state,
	6286	NULL, /* Fail if evaluates to
	6287	anything other than a
	6288	single code point */
	6289	&ender, /* The returned single code
	6290	point */
	6291	NULL, /* Don't need a count of
	6292	how many code points */
	6293	flagp,
	6294	RExC_strict,
	6295	depth)
	6296	) {
	6297	if (*flagp & NEED_UTF8)
	6298	FAIL("panic: grok_bslash_N set NEED_UTF8");
	6299	RETURN_FAIL_ON_RESTART_FLAGP(flagp);
	6300
	6301	/* Here, it wasn't a single code point. Go close
	6302	* up this EXACTish node. The switch() prior to
	6303	* this switch handles the other cases */
	6304	p = oldp;
	6305	RExC_parse_set(p);
	6306	goto loopdone;
	6307	}
	6308	p = RExC_parse;
	6309	RExC_parse_set(atom_parse_start);
	6310
	6311	/* The \N{} means the pattern, if previously /d,
	6312	* becomes /u. That means it can't be an EXACTF node,
	6313	* but an EXACTFU */
	6314	if (node_type == EXACTF) {
	6315	node_type = EXACTFU;
	6316
	6317	/* If the node already contains something that
	6318	* differs between EXACTF and EXACTFU, reparse it
	6319	* as EXACTFU */
	6320	if (! maybe_exactfu) {
	6321	len = 0;
	6322	s = s0;
	6323	goto reparse;
	6324	}
	6325	}
	6326
	6327	break;
	6328	case 'r':
	6329	ender = '\r';
	6330	p++;
	6331	break;
	6332	case 't':
	6333	ender = '\t';
	6334	p++;
	6335	break;
	6336	case 'f':
	6337	ender = '\f';
	6338	p++;
	6339	break;
	6340	case 'e':
	6341	ender = ESC_NATIVE;
	6342	p++;
	6343	break;
	6344	case 'a':
	6345	ender = '\a';
	6346	p++;
	6347	break;
	6348	case 'o':
	6349	if (! grok_bslash_o(&p,
	6350	RExC_end,
	6351	&ender,
	6352	&message,
	6353	&packed_warn,
	6354	(bool) RExC_strict,
	6355	FALSE, /* No illegal cp's */
	6356	UTF))
	6357	{
	6358	RExC_parse_set(p); /* going to die anyway; point to
	6359	exact spot of failure */
	6360	vFAIL(message);
	6361	}
	6362
	6363	if (message && TO_OUTPUT_WARNINGS(p)) {
	6364	warn_non_literal_string(p, packed_warn, message);
	6365	}
	6366	break;
	6367	case 'x':
	6368	if (! grok_bslash_x(&p,
	6369	RExC_end,
	6370	&ender,
	6371	&message,
	6372	&packed_warn,
	6373	(bool) RExC_strict,
	6374	FALSE, /* No illegal cp's */
	6375	UTF))
	6376	{
	6377	RExC_parse_set(p); /* going to die anyway; point
	6378	to exact spot of failure */
	6379	vFAIL(message);
	6380	}
	6381
	6382	if (message && TO_OUTPUT_WARNINGS(p)) {
	6383	warn_non_literal_string(p, packed_warn, message);
	6384	}
	6385
	6386	#ifdef EBCDIC
	6387	if (ender < 0x100) {
	6388	if (RExC_recode_x_to_native) {
	6389	ender = LATIN1_TO_NATIVE(ender);
	6390	}
	6391	}
	6392	#endif
	6393	break;
	6394	case 'c':
	6395	p++;
	6396	if (! grok_bslash_c(*p, &grok_c_char,
	6397	&message, &packed_warn))
	6398	{
	6399	/* going to die anyway; point to exact spot of
	6400	* failure */
	6401	char *new_p= p + ((UTF)
	6402	? UTF8_SAFE_SKIP(p, RExC_end)
	6403	: 1);
	6404	RExC_parse_set(new_p);
	6405	vFAIL(message);
	6406	}
	6407
	6408	ender = grok_c_char;
	6409	p++;
	6410	if (message && TO_OUTPUT_WARNINGS(p)) {
	6411	warn_non_literal_string(p, packed_warn, message);
	6412	}
	6413
	6414	break;
	6415	case '8': case '9': /* must be a backreference */
	6416	--p;
	6417	/* we have an escape like \8 which cannot be an octal escape
	6418	* so we exit the loop, and let the outer loop handle this
	6419	* escape which may or may not be a legitimate backref. */
	6420	goto loopdone;
	6421	case '1': case '2': case '3':case '4':
	6422	case '5': case '6': case '7':
	6423
	6424	/* When we parse backslash escapes there is ambiguity
	6425	* between backreferences and octal escapes. Any escape
	6426	* from \1 - \9 is a backreference, any multi-digit
	6427	* escape which does not start with 0 and which when
	6428	* evaluated as decimal could refer to an already
	6429	* parsed capture buffer is a back reference. Anything
	6430	* else is octal.
	6431	*
	6432	* Note this implies that \118 could be interpreted as
	6433	* 118 OR as "\11" . "8" depending on whether there
	6434	* were 118 capture buffers defined already in the
	6435	* pattern. */
	6436
	6437	/* NOTE, RExC_npar is 1 more than the actual number of
	6438	* parens we have seen so far, hence the "<" as opposed
	6439	* to "<=" */
	6440	if ( !isDIGIT(p[1]) \|\| S_backref_value(p, RExC_end) < RExC_npar)
	6441	{ /* Not to be treated as an octal constant, go
	6442	find backref */
	6443	p = oldp;
	6444	goto loopdone;
	6445	}
	6446	/* FALLTHROUGH */
	6447	case '0':
	6448	{
	6449	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	6450	\| PERL_SCAN_NOTIFY_ILLDIGIT;
	6451	STRLEN numlen = 3;
	6452	ender = grok_oct(p, &numlen, &flags, NULL);
	6453	p += numlen;
	6454	if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
	6455	&& isDIGIT(p) / like \08, \178 */
	6456	&& ckWARN(WARN_REGEXP))
	6457	{
	6458	reg_warn_non_literal_string(
	6459	p + 1,
	6460	form_alien_digit_msg(8, numlen, p,
	6461	RExC_end, UTF, FALSE));
	6462	}
	6463	}
	6464	break;
	6465	case '\0':
	6466	if (p >= RExC_end)
	6467	FAIL("Trailing \\");
	6468	/* FALLTHROUGH */
	6469	default:
	6470	if (isALPHANUMERIC(*p)) {
	6471	/* An alpha followed by '{' is going to fail next
	6472	* iteration, so don't output this warning in that
	6473	* case */
	6474	if (! isALPHA(p) \|\| (p + 1) != '{') {
	6475	ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
	6476	" passed through", p);
	6477	}
	6478	}
	6479	goto normal_default;
	6480	} /* End of switch on '\' */
	6481	break;
	6482	case '{':
	6483	/* Trying to gain new uses for '{' without breaking too
	6484	* much existing code is hard. The solution currently
	6485	* adopted is:
	6486	* 1) If there is no ambiguity that a '{' should always
	6487	* be taken literally, at the start of a construct, we
	6488	* just do so.
	6489	* 2) If the literal '{' conflicts with our desired use
	6490	* of it as a metacharacter, we die. The deprecation
	6491	* cycles for this have come and gone.
	6492	* 3) If there is ambiguity, we raise a simple warning.
	6493	* This could happen, for example, if the user
	6494	* intended it to introduce a quantifier, but slightly
	6495	* misspelled the quantifier. Without this warning,
	6496	* the quantifier would silently be taken as a literal
	6497	* string of characters instead of a meta construct */
	6498	if (len \|\| (p > RExC_start && isALPHA_A(*(p - 1)))) {
	6499	if ( RExC_strict
	6500	\|\| ( p > atom_parse_start + 1
	6501	&& isALPHA_A(*(p - 1))
	6502	&& *(p - 2) == '\\'))
	6503	{
	6504	RExC_parse_set(p + 1);
	6505	vFAIL("Unescaped left brace in regex is "
	6506	"illegal here");
	6507	}
	6508	ckWARNreg(p + 1, "Unescaped left brace in regex is"
	6509	" passed through");
	6510	}
	6511	goto normal_default;
	6512	case '}':
	6513	case ']':
	6514	if (p > RExC_parse && RExC_strict) {
	6515	ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
	6516	}
	6517	/FALLTHROUGH/
	6518	default: /* A literal character */
	6519	normal_default:
	6520	if (! UTF8_IS_INVARIANT(*p) && UTF) {
	6521	STRLEN numlen;
	6522	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	6523	&numlen, UTF8_ALLOW_DEFAULT);
	6524	p += numlen;
	6525	}
	6526	else
	6527	ender = (U8) *p++;
	6528	break;
	6529	} /* End of switch on the literal */
	6530
	6531	/* Here, have looked at the literal character, and <ender>
	6532	* contains its ordinal; <p> points to the character after it.
	6533	* */
	6534
	6535	if (ender > 255) {
	6536	REQUIRE_UTF8(flagp);
	6537	if ( UNICODE_IS_PERL_EXTENDED(ender)
	6538	&& TO_OUTPUT_WARNINGS(p))
	6539	{
	6540	ckWARN2_non_literal_string(p,
	6541	packWARN(WARN_PORTABLE),
	6542	PL_extended_cp_format,
	6543	ender);
	6544	}
	6545	}
	6546
	6547	/* We need to check if the next non-ignored thing is a
	6548	* quantifier. Move <p> to after anything that should be
	6549	* ignored, which, as a side effect, positions <p> for the next
	6550	* loop iteration */
	6551	skip_to_be_ignored_text(pRExC_state, &p,
	6552	FALSE /* Don't force to /x */ );
	6553
	6554	/* If the next thing is a quantifier, it applies to this
	6555	* character only, which means that this character has to be in
	6556	* its own node and can't just be appended to the string in an
	6557	* existing node, so if there are already other characters in
	6558	* the node, close the node with just them, and set up to do
	6559	* this character again next time through, when it will be the
	6560	* only thing in its new node */
	6561
	6562	next_is_quantifier = LIKELY(p < RExC_end)
	6563	&& UNLIKELY(isQUANTIFIER(p, RExC_end));
	6564
	6565	if (next_is_quantifier && LIKELY(len)) {
	6566	p = oldp;
	6567	goto loopdone;
	6568	}
	6569
	6570	/* Ready to add 'ender' to the node */
	6571
	6572	if (! FOLD) { /* The simple case, just append the literal */
	6573	not_fold_common:
	6574
	6575	/* Don't output if it would overflow */
	6576	if (UNLIKELY(len > max_string_len - ((UTF)
	6577	? UVCHR_SKIP(ender)
	6578	: 1)))
	6579	{
	6580	overflowed = TRUE;
	6581	break;
	6582	}
	6583
	6584	if (UVCHR_IS_INVARIANT(ender) \|\| ! UTF) {
	6585	*(s++) = (char) ender;
	6586	}
	6587	else {
	6588	U8 * new_s = uvchr_to_utf8((U8*)s, ender);
	6589	added_len = (char *) new_s - s;
	6590	s = (char *) new_s;
	6591
	6592	if (ender > 255) {
	6593	requires_utf8_target = TRUE;
	6594	}
	6595	}
	6596	}
	6597	else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
	6598
	6599	/* Here are folding under /l, and the code point is
	6600	* problematic. If this is the first character in the
	6601	* node, change the node type to folding. Otherwise, if
	6602	* this is the first problematic character, close up the
	6603	* existing node, so can start a new node with this one */
	6604	if (! len) {
	6605	node_type = EXACTFL;
	6606	RExC_contains_locale = 1;
	6607	}
	6608	else if (node_type == EXACT) {
	6609	p = oldp;
	6610	goto loopdone;
	6611	}
	6612
	6613	/* This problematic code point means we can't simplify
	6614	* things */
	6615	maybe_exactfu = FALSE;
	6616
	6617	/* Although these two characters have folds that are
	6618	* locale-problematic, they also have folds to above Latin1
	6619	* that aren't a problem. Doing these now helps at
	6620	* runtime. */
	6621	if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
	6622	\|\| ender == LATIN_CAPITAL_LETTER_SHARP_S))
	6623	{
	6624	goto fold_anyway;
	6625	}
	6626
	6627	/* Here, we are adding a problematic fold character.
	6628	* "Problematic" in this context means that its fold isn't
	6629	* known until runtime. (The non-problematic code points
	6630	* are the above-Latin1 ones that fold to also all
	6631	* above-Latin1. Their folds don't vary no matter what the
	6632	* locale is.) But here we have characters whose fold
	6633	* depends on the locale. We just add in the unfolded
	6634	* character, and wait until runtime to fold it */
	6635	goto not_fold_common;
	6636	}
	6637	else /* regular fold; see if actually is in a fold */
	6638	if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
	6639	\|\| (ender > 255
	6640	&& ! _invlist_contains_cp(PL_in_some_fold, ender)))
	6641	{
	6642	/* Here, folding, but the character isn't in a fold.
	6643	*
	6644	* Start a new node if previous characters in the node were
	6645	* folded */
	6646	if (len && node_type != EXACT) {
	6647	p = oldp;
	6648	goto loopdone;
	6649	}
	6650
	6651	/* Here, continuing a node with non-folded characters. Add
	6652	* this one */
	6653	goto not_fold_common;
	6654	}
	6655	else { /* Here, does participate in some fold */
	6656
	6657	/* If this is the first character in the node, change its
	6658	* type to folding. Otherwise, if this is the first
	6659	* folding character in the node, close up the existing
	6660	* node, so can start a new node with this one. */
	6661	if (! len) {
	6662	node_type = compute_EXACTish(pRExC_state);
	6663	}
	6664	else if (node_type == EXACT) {
	6665	p = oldp;
	6666	goto loopdone;
	6667	}
	6668
	6669	if (UTF) { /* Alway use the folded value for UTF-8
	6670	patterns */
	6671	if (UVCHR_IS_INVARIANT(ender)) {
	6672	if (UNLIKELY(len + 1 > max_string_len)) {
	6673	overflowed = TRUE;
	6674	break;
	6675	}
	6676
	6677	*(s)++ = (U8) toFOLD(ender);
	6678	}
	6679	else {
	6680	UV folded;
	6681
	6682	fold_anyway:
	6683	folded = _to_uni_fold_flags(
	6684	ender,
	6685	(U8 ) s, / We have allocated extra space
	6686	in 's' so can't run off the
	6687	end */
	6688	&added_len,
	6689	FOLD_FLAGS_FULL
	6690	\| (( ASCII_FOLD_RESTRICTED
	6691	\|\| node_type == EXACTFL)
	6692	? FOLD_FLAGS_NOMIX_ASCII
	6693	: 0));
	6694	if (UNLIKELY(len + added_len > max_string_len)) {
	6695	overflowed = TRUE;
	6696	break;
	6697	}
	6698
	6699	s += added_len;
	6700
	6701	if ( folded > 255
	6702	&& LIKELY(folded != GREEK_SMALL_LETTER_MU))
	6703	{
	6704	/* U+B5 folds to the MU, so its possible for a
	6705	* non-UTF-8 target to match it */
	6706	requires_utf8_target = TRUE;
	6707	}
	6708	}
	6709	}
	6710	else { /* Here is non-UTF8. */
	6711
	6712	/* The fold will be one or (rarely) two characters.
	6713	* Check that there's room for at least a single one
	6714	* before setting any flags, etc. Because otherwise an
	6715	* overflowing character could cause a flag to be set
	6716	* even though it doesn't end up in this node. (For
	6717	* the two character fold, we check again, before
	6718	* setting any flags) */
	6719	if (UNLIKELY(len + 1 > max_string_len)) {
	6720	overflowed = TRUE;
	6721	break;
	6722	}
	6723
	6724	#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
	6725	\|\| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
	6726	\|\| UNICODE_DOT_DOT_VERSION > 0)
	6727
	6728	/* On non-ancient Unicodes, check for the only possible
	6729	* multi-char fold */
	6730	if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
	6731
	6732	/* This potential multi-char fold means the node
	6733	* can't be simple (because it could match more
	6734	* than a single char). And in some cases it will
	6735	* match 'ss', so set that flag */
	6736	maybe_SIMPLE = 0;
	6737	has_ss = TRUE;
	6738
	6739	/* It can't change to be an EXACTFU (unless already
	6740	* is one). We fold it iff under /u rules. */
	6741	if (node_type != EXACTFU) {
	6742	maybe_exactfu = FALSE;
	6743	}
	6744	else {
	6745	if (UNLIKELY(len + 2 > max_string_len)) {
	6746	overflowed = TRUE;
	6747	break;
	6748	}
	6749
	6750	*(s++) = 's';
	6751	*(s++) = 's';
	6752	added_len = 2;
	6753
	6754	goto done_with_this_char;
	6755	}
	6756	}
	6757	else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
	6758	&& LIKELY(len > 0)
	6759	&& UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
	6760	{
	6761	/* Also, the sequence 'ss' is special when not
	6762	* under /u. If the target string is UTF-8, it
	6763	* should match SHARP S; otherwise it won't. So,
	6764	* here we have to exclude the possibility of this
	6765	* node moving to /u.*/
	6766	has_ss = TRUE;
	6767	maybe_exactfu = FALSE;
	6768	}
	6769	#endif
	6770	/* Here, the fold will be a single character */
	6771
	6772	if (UNLIKELY(ender == MICRO_SIGN)) {
	6773	has_micro_sign = TRUE;
	6774	}
	6775	else if (PL_fold[ender] != PL_fold_latin1[ender]) {
	6776
	6777	/* If the character's fold differs between /d and
	6778	* /u, this can't change to be an EXACTFU node */
	6779	maybe_exactfu = FALSE;
	6780	}
	6781
	6782	*(s++) = (DEPENDS_SEMANTICS)
	6783	? (char) toFOLD(ender)
	6784
	6785	/* Under /u, the fold of any character in
	6786	* the 0-255 range happens to be its
	6787	* lowercase equivalent, except for LATIN
	6788	* SMALL LETTER SHARP S, which was handled
	6789	* above, and the MICRO SIGN, whose fold
	6790	* requires UTF-8 to represent. */
	6791	: (char) toLOWER_L1(ender);
	6792	}
	6793	} /* End of adding current character to the node */
	6794
	6795	done_with_this_char:
	6796
	6797	len += added_len;
	6798
	6799	if (next_is_quantifier) {
	6800
	6801	/* Here, the next input is a quantifier, and to get here,
	6802	* the current character is the only one in the node. */
	6803	goto loopdone;
	6804	}
	6805
	6806	} /* End of loop through literal characters */
	6807
	6808	/* Here we have either exhausted the input or run out of room in
	6809	* the node. If the former, we are done. (If we encountered a
	6810	* character that can't be in the node, transfer is made directly
	6811	* to <loopdone>, and so we wouldn't have fallen off the end of the
	6812	* loop.) */
	6813	if (LIKELY(! overflowed)) {
	6814	goto loopdone;
	6815	}
	6816
	6817	/* Here we have run out of room. We can grow plain EXACT and
	6818	* LEXACT nodes. If the pattern is gigantic enough, though,
	6819	* eventually we'll have to artificially chunk the pattern into
	6820	* multiple nodes. */
	6821	if (! LOC && (node_type == EXACT \|\| node_type == LEXACT)) {
	6822	Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
	6823	Size_t overhead_expansion = 0;
	6824	char temp[256];
	6825	Size_t max_nodes_for_string;
	6826	Size_t achievable;
	6827	SSize_t delta;
	6828
	6829	/* Here we couldn't fit the final character in the current
	6830	* node, so it will have to be reparsed, no matter what else we
	6831	* do */
	6832	p = oldp;
	6833
	6834	/* If would have overflowed a regular EXACT node, switch
	6835	* instead to an LEXACT. The code below is structured so that
	6836	* the actual growing code is common to changing from an EXACT
	6837	* or just increasing the LEXACT size. This means that we have
	6838	* to save the string in the EXACT case before growing, and
	6839	* then copy it afterwards to its new location */
	6840	if (node_type == EXACT) {
	6841	overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
	6842	RExC_emit += overhead_expansion;
	6843	Copy(s0, temp, len, char);
	6844	}
	6845
	6846	/* Ready to grow. If it was a plain EXACT, the string was
	6847	* saved, and the first few bytes of it overwritten by adding
	6848	* an argument field. We assume, as we do elsewhere in this
	6849	* file, that one byte of remaining input will translate into
	6850	* one byte of output, and if that's too small, we grow again,
	6851	* if too large the excess memory is freed at the end */
	6852
	6853	max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
	6854	achievable = MIN(max_nodes_for_string,
	6855	current_string_nodes + STR_SZ(RExC_end - p));
	6856	delta = achievable - current_string_nodes;
	6857
	6858	/* If there is just no more room, go finish up this chunk of
	6859	* the pattern. */
	6860	if (delta <= 0) {
	6861	goto loopdone;
	6862	}
	6863
	6864	change_engine_size(pRExC_state, delta + overhead_expansion);
	6865	current_string_nodes += delta;
	6866	max_string_len
	6867	= sizeof(struct regnode) * current_string_nodes;
	6868	upper_fill = max_string_len + 1;
	6869
	6870	/* If the length was small, we know this was originally an
	6871	* EXACT node now converted to LEXACT, and the string has to be
	6872	* restored. Otherwise the string was untouched. 260 is just
	6873	* a number safely above 255 so don't have to worry about
	6874	* getting it precise */
	6875	if (len < 260) {
	6876	node_type = LEXACT;
	6877	FILL_NODE(ret, node_type);
	6878	s0 = STRING(REGNODE_p(ret));
	6879	Copy(temp, s0, len, char);
	6880	s = s0 + len;
	6881	}
	6882
	6883	goto continue_parse;
	6884	}
	6885	else if (FOLD) {
	6886	bool splittable = FALSE;
	6887	bool backed_up = FALSE;
	6888	char * e; /* should this be U8? */
	6889	char * s_start; /* should this be U8? */
	6890
	6891	/* Here is /i. Running out of room creates a problem if we are
	6892	* folding, and the split happens in the middle of a
	6893	* multi-character fold, as a match that should have occurred,
	6894	* won't, due to the way nodes are matched, and our artificial
	6895	* boundary. So back off until we aren't splitting such a
	6896	* fold. If there is no such place to back off to, we end up
	6897	* taking the entire node as-is. This can happen if the node
	6898	* consists entirely of 'f' or entirely of 's' characters (or
	6899	* things that fold to them) as 'ff' and 'ss' are
	6900	* multi-character folds.
	6901	*
	6902	* The Unicode standard says that multi character folds consist
	6903	* of either two or three characters. That means we would be
	6904	* splitting one if the final character in the node is at the
	6905	* beginning of either type, or is the second of a three
	6906	* character fold.
	6907	*
	6908	* At this point:
	6909	* ender is the code point of the character that won't fit
	6910	* in the node
	6911	* s points to just beyond the final byte in the node.
	6912	* It's where we would place ender if there were
	6913	* room, and where in fact we do place ender's fold
	6914	* in the code below, as we've over-allocated space
	6915	* for s0 (hence s) to allow for this
	6916	* e starts at 's' and advances as we append things.
	6917	* old_s is the same as 's'. (If ender had fit, 's' would
	6918	* have been advanced to beyond it).
	6919	* old_old_s points to the beginning byte of the final
	6920	* character in the node
	6921	* p points to the beginning byte in the input of the
	6922	* character beyond 'ender'.
	6923	* oldp points to the beginning byte in the input of
	6924	* 'ender'.
	6925	*
	6926	* In the case of /il, we haven't folded anything that could be
	6927	* affected by the locale. That means only above-Latin1
	6928	* characters that fold to other above-latin1 characters get
	6929	* folded at compile time. To check where a good place to
	6930	* split nodes is, everything in it will have to be folded.
	6931	* The boolean 'maybe_exactfu' keeps track in /il if there are
	6932	* any unfolded characters in the node. */
	6933	bool need_to_fold_loc = LOC && ! maybe_exactfu;
	6934
	6935	/* If we do need to fold the node, we need a place to store the
	6936	* folded copy, and a way to map back to the unfolded original
	6937	* */
	6938	char * locfold_buf = NULL;
	6939	Size_t * loc_correspondence = NULL;
	6940
	6941	if (! need_to_fold_loc) { /* The normal case. Just
	6942	initialize to the actual node */
	6943	e = s;
	6944	s_start = s0;
	6945	s = old_old_s; /* Point to the beginning of the final char
	6946	that fits in the node */
	6947	}
	6948	else {
	6949
	6950	/* Here, we have filled a /il node, and there are unfolded
	6951	* characters in it. If the runtime locale turns out to be
	6952	* UTF-8, there are possible multi-character folds, just
	6953	* like when not under /l. The node hence can't terminate
	6954	* in the middle of such a fold. To determine this, we
	6955	* have to create a folded copy of this node. That means
	6956	* reparsing the node, folding everything assuming a UTF-8
	6957	* locale. (If at runtime it isn't such a locale, the
	6958	* actions here wouldn't have been necessary, but we have
	6959	* to assume the worst case.) If we find we need to back
	6960	* off the folded string, we do so, and then map that
	6961	* position back to the original unfolded node, which then
	6962	* gets output, truncated at that spot */
	6963
	6964	char * redo_p = RExC_parse;
	6965	char * redo_e;
	6966	char * old_redo_e;
	6967
	6968	/* Allow enough space assuming a single byte input folds to
	6969	* a single byte output, plus assume that the two unparsed
	6970	* characters (that we may need) fold to the largest number
	6971	* of bytes possible, plus extra for one more worst case
	6972	* scenario. In the loop below, if we start eating into
	6973	* that final spare space, we enlarge this initial space */
	6974	Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
	6975
	6976	Newxz(locfold_buf, size, char);
	6977	Newxz(loc_correspondence, size, Size_t);
	6978
	6979	/* Redo this node's parse, folding into 'locfold_buf' */
	6980	redo_p = RExC_parse;
	6981	old_redo_e = redo_e = locfold_buf;
	6982	while (redo_p <= oldp) {
	6983
	6984	old_redo_e = redo_e;
	6985	loc_correspondence[redo_e - locfold_buf]
	6986	= redo_p - RExC_parse;
	6987
	6988	if (UTF) {
	6989	Size_t added_len;
	6990
	6991	(void) _to_utf8_fold_flags((U8 *) redo_p,
	6992	(U8 *) RExC_end,
	6993	(U8 *) redo_e,
	6994	&added_len,
	6995	FOLD_FLAGS_FULL);
	6996	redo_e += added_len;
	6997	redo_p += UTF8SKIP(redo_p);
	6998	}
	6999	else {
	7000
	7001	/* Note that if this code is run on some ancient
	7002	* Unicode versions, SHARP S doesn't fold to 'ss',
	7003	* but rather than clutter the code with #ifdef's,
	7004	* as is done above, we ignore that possibility.
	7005	* This is ok because this code doesn't affect what
	7006	* gets matched, but merely where the node gets
	7007	* split */
	7008	if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
	7009	*redo_e++ = toLOWER_L1(UCHARAT(redo_p));
	7010	}
	7011	else {
	7012	*redo_e++ = 's';
	7013	*redo_e++ = 's';
	7014	}
	7015	redo_p++;
	7016	}
	7017
	7018
	7019	/* If we're getting so close to the end that a
	7020	* worst-case fold in the next character would cause us
	7021	* to overflow, increase, assuming one byte output byte
	7022	* per one byte input one, plus room for another worst
	7023	* case fold */
	7024	if ( redo_p <= oldp
	7025	&& redo_e > locfold_buf + size
	7026	- (UTF8_MAXBYTES_CASE + 1))
	7027	{
	7028	Size_t new_size = size
	7029	+ (oldp - redo_p)
	7030	+ UTF8_MAXBYTES_CASE + 1;
	7031	Ptrdiff_t e_offset = redo_e - locfold_buf;
	7032
	7033	Renew(locfold_buf, new_size, char);
	7034	Renew(loc_correspondence, new_size, Size_t);
	7035	size = new_size;
	7036
	7037	redo_e = locfold_buf + e_offset;
	7038	}
	7039	}
	7040
	7041	/* Set so that things are in terms of the folded, temporary
	7042	* string */
	7043	s = old_redo_e;
	7044	s_start = locfold_buf;
	7045	e = redo_e;
	7046
	7047	}
	7048
	7049	/* Here, we have 's', 's_start' and 'e' set up to point to the
	7050	* input that goes into the node, folded.
	7051	*
	7052	* If the final character of the node and the fold of ender
	7053	* form the first two characters of a three character fold, we
	7054	* need to peek ahead at the next (unparsed) character in the
	7055	* input to determine if the three actually do form such a
	7056	* fold. Just looking at that character is not generally
	7057	* sufficient, as it could be, for example, an escape sequence
	7058	* that evaluates to something else, and it needs to be folded.
	7059	*
	7060	* khw originally thought to just go through the parse loop one
	7061	* extra time, but that doesn't work easily as that iteration
	7062	* could cause things to think that the parse is over and to
	7063	* goto loopdone. The character could be a '$' for example, or
	7064	* the character beyond could be a quantifier, and other
	7065	* glitches as well.
	7066	*
	7067	* The solution used here for peeking ahead is to look at that
	7068	* next character. If it isn't ASCII punctuation, then it will
	7069	* be something that would continue on in an EXACTish node if
	7070	* there were space. We append the fold of it to s, having
	7071	* reserved enough room in s0 for the purpose. If we can't
	7072	* reasonably peek ahead, we instead assume the worst case:
	7073	* that it is something that would form the completion of a
	7074	* multi-char fold.
	7075	*
	7076	* If we can't split between s and ender, we work backwards
	7077	* character-by-character down to s0. At each current point
	7078	* see if we are at the beginning of a multi-char fold. If so,
	7079	* that means we would be splitting the fold across nodes, and
	7080	* so we back up one and try again.
	7081	*
	7082	* If we're not at the beginning, we still could be at the
	7083	* final two characters of a (rare) three character fold. We
	7084	* check if the sequence starting at the character before the
	7085	* current position (and including the current and next
	7086	* characters) is a three character fold. If not, the node can
	7087	* be split here. If it is, we have to backup two characters
	7088	* and try again.
	7089	*
	7090	* Otherwise, the node can be split at the current position.
	7091	*
	7092	* The same logic is used for UTF-8 patterns and not */
	7093	if (UTF) {
	7094	Size_t added_len;
	7095
	7096	/* Append the fold of ender */
	7097	(void) _to_uni_fold_flags(
	7098	ender,
	7099	(U8 *) e,
	7100	&added_len,
	7101	FOLD_FLAGS_FULL \| ((ASCII_FOLD_RESTRICTED)
	7102	? FOLD_FLAGS_NOMIX_ASCII
	7103	: 0));
	7104	e += added_len;
	7105
	7106	/* 's' and the character folded to by ender may be the
	7107	* first two of a three-character fold, in which case the
	7108	* node should not be split here. That may mean examining
	7109	* the so-far unparsed character starting at 'p'. But if
	7110	* ender folded to more than one character, we already have
	7111	* three characters to look at. Also, we first check if
	7112	* the sequence consisting of s and the next character form
	7113	* the first two of some three character fold. If not,
	7114	* there's no need to peek ahead. */
	7115	if ( added_len <= UTF8SKIP(e - added_len)
	7116	&& UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
	7117	{
	7118	/* Here, the two do form the beginning of a potential
	7119	* three character fold. The unexamined character may
	7120	* or may not complete it. Peek at it. It might be
	7121	* something that ends the node or an escape sequence,
	7122	* in which case we don't know without a lot of work
	7123	* what it evaluates to, so we have to assume the worst
	7124	* case: that it does complete the fold, and so we
	7125	* can't split here. All such instances will have
	7126	* that character be an ASCII punctuation character,
	7127	* like a backslash. So, for that case, backup one and
	7128	* drop down to try at that position */
	7129	if (isPUNCT(*p)) {
	7130	s = (char ) utf8_hop_back((U8 ) s, -1,
	7131	(U8 *) s_start);
	7132	backed_up = TRUE;
	7133	}
	7134	else {
	7135	/* Here, since it's not punctuation, it must be a
	7136	* real character, and we can append its fold to
	7137	* 'e' (having deliberately reserved enough space
	7138	* for this eventuality) and drop down to check if
	7139	* the three actually do form a folded sequence */
	7140	(void) _to_utf8_fold_flags(
	7141	(U8 ) p, (U8 ) RExC_end,
	7142	(U8 *) e,
	7143	&added_len,
	7144	FOLD_FLAGS_FULL \| ((ASCII_FOLD_RESTRICTED)
	7145	? FOLD_FLAGS_NOMIX_ASCII
	7146	: 0));
	7147	e += added_len;
	7148	}
	7149	}
	7150
	7151	/* Here, we either have three characters available in
	7152	* sequence starting at 's', or we have two characters and
	7153	* know that the following one can't possibly be part of a
	7154	* three character fold. We go through the node backwards
	7155	* until we find a place where we can split it without
	7156	* breaking apart a multi-character fold. At any given
	7157	* point we have to worry about if such a fold begins at
	7158	* the current 's', and also if a three-character fold
	7159	* begins at s-1, (containing s and s+1). Splitting in
	7160	* either case would break apart a fold */
	7161	do {
	7162	char prev_s = (char ) utf8_hop_back((U8 *) s, -1,
	7163	(U8 *) s_start);
	7164
	7165	/* If is a multi-char fold, can't split here. Backup
	7166	* one char and try again */
	7167	if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
	7168	s = prev_s;
	7169	backed_up = TRUE;
	7170	continue;
	7171	}
	7172
	7173	/* If the two characters beginning at 's' are part of a
	7174	* three character fold starting at the character
	7175	* before s, we can't split either before or after s.
	7176	* Backup two chars and try again */
	7177	if ( LIKELY(s > s_start)
	7178	&& UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
	7179	{
	7180	s = prev_s;
	7181	s = (char ) utf8_hop_back((U8 ) s, -1, (U8 *) s_start);
	7182	backed_up = TRUE;
	7183	continue;
	7184	}
	7185
	7186	/* Here there's no multi-char fold between s and the
	7187	* next character following it. We can split */
	7188	splittable = TRUE;
	7189	break;
	7190
	7191	} while (s > s_start); /* End of loops backing up through the node */
	7192
	7193	/* Here we either couldn't find a place to split the node,
	7194	* or else we broke out of the loop setting 'splittable' to
	7195	* true. In the latter case, the place to split is between
	7196	* the first and second characters in the sequence starting
	7197	* at 's' */
	7198	if (splittable) {
	7199	s += UTF8SKIP(s);
	7200	}
	7201	}
	7202	else { /* Pattern not UTF-8 */
	7203	if ( ender != LATIN_SMALL_LETTER_SHARP_S
	7204	\|\| ASCII_FOLD_RESTRICTED)
	7205	{
	7206	assert( toLOWER_L1(ender) < 256 );
	7207	e++ = (char)(toLOWER_L1(ender)); / should e and the cast be U8? */
	7208	}
	7209	else {
	7210	*e++ = 's';
	7211	*e++ = 's';
	7212	}
	7213
	7214	if ( e - s <= 1
	7215	&& UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
	7216	{
	7217	if (isPUNCT(*p)) {
	7218	s--;
	7219	backed_up = TRUE;
	7220	}
	7221	else {
	7222	if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
	7223	\|\| ASCII_FOLD_RESTRICTED)
	7224	{
	7225	assert( toLOWER_L1(ender) < 256 );
	7226	e++ = (char)(toLOWER_L1(ender)); / should e and the cast be U8? */
	7227	}
	7228	else {
	7229	*e++ = 's';
	7230	*e++ = 's';
	7231	}
	7232	}
	7233	}
	7234
	7235	do {
	7236	if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
	7237	s--;
	7238	backed_up = TRUE;
	7239	continue;
	7240	}
	7241
	7242	if ( LIKELY(s > s_start)
	7243	&& UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
	7244	{
	7245	s -= 2;
	7246	backed_up = TRUE;
	7247	continue;
	7248	}
	7249
	7250	splittable = TRUE;
	7251	break;
	7252
	7253	} while (s > s_start);
	7254
	7255	if (splittable) {
	7256	s++;
	7257	}
	7258	}
	7259
	7260	/* Here, we are done backing up. If we didn't backup at all
	7261	* (the likely case), just proceed */
	7262	if (backed_up) {
	7263
	7264	/* If we did find a place to split, reparse the entire node
	7265	* stopping where we have calculated. */
	7266	if (splittable) {
	7267
	7268	/* If we created a temporary folded string under /l, we
	7269	* have to map that back to the original */
	7270	if (need_to_fold_loc) {
	7271	upper_fill = loc_correspondence[s - s_start];
	7272	if (upper_fill == 0) {
	7273	FAIL2("panic: loc_correspondence[%d] is 0",
	7274	(int) (s - s_start));
	7275	}
	7276	Safefree(locfold_buf);
	7277	Safefree(loc_correspondence);
	7278	}
	7279	else {
	7280	upper_fill = s - s0;
	7281	}
	7282	goto reparse;
	7283	}
	7284
	7285	/* Here the node consists entirely of non-final multi-char
	7286	* folds. (Likely it is all 'f's or all 's's.) There's no
	7287	* decent place to split it, so give up and just take the
	7288	* whole thing */
	7289	len = old_s - s0;
	7290	}
	7291
	7292	if (need_to_fold_loc) {
	7293	Safefree(locfold_buf);
	7294	Safefree(loc_correspondence);
	7295	}
	7296	} /* End of verifying node ends with an appropriate char */
	7297
	7298	/* We need to start the next node at the character that didn't fit
	7299	* in this one */
	7300	p = oldp;
	7301
	7302	loopdone: /* Jumped to when encounters something that shouldn't be
	7303	in the node */
	7304
	7305	/* Free up any over-allocated space; cast is to silence bogus
	7306	* warning in MS VC */
	7307	change_engine_size(pRExC_state,
	7308	- (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
	7309
	7310	/* I (khw) don't know if you can get here with zero length, but the
	7311	* old code handled this situation by creating a zero-length EXACT
	7312	* node. Might as well be NOTHING instead */
	7313	if (len == 0) {
	7314	OP(REGNODE_p(ret)) = NOTHING;
	7315	}
	7316	else {
	7317
	7318	/* If the node type is EXACT here, check to see if it
	7319	* should be EXACTL, or EXACT_REQ8. */
	7320	if (node_type == EXACT) {
	7321	if (LOC) {
	7322	node_type = EXACTL;
	7323	}
	7324	else if (requires_utf8_target) {
	7325	node_type = EXACT_REQ8;
	7326	}
	7327	}
	7328	else if (node_type == LEXACT) {
	7329	if (requires_utf8_target) {
	7330	node_type = LEXACT_REQ8;
	7331	}
	7332	}
	7333	else if (FOLD) {
	7334	if ( UNLIKELY(has_micro_sign \|\| has_ss)
	7335	&& (node_type == EXACTFU \|\| ( node_type == EXACTF
	7336	&& maybe_exactfu)))
	7337	{ /* These two conditions are problematic in non-UTF-8
	7338	EXACTFU nodes. */
	7339	assert(! UTF);
	7340	node_type = EXACTFUP;
	7341	}
	7342	else if (node_type == EXACTFL) {
	7343
	7344	/* 'maybe_exactfu' is deliberately set above to
	7345	* indicate this node type, where all code points in it
	7346	* are above 255 */
	7347	if (maybe_exactfu) {
	7348	node_type = EXACTFLU8;
	7349	}
	7350	else if (UNLIKELY(
	7351	_invlist_contains_cp(PL_HasMultiCharFold, ender)))
	7352	{
	7353	/* A character that folds to more than one will
	7354	* match multiple characters, so can't be SIMPLE.
	7355	* We don't have to worry about this with EXACTFLU8
	7356	* nodes just above, as they have already been
	7357	* folded (since the fold doesn't vary at run
	7358	* time). Here, if the final character in the node
	7359	* folds to multiple, it can't be simple. (This
	7360	* only has an effect if the node has only a single
	7361	* character, hence the final one, as elsewhere we
	7362	* turn off simple for nodes whose length > 1 */
	7363	maybe_SIMPLE = 0;
	7364	}
	7365	}
	7366	else if (node_type == EXACTF) { /* Means is /di */
	7367
	7368	/* This intermediate variable is needed solely because
	7369	* the asserts in the macro where used exceed Win32's
	7370	* literal string capacity */
	7371	char first_char = * STRING(REGNODE_p(ret));
	7372
	7373	/* If 'maybe_exactfu' is clear, then we need to stay
	7374	* /di. If it is set, it means there are no code
	7375	* points that match differently depending on UTF8ness
	7376	* of the target string, so it can become an EXACTFU
	7377	* node */
	7378	if (! maybe_exactfu) {
	7379	RExC_seen_d_op = TRUE;
	7380	}
	7381	else if ( isALPHA_FOLD_EQ(first_char, 's')
	7382	\|\| isALPHA_FOLD_EQ(ender, 's'))
	7383	{
	7384	/* But, if the node begins or ends in an 's' we
	7385	* have to defer changing it into an EXACTFU, as
	7386	* the node could later get joined with another one
	7387	* that ends or begins with 's' creating an 'ss'
	7388	* sequence which would then wrongly match the
	7389	* sharp s without the target being UTF-8. We
	7390	* create a special node that we resolve later when
	7391	* we join nodes together */
	7392
	7393	node_type = EXACTFU_S_EDGE;
	7394	}
	7395	else {
	7396	node_type = EXACTFU;
	7397	}
	7398	}
	7399
	7400	if (requires_utf8_target && node_type == EXACTFU) {
	7401	node_type = EXACTFU_REQ8;
	7402	}
	7403	}
	7404
	7405	OP(REGNODE_p(ret)) = node_type;
	7406	setSTR_LEN(REGNODE_p(ret), len);
	7407	RExC_emit += STR_SZ(len);
	7408
	7409	/* If the node isn't a single character, it can't be SIMPLE */
	7410	if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
	7411	maybe_SIMPLE = 0;
	7412	}
	7413
	7414	*flagp \|= HASWIDTH \| maybe_SIMPLE;
	7415	}
	7416
	7417	RExC_parse_set(p);
	7418
	7419	{
	7420	/* len is STRLEN which is unsigned, need to copy to signed */
	7421	IV iv = len;
	7422	if (iv < 0)
	7423	vFAIL("Internal disaster");
	7424	}
	7425
	7426	} /* End of label 'defchar:' */
	7427	break;
	7428	} /* End of giant switch on input character */
	7429
	7430	/* Position parse to next real character */
	7431	skip_to_be_ignored_text(pRExC_state, &RExC_parse,
	7432	FALSE /* Don't force to /x */ );
	7433	if ( *RExC_parse == '{'
	7434	&& OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
	7435	{
	7436	if (RExC_strict) {
	7437	RExC_parse_inc_by(1);
	7438	vFAIL("Unescaped left brace in regex is illegal here");
	7439	}
	7440	ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
	7441	" passed through");
	7442	}
	7443
	7444	return(ret);
	7445	}
	7446
	7447
	7448	#ifdef PERL_RE_BUILD_AUX
	7449	void
	7450	Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode node, SV* invlist_ptr)
	7451	{
	7452	/* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
	7453	* sets up the bitmap and any flags, removing those code points from the
	7454	* inversion list, setting it to NULL should it become completely empty */
	7455
	7456
	7457	PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
	7458
	7459	/* There is no bitmap for this node type */
	7460	if (REGNODE_TYPE(OP(node)) != ANYOF) {
	7461	return;
	7462	}
	7463
	7464	ANYOF_BITMAP_ZERO(node);
	7465	if (*invlist_ptr) {
	7466
	7467	/* This gets set if we actually need to modify things */
	7468	bool change_invlist = FALSE;
	7469
	7470	UV start, end;
	7471
	7472	/* Start looking through invlist_ptr /
	7473	invlist_iterinit(*invlist_ptr);
	7474	while (invlist_iternext(*invlist_ptr, &start, &end)) {
	7475	UV high;
	7476	int i;
	7477
	7478	/* Quit if are above what we should change */
	7479	if (start >= NUM_ANYOF_CODE_POINTS) {
	7480	break;
	7481	}
	7482
	7483	change_invlist = TRUE;
	7484
	7485	/* Set all the bits in the range, up to the max that we are doing */
	7486	high = (end < NUM_ANYOF_CODE_POINTS - 1)
	7487	? end
	7488	: NUM_ANYOF_CODE_POINTS - 1;
	7489	for (i = start; i <= (int) high; i++) {
	7490	ANYOF_BITMAP_SET(node, i);
	7491	}
	7492	}
	7493	invlist_iterfinish(*invlist_ptr);
	7494
	7495	/* Done with loop; remove any code points that are in the bitmap from
	7496	* invlist_ptr /
	7497	if (change_invlist) {
	7498	_invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
	7499	}
	7500
	7501	/* If have completely emptied it, remove it completely */
	7502	if (_invlist_len(*invlist_ptr) == 0) {
	7503	SvREFCNT_dec_NN(*invlist_ptr);
	7504	*invlist_ptr = NULL;
	7505	}
	7506	}
	7507	}
	7508	#endif /* PERL_RE_BUILD_AUX */
	7509
	7510	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	7511	Character classes ([:foo:]) can also be negated ([:^foo:]).
	7512	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	7513	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	7514	but trigger failures because they are currently unimplemented. */
	7515
	7516	#define POSIXCC_DONE(c) ((c) == ':')
	7517	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	7518	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	7519	#define MAYBE_POSIXCC(c) (POSIXCC(c) \|\| (c) == '^' \|\| (c) == ';')
	7520
	7521	#define WARNING_PREFIX "Assuming NOT a POSIX class since "
	7522	#define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
	7523	#define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
	7524
	7525	#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
	7526
	7527	/* 'posix_warnings' and 'warn_text' are names of variables in the following
	7528	* routine. q.v. */
	7529	#define ADD_POSIX_WARNING(p, text) STMT_START { \
	7530	if (posix_warnings) { \
	7531	if (! RExC_warn_text ) RExC_warn_text = \
	7532	(AV ) sv_2mortal((SV ) newAV()); \
	7533	av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_ \
	7534	WARNING_PREFIX \
	7535	text \
	7536	REPORT_LOCATION, \
	7537	REPORT_LOCATION_ARGS(p))); \
	7538	} \
	7539	} STMT_END
	7540	#define CLEAR_POSIX_WARNINGS() \
	7541	STMT_START { \
	7542	if (posix_warnings && RExC_warn_text) \
	7543	av_clear(RExC_warn_text); \
	7544	} STMT_END
	7545
	7546	#define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
	7547	STMT_START { \
	7548	CLEAR_POSIX_WARNINGS(); \
	7549	return ret; \
	7550	} STMT_END
	7551
	7552	STATIC int
	7553	S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
	7554
	7555	const char * const s, /* Where the putative posix class begins.
	7556	Normally, this is one past the '['. This
	7557	parameter exists so it can be somewhere
	7558	besides RExC_parse. */
	7559	char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
	7560	NULL */
	7561	AV ** posix_warnings, /* Where to place any generated warnings, or
	7562	NULL */
	7563	const bool check_only /* Don't die if error */
	7564	)
	7565	{
	7566	/* This parses what the caller thinks may be one of the three POSIX
	7567	* constructs:
	7568	* 1) a character class, like [:blank:]
	7569	* 2) a collating symbol, like [. .]
	7570	* 3) an equivalence class, like [= =]
	7571	* In the latter two cases, it croaks if it finds a syntactically legal
	7572	* one, as these are not handled by Perl.
	7573	*
	7574	* The main purpose is to look for a POSIX character class. It returns:
	7575	* a) the class number
	7576	* if it is a completely syntactically and semantically legal class.
	7577	* 'updated_parse_ptr', if not NULL, is set to point to just after the
	7578	* closing ']' of the class
	7579	* b) OOB_NAMEDCLASS
	7580	* if it appears that one of the three POSIX constructs was meant, but
	7581	* its specification was somehow defective. 'updated_parse_ptr', if
	7582	* not NULL, is set to point to the character just after the end
	7583	* character of the class. See below for handling of warnings.
	7584	* c) NOT_MEANT_TO_BE_A_POSIX_CLASS
	7585	* if it doesn't appear that a POSIX construct was intended.
	7586	* 'updated_parse_ptr' is not changed. No warnings nor errors are
	7587	* raised.
	7588	*
	7589	* In b) there may be errors or warnings generated. If 'check_only' is
	7590	* TRUE, then any errors are discarded. Warnings are returned to the
	7591	* caller via an AV* created into '*posix_warnings' if it is not NULL. If
	7592	* instead it is NULL, warnings are suppressed.
	7593	*
	7594	* The reason for this function, and its complexity is that a bracketed
	7595	* character class can contain just about anything. But it's easy to
	7596	* mistype the very specific posix class syntax but yielding a valid
	7597	* regular bracketed class, so it silently gets compiled into something
	7598	* quite unintended.
	7599	*
	7600	* The solution adopted here maintains backward compatibility except that
	7601	* it adds a warning if it looks like a posix class was intended but
	7602	* improperly specified. The warning is not raised unless what is input
	7603	* very closely resembles one of the 14 legal posix classes. To do this,
	7604	* it uses fuzzy parsing. It calculates how many single-character edits it
	7605	* would take to transform what was input into a legal posix class. Only
	7606	* if that number is quite small does it think that the intention was a
	7607	* posix class. Obviously these are heuristics, and there will be cases
	7608	* where it errs on one side or another, and they can be tweaked as
	7609	* experience informs.
	7610	*
	7611	* The syntax for a legal posix class is:
	7612	*
	7613	* qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
	7614	*
	7615	* What this routine considers syntactically to be an intended posix class
	7616	* is this (the comments indicate some restrictions that the pattern
	7617	* doesn't show):
	7618	*
	7619	* qr/(?x: \[? # The left bracket, possibly
	7620	* # omitted
	7621	* \h* # possibly followed by blanks
	7622	* (?: \^ \h* )? # possibly a misplaced caret
	7623	* [:;]? # The opening class character,
	7624	* # possibly omitted. A typo
	7625	* # semi-colon can also be used.
	7626	* \h*
	7627	* \^? # possibly a correctly placed
	7628	* # caret, but not if there was also
	7629	* # a misplaced one
	7630	* \h*
	7631	* .{3,15} # The class name. If there are
	7632	* # deviations from the legal syntax,
	7633	* # its edit distance must be close
	7634	* # to a real class name in order
	7635	* # for it to be considered to be
	7636	* # an intended posix class.
	7637	* \h*
	7638	* [[:punct:]]? # The closing class character,
	7639	* # possibly omitted. If not a colon
	7640	* # nor semi colon, the class name
	7641	* # must be even closer to a valid
	7642	* # one
	7643	* \h*
	7644	* \]? # The right bracket, possibly
	7645	* # omitted.
	7646	* )/
	7647	*
	7648	* In the above, \h must be ASCII-only.
	7649	*
	7650	* These are heuristics, and can be tweaked as field experience dictates.
	7651	* There will be cases when someone didn't intend to specify a posix class
	7652	* that this warns as being so. The goal is to minimize these, while
	7653	* maximizing the catching of things intended to be a posix class that
	7654	* aren't parsed as such.
	7655	*/
	7656
	7657	const char* p = s;
	7658	const char * const e = RExC_end;
	7659	unsigned complement = 0; /* If to complement the class */
	7660	bool found_problem = FALSE; /* Assume OK until proven otherwise */
	7661	bool has_opening_bracket = FALSE;
	7662	bool has_opening_colon = FALSE;
	7663	int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
	7664	valid class */
	7665	const char * possible_end = NULL; /* used for a 2nd parse pass */
	7666	const char* name_start; /* ptr to class name first char */
	7667
	7668	/* If the number of single-character typos the input name is away from a
	7669	* legal name is no more than this number, it is considered to have meant
	7670	* the legal name */
	7671	int max_distance = 2;
	7672
	7673	/* to store the name. The size determines the maximum length before we
	7674	* decide that no posix class was intended. Should be at least
	7675	* sizeof("alphanumeric") */
	7676	UV input_text[15];
	7677	STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
	7678
	7679	PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
	7680
	7681	CLEAR_POSIX_WARNINGS();
	7682
	7683	if (p >= e) {
	7684	return NOT_MEANT_TO_BE_A_POSIX_CLASS;
	7685	}
	7686
	7687	if (*(p - 1) != '[') {
	7688	ADD_POSIX_WARNING(p, "it doesn't start with a '['");
	7689	found_problem = TRUE;
	7690	}
	7691	else {
	7692	has_opening_bracket = TRUE;
	7693	}
	7694
	7695	/* They could be confused and think you can put spaces between the
	7696	* components */
	7697	if (isBLANK(*p)) {
	7698	found_problem = TRUE;
	7699
	7700	do {
	7701	p++;
	7702	} while (p < e && isBLANK(*p));
	7703
	7704	ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
	7705	}
	7706
	7707	/* For [. .] and [= =]. These are quite different internally from [: :],
	7708	* so they are handled separately. */
	7709	if (POSIXCC_NOTYET(p) && p < e - 3) / 1 for the close, and 1 for the ']'
	7710	and 1 for at least one char in it
	7711	*/
	7712	{
	7713	const char open_char = *p;
	7714	const char * temp_ptr = p + 1;
	7715
	7716	/* These two constructs are not handled by perl, and if we find a
	7717	* syntactically valid one, we croak. khw, who wrote this code, finds
	7718	* this explanation of them very unclear:
	7719	* https://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
	7720	* And searching the rest of the internet wasn't very helpful either.
	7721	* It looks like just about any byte can be in these constructs,
	7722	* depending on the locale. But unless the pattern is being compiled
	7723	* under /l, which is very rare, Perl runs under the C or POSIX locale.
	7724	* In that case, it looks like [= =] isn't allowed at all, and that
	7725	* [. .] could be any single code point, but for longer strings the
	7726	* constituent characters would have to be the ASCII alphabetics plus
	7727	* the minus-hyphen. Any sensible locale definition would limit itself
	7728	* to these. And any portable one definitely should. Trying to parse
	7729	* the general case is a nightmare (see [perl #127604]). So, this code
	7730	* looks only for interiors of these constructs that match:
	7731	* qr/.\|[-\w]{2,}/
	7732	* Using \w relaxes the apparent rules a little, without adding much
	7733	* danger of mistaking something else for one of these constructs.
	7734	*
	7735	* [. .] in some implementations described on the internet is usable to
	7736	* escape a character that otherwise is special in bracketed character
	7737	* classes. For example [.].] means a literal right bracket instead of
	7738	* the ending of the class
	7739	*
	7740	* [= =] can legitimately contain a [. .] construct, but we don't
	7741	* handle this case, as that [. .] construct will later get parsed
	7742	* itself and croak then. And [= =] is checked for even when not under
	7743	* /l, as Perl has long done so.
	7744	*
	7745	* The code below relies on there being a trailing NUL, so it doesn't
	7746	* have to keep checking if the parse ptr < e.
	7747	*/
	7748	if (temp_ptr[1] == open_char) {
	7749	temp_ptr++;
	7750	}
	7751	else while ( temp_ptr < e
	7752	&& (isWORDCHAR(temp_ptr) \|\| temp_ptr == '-'))
	7753	{
	7754	temp_ptr++;
	7755	}
	7756
	7757	if (*temp_ptr == open_char) {
	7758	temp_ptr++;
	7759	if (*temp_ptr == ']') {
	7760	temp_ptr++;
	7761	if (! found_problem && ! check_only) {
	7762	RExC_parse_set((char *) temp_ptr);
	7763	vFAIL3("POSIX syntax [%c %c] is reserved for future "
	7764	"extensions", open_char, open_char);
	7765	}
	7766
	7767	/* Here, the syntax wasn't completely valid, or else the call
	7768	* is to check-only */
	7769	if (updated_parse_ptr) {
	7770	updated_parse_ptr = (char ) temp_ptr;
	7771	}
	7772
	7773	CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
	7774	}
	7775	}
	7776
	7777	/* If we find something that started out to look like one of these
	7778	* constructs, but isn't, we continue below so that it can be checked
	7779	* for being a class name with a typo of '.' or '=' instead of a colon.
	7780	* */
	7781	}
	7782
	7783	/* Here, we think there is a possibility that a [: :] class was meant, and
	7784	* we have the first real character. It could be they think the '^' comes
	7785	* first */
	7786	if (*p == '^') {
	7787	found_problem = TRUE;
	7788	ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
	7789	complement = 1;
	7790	p++;
	7791
	7792	if (isBLANK(*p)) {
	7793	found_problem = TRUE;
	7794
	7795	do {
	7796	p++;
	7797	} while (p < e && isBLANK(*p));
	7798
	7799	ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
	7800	}
	7801	}
	7802
	7803	/* But the first character should be a colon, which they could have easily
	7804	* mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
	7805	* distinguish from a colon, so treat that as a colon). */
	7806	if (*p == ':') {
	7807	p++;
	7808	has_opening_colon = TRUE;
	7809	}
	7810	else if (*p == ';') {
	7811	found_problem = TRUE;
	7812	p++;
	7813	ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
	7814	has_opening_colon = TRUE;
	7815	}
	7816	else {
	7817	found_problem = TRUE;
	7818	ADD_POSIX_WARNING(p, "there must be a starting ':'");
	7819
	7820	/* Consider an initial punctuation (not one of the recognized ones) to
	7821	* be a left terminator */
	7822	if (p != '^' && p != ']' && isPUNCT(*p)) {
	7823	p++;
	7824	}
	7825	}
	7826
	7827	/* They may think that you can put spaces between the components */
	7828	if (isBLANK(*p)) {
	7829	found_problem = TRUE;
	7830
	7831	do {
	7832	p++;
	7833	} while (p < e && isBLANK(*p));
	7834
	7835	ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
	7836	}
	7837
	7838	if (*p == '^') {
	7839
	7840	/* We consider something like [^:^alnum:]] to not have been intended to
	7841	* be a posix class, but XXX maybe we should */
	7842	if (complement) {
	7843	CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
	7844	}
	7845
	7846	complement = 1;
	7847	p++;
	7848	}
	7849
	7850	/* Again, they may think that you can put spaces between the components */
	7851	if (isBLANK(*p)) {
	7852	found_problem = TRUE;
	7853
	7854	do {
	7855	p++;
	7856	} while (p < e && isBLANK(*p));
	7857
	7858	ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
	7859	}
	7860
	7861	if (*p == ']') {
	7862
	7863	/* XXX This ']' may be a typo, and something else was meant. But
	7864	* treating it as such creates enough complications, that that
	7865	* possibility isn't currently considered here. So we assume that the
	7866	* ']' is what is intended, and if we've already found an initial '[',
	7867	* this leaves this construct looking like [:] or [:^], which almost
	7868	* certainly weren't intended to be posix classes */
	7869	if (has_opening_bracket) {
	7870	CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
	7871	}
	7872
	7873	/* But this function can be called when we parse the colon for
	7874	* something like qr/[alpha:]]/, so we back up to look for the
	7875	* beginning */
	7876	p--;
	7877
	7878	if (*p == ';') {
	7879	found_problem = TRUE;
	7880	ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
	7881	}
	7882	else if (*p != ':') {
	7883
	7884	/* XXX We are currently very restrictive here, so this code doesn't
	7885	* consider the possibility that, say, /[alpha.]]/ was intended to
	7886	* be a posix class. */
	7887	CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
	7888	}
	7889
	7890	/* Here we have something like 'foo:]'. There was no initial colon,
	7891	* and we back up over 'foo. XXX Unlike the going forward case, we
	7892	* don't handle typos of non-word chars in the middle */
	7893	has_opening_colon = FALSE;
	7894	p--;
	7895
	7896	while (p > RExC_start && isWORDCHAR(*p)) {
	7897	p--;
	7898	}
	7899	p++;
	7900
	7901	/* Here, we have positioned ourselves to where we think the first
	7902	* character in the potential class is */
	7903	}
	7904
	7905	/* Now the interior really starts. There are certain key characters that
	7906	* can end the interior, or these could just be typos. To catch both
	7907	* cases, we may have to do two passes. In the first pass, we keep on
	7908	* going unless we come to a sequence that matches
	7909	* qr/ [[:punct:]] [[:blank:]]* \] /xa
	7910	* This means it takes a sequence to end the pass, so two typos in a row if
	7911	* that wasn't what was intended. If the class is perfectly formed, just
	7912	* this one pass is needed. We also stop if there are too many characters
	7913	* being accumulated, but this number is deliberately set higher than any
	7914	* real class. It is set high enough so that someone who thinks that
	7915	* 'alphanumeric' is a correct name would get warned that it wasn't.
	7916	* While doing the pass, we keep track of where the key characters were in
	7917	* it. If we don't find an end to the class, and one of the key characters
	7918	* was found, we redo the pass, but stop when we get to that character.
	7919	* Thus the key character was considered a typo in the first pass, but a
	7920	* terminator in the second. If two key characters are found, we stop at
	7921	* the second one in the first pass. Again this can miss two typos, but
	7922	* catches a single one
	7923	*
	7924	* In the first pass, 'possible_end' starts as NULL, and then gets set to
	7925	* point to the first key character. For the second pass, it starts as -1.
	7926	* */
	7927
	7928	name_start = p;
	7929	parse_name:
	7930	{
	7931	bool has_blank = FALSE;
	7932	bool has_upper = FALSE;
	7933	bool has_terminating_colon = FALSE;
	7934	bool has_terminating_bracket = FALSE;
	7935	bool has_semi_colon = FALSE;
	7936	unsigned int name_len = 0;
	7937	int punct_count = 0;
	7938
	7939	while (p < e) {
	7940
	7941	/* Squeeze out blanks when looking up the class name below */
	7942	if (isBLANK(*p) ) {
	7943	has_blank = TRUE;
	7944	found_problem = TRUE;
	7945	p++;
	7946	continue;
	7947	}
	7948
	7949	/* The name will end with a punctuation */
	7950	if (isPUNCT(*p)) {
	7951	const char * peek = p + 1;
	7952
	7953	/* Treat any non-']' punctuation followed by a ']' (possibly
	7954	* with intervening blanks) as trying to terminate the class.
	7955	* ']]' is very likely to mean a class was intended (but
	7956	* missing the colon), but the warning message that gets
	7957	* generated shows the error position better if we exit the
	7958	* loop at the bottom (eventually), so skip it here. */
	7959	if (*p != ']') {
	7960	if (peek < e && isBLANK(*peek)) {
	7961	has_blank = TRUE;
	7962	found_problem = TRUE;
	7963	do {
	7964	peek++;
	7965	} while (peek < e && isBLANK(*peek));
	7966	}
	7967
	7968	if (peek < e && *peek == ']') {
	7969	has_terminating_bracket = TRUE;
	7970	if (*p == ':') {
	7971	has_terminating_colon = TRUE;
	7972	}
	7973	else if (*p == ';') {
	7974	has_semi_colon = TRUE;
	7975	has_terminating_colon = TRUE;
	7976	}
	7977	else {
	7978	found_problem = TRUE;
	7979	}
	7980	p = peek + 1;
	7981	goto try_posix;
	7982	}
	7983	}
	7984
	7985	/* Here we have punctuation we thought didn't end the class.
	7986	* Keep track of the position of the key characters that are
	7987	* more likely to have been class-enders */
	7988	if (p == ']' \|\| p == '[' \|\| p == ':' \|\| p == ';') {
	7989
	7990	/* Allow just one such possible class-ender not actually
	7991	* ending the class. */
	7992	if (possible_end) {
	7993	break;
	7994	}
	7995	possible_end = p;
	7996	}
	7997
	7998	/* If we have too many punctuation characters, no use in
	7999	* keeping going */
	8000	if (++punct_count > max_distance) {
	8001	break;
	8002	}
	8003
	8004	/* Treat the punctuation as a typo. */
	8005	input_text[name_len++] = *p;
	8006	p++;
	8007	}
	8008	else if (isUPPER(p)) { / Use lowercase for lookup */
	8009	input_text[name_len++] = toLOWER(*p);
	8010	has_upper = TRUE;
	8011	found_problem = TRUE;
	8012	p++;
	8013	} else if (! UTF \|\| UTF8_IS_INVARIANT(*p)) {
	8014	input_text[name_len++] = *p;
	8015	p++;
	8016	}
	8017	else {
	8018	input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
	8019	p+= UTF8SKIP(p);
	8020	}
	8021
	8022	/* The declaration of 'input_text' is how long we allow a potential
	8023	* class name to be, before saying they didn't mean a class name at
	8024	* all */
	8025	if (name_len >= C_ARRAY_LENGTH(input_text)) {
	8026	break;
	8027	}
	8028	}
	8029
	8030	/* We get to here when the possible class name hasn't been properly
	8031	* terminated before:
	8032	* 1) we ran off the end of the pattern; or
	8033	* 2) found two characters, each of which might have been intended to
	8034	* be the name's terminator
	8035	* 3) found so many punctuation characters in the purported name,
	8036	* that the edit distance to a valid one is exceeded
	8037	* 4) we decided it was more characters than anyone could have
	8038	* intended to be one. */
	8039
	8040	found_problem = TRUE;
	8041
	8042	/* In the final two cases, we know that looking up what we've
	8043	* accumulated won't lead to a match, even a fuzzy one. */
	8044	if ( name_len >= C_ARRAY_LENGTH(input_text)
	8045	\|\| punct_count > max_distance)
	8046	{
	8047	/* If there was an intermediate key character that could have been
	8048	* an intended end, redo the parse, but stop there */
	8049	if (possible_end && possible_end != (char *) -1) {
	8050	possible_end = (char ) -1; / Special signal value to say
	8051	we've done a first pass */
	8052	p = name_start;
	8053	goto parse_name;
	8054	}
	8055
	8056	/* Otherwise, it can't have meant to have been a class */
	8057	CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
	8058	}
	8059
	8060	/* If we ran off the end, and the final character was a punctuation
	8061	* one, back up one, to look at that final one just below. Later, we
	8062	* will restore the parse pointer if appropriate */
	8063	if (name_len && p == e && isPUNCT(*(p-1))) {
	8064	p--;
	8065	name_len--;
	8066	}
	8067
	8068	if (p < e && isPUNCT(*p)) {
	8069	if (*p == ']') {
	8070	has_terminating_bracket = TRUE;
	8071
	8072	/* If this is a 2nd ']', and the first one is just below this
	8073	* one, consider that to be the real terminator. This gives a
	8074	* uniform and better positioning for the warning message */
	8075	if ( possible_end
	8076	&& possible_end != (char *) -1
	8077	&& *possible_end == ']'
	8078	&& name_len && input_text[name_len - 1] == ']')
	8079	{
	8080	name_len--;
	8081	p = possible_end;
	8082
	8083	/* And this is actually equivalent to having done the 2nd
	8084	* pass now, so set it to not try again */
	8085	possible_end = (char *) -1;
	8086	}
	8087	}
	8088	else {
	8089	if (*p == ':') {
	8090	has_terminating_colon = TRUE;
	8091	}
	8092	else if (*p == ';') {
	8093	has_semi_colon = TRUE;
	8094	has_terminating_colon = TRUE;
	8095	}
	8096	p++;
	8097	}
	8098	}
	8099
	8100	try_posix:
	8101
	8102	/* Here, we have a class name to look up. We can short circuit the
	8103	* stuff below for short names that can't possibly be meant to be a
	8104	* class name. (We can do this on the first pass, as any second pass
	8105	* will yield an even shorter name) */
	8106	if (name_len < 3) {
	8107	CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
	8108	}
	8109
	8110	/* Find which class it is. Initially switch on the length of the name.
	8111	* */
	8112	switch (name_len) {
	8113	case 4:
	8114	if (memEQs(name_start, 4, "word")) {
	8115	/* this is not POSIX, this is the Perl \w */
	8116	class_number = ANYOF_WORDCHAR;
	8117	}
	8118	break;
	8119	case 5:
	8120	/* Names all of length 5: alnum alpha ascii blank cntrl digit
	8121	* graph lower print punct space upper
	8122	* Offset 4 gives the best switch position. */
	8123	switch (name_start[4]) {
	8124	case 'a':
	8125	if (memBEGINs(name_start, 5, "alph")) /* alpha */
	8126	class_number = ANYOF_ALPHA;
	8127	break;
	8128	case 'e':
	8129	if (memBEGINs(name_start, 5, "spac")) /* space */
	8130	class_number = ANYOF_SPACE;
	8131	break;
	8132	case 'h':
	8133	if (memBEGINs(name_start, 5, "grap")) /* graph */
	8134	class_number = ANYOF_GRAPH;
	8135	break;
	8136	case 'i':
	8137	if (memBEGINs(name_start, 5, "asci")) /* ascii */
	8138	class_number = ANYOF_ASCII;
	8139	break;
	8140	case 'k':
	8141	if (memBEGINs(name_start, 5, "blan")) /* blank */
	8142	class_number = ANYOF_BLANK;
	8143	break;
	8144	case 'l':
	8145	if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
	8146	class_number = ANYOF_CNTRL;
	8147	break;
	8148	case 'm':
	8149	if (memBEGINs(name_start, 5, "alnu")) /* alnum */
	8150	class_number = ANYOF_ALPHANUMERIC;
	8151	break;
	8152	case 'r':
	8153	if (memBEGINs(name_start, 5, "lowe")) /* lower */
	8154	class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
	8155	else if (memBEGINs(name_start, 5, "uppe")) /* upper */
	8156	class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
	8157	break;
	8158	case 't':
	8159	if (memBEGINs(name_start, 5, "digi")) /* digit */
	8160	class_number = ANYOF_DIGIT;
	8161	else if (memBEGINs(name_start, 5, "prin")) /* print */
	8162	class_number = ANYOF_PRINT;
	8163	else if (memBEGINs(name_start, 5, "punc")) /* punct */
	8164	class_number = ANYOF_PUNCT;
	8165	break;
	8166	}
	8167	break;
	8168	case 6:
	8169	if (memEQs(name_start, 6, "xdigit"))
	8170	class_number = ANYOF_XDIGIT;
	8171	break;
	8172	}
	8173
	8174	/* If the name exactly matches a posix class name the class number will
	8175	* here be set to it, and the input almost certainly was meant to be a
	8176	* posix class, so we can skip further checking. If instead the syntax
	8177	* is exactly correct, but the name isn't one of the legal ones, we
	8178	* will return that as an error below. But if neither of these apply,
	8179	* it could be that no posix class was intended at all, or that one
	8180	* was, but there was a typo. We tease these apart by doing fuzzy
	8181	* matching on the name */
	8182	if (class_number == OOB_NAMEDCLASS && found_problem) {
	8183	const UV posix_names[][6] = {
	8184	{ 'a', 'l', 'n', 'u', 'm' },
	8185	{ 'a', 'l', 'p', 'h', 'a' },
	8186	{ 'a', 's', 'c', 'i', 'i' },
	8187	{ 'b', 'l', 'a', 'n', 'k' },
	8188	{ 'c', 'n', 't', 'r', 'l' },
	8189	{ 'd', 'i', 'g', 'i', 't' },
	8190	{ 'g', 'r', 'a', 'p', 'h' },
	8191	{ 'l', 'o', 'w', 'e', 'r' },
	8192	{ 'p', 'r', 'i', 'n', 't' },
	8193	{ 'p', 'u', 'n', 'c', 't' },
	8194	{ 's', 'p', 'a', 'c', 'e' },
	8195	{ 'u', 'p', 'p', 'e', 'r' },
	8196	{ 'w', 'o', 'r', 'd' },
	8197	{ 'x', 'd', 'i', 'g', 'i', 't' }
	8198	};
	8199	/* The names of the above all have added NULs to make them the same
	8200	* size, so we need to also have the real lengths */
	8201	const UV posix_name_lengths[] = {
	8202	sizeof("alnum") - 1,
	8203	sizeof("alpha") - 1,
	8204	sizeof("ascii") - 1,
	8205	sizeof("blank") - 1,
	8206	sizeof("cntrl") - 1,
	8207	sizeof("digit") - 1,
	8208	sizeof("graph") - 1,
	8209	sizeof("lower") - 1,
	8210	sizeof("print") - 1,
	8211	sizeof("punct") - 1,
	8212	sizeof("space") - 1,
	8213	sizeof("upper") - 1,
	8214	sizeof("word") - 1,
	8215	sizeof("xdigit")- 1
	8216	};
	8217	unsigned int i;
	8218	int temp_max = max_distance; /* Use a temporary, so if we
	8219	reparse, we haven't changed the
	8220	outer one */
	8221
	8222	/* Use a smaller max edit distance if we are missing one of the
	8223	* delimiters */
	8224	if ( has_opening_bracket + has_opening_colon < 2
	8225	\|\| has_terminating_bracket + has_terminating_colon < 2)
	8226	{
	8227	temp_max--;
	8228	}
	8229
	8230	/* See if the input name is close to a legal one */
	8231	for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
	8232
	8233	/* Short circuit call if the lengths are too far apart to be
	8234	* able to match */
	8235	if (abs( (int) (name_len - posix_name_lengths[i]))
	8236	> temp_max)
	8237	{
	8238	continue;
	8239	}
	8240
	8241	if (edit_distance(input_text,
	8242	posix_names[i],
	8243	name_len,
	8244	posix_name_lengths[i],
	8245	temp_max
	8246	)
	8247	> -1)
	8248	{ /* If it is close, it probably was intended to be a class */
	8249	goto probably_meant_to_be;
	8250	}
	8251	}
	8252
	8253	/* Here the input name is not close enough to a valid class name
	8254	* for us to consider it to be intended to be a posix class. If
	8255	* we haven't already done so, and the parse found a character that
	8256	* could have been terminators for the name, but which we absorbed
	8257	* as typos during the first pass, repeat the parse, signalling it
	8258	* to stop at that character */
	8259	if (possible_end && possible_end != (char *) -1) {
	8260	possible_end = (char *) -1;
	8261	p = name_start;
	8262	goto parse_name;
	8263	}
	8264
	8265	/* Here neither pass found a close-enough class name */
	8266	CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
	8267	}
	8268
	8269	probably_meant_to_be:
	8270
	8271	/* Here we think that a posix specification was intended. Update any
	8272	* parse pointer */
	8273	if (updated_parse_ptr) {
	8274	updated_parse_ptr = (char ) p;
	8275	}
	8276
	8277	/* If a posix class name was intended but incorrectly specified, we
	8278	* output or return the warnings */
	8279	if (found_problem) {
	8280
	8281	/* We set flags for these issues in the parse loop above instead of
	8282	* adding them to the list of warnings, because we can parse it
	8283	* twice, and we only want one warning instance */
	8284	if (has_upper) {
	8285	ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
	8286	}
	8287	if (has_blank) {
	8288	ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
	8289	}
	8290	if (has_semi_colon) {
	8291	ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
	8292	}
	8293	else if (! has_terminating_colon) {
	8294	ADD_POSIX_WARNING(p, "there is no terminating ':'");
	8295	}
	8296	if (! has_terminating_bracket) {
	8297	ADD_POSIX_WARNING(p, "there is no terminating ']'");
	8298	}
	8299
	8300	if ( posix_warnings
	8301	&& RExC_warn_text
	8302	&& av_count(RExC_warn_text) > 0)
	8303	{
	8304	*posix_warnings = RExC_warn_text;
	8305	}
	8306	}
	8307	else if (class_number != OOB_NAMEDCLASS) {
	8308	/* If it is a known class, return the class. The class number
	8309	* #defines are structured so each complement is +1 to the normal
	8310	* one */
	8311	CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
	8312	}
	8313	else if (! check_only) {
	8314
	8315	/* Here, it is an unrecognized class. This is an error (unless the
	8316	* call is to check only, which we've already handled above) */
	8317	const char * const complement_string = (complement)
	8318	? "^"
	8319	: "";
	8320	RExC_parse_set((char *) p);
	8321	vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
	8322	complement_string,
	8323	UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
	8324	}
	8325	}
	8326
	8327	return OOB_NAMEDCLASS;
	8328	}
	8329	#undef ADD_POSIX_WARNING
	8330
	8331	STATIC unsigned int
	8332	S_regex_set_precedence(const U8 my_operator) {
	8333
	8334	/* Returns the precedence in the (?[...]) construct of the input operator,
	8335	* specified by its character representation. The precedence follows
	8336	* general Perl rules, but it extends this so that ')' and ']' have (low)
	8337	* precedence even though they aren't really operators */
	8338
	8339	switch (my_operator) {
	8340	case '!':
	8341	return 5;
	8342	case '&':
	8343	return 4;
	8344	case '^':
	8345	case '\|':
	8346	case '+':
	8347	case '-':
	8348	return 3;
	8349	case ')':
	8350	return 2;
	8351	case ']':
	8352	return 1;
	8353	}
	8354
	8355	NOT_REACHED; /* NOTREACHED */
	8356	return 0; /* Silence compiler warning */
	8357	}
	8358
	8359	STATIC regnode_offset
	8360	S_handle_regex_sets(pTHX_ RExC_state_t pRExC_state, SV* return_invlist,
	8361	I32 *flagp, U32 depth)
	8362	{
	8363	/* Handle the (?[...]) construct to do set operations */
	8364
	8365	U8 curchar; /* Current character being parsed */
	8366	UV start, end; /* End points of code point ranges */
	8367	SV* final = NULL; /* The end result inversion list */
	8368	SV* result_string; /* 'final' stringified */
	8369	AV* stack; /* stack of operators and operands not yet
	8370	resolved */
	8371	AV* fence_stack = NULL; /* A stack containing the positions in
	8372	'stack' of where the undealt-with left
	8373	parens would be if they were actually
	8374	put there */
	8375	/* The 'volatile' is a workaround for an optimiser bug
	8376	* in Solaris Studio 12.3. See RT #127455 */
	8377	volatile IV fence = 0; /* Position of where most recent undealt-
	8378	with left paren in stack is; -1 if none.
	8379	*/
	8380	STRLEN len; /* Temporary */
	8381	regnode_offset node; /* Temporary, and final regnode returned by
	8382	this function */
	8383	const bool save_fold = FOLD; /* Temporary */
	8384	char save_end, save_parse; /* Temporaries */
	8385	const bool in_locale = LOC; /* we turn off /l during processing */
	8386
	8387	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	8388
	8389	PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
	8390
	8391	DEBUG_PARSE("xcls");
	8392
	8393	if (in_locale) {
	8394	set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
	8395	}
	8396
	8397	/* The use of this operator implies /u. This is required so that the
	8398	* compile time values are valid in all runtime cases */
	8399	REQUIRE_UNI_RULES(flagp, 0);
	8400
	8401	/* Everything in this construct is a metacharacter. Operands begin with
	8402	* either a '\' (for an escape sequence), or a '[' for a bracketed
	8403	* character class. Any other character should be an operator, or
	8404	* parenthesis for grouping. Both types of operands are handled by calling
	8405	* regclass() to parse them. It is called with a parameter to indicate to
	8406	* return the computed inversion list. The parsing here is implemented via
	8407	* a stack. Each entry on the stack is a single character representing one
	8408	* of the operators; or else a pointer to an operand inversion list. */
	8409
	8410	#define IS_OPERATOR(a) SvIOK(a)
	8411	#define IS_OPERAND(a) (! IS_OPERATOR(a))
	8412
	8413	/* The stack is kept in Łukasiewicz order. (That's pronounced similar
	8414	* to luke-a-shave-itch (or -itz), but people who didn't want to bother
	8415	* with pronouncing it called it Reverse Polish instead, but now that YOU
	8416	* know how to pronounce it you can use the correct term, thus giving due
	8417	* credit to the person who invented it, and impressing your geek friends.
	8418	* Wikipedia says that the pronunciation of "Ł" has been changing so that
	8419	* it is now more like an English initial W (as in wonk) than an L.)
	8420	*
	8421	* This means that, for example, 'a \| b & c' is stored on the stack as
	8422	*
	8423	* c [4]
	8424	* b [3]
	8425	* & [2]
	8426	* a [1]
	8427	* \| [0]
	8428	*
	8429	* where the numbers in brackets give the stack [array] element number.
	8430	* In this implementation, parentheses are not stored on the stack.
	8431	* Instead a '(' creates a "fence" so that the part of the stack below the
	8432	* fence is invisible except to the corresponding ')' (this allows us to
	8433	* replace testing for parens, by using instead subtraction of the fence
	8434	* position). As new operands are processed they are pushed onto the stack
	8435	* (except as noted in the next paragraph). New operators of higher
	8436	* precedence than the current final one are inserted on the stack before
	8437	* the lhs operand (so that when the rhs is pushed next, everything will be
	8438	* in the correct positions shown above. When an operator of equal or
	8439	* lower precedence is encountered in parsing, all the stacked operations
	8440	* of equal or higher precedence are evaluated, leaving the result as the
	8441	* top entry on the stack. This makes higher precedence operations
	8442	* evaluate before lower precedence ones, and causes operations of equal
	8443	* precedence to left associate.
	8444	*
	8445	* The only unary operator '!' is immediately pushed onto the stack when
	8446	* encountered. When an operand is encountered, if the top of the stack is
	8447	* a '!", the complement is immediately performed, and the '!' popped. The
	8448	* resulting value is treated as a new operand, and the logic in the
	8449	* previous paragraph is executed. Thus in the expression
	8450	* [a] + ! [b]
	8451	* the stack looks like
	8452	*
	8453	* !
	8454	* a
	8455	* +
	8456	*
	8457	* as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
	8458	* becomes
	8459	*
	8460	* !b
	8461	* a
	8462	* +
	8463	*
	8464	* A ')' is treated as an operator with lower precedence than all the
	8465	* aforementioned ones, which causes all operations on the stack above the
	8466	* corresponding '(' to be evaluated down to a single resultant operand.
	8467	* Then the fence for the '(' is removed, and the operand goes through the
	8468	* algorithm above, without the fence.
	8469	*
	8470	* A separate stack is kept of the fence positions, so that the position of
	8471	* the latest so-far unbalanced '(' is at the top of it.
	8472	*
	8473	* The ']' ending the construct is treated as the lowest operator of all,
	8474	* so that everything gets evaluated down to a single operand, which is the
	8475	* result */
	8476
	8477	stack = (AV*)newSV_type_mortal(SVt_PVAV);
	8478	fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
	8479
	8480	while (RExC_parse < RExC_end) {
	8481	I32 top_index; /* Index of top-most element in 'stack' */
	8482	SV** top_ptr; /* Pointer to top 'stack' element */
	8483	SV* current = NULL; /* To contain the current inversion list
	8484	operand */
	8485	SV* only_to_avoid_leaks;
	8486
	8487	skip_to_be_ignored_text(pRExC_state, &RExC_parse,
	8488	TRUE /* Force /x */ );
	8489	if (RExC_parse >= RExC_end) { /* Fail */
	8490	break;
	8491	}
	8492
	8493	curchar = UCHARAT(RExC_parse);
	8494
	8495	redo_curchar:
	8496
	8497	#ifdef ENABLE_REGEX_SETS_DEBUGGING
	8498	/* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
	8499	DEBUG_U(dump_regex_sets_structures(pRExC_state,
	8500	stack, fence, fence_stack));
	8501	#endif
	8502
	8503	top_index = av_tindex_skip_len_mg(stack);
	8504
	8505	switch (curchar) {
	8506	SV** stacked_ptr; /* Ptr to something already on 'stack' */
	8507	char stacked_operator; /* The topmost operator on the 'stack'. */
	8508	SV* lhs; /* Operand to the left of the operator */
	8509	SV* rhs; /* Operand to the right of the operator */
	8510	SV* fence_ptr; /* Pointer to top element of the fence
	8511	stack */
	8512	case '(':
	8513
	8514	if ( RExC_parse < RExC_end - 2
	8515	&& UCHARAT(RExC_parse + 1) == '?'
	8516	&& strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
	8517	{
	8518	const regnode_offset orig_emit = RExC_emit;
	8519	SV * resultant_invlist;
	8520
	8521	/* Here it could be an embedded '(?flags:(?[...])'.
	8522	* This happens when we have some thing like
	8523	*
	8524	* my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
	8525	* ...
	8526	* qr/(?[ \p{Digit} & $thai_or_lao ])/;
	8527	*
	8528	* Here we would be handling the interpolated
	8529	* '$thai_or_lao'. We handle this by a recursive call to
	8530	* reg which returns the inversion list the
	8531	* interpolated expression evaluates to. Actually, the
	8532	* return is a special regnode containing a pointer to that
	8533	* inversion list. If the return isn't that regnode alone,
	8534	* we know that this wasn't such an interpolation, which is
	8535	* an error: we need to get a single inversion list back
	8536	* from the recursion */
	8537
	8538	RExC_parse_inc_by(1);
	8539	RExC_sets_depth++;
	8540
	8541	node = reg(pRExC_state, 2, flagp, depth+1);
	8542	RETURN_FAIL_ON_RESTART(*flagp, flagp);
	8543
	8544	if ( OP(REGNODE_p(node)) != REGEX_SET
	8545	/* If more than a single node returned, the nested
	8546	* parens evaluated to more than just a (?[...]),
	8547	* which isn't legal */
	8548	\|\| RExC_emit != orig_emit
	8549	+ NODE_STEP_REGNODE
	8550	+ REGNODE_ARG_LEN(REGEX_SET))
	8551	{
	8552	vFAIL("Expecting interpolated extended charclass");
	8553	}
	8554	resultant_invlist = (SV *) ARGp(REGNODE_p(node));
	8555	current = invlist_clone(resultant_invlist, NULL);
	8556	SvREFCNT_dec(resultant_invlist);
	8557
	8558	RExC_sets_depth--;
	8559	RExC_emit = orig_emit;
	8560	goto handle_operand;
	8561	}
	8562
	8563	/* A regular '('. Look behind for illegal syntax */
	8564	if (top_index - fence >= 0) {
	8565	/* If the top entry on the stack is an operator, it had
	8566	* better be a '!', otherwise the entry below the top
	8567	* operand should be an operator */
	8568	if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
	8569	\|\| (IS_OPERATOR(top_ptr) && SvUV(top_ptr) != '!')
	8570	\|\| ( IS_OPERAND(*top_ptr)
	8571	&& ( top_index - fence < 1
	8572	\|\| ! (stacked_ptr = av_fetch(stack,
	8573	top_index - 1,
	8574	FALSE))
	8575	\|\| ! IS_OPERATOR(*stacked_ptr))))
	8576	{
	8577	RExC_parse_inc_by(1);
	8578	vFAIL("Unexpected '(' with no preceding operator");
	8579	}
	8580	}
	8581
	8582	/* Stack the position of this undealt-with left paren */
	8583	av_push_simple(fence_stack, newSViv(fence));
	8584	fence = top_index + 1;
	8585	break;
	8586
	8587	case '\\':
	8588	/* regclass() can only return RESTART_PARSE and NEED_UTF8 if
	8589	* multi-char folds are allowed. */
	8590	if (!regclass(pRExC_state, flagp, depth+1,
	8591	TRUE, /* means parse just the next thing */
	8592	FALSE, /* don't allow multi-char folds */
	8593	FALSE, /* don't silence non-portable warnings. */
	8594	TRUE, /* strict */
	8595	FALSE, /* Require return to be an ANYOF */
	8596	&current))
	8597	{
	8598	RETURN_FAIL_ON_RESTART(*flagp, flagp);
	8599	goto regclass_failed;
	8600	}
	8601
	8602	assert(current);
	8603
	8604	/* regclass() will return with parsing just the \ sequence,
	8605	* leaving the parse pointer at the next thing to parse */
	8606	RExC_parse--;
	8607	goto handle_operand;
	8608
	8609	case '[': /* Is a bracketed character class */
	8610	{
	8611	/* See if this is a [:posix:] class. */
	8612	bool is_posix_class = (OOB_NAMEDCLASS
	8613	< handle_possible_posix(pRExC_state,
	8614	RExC_parse + 1,
	8615	NULL,
	8616	NULL,
	8617	TRUE /* checking only */));
	8618	/* If it is a posix class, leave the parse pointer at the '['
	8619	* to fool regclass() into thinking it is part of a
	8620	* '[[:posix:]]'. */
	8621	if (! is_posix_class) {
	8622	RExC_parse_inc_by(1);
	8623	}
	8624
	8625	/* regclass() can only return RESTART_PARSE and NEED_UTF8 if
	8626	* multi-char folds are allowed. */
	8627	if (!regclass(pRExC_state, flagp, depth+1,
	8628	is_posix_class, /* parse the whole char
	8629	class only if not a
	8630	posix class */
	8631	FALSE, /* don't allow multi-char folds */
	8632	TRUE, /* silence non-portable warnings. */
	8633	TRUE, /* strict */
	8634	FALSE, /* Require return to be an ANYOF */
	8635	&current))
	8636	{
	8637	RETURN_FAIL_ON_RESTART(*flagp, flagp);
	8638	goto regclass_failed;
	8639	}
	8640
	8641	assert(current);
	8642
	8643	/* function call leaves parse pointing to the ']', except if we
	8644	* faked it */
	8645	if (is_posix_class) {
	8646	RExC_parse--;
	8647	}
	8648
	8649	goto handle_operand;
	8650	}
	8651
	8652	case ']':
	8653	if (top_index >= 1) {
	8654	goto join_operators;
	8655	}
	8656
	8657	/* Only a single operand on the stack: are done */
	8658	goto done;
	8659
	8660	case ')':
	8661	if (av_tindex_skip_len_mg(fence_stack) < 0) {
	8662	if (UCHARAT(RExC_parse - 1) == ']') {
	8663	break;
	8664	}
	8665	RExC_parse_inc_by(1);
	8666	vFAIL("Unexpected ')'");
	8667	}
	8668
	8669	/* If nothing after the fence, is missing an operand */
	8670	if (top_index - fence < 0) {
	8671	RExC_parse_inc_by(1);
	8672	goto bad_syntax;
	8673	}
	8674	/* If at least two things on the stack, treat this as an
	8675	* operator */
	8676	if (top_index - fence >= 1) {
	8677	goto join_operators;
	8678	}
	8679
	8680	/* Here only a single thing on the fenced stack, and there is a
	8681	* fence. Get rid of it */
	8682	fence_ptr = av_pop(fence_stack);
	8683	assert(fence_ptr);
	8684	fence = SvIV(fence_ptr);
	8685	SvREFCNT_dec_NN(fence_ptr);
	8686	fence_ptr = NULL;
	8687
	8688	if (fence < 0) {
	8689	fence = 0;
	8690	}
	8691
	8692	/* Having gotten rid of the fence, we pop the operand at the
	8693	* stack top and process it as a newly encountered operand */
	8694	current = av_pop(stack);
	8695	if (IS_OPERAND(current)) {
	8696	goto handle_operand;
	8697	}
	8698
	8699	RExC_parse_inc_by(1);
	8700	goto bad_syntax;
	8701
	8702	case '&':
	8703	case '\|':
	8704	case '+':
	8705	case '-':
	8706	case '^':
	8707
	8708	/* These binary operators should have a left operand already
	8709	* parsed */
	8710	if ( top_index - fence < 0
	8711	\|\| top_index - fence == 1
	8712	\|\| ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
	8713	\|\| ! IS_OPERAND(*top_ptr))
	8714	{
	8715	goto unexpected_binary;
	8716	}
	8717
	8718	/* If only the one operand is on the part of the stack visible
	8719	* to us, we just place this operator in the proper position */
	8720	if (top_index - fence < 2) {
	8721
	8722	/* Place the operator before the operand */
	8723
	8724	SV* lhs = av_pop(stack);
	8725	av_push_simple(stack, newSVuv(curchar));
	8726	av_push_simple(stack, lhs);
	8727	break;
	8728	}
	8729
	8730	/* But if there is something else on the stack, we need to
	8731	* process it before this new operator if and only if the
	8732	* stacked operation has equal or higher precedence than the
	8733	* new one */
	8734
	8735	join_operators:
	8736
	8737	/* The operator on the stack is supposed to be below both its
	8738	* operands */
	8739	if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
	8740	\|\| IS_OPERAND(*stacked_ptr))
	8741	{
	8742	/* But if not, it's legal and indicates we are completely
	8743	* done if and only if we're currently processing a ']',
	8744	* which should be the final thing in the expression */
	8745	if (curchar == ']') {
	8746	goto done;
	8747	}
	8748
	8749	unexpected_binary:
	8750	RExC_parse_inc_by(1);
	8751	vFAIL2("Unexpected binary operator '%c' with no "
	8752	"preceding operand", curchar);
	8753	}
	8754	stacked_operator = (char) SvUV(*stacked_ptr);
	8755
	8756	if (regex_set_precedence(curchar)
	8757	> regex_set_precedence(stacked_operator))
	8758	{
	8759	/* Here, the new operator has higher precedence than the
	8760	* stacked one. This means we need to add the new one to
	8761	* the stack to await its rhs operand (and maybe more
	8762	* stuff). We put it before the lhs operand, leaving
	8763	* untouched the stacked operator and everything below it
	8764	* */
	8765	lhs = av_pop(stack);
	8766	assert(IS_OPERAND(lhs));
	8767	av_push_simple(stack, newSVuv(curchar));
	8768	av_push_simple(stack, lhs);
	8769	break;
	8770	}
	8771
	8772	/* Here, the new operator has equal or lower precedence than
	8773	* what's already there. This means the operation already
	8774	* there should be performed now, before the new one. */
	8775
	8776	rhs = av_pop(stack);
	8777	if (! IS_OPERAND(rhs)) {
	8778
	8779	/* This can happen when a ! is not followed by an operand,
	8780	* like in /(?[\t &!])/ */
	8781	goto bad_syntax;
	8782	}
	8783
	8784	lhs = av_pop(stack);
	8785
	8786	if (! IS_OPERAND(lhs)) {
	8787
	8788	/* This can happen when there is an empty (), like in
	8789	* /(?[[0]+()+])/ */
	8790	goto bad_syntax;
	8791	}
	8792
	8793	switch (stacked_operator) {
	8794	case '&':
	8795	_invlist_intersection(lhs, rhs, &rhs);
	8796	break;
	8797
	8798	case '\|':
	8799	case '+':
	8800	_invlist_union(lhs, rhs, &rhs);
	8801	break;
	8802
	8803	case '-':
	8804	_invlist_subtract(lhs, rhs, &rhs);
	8805	break;
	8806
	8807	case '^': /* The union minus the intersection */
	8808	{
	8809	SV* i = NULL;
	8810	SV* u = NULL;
	8811
	8812	_invlist_union(lhs, rhs, &u);
	8813	_invlist_intersection(lhs, rhs, &i);
	8814	_invlist_subtract(u, i, &rhs);
	8815	SvREFCNT_dec_NN(i);
	8816	SvREFCNT_dec_NN(u);
	8817	break;
	8818	}
	8819	}
	8820	SvREFCNT_dec(lhs);
	8821
	8822	/* Here, the higher precedence operation has been done, and the
	8823	* result is in 'rhs'. We overwrite the stacked operator with
	8824	* the result. Then we redo this code to either push the new
	8825	* operator onto the stack or perform any higher precedence
	8826	* stacked operation */
	8827	only_to_avoid_leaks = av_pop(stack);
	8828	SvREFCNT_dec(only_to_avoid_leaks);
	8829	av_push_simple(stack, rhs);
	8830	goto redo_curchar;
	8831
	8832	case '!': /* Highest priority, right associative */
	8833
	8834	/* If what's already at the top of the stack is another '!",
	8835	* they just cancel each other out */
	8836	if ( (top_ptr = av_fetch(stack, top_index, FALSE))
	8837	&& (IS_OPERATOR(top_ptr) && SvUV(top_ptr) == '!'))
	8838	{
	8839	only_to_avoid_leaks = av_pop(stack);
	8840	SvREFCNT_dec(only_to_avoid_leaks);
	8841	}
	8842	else { /* Otherwise, since it's right associative, just push
	8843	onto the stack */
	8844	av_push_simple(stack, newSVuv(curchar));
	8845	}
	8846	break;
	8847
	8848	default:
	8849	RExC_parse_inc();
	8850	if (RExC_parse >= RExC_end) {
	8851	break;
	8852	}
	8853	vFAIL("Unexpected character");
	8854
	8855	handle_operand:
	8856
	8857	/* Here 'current' is the operand. If something is already on the
	8858	* stack, we have to check if it is a !. But first, the code above
	8859	* may have altered the stack in the time since we earlier set
	8860	* 'top_index'. */
	8861
	8862	top_index = av_tindex_skip_len_mg(stack);
	8863	if (top_index - fence >= 0) {
	8864	/* If the top entry on the stack is an operator, it had better
	8865	* be a '!', otherwise the entry below the top operand should
	8866	* be an operator */
	8867	top_ptr = av_fetch(stack, top_index, FALSE);
	8868	assert(top_ptr);
	8869	if (IS_OPERATOR(*top_ptr)) {
	8870
	8871	/* The only permissible operator at the top of the stack is
	8872	* '!', which is applied immediately to this operand. */
	8873	curchar = (char) SvUV(*top_ptr);
	8874	if (curchar != '!') {
	8875	SvREFCNT_dec(current);
	8876	vFAIL2("Unexpected binary operator '%c' with no "
	8877	"preceding operand", curchar);
	8878	}
	8879
	8880	_invlist_invert(current);
	8881
	8882	only_to_avoid_leaks = av_pop(stack);
	8883	SvREFCNT_dec(only_to_avoid_leaks);
	8884
	8885	/* And we redo with the inverted operand. This allows
	8886	* handling multiple ! in a row */
	8887	goto handle_operand;
	8888	}
	8889	/* Single operand is ok only for the non-binary ')'
	8890	* operator */
	8891	else if ((top_index - fence == 0 && curchar != ')')
	8892	\|\| (top_index - fence > 0
	8893	&& (! (stacked_ptr = av_fetch(stack,
	8894	top_index - 1,
	8895	FALSE))
	8896	\|\| IS_OPERAND(*stacked_ptr))))
	8897	{
	8898	SvREFCNT_dec(current);
	8899	vFAIL("Operand with no preceding operator");
	8900	}
	8901	}
	8902
	8903	/* Here there was nothing on the stack or the top element was
	8904	* another operand. Just add this new one */
	8905	av_push_simple(stack, current);
	8906
	8907	} /* End of switch on next parse token */
	8908
	8909	RExC_parse_inc();
	8910	} /* End of loop parsing through the construct */
	8911
	8912	vFAIL("Syntax error in (?[...])");
	8913
	8914	done:
	8915
	8916	if (RExC_parse >= RExC_end \|\| RExC_parse[1] != ')') {
	8917	if (RExC_parse < RExC_end) {
	8918	RExC_parse_inc_by(1);
	8919	}
	8920
	8921	vFAIL("Unexpected ']' with no following ')' in (?[...");
	8922	}
	8923
	8924	if (av_tindex_skip_len_mg(fence_stack) >= 0) {
	8925	vFAIL("Unmatched (");
	8926	}
	8927
	8928	if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
	8929	\|\| ((final = av_pop(stack)) == NULL)
	8930	\|\| ! IS_OPERAND(final)
	8931	\|\| ! is_invlist(final)
	8932	\|\| av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
	8933	{
	8934	bad_syntax:
	8935	SvREFCNT_dec(final);
	8936	vFAIL("Incomplete expression within '(?[ ])'");
	8937	}
	8938
	8939	/* Here, 'final' is the resultant inversion list from evaluating the
	8940	* expression. Return it if so requested */
	8941	if (return_invlist) {
	8942	*return_invlist = final;
	8943	return END;
	8944	}
	8945
	8946	if (RExC_sets_depth) { /* If within a recursive call, return in a special
	8947	regnode */
	8948	RExC_parse_inc_by(1);
	8949	node = regpnode(pRExC_state, REGEX_SET, final);
	8950	}
	8951	else {
	8952
	8953	/* Otherwise generate a resultant node, based on 'final'. regclass()
	8954	* is expecting a string of ranges and individual code points */
	8955	invlist_iterinit(final);
	8956	result_string = newSVpvs("");
	8957	while (invlist_iternext(final, &start, &end)) {
	8958	if (start == end) {
	8959	Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
	8960	}
	8961	else {
	8962	Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
	8963	UVXf "}", start, end);
	8964	}
	8965	}
	8966
	8967	/* About to generate an ANYOF (or similar) node from the inversion list
	8968	* we have calculated */
	8969	save_parse = RExC_parse;
	8970	RExC_parse_set(SvPV(result_string, len));
	8971	save_end = RExC_end;
	8972	RExC_end = RExC_parse + len;
	8973	TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
	8974
	8975	/* We turn off folding around the call, as the class we have
	8976	* constructed already has all folding taken into consideration, and we
	8977	* don't want regclass() to add to that */
	8978	RExC_flags &= ~RXf_PMf_FOLD;
	8979	/* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
	8980	* folds are allowed. */
	8981	node = regclass(pRExC_state, flagp, depth+1,
	8982	FALSE, /* means parse the whole char class */
	8983	FALSE, /* don't allow multi-char folds */
	8984	TRUE, /* silence non-portable warnings. The above may
	8985	very well have generated non-portable code
	8986	points, but they're valid on this machine */
	8987	FALSE, /* similarly, no need for strict */
	8988
	8989	/* We can optimize into something besides an ANYOF,
	8990	* except under /l, which needs to be ANYOF because of
	8991	* runtime checks for locale sanity, etc */
	8992	! in_locale,
	8993	NULL
	8994	);
	8995
	8996	RESTORE_WARNINGS;
	8997	RExC_parse_set(save_parse + 1);
	8998	RExC_end = save_end;
	8999	SvREFCNT_dec_NN(final);
	9000	SvREFCNT_dec_NN(result_string);
	9001
	9002	if (save_fold) {
	9003	RExC_flags \|= RXf_PMf_FOLD;
	9004	}
	9005
	9006	if (!node) {
	9007	RETURN_FAIL_ON_RESTART(*flagp, flagp);
	9008	goto regclass_failed;
	9009	}
	9010
	9011	/* Fix up the node type if we are in locale. (We have pretended we are
	9012	* under /u for the purposes of regclass(), as this construct will only
	9013	* work under UTF-8 locales. But now we change the opcode to be ANYOFL
	9014	* (so as to cause any warnings about bad locales to be output in
	9015	* regexec.c), and add the flag that indicates to check if not in a
	9016	* UTF-8 locale. The reason we above forbid optimization into
	9017	* something other than an ANYOF node is simply to minimize the number
	9018	* of code changes in regexec.c. Otherwise we would have to create new
	9019	* EXACTish node types and deal with them. This decision could be
	9020	* revisited should this construct become popular.
	9021	*
	9022	* (One might think we could look at the resulting ANYOF node and
	9023	* suppress the flag if everything is above 255, as those would be
	9024	* UTF-8 only, but this isn't true, as the components that led to that
	9025	* result could have been locale-affected, and just happen to cancel
	9026	* each other out under UTF-8 locales.) */
	9027	if (in_locale) {
	9028	set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
	9029
	9030	assert(OP(REGNODE_p(node)) == ANYOF);
	9031
	9032	OP(REGNODE_p(node)) = ANYOFL;
	9033	ANYOF_FLAGS(REGNODE_p(node)) \|= ANYOFL_UTF8_LOCALE_REQD;
	9034	}
	9035	}
	9036
	9037	nextchar(pRExC_state);
	9038	return node;
	9039
	9040	regclass_failed:
	9041	FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
	9042	(UV) *flagp);
	9043	}
	9044
	9045	#ifdef ENABLE_REGEX_SETS_DEBUGGING
	9046
	9047	STATIC void
	9048	S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
	9049	AV * stack, const IV fence, AV * fence_stack)
	9050	{ /* Dumps the stacks in handle_regex_sets() */
	9051
	9052	const SSize_t stack_top = av_tindex_skip_len_mg(stack);
	9053	const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
	9054	SSize_t i;
	9055
	9056	PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
	9057
	9058	PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
	9059
	9060	if (stack_top < 0) {
	9061	PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
	9062	}
	9063	else {
	9064	PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
	9065	for (i = stack_top; i >= 0; i--) {
	9066	SV ** element_ptr = av_fetch(stack, i, FALSE);
	9067	if (! element_ptr) {
	9068	}
	9069
	9070	if (IS_OPERATOR(*element_ptr)) {
	9071	PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
	9072	(int) i, (int) SvIV(*element_ptr));
	9073	}
	9074	else {
	9075	PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
	9076	sv_dump(*element_ptr);
	9077	}
	9078	}
	9079	}
	9080
	9081	if (fence_stack_top < 0) {
	9082	PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
	9083	}
	9084	else {
	9085	PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
	9086	for (i = fence_stack_top; i >= 0; i--) {
	9087	SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE);
	9088	if (! element_ptr) {
	9089	}
	9090
	9091	PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
	9092	(int) i, (int) SvIV(*element_ptr));
	9093	}
	9094	}
	9095	}
	9096
	9097	#endif
	9098
	9099	#undef IS_OPERATOR
	9100	#undef IS_OPERAND
	9101
	9102	#ifdef PERL_RE_BUILD_AUX
	9103	void
	9104	Perl_add_above_Latin1_folds(pTHX_ RExC_state_t pRExC_state, const U8 cp, SV* invlist)
	9105	{
	9106	/* This adds the Latin1/above-Latin1 folding rules.
	9107	*
	9108	* This should be called only for a Latin1-range code points, cp, which is
	9109	* known to be involved in a simple fold with other code points above
	9110	* Latin1. It would give false results if /aa has been specified.
	9111	* Multi-char folds are outside the scope of this, and must be handled
	9112	* specially. */
	9113
	9114	PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
	9115
	9116	assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
	9117
	9118	/* The rules that are valid for all Unicode versions are hard-coded in */
	9119	switch (cp) {
	9120	case 'k':
	9121	case 'K':
	9122	*invlist =
	9123	add_cp_to_invlist(*invlist, KELVIN_SIGN);
	9124	break;
	9125	case 's':
	9126	case 'S':
	9127	invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_LONG_S);
	9128	break;
	9129	case MICRO_SIGN:
	9130	invlist = add_cp_to_invlist(invlist, GREEK_CAPITAL_LETTER_MU);
	9131	invlist = add_cp_to_invlist(invlist, GREEK_SMALL_LETTER_MU);
	9132	break;
	9133	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	9134	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	9135	invlist = add_cp_to_invlist(invlist, ANGSTROM_SIGN);
	9136	break;
	9137	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	9138	invlist = add_cp_to_invlist(invlist,
	9139	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	9140	break;
	9141
	9142	default: /* Other code points are checked against the data for the
	9143	current Unicode version */
	9144	{
	9145	Size_t folds_count;
	9146	U32 first_fold;
	9147	const U32 * remaining_folds;
	9148	UV folded_cp;
	9149
	9150	if (isASCII(cp)) {
	9151	folded_cp = toFOLD(cp);
	9152	}
	9153	else {
	9154	U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
	9155	Size_t dummy_len;
	9156	folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
	9157	}
	9158
	9159	if (folded_cp > 255) {
	9160	invlist = add_cp_to_invlist(invlist, folded_cp);
	9161	}
	9162
	9163	folds_count = _inverse_folds(folded_cp, &first_fold,
	9164	&remaining_folds);
	9165	if (folds_count == 0) {
	9166
	9167	/* Use deprecated warning to increase the chances of this being
	9168	* output */
	9169	ckWARN2reg_d(RExC_parse,
	9170	"Perl folding rules are not up-to-date for 0x%02X;"
	9171	" please use the perlbug utility to report;", cp);
	9172	}
	9173	else {
	9174	unsigned int i;
	9175
	9176	if (first_fold > 255) {
	9177	invlist = add_cp_to_invlist(invlist, first_fold);
	9178	}
	9179	for (i = 0; i < folds_count - 1; i++) {
	9180	if (remaining_folds[i] > 255) {
	9181	invlist = add_cp_to_invlist(invlist,
	9182	remaining_folds[i]);
	9183	}
	9184	}
	9185	}
	9186	break;
	9187	}
	9188	}
	9189	}
	9190	#endif /* PERL_RE_BUILD_AUX */
	9191
	9192
	9193	STATIC void
	9194	S_output_posix_warnings(pTHX_ RExC_state_t pRExC_state, AV posix_warnings)
	9195	{
	9196	/* Output the elements of the array given by '*posix_warnings' as REGEXP
	9197	* warnings. */
	9198
	9199	SV * msg;
	9200	const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
	9201
	9202	PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
	9203
	9204	if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
	9205	CLEAR_POSIX_WARNINGS();
	9206	return;
	9207	}
	9208
	9209	while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
	9210	if (first_is_fatal) { /* Avoid leaking this */
	9211	av_undef(posix_warnings); /* This isn't necessary if the
	9212	array is mortal, but is a
	9213	fail-safe */
	9214	(void) sv_2mortal(msg);
	9215	PREPARE_TO_DIE;
	9216	}
	9217	Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
	9218	SvREFCNT_dec_NN(msg);
	9219	}
	9220
	9221	UPDATE_WARNINGS_LOC(RExC_parse);
	9222	}
	9223
	9224	PERL_STATIC_INLINE Size_t
	9225	S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
	9226	{
	9227	const U8 * const start = s1;
	9228	const U8 * const send = start + max;
	9229
	9230	PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
	9231
	9232	while (s1 < send && s1 == s2) {
	9233	s1++; s2++;
	9234	}
	9235
	9236	return s1 - start;
	9237	}
	9238
	9239	STATIC AV *
	9240	S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
	9241	{
	9242	/* This adds the string scalar <multi_string> to the array
	9243	* <multi_char_matches>. <multi_string> is known to have exactly
	9244	* <cp_count> code points in it. This is used when constructing a
	9245	* bracketed character class and we find something that needs to match more
	9246	* than a single character.
	9247	*
	9248	* <multi_char_matches> is actually an array of arrays. Each top-level
	9249	* element is an array that contains all the strings known so far that are
	9250	* the same length. And that length (in number of code points) is the same
	9251	* as the index of the top-level array. Hence, the [2] element is an
	9252	* array, each element thereof is a string containing TWO code points;
	9253	* while element [3] is for strings of THREE characters, and so on. Since
	9254	* this is for multi-char strings there can never be a [0] nor [1] element.
	9255	*
	9256	* When we rewrite the character class below, we will do so such that the
	9257	* longest strings are written first, so that it prefers the longest
	9258	* matching strings first. This is done even if it turns out that any
	9259	* quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
	9260	* Christiansen has agreed that this is ok. This makes the test for the
	9261	* ligature 'ffi' come before the test for 'ff', for example */
	9262
	9263	AV* this_array;
	9264	AV** this_array_ptr;
	9265
	9266	PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
	9267
	9268	if (! multi_char_matches) {
	9269	multi_char_matches = newAV();
	9270	}
	9271
	9272	if (av_exists(multi_char_matches, cp_count)) {
	9273	this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE);
	9274	this_array = *this_array_ptr;
	9275	}
	9276	else {
	9277	this_array = newAV();
	9278	av_store_simple(multi_char_matches, cp_count,
	9279	(SV*) this_array);
	9280	}
	9281	av_push_simple(this_array, multi_string);
	9282
	9283	return multi_char_matches;
	9284	}
	9285
	9286	/* The names of properties whose definitions are not known at compile time are
	9287	* stored in this SV, after a constant heading. So if the length has been
	9288	* changed since initialization, then there is a run-time definition. */
	9289	#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
	9290	(SvCUR(listsv) != initial_listsv_len)
	9291
	9292	/* There is a restricted set of white space characters that are legal when
	9293	* ignoring white space in a bracketed character class. This generates the
	9294	* code to skip them.
	9295	*
	9296	* There is a line below that uses the same white space criteria but is outside
	9297	* this macro. Both here and there must use the same definition */
	9298	#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
	9299	STMT_START { \
	9300	if (do_skip) { \
	9301	while (p < stop_p && isBLANK_A(UCHARAT(p))) \
	9302	{ \
	9303	p++; \
	9304	} \
	9305	} \
	9306	} STMT_END
	9307
	9308	STATIC regnode_offset
	9309	S_regclass(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth,
	9310	const bool stop_at_1, /* Just parse the next thing, don't
	9311	look for a full character class */
	9312	bool allow_mutiple_chars,
	9313	const bool silence_non_portable, /* Don't output warnings
	9314	about too large
	9315	characters */
	9316	const bool strict,
	9317	bool optimizable, /* ? Allow a non-ANYOF return
	9318	node */
	9319	SV** ret_invlist /* Return an inversion list, not a node */
	9320	)
	9321	{
	9322	/* parse a bracketed class specification. Most of these will produce an
	9323	* ANYOF node; but something like [a] will produce an EXACT node; [aA], an
	9324	* EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
	9325	* under /i with multi-character folds: it will be rewritten following the
	9326	* paradigm of this example, where the <multi-fold>s are characters which
	9327	* fold to multiple character sequences:
	9328	* /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
	9329	* gets effectively rewritten as:
	9330	* /(?:\x{multi-fold1}\|\x{multi-fold2}\|[abcdefghi]/i
	9331	* reg() gets called (recursively) on the rewritten version, and this
	9332	* function will return what it constructs. (Actually the <multi-fold>s
	9333	* aren't physically removed from the [abcdefghi], it's just that they are
	9334	* ignored in the recursion by means of a flag:
	9335	* <RExC_in_multi_char_class>.)
	9336	*
	9337	* ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
	9338	* characters, with the corresponding bit set if that character is in the
	9339	* list. For characters above this, an inversion list is used. There
	9340	* are extra bits for \w, etc. in locale ANYOFs, as what these match is not
	9341	* determinable at compile time
	9342	*
	9343	* On success, returns the offset at which any next node should be placed
	9344	* into the regex engine program being compiled.
	9345	*
	9346	* Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
	9347	* to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
	9348	* UTF-8
	9349	*/
	9350
	9351	UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
	9352	IV range = 0;
	9353	UV value = OOB_UNICODE, save_value = OOB_UNICODE;
	9354	regnode_offset ret = -1; /* Initialized to an illegal value */
	9355	STRLEN numlen;
	9356	int namedclass = OOB_NAMEDCLASS;
	9357	char *rangebegin = NULL;
	9358	SV listsv = NULL; / List of \p{user-defined} whose definitions
	9359	aren't available at the time this was called */
	9360	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	9361	than just initialized. */
	9362	SV* properties = NULL; /* Code points that match \p{} \P{} */
	9363	SV* posixes = NULL; /* Code points that match classes like [:word:],
	9364	extended beyond the Latin1 range. These have to
	9365	be kept separate from other code points for much
	9366	of this function because their handling is
	9367	different under /i, and for most classes under
	9368	/d as well */
	9369	SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
	9370	separate for a while from the non-complemented
	9371	versions because of complications with /d
	9372	matching */
	9373	SV* simple_posixes = NULL; /* But under some conditions, the classes can be
	9374	treated more simply than the general case,
	9375	leading to less compilation and execution
	9376	work */
	9377	UV element_count = 0; /* Number of distinct elements in the class.
	9378	Optimizations may be possible if this is tiny */
	9379	AV * multi_char_matches = NULL; /* Code points that fold to more than one
	9380	character; used under /i */
	9381	UV n;
	9382	char * stop_ptr = RExC_end; /* where to stop parsing */
	9383
	9384	/* ignore unescaped whitespace? */
	9385	const bool skip_white = cBOOL( ret_invlist
	9386	\|\| (RExC_flags & RXf_PMf_EXTENDED_MORE));
	9387
	9388	/* inversion list of code points this node matches only when the target
	9389	* string is in UTF-8. These are all non-ASCII, < 256. (Because is under
	9390	* /d) */
	9391	SV* upper_latin1_only_utf8_matches = NULL;
	9392
	9393	/* Inversion list of code points this node matches regardless of things
	9394	* like locale, folding, utf8ness of the target string */
	9395	SV* cp_list = NULL;
	9396
	9397	/* Like cp_list, but code points on this list need to be checked for things
	9398	* that fold to/from them under /i */
	9399	SV* cp_foldable_list = NULL;
	9400
	9401	/* Like cp_list, but code points on this list are valid only when the
	9402	* runtime locale is UTF-8 */
	9403	SV* only_utf8_locale_list = NULL;
	9404
	9405	/* In a range, if one of the endpoints is non-character-set portable,
	9406	* meaning that it hard-codes a code point that may mean a different
	9407	* character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
	9408	* mnemonic '\t' which each mean the same character no matter which
	9409	* character set the platform is on. */
	9410	unsigned int non_portable_endpoint = 0;
	9411
	9412	/* Is the range unicode? which means on a platform that isn't 1-1 native
	9413	* to Unicode (i.e. non-ASCII), each code point in it should be considered
	9414	* to be a Unicode value. */
	9415	bool unicode_range = FALSE;
	9416	bool invert = FALSE; /* Is this class to be complemented */
	9417
	9418	bool warn_super = ALWAYS_WARN_SUPER;
	9419
	9420	const char * orig_parse = RExC_parse;
	9421
	9422	/* This variable is used to mark where the end in the input is of something
	9423	* that looks like a POSIX construct but isn't. During the parse, when
	9424	* something looks like it could be such a construct is encountered, it is
	9425	* checked for being one, but not if we've already checked this area of the
	9426	* input. Only after this position is reached do we check again */
	9427	char *not_posix_region_end = RExC_parse - 1;
	9428
	9429	AV* posix_warnings = NULL;
	9430	const bool do_posix_warnings = ckWARN(WARN_REGEXP);
	9431	U8 op = ANYOF; /* The returned node-type, initialized to the expected
	9432	type. */
	9433	U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
	9434	U32 posixl = 0; /* bit field of posix classes matched under /l */
	9435
	9436
	9437	/* Flags as to what things aren't knowable until runtime. (Note that these are
	9438	* mutually exclusive.) */
	9439	#define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
	9440	haven't been defined as of yet */
	9441	#define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
	9442	UTF-8 or not */
	9443	#define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
	9444	what gets folded */
	9445	U32 has_runtime_dependency = 0; /* OR of the above flags */
	9446
	9447	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	9448
	9449	PERL_ARGS_ASSERT_REGCLASS;
	9450	#ifndef DEBUGGING
	9451	PERL_UNUSED_ARG(depth);
	9452	#endif
	9453
	9454	assert(! (ret_invlist && allow_mutiple_chars));
	9455
	9456	/* If wants an inversion list returned, we can't optimize to something
	9457	* else. */
	9458	if (ret_invlist) {
	9459	optimizable = FALSE;
	9460	}
	9461
	9462	DEBUG_PARSE("clas");
	9463
	9464	#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
	9465	\|\| (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
	9466	&& UNICODE_DOT_DOT_VERSION == 0)
	9467	allow_mutiple_chars = FALSE;
	9468	#endif
	9469
	9470	/* We include the /i status at the beginning of this so that we can
	9471	* know it at runtime */
	9472	listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
	9473	initial_listsv_len = SvCUR(listsv);
	9474	SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
	9475
	9476	SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
	9477
	9478	assert(RExC_parse <= RExC_end);
	9479
	9480	if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
	9481	RExC_parse_inc_by(1);
	9482	invert = TRUE;
	9483	allow_mutiple_chars = FALSE;
	9484	MARK_NAUGHTY(1);
	9485	SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
	9486	}
	9487
	9488	/* Check that they didn't say [:posix:] instead of [[:posix:]] */
	9489	if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
	9490	int maybe_class = handle_possible_posix(pRExC_state,
	9491	RExC_parse,
	9492	&not_posix_region_end,
	9493	NULL,
	9494	TRUE /* checking only */);
	9495	if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
	9496	ckWARN4reg(not_posix_region_end,
	9497	"POSIX syntax [%c %c] belongs inside character classes%s",
	9498	RExC_parse, RExC_parse,
	9499	(maybe_class == OOB_NAMEDCLASS)
	9500	? ((POSIXCC_NOTYET(*RExC_parse))
	9501	? " (but this one isn't implemented)"
	9502	: " (but this one isn't fully valid)")
	9503	: ""
	9504	);
	9505	}
	9506	}
	9507
	9508	/* If the caller wants us to just parse a single element, accomplish this
	9509	* by faking the loop ending condition */
	9510	if (stop_at_1 && RExC_end > RExC_parse) {
	9511	stop_ptr = RExC_parse + 1;
	9512	}
	9513
	9514	/* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
	9515	if (UCHARAT(RExC_parse) == ']')
	9516	goto charclassloop;
	9517
	9518	while (1) {
	9519
	9520	if ( posix_warnings
	9521	&& av_tindex_skip_len_mg(posix_warnings) >= 0
	9522	&& RExC_parse > not_posix_region_end)
	9523	{
	9524	/* Warnings about posix class issues are considered tentative until
	9525	* we are far enough along in the parse that we can no longer
	9526	* change our mind, at which point we output them. This is done
	9527	* each time through the loop so that a later class won't zap them
	9528	* before they have been dealt with. */
	9529	output_posix_warnings(pRExC_state, posix_warnings);
	9530	}
	9531
	9532	SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
	9533
	9534	if (RExC_parse >= stop_ptr) {
	9535	break;
	9536	}
	9537
	9538	if (UCHARAT(RExC_parse) == ']') {
	9539	break;
	9540	}
	9541
	9542	charclassloop:
	9543
	9544	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	9545	save_value = value;
	9546	save_prevvalue = prevvalue;
	9547
	9548	if (!range) {
	9549	rangebegin = RExC_parse;
	9550	element_count++;
	9551	non_portable_endpoint = 0;
	9552	}
	9553	if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
	9554	value = utf8n_to_uvchr((U8*)RExC_parse,
	9555	RExC_end - RExC_parse,
	9556	&numlen, UTF8_ALLOW_DEFAULT);
	9557	RExC_parse_inc_by(numlen);
	9558	}
	9559	else {
	9560	value = UCHARAT(RExC_parse);
	9561	RExC_parse_inc_by(1);
	9562	}
	9563
	9564	if (value == '[') {
	9565	char * posix_class_end;
	9566	namedclass = handle_possible_posix(pRExC_state,
	9567	RExC_parse,
	9568	&posix_class_end,
	9569	do_posix_warnings ? &posix_warnings : NULL,
	9570	FALSE /* die if error */);
	9571	if (namedclass > OOB_NAMEDCLASS) {
	9572
	9573	/* If there was an earlier attempt to parse this particular
	9574	* posix class, and it failed, it was a false alarm, as this
	9575	* successful one proves */
	9576	if ( posix_warnings
	9577	&& av_tindex_skip_len_mg(posix_warnings) >= 0
	9578	&& not_posix_region_end >= RExC_parse
	9579	&& not_posix_region_end <= posix_class_end)
	9580	{
	9581	av_undef(posix_warnings);
	9582	}
	9583
	9584	RExC_parse_set(posix_class_end);
	9585	}
	9586	else if (namedclass == OOB_NAMEDCLASS) {
	9587	not_posix_region_end = posix_class_end;
	9588	}
	9589	else {
	9590	namedclass = OOB_NAMEDCLASS;
	9591	}
	9592	}
	9593	else if ( RExC_parse - 1 > not_posix_region_end
	9594	&& MAYBE_POSIXCC(value))
	9595	{
	9596	(void) handle_possible_posix(
	9597	pRExC_state,
	9598	RExC_parse - 1, /* -1 because parse has already been
	9599	advanced */
	9600	&not_posix_region_end,
	9601	do_posix_warnings ? &posix_warnings : NULL,
	9602	TRUE /* checking only */);
	9603	}
	9604	else if ( strict && ! skip_white
	9605	&& ( generic_isCC_(value, CC_VERTSPACE_)
	9606	\|\| is_VERTWS_cp_high(value)))
	9607	{
	9608	vFAIL("Literal vertical space in [] is illegal except under /x");
	9609	}
	9610	else if (value == '\\') {
	9611	/* Is a backslash; get the code point of the char after it */
	9612
	9613	if (RExC_parse >= RExC_end) {
	9614	vFAIL("Unmatched [");
	9615	}
	9616
	9617	if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
	9618	value = utf8n_to_uvchr((U8*)RExC_parse,
	9619	RExC_end - RExC_parse,
	9620	&numlen, UTF8_ALLOW_DEFAULT);
	9621	RExC_parse_inc_by(numlen);
	9622	}
	9623	else {
	9624	value = UCHARAT(RExC_parse);
	9625	RExC_parse_inc_by(1);
	9626	}
	9627
	9628	/* Some compilers cannot handle switching on 64-bit integer
	9629	* values, therefore value cannot be an UV. Yes, this will
	9630	* be a problem later if we want switch on Unicode.
	9631	* A similar issue a little bit later when switching on
	9632	* namedclass. --jhi */
	9633
	9634	/* If the \ is escaping white space when white space is being
	9635	* skipped, it means that that white space is wanted literally, and
	9636	* is already in 'value'. Otherwise, need to translate the escape
	9637	* into what it signifies. */
	9638	if (! skip_white \|\| ! isBLANK_A(value)) switch ((I32)value) {
	9639	const char * message;
	9640	U32 packed_warn;
	9641	U8 grok_c_char;
	9642
	9643	case 'w': namedclass = ANYOF_WORDCHAR; break;
	9644	case 'W': namedclass = ANYOF_NWORDCHAR; break;
	9645	case 's': namedclass = ANYOF_SPACE; break;
	9646	case 'S': namedclass = ANYOF_NSPACE; break;
	9647	case 'd': namedclass = ANYOF_DIGIT; break;
	9648	case 'D': namedclass = ANYOF_NDIGIT; break;
	9649	case 'v': namedclass = ANYOF_VERTWS; break;
	9650	case 'V': namedclass = ANYOF_NVERTWS; break;
	9651	case 'h': namedclass = ANYOF_HORIZWS; break;
	9652	case 'H': namedclass = ANYOF_NHORIZWS; break;
	9653	case 'N': /* Handle \N{NAME} in class */
	9654	{
	9655	const char * const backslash_N_beg = RExC_parse - 2;
	9656	int cp_count;
	9657
	9658	if (! grok_bslash_N(pRExC_state,
	9659	NULL, /* No regnode */
	9660	&value, /* Yes single value */
	9661	&cp_count, /* Multiple code pt count */
	9662	flagp,
	9663	strict,
	9664	depth)
	9665	) {
	9666
	9667	if (*flagp & NEED_UTF8)
	9668	FAIL("panic: grok_bslash_N set NEED_UTF8");
	9669
	9670	RETURN_FAIL_ON_RESTART_FLAGP(flagp);
	9671
	9672	if (cp_count < 0) {
	9673	vFAIL("\\N in a character class must be a named character: \\N{...}");
	9674	}
	9675	else if (cp_count == 0) {
	9676	ckWARNreg(RExC_parse,
	9677	"Ignoring zero length \\N{} in character class");
	9678	}
	9679	else { /* cp_count > 1 */
	9680	assert(cp_count > 1);
	9681	if (! RExC_in_multi_char_class) {
	9682	if ( ! allow_mutiple_chars
	9683	\|\| invert
	9684	\|\| range
	9685	\|\| *RExC_parse == '-')
	9686	{
	9687	if (strict) {
	9688	RExC_parse--;
	9689	vFAIL("\\N{} here is restricted to one character");
	9690	}
	9691	ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
	9692	break; /* <value> contains the first code
	9693	point. Drop out of the switch to
	9694	process it */
	9695	}
	9696	else {
	9697	SV * multi_char_N = newSVpvn(backslash_N_beg,
	9698	RExC_parse - backslash_N_beg);
	9699	multi_char_matches
	9700	= add_multi_match(multi_char_matches,
	9701	multi_char_N,
	9702	cp_count);
	9703	}
	9704	}
	9705	} /* End of cp_count != 1 */
	9706
	9707	/* This element should not be processed further in this
	9708	* class */
	9709	element_count--;
	9710	value = save_value;
	9711	prevvalue = save_prevvalue;
	9712	continue; /* Back to top of loop to get next char */
	9713	}
	9714
	9715	/* Here, is a single code point, and <value> contains it */
	9716	unicode_range = TRUE; /* \N{} are Unicode */
	9717	}
	9718	break;
	9719	case 'p':
	9720	case 'P':
	9721	{
	9722	char *e;
	9723
	9724	if (RExC_pm_flags & PMf_WILDCARD) {
	9725	RExC_parse_inc_by(1);
	9726	/* diag_listed_as: Use of %s is not allowed in Unicode
	9727	property wildcard subpatterns in regex; marked by <--
	9728	HERE in m/%s/ */
	9729	vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
	9730	" wildcard subpatterns", (char) value, *(RExC_parse - 1));
	9731	}
	9732
	9733	/* \p means they want Unicode semantics */
	9734	REQUIRE_UNI_RULES(flagp, 0);
	9735
	9736	if (RExC_parse >= RExC_end)
	9737	vFAIL2("Empty \\%c", (U8)value);
	9738	if (*RExC_parse == '{') {
	9739	const U8 c = (U8)value;
	9740	e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
	9741	if (!e) {
	9742	RExC_parse_inc_by(1);
	9743	vFAIL2("Missing right brace on \\%c{}", c);
	9744	}
	9745
	9746	RExC_parse_inc_by(1);
	9747
	9748	/* White space is allowed adjacent to the braces and after
	9749	* any '^', even when not under /x */
	9750	while (isSPACE(*RExC_parse)) {
	9751	RExC_parse_inc_by(1);
	9752	}
	9753
	9754	if (UCHARAT(RExC_parse) == '^') {
	9755
	9756	/* toggle. (The rhs xor gets the single bit that
	9757	* differs between P and p; the other xor inverts just
	9758	* that bit) */
	9759	value ^= 'P' ^ 'p';
	9760
	9761	RExC_parse_inc_by(1);
	9762	while (isSPACE(*RExC_parse)) {
	9763	RExC_parse_inc_by(1);
	9764	}
	9765	}
	9766
	9767	if (e == RExC_parse)
	9768	vFAIL2("Empty \\%c{}", c);
	9769
	9770	n = e - RExC_parse;
	9771	while (isSPACE(*(RExC_parse + n - 1)))
	9772	n--;
	9773
	9774	} /* The \p isn't immediately followed by a '{' */
	9775	else if (! isALPHA(*RExC_parse)) {
	9776	RExC_parse_inc_safe();
	9777	vFAIL2("Character following \\%c must be '{' or a "
	9778	"single-character Unicode property name",
	9779	(U8) value);
	9780	}
	9781	else {
	9782	e = RExC_parse;
	9783	n = 1;
	9784	}
	9785	{
	9786	char* name = RExC_parse;
	9787
	9788	/* Any message returned about expanding the definition */
	9789	SV* msg = newSVpvs_flags("", SVs_TEMP);
	9790
	9791	/* If set TRUE, the property is user-defined as opposed to
	9792	* official Unicode */
	9793	bool user_defined = FALSE;
	9794	AV * strings = NULL;
	9795
	9796	SV * prop_definition = parse_uniprop_string(
	9797	name, n, UTF, FOLD,
	9798	FALSE, /* This is compile-time */
	9799
	9800	/* We can't defer this defn when
	9801	* the full result is required in
	9802	* this call */
	9803	! cBOOL(ret_invlist),
	9804
	9805	&strings,
	9806	&user_defined,
	9807	msg,
	9808	0 /* Base level */
	9809	);
	9810	if (SvCUR(msg)) { /* Assumes any error causes a msg */
	9811	assert(prop_definition == NULL);
	9812	RExC_parse_set(e + 1);
	9813	if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
	9814	thing so, or else the display is
	9815	mojibake */
	9816	RExC_utf8 = TRUE;
	9817	}
	9818	/* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
	9819	vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
	9820	SvCUR(msg), SvPVX(msg)));
	9821	}
	9822
	9823	assert(prop_definition \|\| strings);
	9824
	9825	if (strings) {
	9826	if (ret_invlist) {
	9827	if (! prop_definition) {
	9828	RExC_parse_set(e + 1);
	9829	vFAIL("Unicode string properties are not implemented in (?[...])");
	9830	}
	9831	else {
	9832	ckWARNreg(e + 1,
	9833	"Using just the single character results"
	9834	" returned by \\p{} in (?[...])");
	9835	}
	9836	}
	9837	else if (! RExC_in_multi_char_class) {
	9838	if (invert ^ (value == 'P')) {
	9839	RExC_parse_set(e + 1);
	9840	vFAIL("Inverting a character class which contains"
	9841	" a multi-character sequence is illegal");
	9842	}
	9843
	9844	/* For each multi-character string ... */
	9845	while (av_count(strings) > 0) {
	9846	/* ... Each entry is itself an array of code
	9847	* points. */
	9848	AV * this_string = (AV *) av_shift( strings);
	9849	STRLEN cp_count = av_count(this_string);
	9850	SV * final = newSV(cp_count ? cp_count * 4 : 1);
	9851	SvPVCLEAR_FRESH(final);
	9852
	9853	/* Create another string of sequences of \x{...} */
	9854	while (av_count(this_string) > 0) {
	9855	SV * character = av_shift(this_string);
	9856	UV cp = SvUV(character);
	9857
	9858	if (cp > 255) {
	9859	REQUIRE_UTF8(flagp);
	9860	}
	9861	Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
	9862	cp);
	9863	SvREFCNT_dec_NN(character);
	9864	}
	9865	SvREFCNT_dec_NN(this_string);
	9866
	9867	/* And add that to the list of such things */
	9868	multi_char_matches
	9869	= add_multi_match(multi_char_matches,
	9870	final,
	9871	cp_count);
	9872	}
	9873	}
	9874	SvREFCNT_dec_NN(strings);
	9875	}
	9876
	9877	if (! prop_definition) { /* If we got only a string,
	9878	this iteration didn't really
	9879	find a character */
	9880	element_count--;
	9881	}
	9882	else if (! is_invlist(prop_definition)) {
	9883
	9884	/* Here, the definition isn't known, so we have gotten
	9885	* returned a string that will be evaluated if and when
	9886	* encountered at runtime. We add it to the list of
	9887	* such properties, along with whether it should be
	9888	* complemented or not */
	9889	if (value == 'P') {
	9890	sv_catpvs(listsv, "!");
	9891	}
	9892	else {
	9893	sv_catpvs(listsv, "+");
	9894	}
	9895	sv_catsv(listsv, prop_definition);
	9896
	9897	has_runtime_dependency \|= HAS_USER_DEFINED_PROPERTY;
	9898
	9899	/* We don't know yet what this matches, so have to flag
	9900	* it */
	9901	anyof_flags \|= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
	9902	}
	9903	else {
	9904	assert (prop_definition && is_invlist(prop_definition));
	9905
	9906	/* Here we do have the complete property definition
	9907	*
	9908	* Temporary workaround for [GH #16520]. For this
	9909	* precise input that is in the .t that is failing,
	9910	* load utf8.pm, which is what the test wants, so that
	9911	* that .t passes */
	9912	if ( memEQs(RExC_start, e + 1 - RExC_start,
	9913	"foo\\p{Alnum}")
	9914	&& ! hv_common(GvHVn(PL_incgv),
	9915	NULL,
	9916	"utf8.pm", sizeof("utf8.pm") - 1,
	9917	0, HV_FETCH_ISEXISTS, NULL, 0))
	9918	{
	9919	require_pv("utf8.pm");
	9920	}
	9921
	9922	if (! user_defined &&
	9923	/* We warn on matching an above-Unicode code point
	9924	* if the match would return true, except don't
	9925	* warn for \p{All}, which has exactly one element
	9926	* = 0 */
	9927	(_invlist_contains_cp(prop_definition, 0x110000)
	9928	&& (! (_invlist_len(prop_definition) == 1
	9929	&& *invlist_array(prop_definition) == 0))))
	9930	{
	9931	warn_super = TRUE;
	9932	}
	9933
	9934	/* Invert if asking for the complement */
	9935	if (value == 'P') {
	9936	_invlist_union_complement_2nd(properties,
	9937	prop_definition,
	9938	&properties);
	9939	}
	9940	else {
	9941	_invlist_union(properties, prop_definition, &properties);
	9942	}
	9943	}
	9944	}
	9945
	9946	RExC_parse_set(e + 1);
	9947	namedclass = ANYOF_UNIPROP; /* no official name, but it's
	9948	named */
	9949	}
	9950	break;
	9951	case 'n': value = '\n'; break;
	9952	case 'r': value = '\r'; break;
	9953	case 't': value = '\t'; break;
	9954	case 'f': value = '\f'; break;
	9955	case 'b': value = '\b'; break;
	9956	case 'e': value = ESC_NATIVE; break;
	9957	case 'a': value = '\a'; break;
	9958	case 'o':
	9959	RExC_parse--; /* function expects to be pointed at the 'o' */
	9960	if (! grok_bslash_o(&RExC_parse,
	9961	RExC_end,
	9962	&value,
	9963	&message,
	9964	&packed_warn,
	9965	strict,
	9966	cBOOL(range), /* MAX_UV allowed for range
	9967	upper limit */
	9968	UTF))
	9969	{
	9970	vFAIL(message);
	9971	}
	9972	else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
	9973	warn_non_literal_string(RExC_parse, packed_warn, message);
	9974	}
	9975
	9976	if (value < 256) {
	9977	non_portable_endpoint++;
	9978	}
	9979	break;
	9980	case 'x':
	9981	RExC_parse--; /* function expects to be pointed at the 'x' */
	9982	if (! grok_bslash_x(&RExC_parse,
	9983	RExC_end,
	9984	&value,
	9985	&message,
	9986	&packed_warn,
	9987	strict,
	9988	cBOOL(range), /* MAX_UV allowed for range
	9989	upper limit */
	9990	UTF))
	9991	{
	9992	vFAIL(message);
	9993	}
	9994	else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
	9995	warn_non_literal_string(RExC_parse, packed_warn, message);
	9996	}
	9997
	9998	if (value < 256) {
	9999	non_portable_endpoint++;
	10000	}
	10001	break;
	10002	case 'c':
	10003	if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
	10004	&packed_warn))
	10005	{
	10006	/* going to die anyway; point to exact spot of
	10007	* failure */
	10008	RExC_parse_inc_safe();
	10009	vFAIL(message);
	10010	}
	10011
	10012	value = grok_c_char;
	10013	RExC_parse_inc_by(1);
	10014	if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
	10015	warn_non_literal_string(RExC_parse, packed_warn, message);
	10016	}
	10017
	10018	non_portable_endpoint++;
	10019	break;
	10020	case '0': case '1': case '2': case '3': case '4':
	10021	case '5': case '6': case '7':
	10022	{
	10023	/* Take 1-3 octal digits */
	10024	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	10025	\| PERL_SCAN_NOTIFY_ILLDIGIT;
	10026	numlen = (strict) ? 4 : 3;
	10027	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	10028	RExC_parse_inc_by(numlen);
	10029	if (numlen != 3) {
	10030	if (strict) {
	10031	RExC_parse_inc_safe();
	10032	vFAIL("Need exactly 3 octal digits");
	10033	}
	10034	else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
	10035	&& RExC_parse < RExC_end
	10036	&& isDIGIT(*RExC_parse)
	10037	&& ckWARN(WARN_REGEXP))
	10038	{
	10039	reg_warn_non_literal_string(
	10040	RExC_parse + 1,
	10041	form_alien_digit_msg(8, numlen, RExC_parse,
	10042	RExC_end, UTF, FALSE));
	10043	}
	10044	}
	10045	if (value < 256) {
	10046	non_portable_endpoint++;
	10047	}
	10048	break;
	10049	}
	10050	default:
	10051	/* Allow \_ to not give an error */
	10052	if (isWORDCHAR(value) && value != '_') {
	10053	if (strict) {
	10054	vFAIL2("Unrecognized escape \\%c in character class",
	10055	(int)value);
	10056	}
	10057	else {
	10058	ckWARN2reg(RExC_parse,
	10059	"Unrecognized escape \\%c in character class passed through",
	10060	(int)value);
	10061	}
	10062	}
	10063	break;
	10064	} /* End of switch on char following backslash */
	10065	} /* end of handling backslash escape sequences */
	10066
	10067	/* Here, we have the current token in 'value' */
	10068
	10069	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	10070	U8 classnum;
	10071
	10072	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	10073	* literal, as is the character that began the false range, i.e.
	10074	* the 'a' in the examples */
	10075	if (range) {
	10076	const int w = (RExC_parse >= rangebegin)
	10077	? RExC_parse - rangebegin
	10078	: 0;
	10079	if (strict) {
	10080	vFAIL2utf8f(
	10081	"False [] range \"%" UTF8f "\"",
	10082	UTF8fARG(UTF, w, rangebegin));
	10083	}
	10084	else {
	10085	ckWARN2reg(RExC_parse,
	10086	"False [] range \"%" UTF8f "\"",
	10087	UTF8fARG(UTF, w, rangebegin));
	10088	cp_list = add_cp_to_invlist(cp_list, '-');
	10089	cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
	10090	prevvalue);
	10091	}
	10092
	10093	range = 0; /* this was not a true range */
	10094	element_count += 2; /* So counts for three values */
	10095	}
	10096
	10097	classnum = namedclass_to_classnum(namedclass);
	10098
	10099	if (LOC && namedclass < ANYOF_POSIXL_MAX
	10100	#ifndef HAS_ISASCII
	10101	&& classnum != CC_ASCII_
	10102	#endif
	10103	) {
	10104	SV* scratch_list = NULL;
	10105
	10106	/* What the Posix classes (like \w, [:space:]) match isn't
	10107	* generally knowable under locale until actual match time. A
	10108	* special node is used for these which has extra space for a
	10109	* bitmap, with a bit reserved for each named class that is to
	10110	* be matched against. (This isn't needed for \p{} and
	10111	* pseudo-classes, as they are not affected by locale, and
	10112	* hence are dealt with separately.) However, if a named class
	10113	* and its complement are both present, then it matches
	10114	* everything, and there is no runtime dependency. Odd numbers
	10115	* are the complements of the next lower number, so xor works.
	10116	* (Note that something like [\w\D] should match everything,
	10117	* because \d should be a proper subset of \w. But rather than
	10118	* trust that the locale is well behaved, we leave this to
	10119	* runtime to sort out) */
	10120	if (POSIXL_TEST(posixl, namedclass ^ 1)) {
	10121	cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
	10122	POSIXL_ZERO(posixl);
	10123	has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
	10124	anyof_flags &= ~ANYOF_MATCHES_POSIXL;
	10125	continue; /* We could ignore the rest of the class, but
	10126	best to parse it for any errors */
	10127	}
	10128	else { /* Here, isn't the complement of any already parsed
	10129	class */
	10130	POSIXL_SET(posixl, namedclass);
	10131	has_runtime_dependency \|= HAS_L_RUNTIME_DEPENDENCY;
	10132	anyof_flags \|= ANYOF_MATCHES_POSIXL;
	10133
	10134	/* The above-Latin1 characters are not subject to locale
	10135	* rules. Just add them to the unconditionally-matched
	10136	* list */
	10137
	10138	/* Get the list of the above-Latin1 code points this
	10139	* matches */
	10140	_invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
	10141	PL_XPosix_ptrs[classnum],
	10142
	10143	/* Odd numbers are complements,
	10144	* like NDIGIT, NASCII, ... */
	10145	namedclass % 2 != 0,
	10146	&scratch_list);
	10147	/* Checking if 'cp_list' is NULL first saves an extra
	10148	* clone. Its reference count will be decremented at the
	10149	* next union, etc, or if this is the only instance, at the
	10150	* end of the routine */
	10151	if (! cp_list) {
	10152	cp_list = scratch_list;
	10153	}
	10154	else {
	10155	_invlist_union(cp_list, scratch_list, &cp_list);
	10156	SvREFCNT_dec_NN(scratch_list);
	10157	}
	10158	continue; /* Go get next character */
	10159	}
	10160	}
	10161	else {
	10162
	10163	/* Here, is not /l, or is a POSIX class for which /l doesn't
	10164	* matter (or is a Unicode property, which is skipped here). */
	10165	if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
	10166	if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
	10167
	10168	/* Here, should be \h, \H, \v, or \V. None of /d, /i
	10169	* nor /l make a difference in what these match,
	10170	* therefore we just add what they match to cp_list. */
	10171	if (classnum != CC_VERTSPACE_) {
	10172	assert( namedclass == ANYOF_HORIZWS
	10173	\|\| namedclass == ANYOF_NHORIZWS);
	10174
	10175	/* It turns out that \h is just a synonym for
	10176	* XPosixBlank */
	10177	classnum = CC_BLANK_;
	10178	}
	10179
	10180	_invlist_union_maybe_complement_2nd(
	10181	cp_list,
	10182	PL_XPosix_ptrs[classnum],
	10183	namedclass % 2 != 0, /* Complement if odd
	10184	(NHORIZWS, NVERTWS)
	10185	*/
	10186	&cp_list);
	10187	}
	10188	}
	10189	else if ( AT_LEAST_UNI_SEMANTICS
	10190	\|\| classnum == CC_ASCII_
	10191	\|\| (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_
	10192	\|\| classnum == CC_XDIGIT_)))
	10193	{
	10194	/* We usually have to worry about /d affecting what POSIX
	10195	* classes match, with special code needed because we won't
	10196	* know until runtime what all matches. But there is no
	10197	* extra work needed under /u and /a; and [:ascii:] is
	10198	* unaffected by /d; and :digit: and :xdigit: don't have
	10199	* runtime differences under /d. So we can special case
	10200	* these, and avoid some extra work below, and at runtime.
	10201	* */
	10202	_invlist_union_maybe_complement_2nd(
	10203	simple_posixes,
	10204	((AT_LEAST_ASCII_RESTRICTED)
	10205	? PL_Posix_ptrs[classnum]
	10206	: PL_XPosix_ptrs[classnum]),
	10207	namedclass % 2 != 0,
	10208	&simple_posixes);
	10209	}
	10210	else { /* Garden variety class. If is NUPPER, NALPHA, ...
	10211	complement and use nposixes */
	10212	SV** posixes_ptr = namedclass % 2 == 0
	10213	? &posixes
	10214	: &nposixes;
	10215	_invlist_union_maybe_complement_2nd(
	10216	*posixes_ptr,
	10217	PL_XPosix_ptrs[classnum],
	10218	namedclass % 2 != 0,
	10219	posixes_ptr);
	10220	}
	10221	}
	10222	} /* end of namedclass \blah */
	10223
	10224	SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
	10225
	10226	/* If 'range' is set, 'value' is the ending of a range--check its
	10227	* validity. (If value isn't a single code point in the case of a
	10228	* range, we should have figured that out above in the code that
	10229	* catches false ranges). Later, we will handle each individual code
	10230	* point in the range. If 'range' isn't set, this could be the
	10231	* beginning of a range, so check for that by looking ahead to see if
	10232	* the next real character to be processed is the range indicator--the
	10233	* minus sign */
	10234
	10235	if (range) {
	10236	#ifdef EBCDIC
	10237	/* For unicode ranges, we have to test that the Unicode as opposed
	10238	* to the native values are not decreasing. (Above 255, there is
	10239	* no difference between native and Unicode) */
	10240	if (unicode_range && prevvalue < 255 && value < 255) {
	10241	if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
	10242	goto backwards_range;
	10243	}
	10244	}
	10245	else
	10246	#endif
	10247	if (prevvalue > value) /* b-a */ {
	10248	int w;
	10249	#ifdef EBCDIC
	10250	backwards_range:
	10251	#endif
	10252	w = RExC_parse - rangebegin;
	10253	vFAIL2utf8f(
	10254	"Invalid [] range \"%" UTF8f "\"",
	10255	UTF8fARG(UTF, w, rangebegin));
	10256	NOT_REACHED; /* NOTREACHED */
	10257	}
	10258	}
	10259	else {
	10260	prevvalue = value; /* save the beginning of the potential range */
	10261	if (! stop_at_1 /* Can't be a range if parsing just one thing */
	10262	&& *RExC_parse == '-')
	10263	{
	10264	char* next_char_ptr = RExC_parse + 1;
	10265
	10266	/* Get the next real char after the '-' */
	10267	SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
	10268
	10269	/* If the '-' is at the end of the class (just before the ']',
	10270	* it is a literal minus; otherwise it is a range */
	10271	if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
	10272	RExC_parse_set(next_char_ptr);
	10273
	10274	/* a bad range like \w-, [:word:]- ? */
	10275	if (namedclass > OOB_NAMEDCLASS) {
	10276	if (strict \|\| ckWARN(WARN_REGEXP)) {
	10277	const int w = RExC_parse >= rangebegin
	10278	? RExC_parse - rangebegin
	10279	: 0;
	10280	if (strict) {
	10281	vFAIL4("False [] range \"%.s\"",
	10282	w, w, rangebegin);
	10283	}
	10284	else {
	10285	vWARN4(RExC_parse,
	10286	"False [] range \"%.s\"",
	10287	w, w, rangebegin);
	10288	}
	10289	}
	10290	cp_list = add_cp_to_invlist(cp_list, '-');
	10291	element_count++;
	10292	} else
	10293	range = 1; /* yeah, it's a range! */
	10294	continue; /* but do it the next time */
	10295	}
	10296	}
	10297	}
	10298
	10299	if (namedclass > OOB_NAMEDCLASS) {
	10300	continue;
	10301	}
	10302
	10303	/* Here, we have a single value this time through the loop, and
	10304	* <prevvalue> is the beginning of the range, if any; or <value> if
	10305	* not. */
	10306
	10307	/* non-Latin1 code point implies unicode semantics. */
	10308	if (value > 255) {
	10309	if (value > MAX_LEGAL_CP && ( value != UV_MAX
	10310	\|\| prevvalue > MAX_LEGAL_CP))
	10311	{
	10312	vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
	10313	}
	10314	REQUIRE_UNI_RULES(flagp, 0);
	10315	if ( ! silence_non_portable
	10316	&& UNICODE_IS_PERL_EXTENDED(value)
	10317	&& TO_OUTPUT_WARNINGS(RExC_parse))
	10318	{
	10319	ckWARN2_non_literal_string(RExC_parse,
	10320	packWARN(WARN_PORTABLE),
	10321	PL_extended_cp_format,
	10322	value);
	10323	}
	10324	}
	10325
	10326	/* Ready to process either the single value, or the completed range.
	10327	* For single-valued non-inverted ranges, we consider the possibility
	10328	* of multi-char folds. (We made a conscious decision to not do this
	10329	* for the other cases because it can often lead to non-intuitive
	10330	* results. For example, you have the peculiar case that:
	10331	* "s s" =~ /^[^\xDF]+$/i => Y
	10332	* "ss" =~ /^[^\xDF]+$/i => N
	10333	*
	10334	* See [perl #89750] */
	10335	if (FOLD && allow_mutiple_chars && value == prevvalue) {
	10336	if ( value == LATIN_SMALL_LETTER_SHARP_S
	10337	\|\| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
	10338	value)))
	10339	{
	10340	/* Here <value> is indeed a multi-char fold. Get what it is */
	10341
	10342	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	10343	STRLEN foldlen;
	10344
	10345	UV folded = _to_uni_fold_flags(
	10346	value,
	10347	foldbuf,
	10348	&foldlen,
	10349	FOLD_FLAGS_FULL \| (ASCII_FOLD_RESTRICTED
	10350	? FOLD_FLAGS_NOMIX_ASCII
	10351	: 0)
	10352	);
	10353
	10354	/* Here, <folded> should be the first character of the
	10355	* multi-char fold of <value>, with <foldbuf> containing the
	10356	* whole thing. But, if this fold is not allowed (because of
	10357	* the flags), <fold> will be the same as <value>, and should
	10358	* be processed like any other character, so skip the special
	10359	* handling */
	10360	if (folded != value) {
	10361
	10362	/* Skip if we are recursed, currently parsing the class
	10363	* again. Otherwise add this character to the list of
	10364	* multi-char folds. */
	10365	if (! RExC_in_multi_char_class) {
	10366	STRLEN cp_count = utf8_length(foldbuf,
	10367	foldbuf + foldlen);
	10368	SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
	10369
	10370	Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
	10371
	10372	multi_char_matches
	10373	= add_multi_match(multi_char_matches,
	10374	multi_fold,
	10375	cp_count);
	10376
	10377	}
	10378
	10379	/* This element should not be processed further in this
	10380	* class */
	10381	element_count--;
	10382	value = save_value;
	10383	prevvalue = save_prevvalue;
	10384	continue;
	10385	}
	10386	}
	10387	}
	10388
	10389	if (strict && ckWARN(WARN_REGEXP)) {
	10390	if (range) {
	10391
	10392	/* If the range starts above 255, everything is portable and
	10393	* likely to be so for any forseeable character set, so don't
	10394	* warn. */
	10395	if (unicode_range && non_portable_endpoint && prevvalue < 256) {
	10396	vWARN(RExC_parse, "Both or neither range ends should be Unicode");
	10397	}
	10398	else if (prevvalue != value) {
	10399
	10400	/* Under strict, ranges that stop and/or end in an ASCII
	10401	* printable should have each end point be a portable value
	10402	* for it (preferably like 'A', but we don't warn if it is
	10403	* a (portable) Unicode name or code point), and the range
	10404	* must be all digits or all letters of the same case.
	10405	* Otherwise, the range is non-portable and unclear as to
	10406	* what it contains */
	10407	if ( (isPRINT_A(prevvalue) \|\| isPRINT_A(value))
	10408	&& ( non_portable_endpoint
	10409	\|\| ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
	10410	\|\| (isLOWER_A(prevvalue) && isLOWER_A(value))
	10411	\|\| (isUPPER_A(prevvalue) && isUPPER_A(value))
	10412	))) {
	10413	vWARN(RExC_parse, "Ranges of ASCII printables should"
	10414	" be some subset of \"0-9\","
	10415	" \"A-Z\", or \"a-z\"");
	10416	}
	10417	else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
	10418	SSize_t index_start;
	10419	SSize_t index_final;
	10420
	10421	/* But the nature of Unicode and languages mean we
	10422	* can't do the same checks for above-ASCII ranges,
	10423	* except in the case of digit ones. These should
	10424	* contain only digits from the same group of 10. The
	10425	* ASCII case is handled just above. Hence here, the
	10426	* range could be a range of digits. First some
	10427	* unlikely special cases. Grandfather in that a range
	10428	* ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
	10429	* if its starting value is one of the 10 digits prior
	10430	* to it. This is because it is an alternate way of
	10431	* writing 19D1, and some people may expect it to be in
	10432	* that group. But it is bad, because it won't give
	10433	* the expected results. In Unicode 5.2 it was
	10434	* considered to be in that group (of 11, hence), but
	10435	* this was fixed in the next version */
	10436
	10437	if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
	10438	goto warn_bad_digit_range;
	10439	}
	10440	else if (UNLIKELY( prevvalue >= 0x1D7CE
	10441	&& value <= 0x1D7FF))
	10442	{
	10443	/* This is the only other case currently in Unicode
	10444	* where the algorithm below fails. The code
	10445	* points just above are the end points of a single
	10446	* range containing only decimal digits. It is 5
	10447	* different series of 0-9. All other ranges of
	10448	* digits currently in Unicode are just a single
	10449	* series. (And mktables will notify us if a later
	10450	* Unicode version breaks this.)
	10451	*
	10452	* If the range being checked is at most 9 long,
	10453	* and the digit values represented are in
	10454	* numerical order, they are from the same series.
	10455	* */
	10456	if ( value - prevvalue > 9
	10457	\|\| ((( value - 0x1D7CE) % 10)
	10458	<= (prevvalue - 0x1D7CE) % 10))
	10459	{
	10460	goto warn_bad_digit_range;
	10461	}
	10462	}
	10463	else {
	10464
	10465	/* For all other ranges of digits in Unicode, the
	10466	* algorithm is just to check if both end points
	10467	* are in the same series, which is the same range.
	10468	* */
	10469	index_start = _invlist_search(
	10470	PL_XPosix_ptrs[CC_DIGIT_],
	10471	prevvalue);
	10472
	10473	/* Warn if the range starts and ends with a digit,
	10474	* and they are not in the same group of 10. */
	10475	if ( index_start >= 0
	10476	&& ELEMENT_RANGE_MATCHES_INVLIST(index_start)
	10477	&& (index_final =
	10478	_invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
	10479	value)) != index_start
	10480	&& index_final >= 0
	10481	&& ELEMENT_RANGE_MATCHES_INVLIST(index_final))
	10482	{
	10483	warn_bad_digit_range:
	10484	vWARN(RExC_parse, "Ranges of digits should be"
	10485	" from the same group of"
	10486	" 10");
	10487	}
	10488	}
	10489	}
	10490	}
	10491	}
	10492	if ((! range \|\| prevvalue == value) && non_portable_endpoint) {
	10493	if (isPRINT_A(value)) {
	10494	char literal[3];
	10495	unsigned d = 0;
	10496	if (isBACKSLASHED_PUNCT(value)) {
	10497	literal[d++] = '\\';
	10498	}
	10499	literal[d++] = (char) value;
	10500	literal[d++] = '\0';
	10501
	10502	vWARN4(RExC_parse,
	10503	"\"%.*s\" is more clearly written simply as \"%s\"",
	10504	(int) (RExC_parse - rangebegin),
	10505	rangebegin,
	10506	literal
	10507	);
	10508	}
	10509	else if (isMNEMONIC_CNTRL(value)) {
	10510	vWARN4(RExC_parse,
	10511	"\"%.*s\" is more clearly written simply as \"%s\"",
	10512	(int) (RExC_parse - rangebegin),
	10513	rangebegin,
	10514	cntrl_to_mnemonic((U8) value)
	10515	);
	10516	}
	10517	}
	10518	}
	10519
	10520	/* Deal with this element of the class */
	10521
	10522	#ifndef EBCDIC
	10523	cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
	10524	prevvalue, value);
	10525	#else
	10526	/* On non-ASCII platforms, for ranges that span all of 0..255, and ones
	10527	* that don't require special handling, we can just add the range like
	10528	* we do for ASCII platforms */
	10529	if ((UNLIKELY(prevvalue == 0) && value >= 255)
	10530	\|\| ! (prevvalue < 256
	10531	&& (unicode_range
	10532	\|\| (! non_portable_endpoint
	10533	&& ((isLOWER_A(prevvalue) && isLOWER_A(value))
	10534	\|\| (isUPPER_A(prevvalue)
	10535	&& isUPPER_A(value)))))))
	10536	{
	10537	cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
	10538	prevvalue, value);
	10539	}
	10540	else {
	10541	/* Here, requires special handling. This can be because it is a
	10542	* range whose code points are considered to be Unicode, and so
	10543	* must be individually translated into native, or because its a
	10544	* subrange of 'A-Z' or 'a-z' which each aren't contiguous in
	10545	* EBCDIC, but we have defined them to include only the "expected"
	10546	* upper or lower case ASCII alphabetics. Subranges above 255 are
	10547	* the same in native and Unicode, so can be added as a range */
	10548	U8 start = NATIVE_TO_LATIN1(prevvalue);
	10549	unsigned j;
	10550	U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
	10551	for (j = start; j <= end; j++) {
	10552	cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
	10553	}
	10554	if (value > 255) {
	10555	cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
	10556	256, value);
	10557	}
	10558	}
	10559	#endif
	10560
	10561	range = 0; /* this range (if it was one) is done now */
	10562	} /* End of loop through all the text within the brackets */
	10563
	10564	if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
	10565	output_posix_warnings(pRExC_state, posix_warnings);
	10566	}
	10567
	10568	/* If anything in the class expands to more than one character, we have to
	10569	* deal with them by building up a substitute parse string, and recursively
	10570	* calling reg() on it, instead of proceeding */
	10571	if (multi_char_matches) {
	10572	SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
	10573	I32 cp_count;
	10574	STRLEN len;
	10575	char *save_end = RExC_end;
	10576	char *save_parse = RExC_parse;
	10577	char *save_start = RExC_start;
	10578	Size_t constructed_prefix_len = 0; /* This gives the length of the
	10579	constructed portion of the
	10580	substitute parse. */
	10581	bool first_time = TRUE; /* First multi-char occurrence doesn't get
	10582	a "\|" */
	10583	I32 reg_flags;
	10584
	10585	assert(! invert);
	10586	/* Only one level of recursion allowed */
	10587	assert(RExC_copy_start_in_constructed == RExC_precomp);
	10588
	10589	#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
	10590	because too confusing */
	10591	if (invert) {
	10592	sv_catpvs(substitute_parse, "(?:");
	10593	}
	10594	#endif
	10595
	10596	/* Look at the longest strings first */
	10597	for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
	10598	cp_count > 0;
	10599	cp_count--)
	10600	{
	10601
	10602	if (av_exists(multi_char_matches, cp_count)) {
	10603	AV** this_array_ptr;
	10604	SV* this_sequence;
	10605
	10606	this_array_ptr = (AV**) av_fetch_simple(multi_char_matches,
	10607	cp_count, FALSE);
	10608	while ((this_sequence = av_pop(*this_array_ptr)) !=
	10609	&PL_sv_undef)
	10610	{
	10611	if (! first_time) {
	10612	sv_catpvs(substitute_parse, "\|");
	10613	}
	10614	first_time = FALSE;
	10615
	10616	sv_catpv(substitute_parse, SvPVX(this_sequence));
	10617	}
	10618	}
	10619	}
	10620
	10621	/* If the character class contains anything else besides these
	10622	* multi-character strings, have to include it in recursive parsing */
	10623	if (element_count) {
	10624	bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
	10625
	10626	sv_catpvs(substitute_parse, "\|");
	10627	if (has_l_bracket) { /* Add an [ if the original had one */
	10628	sv_catpvs(substitute_parse, "[");
	10629	}
	10630	constructed_prefix_len = SvCUR(substitute_parse);
	10631	sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
	10632
	10633	/* Put in a closing ']' to match any opening one, but not if going
	10634	* off the end, as otherwise we are adding something that really
	10635	* isn't there */
	10636	if (has_l_bracket && RExC_parse < RExC_end) {
	10637	sv_catpvs(substitute_parse, "]");
	10638	}
	10639	}
	10640
	10641	sv_catpvs(substitute_parse, ")");
	10642	#if 0
	10643	if (invert) {
	10644	/* This is a way to get the parse to skip forward a whole named
	10645	* sequence instead of matching the 2nd character when it fails the
	10646	* first */
	10647	sv_catpvs(substitute_parse, "(THEN)(SKIP)(*FAIL)\|.)");
	10648	}
	10649	#endif
	10650
	10651	/* Set up the data structure so that any errors will be properly
	10652	* reported. See the comments at the definition of
	10653	* REPORT_LOCATION_ARGS for details */
	10654	RExC_copy_start_in_input = (char *) orig_parse;
	10655	RExC_start = SvPV(substitute_parse, len);
	10656	RExC_parse_set( RExC_start );
	10657	RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
	10658	RExC_end = RExC_parse + len;
	10659	RExC_in_multi_char_class = 1;
	10660
	10661	ret = reg(pRExC_state, 1, &reg_flags, depth+1);
	10662
	10663	*flagp \|= reg_flags & (HASWIDTH\|SIMPLE\|POSTPONED\|RESTART_PARSE\|NEED_UTF8);
	10664
	10665	/* And restore so can parse the rest of the pattern */
	10666	RExC_parse_set(save_parse);
	10667	RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
	10668	RExC_end = save_end;
	10669	RExC_in_multi_char_class = 0;
	10670	SvREFCNT_dec_NN(multi_char_matches);
	10671	SvREFCNT_dec(properties);
	10672	SvREFCNT_dec(cp_list);
	10673	SvREFCNT_dec(simple_posixes);
	10674	SvREFCNT_dec(posixes);
	10675	SvREFCNT_dec(nposixes);
	10676	SvREFCNT_dec(cp_foldable_list);
	10677	return ret;
	10678	}
	10679
	10680	/* If folding, we calculate all characters that could fold to or from the
	10681	* ones already on the list */
	10682	if (cp_foldable_list) {
	10683	if (FOLD) {
	10684	UV start, end; /* End points of code point ranges */
	10685
	10686	SV* fold_intersection = NULL;
	10687	SV** use_list;
	10688
	10689	/* Our calculated list will be for Unicode rules. For locale
	10690	* matching, we have to keep a separate list that is consulted at
	10691	* runtime only when the locale indicates Unicode rules (and we
	10692	* don't include potential matches in the ASCII/Latin1 range, as
	10693	* any code point could fold to any other, based on the run-time
	10694	* locale). For non-locale, we just use the general list */
	10695	if (LOC) {
	10696	use_list = &only_utf8_locale_list;
	10697	}
	10698	else {
	10699	use_list = &cp_list;
	10700	}
	10701
	10702	/* Only the characters in this class that participate in folds need
	10703	* be checked. Get the intersection of this class and all the
	10704	* possible characters that are foldable. This can quickly narrow
	10705	* down a large class */
	10706	_invlist_intersection(PL_in_some_fold, cp_foldable_list,
	10707	&fold_intersection);
	10708
	10709	/* Now look at the foldable characters in this class individually */
	10710	invlist_iterinit(fold_intersection);
	10711	while (invlist_iternext(fold_intersection, &start, &end)) {
	10712	UV j;
	10713	UV folded;
	10714
	10715	/* Look at every character in the range */
	10716	for (j = start; j <= end; j++) {
	10717	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	10718	STRLEN foldlen;
	10719	unsigned int k;
	10720	Size_t folds_count;
	10721	U32 first_fold;
	10722	const U32 * remaining_folds;
	10723
	10724	if (j < 256) {
	10725
	10726	/* Under /l, we don't know what code points below 256
	10727	* fold to, except we do know the MICRO SIGN folds to
	10728	* an above-255 character if the locale is UTF-8, so we
	10729	* add it to the special list (in *use_list) Otherwise
	10730	* we know now what things can match, though some folds
	10731	* are valid under /d only if the target is UTF-8.
	10732	* Those go in a separate list */
	10733	if ( IS_IN_SOME_FOLD_L1(j)
	10734	&& ! (LOC && j != MICRO_SIGN))
	10735	{
	10736
	10737	/* ASCII is always matched; non-ASCII is matched
	10738	* only under Unicode rules (which could happen
	10739	* under /l if the locale is a UTF-8 one */
	10740	if (isASCII(j) \|\| ! DEPENDS_SEMANTICS) {
	10741	use_list = add_cp_to_invlist(use_list,
	10742	PL_fold_latin1[j]);
	10743	}
	10744	else if (j != PL_fold_latin1[j]) {
	10745	upper_latin1_only_utf8_matches
	10746	= add_cp_to_invlist(
	10747	upper_latin1_only_utf8_matches,
	10748	PL_fold_latin1[j]);
	10749	}
	10750	}
	10751
	10752	if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
	10753	&& (! isASCII(j) \|\| ! ASCII_FOLD_RESTRICTED))
	10754	{
	10755	add_above_Latin1_folds(pRExC_state,
	10756	(U8) j,
	10757	use_list);
	10758	}
	10759	continue;
	10760	}
	10761
	10762	/* Here is an above Latin1 character. We don't have the
	10763	* rules hard-coded for it. First, get its fold. This is
	10764	* the simple fold, as the multi-character folds have been
	10765	* handled earlier and separated out */
	10766	folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
	10767	(ASCII_FOLD_RESTRICTED)
	10768	? FOLD_FLAGS_NOMIX_ASCII
	10769	: 0);
	10770
	10771	/* Single character fold of above Latin1. Add everything
	10772	* in its fold closure to the list that this node should
	10773	* match. */
	10774	folds_count = _inverse_folds(folded, &first_fold,
	10775	&remaining_folds);
	10776	for (k = 0; k <= folds_count; k++) {
	10777	UV c = (k == 0) /* First time through use itself */
	10778	? folded
	10779	: (k == 1) /* 2nd time use, the first fold */
	10780	? first_fold
	10781
	10782	/* Then the remaining ones */
	10783	: remaining_folds[k-2];
	10784
	10785	/* /aa doesn't allow folds between ASCII and non- */
	10786	if (( ASCII_FOLD_RESTRICTED
	10787	&& (isASCII(c) != isASCII(j))))
	10788	{
	10789	continue;
	10790	}
	10791
	10792	/* Folds under /l which cross the 255/256 boundary are
	10793	* added to a separate list. (These are valid only
	10794	* when the locale is UTF-8.) */
	10795	if (c < 256 && LOC) {
	10796	use_list = add_cp_to_invlist(use_list, c);
	10797	continue;
	10798	}
	10799
	10800	if (isASCII(c) \|\| c > 255 \|\| AT_LEAST_UNI_SEMANTICS)
	10801	{
	10802	cp_list = add_cp_to_invlist(cp_list, c);
	10803	}
	10804	else {
	10805	/* Similarly folds involving non-ascii Latin1
	10806	* characters under /d are added to their list */
	10807	upper_latin1_only_utf8_matches
	10808	= add_cp_to_invlist(
	10809	upper_latin1_only_utf8_matches,
	10810	c);
	10811	}
	10812	}
	10813	}
	10814	}
	10815	SvREFCNT_dec_NN(fold_intersection);
	10816	}
	10817
	10818	/* Now that we have finished adding all the folds, there is no reason
	10819	* to keep the foldable list separate */
	10820	_invlist_union(cp_list, cp_foldable_list, &cp_list);
	10821	SvREFCNT_dec_NN(cp_foldable_list);
	10822	}
	10823
	10824	/* And combine the result (if any) with any inversion lists from posix
	10825	* classes. The lists are kept separate up to now because we don't want to
	10826	* fold the classes */
	10827	if (simple_posixes) { /* These are the classes known to be unaffected by
	10828	/a, /aa, and /d */
	10829	if (cp_list) {
	10830	_invlist_union(cp_list, simple_posixes, &cp_list);
	10831	SvREFCNT_dec_NN(simple_posixes);
	10832	}
	10833	else {
	10834	cp_list = simple_posixes;
	10835	}
	10836	}
	10837	if (posixes \|\| nposixes) {
	10838	if (! DEPENDS_SEMANTICS) {
	10839
	10840	/* For everything but /d, we can just add the current 'posixes' and
	10841	* 'nposixes' to the main list */
	10842	if (posixes) {
	10843	if (cp_list) {
	10844	_invlist_union(cp_list, posixes, &cp_list);
	10845	SvREFCNT_dec_NN(posixes);
	10846	}
	10847	else {
	10848	cp_list = posixes;
	10849	}
	10850	}
	10851	if (nposixes) {
	10852	if (cp_list) {
	10853	_invlist_union(cp_list, nposixes, &cp_list);
	10854	SvREFCNT_dec_NN(nposixes);
	10855	}
	10856	else {
	10857	cp_list = nposixes;
	10858	}
	10859	}
	10860	}
	10861	else {
	10862	/* Under /d, things like \w match upper Latin1 characters only if
	10863	* the target string is in UTF-8. But things like \W match all the
	10864	* upper Latin1 characters if the target string is not in UTF-8.
	10865	*
	10866	* Handle the case with something like \W separately */
	10867	if (nposixes) {
	10868	SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
	10869
	10870	/* A complemented posix class matches all upper Latin1
	10871	* characters if not in UTF-8. And it matches just certain
	10872	* ones when in UTF-8. That means those certain ones are
	10873	* matched regardless, so can just be added to the
	10874	* unconditional list */
	10875	if (cp_list) {
	10876	_invlist_union(cp_list, nposixes, &cp_list);
	10877	SvREFCNT_dec_NN(nposixes);
	10878	nposixes = NULL;
	10879	}
	10880	else {
	10881	cp_list = nposixes;
	10882	}
	10883
	10884	/* Likewise for 'posixes' */
	10885	_invlist_union(posixes, cp_list, &cp_list);
	10886	SvREFCNT_dec(posixes);
	10887
	10888	/* Likewise for anything else in the range that matched only
	10889	* under UTF-8 */
	10890	if (upper_latin1_only_utf8_matches) {
	10891	_invlist_union(cp_list,
	10892	upper_latin1_only_utf8_matches,
	10893	&cp_list);
	10894	SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
	10895	upper_latin1_only_utf8_matches = NULL;
	10896	}
	10897
	10898	/* If we don't match all the upper Latin1 characters regardless
	10899	* of UTF-8ness, we have to set a flag to match the rest when
	10900	* not in UTF-8 */
	10901	_invlist_subtract(only_non_utf8_list, cp_list,
	10902	&only_non_utf8_list);
	10903	if (_invlist_len(only_non_utf8_list) != 0) {
	10904	anyof_flags \|= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
	10905	}
	10906	SvREFCNT_dec_NN(only_non_utf8_list);
	10907	}
	10908	else {
	10909	/* Here there were no complemented posix classes. That means
	10910	* the upper Latin1 characters in 'posixes' match only when the
	10911	* target string is in UTF-8. So we have to add them to the
	10912	* list of those types of code points, while adding the
	10913	* remainder to the unconditional list.
	10914	*
	10915	* First calculate what they are */
	10916	SV* nonascii_but_latin1_properties = NULL;
	10917	_invlist_intersection(posixes, PL_UpperLatin1,
	10918	&nonascii_but_latin1_properties);
	10919
	10920	/* And add them to the final list of such characters. */
	10921	_invlist_union(upper_latin1_only_utf8_matches,
	10922	nonascii_but_latin1_properties,
	10923	&upper_latin1_only_utf8_matches);
	10924
	10925	/* Remove them from what now becomes the unconditional list */
	10926	_invlist_subtract(posixes, nonascii_but_latin1_properties,
	10927	&posixes);
	10928
	10929	/* And add those unconditional ones to the final list */
	10930	if (cp_list) {
	10931	_invlist_union(cp_list, posixes, &cp_list);
	10932	SvREFCNT_dec_NN(posixes);
	10933	posixes = NULL;
	10934	}
	10935	else {
	10936	cp_list = posixes;
	10937	}
	10938
	10939	SvREFCNT_dec(nonascii_but_latin1_properties);
	10940
	10941	/* Get rid of any characters from the conditional list that we
	10942	* now know are matched unconditionally, which may make that
	10943	* list empty */
	10944	_invlist_subtract(upper_latin1_only_utf8_matches,
	10945	cp_list,
	10946	&upper_latin1_only_utf8_matches);
	10947	if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
	10948	SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
	10949	upper_latin1_only_utf8_matches = NULL;
	10950	}
	10951	}
	10952	}
	10953	}
	10954
	10955	/* And combine the result (if any) with any inversion list from properties.
	10956	* The lists are kept separate up to now so that we can distinguish the two
	10957	* in regards to matching above-Unicode. A run-time warning is generated
	10958	* if a Unicode property is matched against a non-Unicode code point. But,
	10959	* we allow user-defined properties to match anything, without any warning,
	10960	* and we also suppress the warning if there is a portion of the character
	10961	* class that isn't a Unicode property, and which matches above Unicode, \W
	10962	* or [\x{110000}] for example.
	10963	* (Note that in this case, unlike the Posix one above, there is no
	10964	* <upper_latin1_only_utf8_matches>, because having a Unicode property
	10965	* forces Unicode semantics */
	10966	if (properties) {
	10967	if (cp_list) {
	10968
	10969	/* If it matters to the final outcome, see if a non-property
	10970	* component of the class matches above Unicode. If so, the
	10971	* warning gets suppressed. This is true even if just a single
	10972	* such code point is specified, as, though not strictly correct if
	10973	* another such code point is matched against, the fact that they
	10974	* are using above-Unicode code points indicates they should know
	10975	* the issues involved */
	10976	if (warn_super) {
	10977	warn_super = ! (invert
	10978	^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
	10979	}
	10980
	10981	_invlist_union(properties, cp_list, &cp_list);
	10982	SvREFCNT_dec_NN(properties);
	10983	}
	10984	else {
	10985	cp_list = properties;
	10986	}
	10987
	10988	if (warn_super) {
	10989	anyof_flags \|= ANYOF_WARN_SUPER__shared;
	10990
	10991	/* Because an ANYOF node is the only one that warns, this node
	10992	* can't be optimized into something else */
	10993	optimizable = FALSE;
	10994	}
	10995	}
	10996
	10997	/* Here, we have calculated what code points should be in the character
	10998	* class.
	10999	*
	11000	* Now we can see about various optimizations. Fold calculation (which we
	11001	* did above) needs to take place before inversion. Otherwise /[^k]/i
	11002	* would invert to include K, which under /i would match k, which it
	11003	* shouldn't. Therefore we can't invert folded locale now, as it won't be
	11004	* folded until runtime */
	11005
	11006	/* If we didn't do folding, it's because some information isn't available
	11007	* until runtime; set the run-time fold flag for these We know to set the
	11008	* flag if we have a non-NULL list for UTF-8 locales, or the class matches
	11009	* at least one 0-255 range code point */
	11010	if (LOC && FOLD) {
	11011
	11012	/* Some things on the list might be unconditionally included because of
	11013	* other components. Remove them, and clean up the list if it goes to
	11014	* 0 elements */
	11015	if (only_utf8_locale_list && cp_list) {
	11016	_invlist_subtract(only_utf8_locale_list, cp_list,
	11017	&only_utf8_locale_list);
	11018
	11019	if (_invlist_len(only_utf8_locale_list) == 0) {
	11020	SvREFCNT_dec_NN(only_utf8_locale_list);
	11021	only_utf8_locale_list = NULL;
	11022	}
	11023	}
	11024	if ( only_utf8_locale_list
	11025	\|\| ( cp_list
	11026	&& ( _invlist_contains_cp(cp_list,
	11027	LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
	11028	\|\| _invlist_contains_cp(cp_list,
	11029	LATIN_SMALL_LETTER_DOTLESS_I))))
	11030	{
	11031	has_runtime_dependency \|= HAS_L_RUNTIME_DEPENDENCY;
	11032	anyof_flags \|= ANYOFL_FOLD\|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
	11033	}
	11034	else if (cp_list && invlist_lowest(cp_list) < 256) {
	11035	/* If nothing is below 256, has no locale dependency; otherwise it
	11036	* does */
	11037	anyof_flags \|= ANYOFL_FOLD;
	11038	has_runtime_dependency \|= HAS_L_RUNTIME_DEPENDENCY;
	11039
	11040	/* In a Turkish locale these could match, notify the run-time code
	11041	* to check for that */
	11042	if ( _invlist_contains_cp(cp_list, 'I')
	11043	\|\| _invlist_contains_cp(cp_list, 'i'))
	11044	{
	11045	anyof_flags \|= ANYOFL_FOLD\|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
	11046	}
	11047	}
	11048	}
	11049	else if ( DEPENDS_SEMANTICS
	11050	&& ( upper_latin1_only_utf8_matches
	11051	\|\| ( anyof_flags
	11052	& ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
	11053	{
	11054	RExC_seen_d_op = TRUE;
	11055	has_runtime_dependency \|= HAS_D_RUNTIME_DEPENDENCY;
	11056	}
	11057
	11058	/* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
	11059	* compile time. */
	11060	if ( cp_list
	11061	&& invert
	11062	&& ! has_runtime_dependency)
	11063	{
	11064	_invlist_invert(cp_list);
	11065
	11066	/* Clear the invert flag since have just done it here */
	11067	invert = FALSE;
	11068	}
	11069
	11070	/* All possible optimizations below still have these characteristics.
	11071	* (Multi-char folds aren't SIMPLE, but they don't get this far in this
	11072	* routine) */
	11073	*flagp \|= HASWIDTH\|SIMPLE;
	11074
	11075	if (ret_invlist) {
	11076	*ret_invlist = cp_list;
	11077
	11078	return (cp_list) ? RExC_emit : 0;
	11079	}
	11080
	11081	if (anyof_flags & ANYOF_LOCALE_FLAGS) {
	11082	RExC_contains_locale = 1;
	11083	}
	11084
	11085	if (optimizable) {
	11086
	11087	/* Some character classes are equivalent to other nodes. Such nodes
	11088	* take up less room, and some nodes require fewer operations to
	11089	* execute, than ANYOF nodes. EXACTish nodes may be joinable with
	11090	* adjacent nodes to improve efficiency. */
	11091	op = optimize_regclass(pRExC_state, cp_list,
	11092	only_utf8_locale_list,
	11093	upper_latin1_only_utf8_matches,
	11094	has_runtime_dependency,
	11095	posixl,
	11096	&anyof_flags, &invert, &ret, flagp);
	11097	RETURN_FAIL_ON_RESTART_FLAGP(flagp);
	11098
	11099	/* If optimized to something else and emitted, clean up and return */
	11100	if (ret >= 0) {
	11101	SvREFCNT_dec(cp_list);;
	11102	SvREFCNT_dec(only_utf8_locale_list);
	11103	SvREFCNT_dec(upper_latin1_only_utf8_matches);
	11104	return ret;
	11105	}
	11106
	11107	/* If no optimization was found, an END was returned and we will now
	11108	* emit an ANYOF */
	11109	if (op == END) {
	11110	op = ANYOF;
	11111	}
	11112	}
	11113
	11114	/* Here are going to emit an ANYOF; set the particular type */
	11115	if (op == ANYOF) {
	11116	if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
	11117	op = ANYOFD;
	11118	}
	11119	else if (posixl) {
	11120	op = ANYOFPOSIXL;
	11121	}
	11122	else if (LOC) {
	11123	op = ANYOFL;
	11124	}
	11125	}
	11126
	11127	ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
	11128	FILL_NODE(ret, op); /* We set the argument later */
	11129	RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
	11130	ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
	11131
	11132	/* Here, <cp_list> contains all the code points we can determine at
	11133	* compile time that match under all conditions. Go through it, and
	11134	* for things that belong in the bitmap, put them there, and delete from
	11135	* <cp_list>. While we are at it, see if everything above 255 is in the
	11136	* list, and if so, set a flag to speed up execution */
	11137
	11138	populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
	11139
	11140	if (posixl) {
	11141	ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
	11142	}
	11143
	11144	if (invert) {
	11145	ANYOF_FLAGS(REGNODE_p(ret)) \|= ANYOF_INVERT;
	11146	}
	11147
	11148	/* Here, the bitmap has been populated with all the Latin1 code points that
	11149	* always match. Can now add to the overall list those that match only
	11150	* when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
	11151	* */
	11152	if (upper_latin1_only_utf8_matches) {
	11153	if (cp_list) {
	11154	_invlist_union(cp_list,
	11155	upper_latin1_only_utf8_matches,
	11156	&cp_list);
	11157	SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
	11158	}
	11159	else {
	11160	cp_list = upper_latin1_only_utf8_matches;
	11161	}
	11162	ANYOF_FLAGS(REGNODE_p(ret)) \|= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
	11163	}
	11164
	11165	set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
	11166	(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	11167	? listsv
	11168	: NULL,
	11169	only_utf8_locale_list);
	11170
	11171	SvREFCNT_dec(cp_list);;
	11172	SvREFCNT_dec(only_utf8_locale_list);
	11173	return ret;
	11174	}
	11175
	11176	STATIC U8
	11177	S_optimize_regclass(pTHX_
	11178	RExC_state_t *pRExC_state,
	11179	SV * cp_list,
	11180	SV* only_utf8_locale_list,
	11181	SV* upper_latin1_only_utf8_matches,
	11182	const U32 has_runtime_dependency,
	11183	const U32 posixl,
	11184	U8 * anyof_flags,
	11185	bool * invert,
	11186	regnode_offset * ret,
	11187	I32 *flagp
	11188	)
	11189	{
	11190	/* This function exists just to make S_regclass() smaller. It extracts out
	11191	* the code that looks for potential optimizations away from a full generic
	11192	* ANYOF node. The parameter names are the same as the corresponding
	11193	* variables in S_regclass.
	11194	*
	11195	* It returns the new op (the impossible END one if no optimization found)
	11196	* and sets *ret to any created regnode. If the new op is sufficiently
	11197	* like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
	11198	*
	11199	* Certain of the parameters may be updated as a result of the changes
	11200	* herein */
	11201
	11202	U8 op = END; /* The returned node-type, initialized to an impossible
	11203	one. */
	11204	UV value = 0;
	11205	PERL_UINT_FAST8_T i;
	11206	UV partial_cp_count = 0;
	11207	UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
	11208	UV end[MAX_FOLD_FROMS+1] = { 0 };
	11209	bool single_range = FALSE;
	11210	UV lowest_cp = 0, highest_cp = 0;
	11211
	11212	PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
	11213
	11214	if (cp_list) { /* Count the code points in enough ranges that we would see
	11215	all the ones possible in any fold in this version of
	11216	Unicode */
	11217
	11218	invlist_iterinit(cp_list);
	11219	for (i = 0; i <= MAX_FOLD_FROMS; i++) {
	11220	if (! invlist_iternext(cp_list, &start[i], &end[i])) {
	11221	break;
	11222	}
	11223	partial_cp_count += end[i] - start[i] + 1;
	11224	}
	11225
	11226	if (i == 1) {
	11227	single_range = TRUE;
	11228	}
	11229	invlist_iterfinish(cp_list);
	11230
	11231	/* If we know at compile time that this matches every possible code
	11232	* point, any run-time dependencies don't matter */
	11233	if (start[0] == 0 && end[0] == UV_MAX) {
	11234	if (*invert) {
	11235	goto return_OPFAIL;
	11236	}
	11237	else {
	11238	goto return_SANY;
	11239	}
	11240	}
	11241
	11242	/* Use a clearer mnemonic for below */
	11243	lowest_cp = start[0];
	11244
	11245	highest_cp = invlist_highest(cp_list);
	11246	}
	11247
	11248	/* Similarly, for /l posix classes, if both a class and its complement
	11249	* match, any run-time dependencies don't matter */
	11250	if (posixl) {
	11251	int namedclass;
	11252	for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
	11253	if ( POSIXL_TEST(posixl, namedclass) /* class */
	11254	&& POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
	11255	{
	11256	if (*invert) {
	11257	goto return_OPFAIL;
	11258	}
	11259	goto return_SANY;
	11260	}
	11261	}
	11262
	11263	/* For well-behaved locales, some classes are subsets of others, so
	11264	* complementing the subset and including the non-complemented superset
	11265	* should match everything, like [\D[:alnum:]], and
	11266	* [[:^alpha:][:alnum:]], but some implementations of locales are
	11267	* buggy, and khw thinks its a bad idea to have optimization change
	11268	* behavior, even if it avoids an OS bug in a given case */
	11269
	11270	#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
	11271
	11272	/* If is a single posix /l class, can optimize to just that op. Such a
	11273	* node will not match anything in the Latin1 range, as that is not
	11274	* determinable until runtime, but will match whatever the class does
	11275	* outside that range. (Note that some classes won't match anything
	11276	* outside the range, like [:ascii:]) */
	11277	if ( isSINGLE_BIT_SET(posixl)
	11278	&& (partial_cp_count == 0 \|\| lowest_cp > 255))
	11279	{
	11280	U8 classnum;
	11281	SV * class_above_latin1 = NULL;
	11282	bool already_inverted;
	11283	bool are_equivalent;
	11284
	11285
	11286	namedclass = single_1bit_pos32(posixl);
	11287	classnum = namedclass_to_classnum(namedclass);
	11288
	11289	/* The named classes are such that the inverted number is one
	11290	* larger than the non-inverted one */
	11291	already_inverted = namedclass - classnum_to_namedclass(classnum);
	11292
	11293	/* Create an inversion list of the official property, inverted if
	11294	* the constructed node list is inverted, and restricted to only
	11295	* the above latin1 code points, which are the only ones known at
	11296	* compile time */
	11297	_invlist_intersection_maybe_complement_2nd(
	11298	PL_AboveLatin1,
	11299	PL_XPosix_ptrs[classnum],
	11300	already_inverted,
	11301	&class_above_latin1);
	11302	are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
	11303	SvREFCNT_dec_NN(class_above_latin1);
	11304
	11305	if (are_equivalent) {
	11306
	11307	/* Resolve the run-time inversion flag with this possibly
	11308	* inverted class */
	11309	invert = invert ^ already_inverted;
	11310
	11311	op = POSIXL + invert (NPOSIXL - POSIXL);
	11312	*ret = reg_node(pRExC_state, op);
	11313	FLAGS(REGNODE_p(*ret)) = classnum;
	11314	return op;
	11315	}
	11316	}
	11317	}
	11318
	11319	/* khw can't think of any other possible transformation involving these. */
	11320	if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
	11321	return END;
	11322	}
	11323
	11324	if (! has_runtime_dependency) {
	11325
	11326	/* If the list is empty, nothing matches. This happens, for example,
	11327	* when a Unicode property that doesn't match anything is the only
	11328	* element in the character class (perluniprops.pod notes such
	11329	* properties). */
	11330	if (partial_cp_count == 0) {
	11331	if (*invert) {
	11332	goto return_SANY;
	11333	}
	11334	else {
	11335	goto return_OPFAIL;
	11336	}
	11337	}
	11338
	11339	/* If matches everything but \n */
	11340	if ( start[0] == 0 && end[0] == '\n' - 1
	11341	&& start[1] == '\n' + 1 && end[1] == UV_MAX)
	11342	{
	11343	assert (! *invert);
	11344	op = REG_ANY;
	11345	*ret = reg_node(pRExC_state, op);
	11346	MARK_NAUGHTY(1);
	11347	return op;
	11348	}
	11349	}
	11350
	11351	/* Next see if can optimize classes that contain just a few code points
	11352	* into an EXACTish node. The reason to do this is to let the optimizer
	11353	* join this node with adjacent EXACTish ones, and ANYOF nodes require
	11354	* runtime conversion to code point from UTF-8, which we'd like to avoid.
	11355	*
	11356	* An EXACTFish node can be generated even if not under /i, and vice versa.
	11357	* But care must be taken. An EXACTFish node has to be such that it only
	11358	* matches precisely the code points in the class, but we want to generate
	11359	* the least restrictive one that does that, to increase the odds of being
	11360	* able to join with an adjacent node. For example, if the class contains
	11361	* [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
	11362	* from matching. Whether we are under /i or not is irrelevant in this
	11363	* case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER
	11364	* LETTER APOSTROPHE. That is supposed to match the single character U+0149
	11365	* LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there
	11366	* is no simple fold that includes \X{02BC}, there is a multi-char fold
	11367	* that does, and so the node generated for it must be an EXACTFish one.
	11368	* On the other hand qr/:/i should generate a plain EXACT node since the
	11369	* colon participates in no fold whatsoever, and having it be EXACT tells
	11370	* the optimizer the target string cannot match unless it has a colon in
	11371	* it. */
	11372	if ( ! posixl
	11373	&& ! *invert
	11374
	11375	/* Only try if there are no more code points in the class than in
	11376	* the max possible fold */
	11377	&& inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
	11378	{
	11379	/* We can always make a single code point class into an EXACTish node.
	11380	* */
	11381	if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
	11382	if (LOC) {
	11383
	11384	/* Here is /l: Use EXACTL, except if there is a fold not known
	11385	* until runtime so shows as only a single code point here.
	11386	* For code points above 255, we know which can cause problems
	11387	* by having a potential fold to the Latin1 range. */
	11388	if ( ! FOLD
	11389	\|\| ( lowest_cp > 255
	11390	&& ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
	11391	{
	11392	op = EXACTL;
	11393	}
	11394	else {
	11395	op = EXACTFL;
	11396	}
	11397	}
	11398	else if (! FOLD) { /* Not /l and not /i */
	11399	op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
	11400	}
	11401	else if (lowest_cp < 256) { /* /i, not /l, and the code point is
	11402	small */
	11403
	11404	/* Under /i, it gets a little tricky. A code point that
	11405	* doesn't participate in a fold should be an EXACT node. We
	11406	* know this one isn't the result of a simple fold, or there'd
	11407	* be more than one code point in the list, but it could be
	11408	* part of a multi-character fold. In that case we better not
	11409	* create an EXACT node, as we would wrongly be telling the
	11410	* optimizer that this code point must be in the target string,
	11411	* and that is wrong. This is because if the sequence around
	11412	* this code point forms a multi-char fold, what needs to be in
	11413	* the string could be the code point that folds to the
	11414	* sequence.
	11415	*
	11416	* This handles the case of below-255 code points, as we have
	11417	* an easy look up for those. The next clause handles the
	11418	* above-256 one */
	11419	op = IS_IN_SOME_FOLD_L1(lowest_cp)
	11420	? EXACTFU
	11421	: EXACT;
	11422	}
	11423	else { /* /i, larger code point. Since we are under /i, and have
	11424	just this code point, we know that it can't fold to
	11425	something else, so PL_InMultiCharFold applies to it */
	11426	op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
	11427	? EXACTFU_REQ8
	11428	: EXACT_REQ8;
	11429	}
	11430
	11431	value = lowest_cp;
	11432	}
	11433	else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
	11434	&& _invlist_contains_cp(PL_in_some_fold, lowest_cp))
	11435	{
	11436	/* Here, the only runtime dependency, if any, is from /d, and the
	11437	* class matches more than one code point, and the lowest code
	11438	* point participates in some fold. It might be that the other
	11439	* code points are /i equivalent to this one, and hence they would
	11440	* be representable by an EXACTFish node. Above, we eliminated
	11441	* classes that contain too many code points to be EXACTFish, with
	11442	* the test for MAX_FOLD_FROMS
	11443	*
	11444	* First, special case the ASCII fold pairs, like 'B' and 'b'. We
	11445	* do this because we have EXACTFAA at our disposal for the ASCII
	11446	* range */
	11447	if (partial_cp_count == 2 && isASCII(lowest_cp)) {
	11448
	11449	/* The only ASCII characters that participate in folds are
	11450	* alphabetics */
	11451	assert(isALPHA(lowest_cp));
	11452	if ( end[0] == start[0] /* First range is a single
	11453	character, so 2nd exists */
	11454	&& isALPHA_FOLD_EQ(start[0], start[1]))
	11455	{
	11456	/* Here, is part of an ASCII fold pair */
	11457
	11458	if ( ASCII_FOLD_RESTRICTED
	11459	\|\| HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
	11460	{
	11461	/* If the second clause just above was true, it means
	11462	* we can't be under /i, or else the list would have
	11463	* included more than this fold pair. Therefore we
	11464	* have to exclude the possibility of whatever else it
	11465	* is that folds to these, by using EXACTFAA */
	11466	op = EXACTFAA;
	11467	}
	11468	else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
	11469
	11470	/* Here, there's no simple fold that lowest_cp is part
	11471	* of, but there is a multi-character one. If we are
	11472	* not under /i, we want to exclude that possibility;
	11473	* if under /i, we want to include it */
	11474	op = (FOLD) ? EXACTFU : EXACTFAA;
	11475	}
	11476	else {
	11477
	11478	/* Here, the only possible fold lowest_cp participates in
	11479	* is with start[1]. /i or not isn't relevant */
	11480	op = EXACTFU;
	11481	}
	11482
	11483	value = toFOLD(lowest_cp);
	11484	}
	11485	}
	11486	else if ( ! upper_latin1_only_utf8_matches
	11487	\|\| ( _invlist_len(upper_latin1_only_utf8_matches) == 2
	11488	&& PL_fold_latin1[
	11489	invlist_highest(upper_latin1_only_utf8_matches)]
	11490	== lowest_cp))
	11491	{
	11492	/* Here, the smallest character is non-ascii or there are more
	11493	* than 2 code points matched by this node. Also, we either
	11494	* don't have /d UTF-8 dependent matches, or if we do, they
	11495	* look like they could be a single character that is the fold
	11496	* of the lowest one is in the always-match list. This test
	11497	* quickly excludes most of the false positives when there are
	11498	* /d UTF-8 depdendent matches. These are like LATIN CAPITAL
	11499	* LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
	11500	* iff the target string is UTF-8. (We don't have to worry
	11501	* above about exceeding the array bounds of PL_fold_latin1[]
	11502	* because any code point in 'upper_latin1_only_utf8_matches'
	11503	* is below 256.)
	11504	*
	11505	* EXACTFAA would apply only to pairs (hence exactly 2 code
	11506	* points) in the ASCII range, so we can't use it here to
	11507	* artificially restrict the fold domain, so we check if the
	11508	* class does or does not match some EXACTFish node. Further,
	11509	* if we aren't under /i, and and the folded-to character is
	11510	* part of a multi-character fold, we can't do this
	11511	* optimization, as the sequence around it could be that
	11512	* multi-character fold, and we don't here know the context, so
	11513	* we have to assume it is that multi-char fold, to prevent
	11514	* potential bugs.
	11515	*
	11516	* To do the general case, we first find the fold of the lowest
	11517	* code point (which may be higher than that lowest unfolded
	11518	* one), then find everything that folds to it. (The data
	11519	* structure we have only maps from the folded code points, so
	11520	* we have to do the earlier step.) */
	11521
	11522	Size_t foldlen;
	11523	U8 foldbuf[UTF8_MAXBYTES_CASE];
	11524	UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
	11525	U32 first_fold;
	11526	const U32 * remaining_folds;
	11527	Size_t folds_to_this_cp_count = _inverse_folds(
	11528	folded,
	11529	&first_fold,
	11530	&remaining_folds);
	11531	Size_t folds_count = folds_to_this_cp_count + 1;
	11532	SV * fold_list = _new_invlist(folds_count);
	11533	unsigned int i;
	11534
	11535	/* If there are UTF-8 dependent matches, create a temporary
	11536	* list of what this node matches, including them. */
	11537	SV * all_cp_list = NULL;
	11538	SV ** use_this_list = &cp_list;
	11539
	11540	if (upper_latin1_only_utf8_matches) {
	11541	all_cp_list = _new_invlist(0);
	11542	use_this_list = &all_cp_list;
	11543	_invlist_union(cp_list,
	11544	upper_latin1_only_utf8_matches,
	11545	use_this_list);
	11546	}
	11547
	11548	/* Having gotten everything that participates in the fold
	11549	* containing the lowest code point, we turn that into an
	11550	* inversion list, making sure everything is included. */
	11551	fold_list = add_cp_to_invlist(fold_list, lowest_cp);
	11552	fold_list = add_cp_to_invlist(fold_list, folded);
	11553	if (folds_to_this_cp_count > 0) {
	11554	fold_list = add_cp_to_invlist(fold_list, first_fold);
	11555	for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
	11556	fold_list = add_cp_to_invlist(fold_list,
	11557	remaining_folds[i]);
	11558	}
	11559	}
	11560
	11561	/* If the fold list is identical to what's in this ANYOF node,
	11562	* the node can be represented by an EXACTFish one instead */
	11563	if (_invlistEQ(*use_this_list, fold_list,
	11564	0 /* Don't complement */ )
	11565	) {
	11566
	11567	/* But, we have to be careful, as mentioned above. Just
	11568	* the right sequence of characters could match this if it
	11569	* is part of a multi-character fold. That IS what we want
	11570	* if we are under /i. But it ISN'T what we want if not
	11571	* under /i, as it could match when it shouldn't. So, when
	11572	* we aren't under /i and this character participates in a
	11573	* multi-char fold, we don't optimize into an EXACTFish
	11574	* node. So, for each case below we have to check if we
	11575	* are folding, and if not, if it is not part of a
	11576	* multi-char fold. */
	11577	if (lowest_cp > 255) { /* Highish code point */
	11578	if (FOLD \|\| ! _invlist_contains_cp(
	11579	PL_InMultiCharFold, folded))
	11580	{
	11581	op = (LOC)
	11582	? EXACTFLU8
	11583	: (ASCII_FOLD_RESTRICTED)
	11584	? EXACTFAA
	11585	: EXACTFU_REQ8;
	11586	value = folded;
	11587	}
	11588	} /* Below, the lowest code point < 256 */
	11589	else if ( FOLD
	11590	&& folded == 's'
	11591	&& DEPENDS_SEMANTICS)
	11592	{ /* An EXACTF node containing a single character 's',
	11593	can be an EXACTFU if it doesn't get joined with an
	11594	adjacent 's' */
	11595	op = EXACTFU_S_EDGE;
	11596	value = folded;
	11597	}
	11598	else if ( FOLD
	11599	\|\| ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
	11600	{
	11601	if (upper_latin1_only_utf8_matches) {
	11602	op = EXACTF;
	11603
	11604	/* We can't use the fold, as that only matches
	11605	* under UTF-8 */
	11606	value = lowest_cp;
	11607	}
	11608	else if ( UNLIKELY(lowest_cp == MICRO_SIGN)
	11609	&& ! UTF)
	11610	{ /* EXACTFUP is a special node for this character */
	11611	op = (ASCII_FOLD_RESTRICTED)
	11612	? EXACTFAA
	11613	: EXACTFUP;
	11614	value = MICRO_SIGN;
	11615	}
	11616	else if ( ASCII_FOLD_RESTRICTED
	11617	&& ! isASCII(lowest_cp))
	11618	{ /* For ASCII under /iaa, we can use EXACTFU below
	11619	*/
	11620	op = EXACTFAA;
	11621	value = folded;
	11622	}
	11623	else {
	11624	op = EXACTFU;
	11625	value = folded;
	11626	}
	11627	}
	11628	}
	11629
	11630	SvREFCNT_dec_NN(fold_list);
	11631	SvREFCNT_dec(all_cp_list);
	11632	}
	11633	}
	11634
	11635	if (op != END) {
	11636	U8 len;
	11637
	11638	/* Here, we have calculated what EXACTish node to use. Have to
	11639	* convert to UTF-8 if not already there */
	11640	if (value > 255) {
	11641	if (! UTF) {
	11642	SvREFCNT_dec(cp_list);;
	11643	REQUIRE_UTF8(flagp);
	11644	}
	11645
	11646	/* This is a kludge to the special casing issues with this
	11647	* ligature under /aa. FB05 should fold to FB06, but the call
	11648	* above to _to_uni_fold_flags() didn't find this, as it didn't
	11649	* use the /aa restriction in order to not miss other folds
	11650	* that would be affected. This is the only instance likely to
	11651	* ever be a problem in all of Unicode. So special case it. */
	11652	if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
	11653	&& ASCII_FOLD_RESTRICTED)
	11654	{
	11655	value = LATIN_SMALL_LIGATURE_ST;
	11656	}
	11657	}
	11658
	11659	len = (UTF) ? UVCHR_SKIP(value) : 1;
	11660
	11661	*ret = REGNODE_GUTS(pRExC_state, op, len);
	11662	FILL_NODE(*ret, op);
	11663	RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
	11664	setSTR_LEN(REGNODE_p(*ret), len);
	11665	if (len == 1) {
	11666	STRINGs(REGNODE_p(ret)) = (U8) value;
	11667	}
	11668	else {
	11669	uvchr_to_utf8((U8 ) STRINGs(REGNODE_p(ret)), value);
	11670	}
	11671
	11672	return op;
	11673	}
	11674	}
	11675
	11676	if (! has_runtime_dependency) {
	11677
	11678	/* See if this can be turned into an ANYOFM node. Think about the bit
	11679	* patterns in two different bytes. In some positions, the bits in
	11680	* each will be 1; and in other positions both will be 0; and in some
	11681	* positions the bit will be 1 in one byte, and 0 in the other. Let
	11682	* 'n' be the number of positions where the bits differ. We create a
	11683	* mask which has exactly 'n' 0 bits, each in a position where the two
	11684	* bytes differ. Now take the set of all bytes that when ANDed with
	11685	* the mask yield the same result. That set has 2**n elements, and is
	11686	* representable by just two 8 bit numbers: the result and the mask.
	11687	* Importantly, matching the set can be vectorized by creating a word
	11688	* full of the result bytes, and a word full of the mask bytes,
	11689	* yielding a significant speed up. Here, see if this node matches
	11690	* such a set. As a concrete example consider [01], and the byte
	11691	* representing '0' which is 0x30 on ASCII machines. It has the bits
	11692	* 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with
	11693	* that mask we get 0x30. Any other bytes ANDed yield something else.
	11694	* So [01], which is a common usage, is optimizable into ANYOFM, and
	11695	* can benefit from the speed up. We can only do this on UTF-8
	11696	* invariant bytes, because they have the same bit patterns under UTF-8
	11697	* as not. */
	11698	PERL_UINT_FAST8_T inverted = 0;
	11699
	11700	/* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
	11701	* EBCDIC */
	11702	const PERL_UINT_FAST8_T max_permissible
	11703	= nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
	11704
	11705	/* If doesn't fit the criteria for ANYOFM, invert and try again. If
	11706	* that works we will instead later generate an NANYOFM, and invert
	11707	* back when through */
	11708	if (highest_cp > max_permissible) {
	11709	_invlist_invert(cp_list);
	11710	inverted = 1;
	11711	}
	11712
	11713	if (invlist_highest(cp_list) <= max_permissible) {
	11714	UV this_start, this_end;
	11715	UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
	11716	U8 bits_differing = 0;
	11717	Size_t full_cp_count = 0;
	11718	bool first_time = TRUE;
	11719
	11720	/* Go through the bytes and find the bit positions that differ */
	11721	invlist_iterinit(cp_list);
	11722	while (invlist_iternext(cp_list, &this_start, &this_end)) {
	11723	unsigned int i = this_start;
	11724
	11725	if (first_time) {
	11726	if (! UVCHR_IS_INVARIANT(i)) {
	11727	goto done_anyofm;
	11728	}
	11729
	11730	first_time = FALSE;
	11731	lowest_cp = this_start;
	11732
	11733	/* We have set up the code point to compare with. Don't
	11734	* compare it with itself */
	11735	i++;
	11736	}
	11737
	11738	/* Find the bit positions that differ from the lowest code
	11739	* point in the node. Keep track of all such positions by
	11740	* OR'ing */
	11741	for (; i <= this_end; i++) {
	11742	if (! UVCHR_IS_INVARIANT(i)) {
	11743	goto done_anyofm;
	11744	}
	11745
	11746	bits_differing \|= i ^ lowest_cp;
	11747	}
	11748
	11749	full_cp_count += this_end - this_start + 1;
	11750	}
	11751
	11752	/* At the end of the loop, we count how many bits differ from the
	11753	* bits in lowest code point, call the count 'd'. If the set we
	11754	* found contains 2**d elements, it is the closure of all code
	11755	* points that differ only in those bit positions. To convince
	11756	* yourself of that, first note that the number in the closure must
	11757	* be a power of 2, which we test for. The only way we could have
	11758	* that count and it be some differing set, is if we got some code
	11759	* points that don't differ from the lowest code point in any
	11760	* position, but do differ from each other in some other position.
	11761	* That means one code point has a 1 in that position, and another
	11762	* has a 0. But that would mean that one of them differs from the
	11763	* lowest code point in that position, which possibility we've
	11764	* already excluded. */
	11765	if ( (inverted \|\| full_cp_count > 1)
	11766	&& full_cp_count == 1U << PL_bitcount[bits_differing])
	11767	{
	11768	U8 ANYOFM_mask;
	11769
	11770	op = ANYOFM + inverted;;
	11771
	11772	/* We need to make the bits that differ be 0's */
	11773	ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
	11774
	11775	/* The argument is the lowest code point */
	11776	*ret = reg1node(pRExC_state, op, lowest_cp);
	11777	FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
	11778	}
	11779
	11780	done_anyofm:
	11781	invlist_iterfinish(cp_list);
	11782	}
	11783
	11784	if (inverted) {
	11785	_invlist_invert(cp_list);
	11786	}
	11787
	11788	if (op != END) {
	11789	return op;
	11790	}
	11791
	11792	/* XXX We could create an ANYOFR_LOW node here if we saved above if all
	11793	* were invariants, it wasn't inverted, and there is a single range.
	11794	* This would be faster than some of the posix nodes we create below
	11795	* like /\d/a, but would be twice the size. Without having actually
	11796	* measured the gain, khw doesn't think the tradeoff is really worth it
	11797	* */
	11798	}
	11799
	11800	if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
	11801	PERL_UINT_FAST8_T type;
	11802	SV * intersection = NULL;
	11803	SV* d_invlist = NULL;
	11804
	11805	/* See if this matches any of the POSIX classes. The POSIXA and POSIXD
	11806	* ones are about the same speed as ANYOF ops, but take less room; the
	11807	* ones that have above-Latin1 code point matches are somewhat faster
	11808	* than ANYOF. */
	11809
	11810	for (type = POSIXA; type >= POSIXD; type--) {
	11811	int posix_class;
	11812
	11813	if (type == POSIXL) { /* But not /l posix classes */
	11814	continue;
	11815	}
	11816
	11817	for (posix_class = 0;
	11818	posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
	11819	posix_class++)
	11820	{
	11821	SV** our_code_points = &cp_list;
	11822	SV** official_code_points;
	11823	int try_inverted;
	11824
	11825	if (type == POSIXA) {
	11826	official_code_points = &PL_Posix_ptrs[posix_class];
	11827	}
	11828	else {
	11829	official_code_points = &PL_XPosix_ptrs[posix_class];
	11830	}
	11831
	11832	/* Skip non-existent classes of this type. e.g. \v only has an
	11833	* entry in PL_XPosix_ptrs */
	11834	if (! *official_code_points) {
	11835	continue;
	11836	}
	11837
	11838	/* Try both the regular class, and its inversion */
	11839	for (try_inverted = 0; try_inverted < 2; try_inverted++) {
	11840	bool this_inverted = *invert ^ try_inverted;
	11841
	11842	if (type != POSIXD) {
	11843
	11844	/* This class that isn't /d can't match if we have /d
	11845	* dependencies */
	11846	if (has_runtime_dependency
	11847	& HAS_D_RUNTIME_DEPENDENCY)
	11848	{
	11849	continue;
	11850	}
	11851	}
	11852	else /* is /d */ if (! this_inverted) {
	11853
	11854	/* /d classes don't match anything non-ASCII below 256
	11855	* unconditionally (which cp_list contains) */
	11856	_invlist_intersection(cp_list, PL_UpperLatin1,
	11857	&intersection);
	11858	if (_invlist_len(intersection) != 0) {
	11859	continue;
	11860	}
	11861
	11862	SvREFCNT_dec(d_invlist);
	11863	d_invlist = invlist_clone(cp_list, NULL);
	11864
	11865	/* But under UTF-8 it turns into using /u rules. Add
	11866	* the things it matches under these conditions so that
	11867	* we check below that these are identical to what the
	11868	* tested class should match */
	11869	if (upper_latin1_only_utf8_matches) {
	11870	_invlist_union(
	11871	d_invlist,
	11872	upper_latin1_only_utf8_matches,
	11873	&d_invlist);
	11874	}
	11875	our_code_points = &d_invlist;
	11876	}
	11877	else { /* POSIXD, inverted. If this doesn't have this
	11878	flag set, it isn't /d. */
	11879	if (! ( *anyof_flags
	11880	& ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
	11881	{
	11882	continue;
	11883	}
	11884
	11885	our_code_points = &cp_list;
	11886	}
	11887
	11888	/* Here, have weeded out some things. We want to see if
	11889	* the list of characters this node contains
	11890	* ('*our_code_points') precisely matches those of the
	11891	* class we are currently checking against
	11892	* ('official_code_points'). /
	11893	if (_invlistEQ(*our_code_points,
	11894	*official_code_points,
	11895	try_inverted))
	11896	{
	11897	/* Here, they precisely match. Optimize this ANYOF
	11898	* node into its equivalent POSIX one of the correct
	11899	* type, possibly inverted.
	11900	*
	11901	* Some of these nodes match a single range of
	11902	* characters (or [:alpha:] matches two parallel ranges
	11903	* on ASCII platforms). The array lookup at execution
	11904	* time could be replaced by a range check for such
	11905	* nodes. But regnodes are a finite resource, and the
	11906	* possible performance boost isn't large, so this
	11907	* hasn't been done. An attempt to use just one node
	11908	* (and its inverse) to encompass all such cases was
	11909	* made in d62feba66bf43f35d092bb026694f927e9f94d38.
	11910	* But the shifting/masking it used ended up being
	11911	* slower than the array look up, so it was reverted */
	11912	op = (try_inverted)
	11913	? type + NPOSIXA - POSIXA
	11914	: type;
	11915	*ret = reg_node(pRExC_state, op);
	11916	FLAGS(REGNODE_p(*ret)) = posix_class;
	11917	SvREFCNT_dec(d_invlist);
	11918	SvREFCNT_dec(intersection);
	11919	return op;
	11920	}
	11921	}
	11922	}
	11923	}
	11924	SvREFCNT_dec(d_invlist);
	11925	SvREFCNT_dec(intersection);
	11926	}
	11927
	11928	/* If it is a single contiguous range, ANYOFR is an efficient regnode, both
	11929	* in size and speed. Currently, a 20 bit range base (smallest code point
	11930	* in the range), and a 12 bit maximum delta are packed into a 32 bit word.
	11931	* This allows for using it on all of the Unicode code points except for
	11932	* the highest plane, which is only for private use code points. khw
	11933	* doubts that a bigger delta is likely in real world applications */
	11934	if ( single_range
	11935	&& ! has_runtime_dependency
	11936	&& *anyof_flags == 0
	11937	&& start[0] < (1 << ANYOFR_BASE_BITS)
	11938	&& end[0] - start[0]
	11939	< ((1U << (sizeof(ARG1u_LOC(NULL))
	11940	* CHARBITS - ANYOFR_BASE_BITS))))
	11941
	11942	{
	11943	U8 low_utf8[UTF8_MAXBYTES+1];
	11944	U8 high_utf8[UTF8_MAXBYTES+1];
	11945
	11946	op = ANYOFR;
	11947	*ret = reg1node(pRExC_state, op,
	11948	(start[0] \| (end[0] - start[0]) << ANYOFR_BASE_BITS));
	11949
	11950	/* Place the lowest UTF-8 start byte in the flags field, so as to allow
	11951	* efficient ruling out at run time of many possible inputs. */
	11952	(void) uvchr_to_utf8(low_utf8, start[0]);
	11953	(void) uvchr_to_utf8(high_utf8, end[0]);
	11954
	11955	/* If all code points share the same first byte, this can be an
	11956	* ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
	11957	* quickly rule out many inputs at run-time without having to compute
	11958	* the code point from UTF-8. For EBCDIC, we use I8, as not doing that
	11959	* transformation would not rule out nearly so many things */
	11960	if (low_utf8[0] == high_utf8[0]) {
	11961	op = ANYOFRb;
	11962	OP(REGNODE_p(*ret)) = op;
	11963	ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
	11964	}
	11965	else {
	11966	ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
	11967	}
	11968
	11969	return op;
	11970	}
	11971
	11972	/* If didn't find an optimization and there is no need for a bitmap,
	11973	* of the lowest code points, optimize to indicate that */
	11974	if ( lowest_cp >= NUM_ANYOF_CODE_POINTS
	11975	&& ! LOC
	11976	&& ! upper_latin1_only_utf8_matches
	11977	&& *anyof_flags == 0)
	11978	{
	11979	U8 low_utf8[UTF8_MAXBYTES+1];
	11980	UV highest_cp = invlist_highest(cp_list);
	11981
	11982	/* Currently the maximum allowed code point by the system is IV_MAX.
	11983	* Higher ones are reserved for future internal use. This particular
	11984	* regnode can be used for higher ones, but we can't calculate the code
	11985	* point of those. IV_MAX suffices though, as it will be a large first
	11986	* byte */
	11987	Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
	11988	- low_utf8;
	11989
	11990	/* We store the lowest possible first byte of the UTF-8 representation,
	11991	* using the flags field. This allows for quick ruling out of some
	11992	* inputs without having to convert from UTF-8 to code point. For
	11993	* EBCDIC, we use I8, as not doing that transformation would not rule
	11994	* out nearly so many things */
	11995	*anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
	11996
	11997	op = ANYOFH;
	11998
	11999	/* If the first UTF-8 start byte for the highest code point in the
	12000	* range is suitably small, we may be able to get an upper bound as
	12001	* well */
	12002	if (highest_cp <= IV_MAX) {
	12003	U8 high_utf8[UTF8_MAXBYTES+1];
	12004	Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
	12005
	12006	/* If the lowest and highest are the same, we can get an exact
	12007	* first byte instead of a just minimum or even a sequence of exact
	12008	* leading bytes. We signal these with different regnodes */
	12009	if (low_utf8[0] == high_utf8[0]) {
	12010	Size_t len = find_first_differing_byte_pos(low_utf8,
	12011	high_utf8,
	12012	MIN(low_len, high_len));
	12013	if (len == 1) {
	12014
	12015	/* No need to convert to I8 for EBCDIC as this is an exact
	12016	* match */
	12017	*anyof_flags = low_utf8[0];
	12018
	12019	if (high_len == 2) {
	12020	/* If the elements matched all have a 2-byte UTF-8
	12021	* representation, with the first byte being the same,
	12022	* we can use a compact, fast regnode. capable of
	12023	* matching any combination of continuation byte
	12024	* patterns.
	12025	*
	12026	* (A similar regnode could be created for the Latin1
	12027	* range; the complication being that it could match
	12028	* non-UTF8 targets. The internal bitmap would serve
	12029	* both cases; with some extra code in regexec.c) */
	12030	op = ANYOFHbbm;
	12031	*ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
	12032	FILL_NODE(*ret, op);
	12033	FIRST_BYTE((struct regnode_bbm ) REGNODE_p(ret)) = low_utf8[0],
	12034
	12035	/* The 64 bit (or 32 on EBCCDIC) map can be looked up
	12036	* directly based on the continuation byte, without
	12037	* needing to convert to code point */
	12038	populate_bitmap_from_invlist(
	12039	cp_list,
	12040
	12041	/* The base code point is from the start byte */
	12042	TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
	12043	UTF_CONTINUATION_MARK \| 0),
	12044
	12045	((struct regnode_bbm ) REGNODE_p(ret))->bitmap,
	12046	REGNODE_BBM_BITMAP_LEN);
	12047	RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
	12048	return op;
	12049	}
	12050	else {
	12051	op = ANYOFHb;
	12052	}
	12053	}
	12054	else {
	12055	op = ANYOFHs;
	12056	*ret = REGNODE_GUTS(pRExC_state, op,
	12057	REGNODE_ARG_LEN(op) + STR_SZ(len));
	12058	FILL_NODE(*ret, op);
	12059	STR_LEN_U8((struct regnode_anyofhs ) REGNODE_p(ret))
	12060	= len;
	12061	Copy(low_utf8, /* Add the common bytes */
	12062	((struct regnode_anyofhs ) REGNODE_p(ret))->string,
	12063	len, U8);
	12064	RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
	12065	set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
	12066	NULL, only_utf8_locale_list);
	12067	return op;
	12068	}
	12069	}
	12070	else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
	12071
	12072	/* Here, the high byte is not the same as the low, but is small
	12073	* enough that its reasonable to have a loose upper bound,
	12074	* which is packed in with the strict lower bound. See
	12075	* comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC
	12076	* platforms, I8 is used. On ASCII platforms I8 is the same
	12077	* thing as UTF-8 */
	12078
	12079	U8 bits = 0;
	12080	U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
	12081	U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
	12082	- *anyof_flags;
	12083
	12084	if (range_diff <= max_range_diff / 8) {
	12085	bits = 3;
	12086	}
	12087	else if (range_diff <= max_range_diff / 4) {
	12088	bits = 2;
	12089	}
	12090	else if (range_diff <= max_range_diff / 2) {
	12091	bits = 1;
	12092	}
	12093	anyof_flags = (anyof_flags - 0xC0) << 2 \| bits;
	12094	op = ANYOFHr;
	12095	}
	12096	}
	12097	}
	12098
	12099	return op;
	12100
	12101	return_OPFAIL:
	12102	op = OPFAIL;
	12103	*ret = reg1node(pRExC_state, op, 0);
	12104	return op;
	12105
	12106	return_SANY:
	12107	op = SANY;
	12108	*ret = reg_node(pRExC_state, op);
	12109	MARK_NAUGHTY(1);
	12110	return op;
	12111	}
	12112
	12113	#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
	12114
	12115	#ifdef PERL_RE_BUILD_AUX
	12116	void
	12117	Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
	12118	regnode* const node,
	12119	SV* const cp_list,
	12120	SV* const runtime_defns,
	12121	SV* const only_utf8_locale_list)
	12122	{
	12123	/* Sets the arg field of an ANYOF-type node 'node', using information about
	12124	* the node passed-in. If only the bitmap is needed to determine what
	12125	* matches, the arg is set appropriately to either
	12126	* 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
	12127	* 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
	12128	*
	12129	* Otherwise, it sets the argument to the count returned by reg_add_data(),
	12130	* having allocated and stored an array, av, as follows:
	12131	* av[0] stores the inversion list defining this class as far as known at
	12132	* this time, or PL_sv_undef if nothing definite is now known.
	12133	* av[1] stores the inversion list of code points that match only if the
	12134	* current locale is UTF-8, or if none, PL_sv_undef if there is an
	12135	* av[2], or no entry otherwise.
	12136	* av[2] stores the list of user-defined properties whose subroutine
	12137	* definitions aren't known at this time, or no entry if none. */
	12138
	12139	UV n;
	12140
	12141	PERL_ARGS_ASSERT_SET_ANYOF_ARG;
	12142
	12143	/* If this is set, the final disposition won't be known until runtime, so
	12144	* we can't do any of the compile time optimizations */
	12145	if (! runtime_defns) {
	12146
	12147	/* On plain ANYOF nodes without the possibility of a runtime locale
	12148	* making a difference, maybe there's no information to be gleaned
	12149	* except for what's in the bitmap */
	12150	if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
	12151
	12152	/* There are two such cases:
	12153	* 1) there is no list of code points matched outside the bitmap
	12154	*/
	12155	if (! cp_list) {
	12156	ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
	12157	return;
	12158	}
	12159
	12160	/* 2) the list indicates everything outside the bitmap matches */
	12161	if ( invlist_highest(cp_list) == UV_MAX
	12162	&& invlist_highest_range_start(cp_list)
	12163	<= NUM_ANYOF_CODE_POINTS)
	12164	{
	12165	ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
	12166	return;
	12167	}
	12168
	12169	/* In all other cases there are things outside the bitmap that we
	12170	* may need to check at runtime. */
	12171	}
	12172
	12173	/* Here, we have resolved all the possible run-time matches, and they
	12174	* are stored in one or both of two possible lists. (While some match
	12175	* only under certain runtime circumstances, we know all the possible
	12176	* ones for each such circumstance.)
	12177	*
	12178	* It may very well be that the pattern being compiled contains an
	12179	* identical class, already encountered. Reusing that class here saves
	12180	* space. Look through all classes so far encountered. */
	12181	U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
	12182	for (unsigned int i = 0; i < existing_items; i++) {
	12183
	12184	/* Only look at auxiliary data of this type */
	12185	if (RExC_rxi->data->what[i] != 's') {
	12186	continue;
	12187	}
	12188
	12189	SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
	12190	AV * const av = MUTABLE_AV(SvRV(rv));
	12191
	12192	/* If the already encountered class has data that won't be known
	12193	* until runtime (stored in the final element of the array), we
	12194	* can't share */
	12195	if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
	12196	continue;
	12197	}
	12198
	12199	SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
	12200	false /* no lvalue */);
	12201
	12202	/* The new and the existing one both have to have or both not
	12203	* have this element, for this one to duplicate that one */
	12204	if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
	12205	continue;
	12206	}
	12207
	12208	/* If the inversion lists aren't equivalent, can't share */
	12209	if (cp_list && ! _invlistEQ(cp_list,
	12210	*stored_cp_list_ptr,
	12211	FALSE /* don't complement */))
	12212	{
	12213	continue;
	12214	}
	12215
	12216	/* Similarly for the other list */
	12217	SV ** stored_only_utf8_locale_list_ptr = av_fetch(
	12218	av,
	12219	ONLY_LOCALE_MATCHES_INDEX,
	12220	false /* no lvalue */);
	12221	if ( cBOOL(only_utf8_locale_list)
	12222	!= cBOOL(stored_only_utf8_locale_list_ptr))
	12223	{
	12224	continue;
	12225	}
	12226
	12227	if (only_utf8_locale_list && ! _invlistEQ(
	12228	only_utf8_locale_list,
	12229	*stored_only_utf8_locale_list_ptr,
	12230	FALSE /* don't complement */))
	12231	{
	12232	continue;
	12233	}
	12234
	12235	/* Here, the existence and contents of both compile-time lists
	12236	* are identical between the new and existing data. Re-use the
	12237	* existing one */
	12238	ARG1u_SET(node, i);
	12239	return;
	12240	} /* end of loop through existing classes */
	12241	}
	12242
	12243	/* Here, we need to create a new auxiliary data element; either because
	12244	* this doesn't duplicate an existing one, or we can't tell at this time if
	12245	* it eventually will */
	12246
	12247	AV * const av = newAV();
	12248	SV *rv;
	12249
	12250	if (cp_list) {
	12251	av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
	12252	}
	12253
	12254	/* (Note that if any of this changes, the size calculations in
	12255	* S_optimize_regclass() might need to be updated.) */
	12256
	12257	if (only_utf8_locale_list) {
	12258	av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX,
	12259	SvREFCNT_inc_NN(only_utf8_locale_list));
	12260	}
	12261
	12262	if (runtime_defns) {
	12263	av_store_simple(av, DEFERRED_USER_DEFINED_INDEX,
	12264	SvREFCNT_inc_NN(runtime_defns));
	12265	}
	12266
	12267	rv = newRV_noinc(MUTABLE_SV(av));
	12268	n = reg_add_data(pRExC_state, STR_WITH_LEN("s"));
	12269	RExC_rxi->data->data[n] = (void*)rv;
	12270	ARG1u_SET(node, n);
	12271	}
	12272	#endif /* PERL_RE_BUILD_AUX */
	12273
	12274	SV *
	12275
	12276	#if !defined(PERL_IN_XSUB_RE) \|\| defined(PLUGGABLE_RE_EXTENSION)
	12277	Perl_get_regclass_aux_data(pTHX_ const regexp prog, const regnode node, bool doinit, SV listsvp, SV only_utf8_locale_ptr, SV** output_invlist)
	12278	#else
	12279	Perl_get_re_gclass_aux_data(pTHX_ const regexp prog, const regnode node, bool doinit, SV listsvp, SV only_utf8_locale_ptr, SV** output_invlist)
	12280	#endif
	12281
	12282	{
	12283	/* For internal core use only.
	12284	* Returns the inversion list for the input 'node' in the regex 'prog'.
	12285	* If <doinit> is 'true', will attempt to create the inversion list if not
	12286	* already done. If it is created, it will add to the normal inversion
	12287	* list any that comes from user-defined properties. It croaks if this
	12288	* is called before such a list is ready to be generated, that is when a
	12289	* user-defined property has been declared, buyt still not yet defined.
	12290	* If <listsvp> is non-null, will return the printable contents of the
	12291	* property definition. This can be used to get debugging information
	12292	* even before the inversion list exists, by calling this function with
	12293	* 'doinit' set to false, in which case the components that will be used
	12294	* to eventually create the inversion list are returned (in a printable
	12295	* form).
	12296	* If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
	12297	* store an inversion list of code points that should match only if the
	12298	* execution-time locale is a UTF-8 one.
	12299	* If <output_invlist> is not NULL, it is where this routine is to store an
	12300	* inversion list of the code points that would be instead returned in
	12301	* <listsvp> if this were NULL. Thus, what gets output in <listsvp>
	12302	* when this parameter is used, is just the non-code point data that
	12303	* will go into creating the inversion list. This currently should be just
	12304	* user-defined properties whose definitions were not known at compile
	12305	* time. Using this parameter allows for easier manipulation of the
	12306	* inversion list's data by the caller. It is illegal to call this
	12307	* function with this parameter set, but not <listsvp>
	12308	*
	12309	* Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
	12310	* that, in spite of this function's name, the inversion list it returns
	12311	* may include the bitmap data as well */
	12312
	12313	SV si = NULL; / Input initialization string */
	12314	SV* invlist = NULL;
	12315
	12316	RXi_GET_DECL_NULL(prog, progi);
	12317	const struct reg_data * const data = prog ? progi->data : NULL;
	12318
	12319	#if !defined(PERL_IN_XSUB_RE) \|\| defined(PLUGGABLE_RE_EXTENSION)
	12320	PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
	12321	#else
	12322	PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
	12323	#endif
	12324	assert(! output_invlist \|\| listsvp);
	12325
	12326	if (data && data->count) {
	12327	const U32 n = ARG1u(node);
	12328
	12329	if (data->what[n] == 's') {
	12330	SV * const rv = MUTABLE_SV(data->data[n]);
	12331	AV * const av = MUTABLE_AV(SvRV(rv));
	12332	SV **const ary = AvARRAY(av);
	12333
	12334	invlist = ary[INVLIST_INDEX];
	12335
	12336	if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
	12337	*only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
	12338	}
	12339
	12340	if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
	12341	si = ary[DEFERRED_USER_DEFINED_INDEX];
	12342	}
	12343
	12344	if (doinit && (si \|\| invlist)) {
	12345	if (si) {
	12346	bool user_defined;
	12347	SV * msg = newSVpvs_flags("", SVs_TEMP);
	12348
	12349	SV * prop_definition = handle_user_defined_property(
	12350	"", 0, FALSE, /* There is no \p{}, \P{} */
	12351	SvPVX_const(si)[1] - '0', /* /i or not has been
	12352	stored here for just
	12353	this occasion */
	12354	TRUE, /* run time */
	12355	FALSE, /* This call must find the defn */
	12356	si, /* The property definition */
	12357	&user_defined,
	12358	msg,
	12359	0 /* base level call */
	12360	);
	12361
	12362	if (SvCUR(msg)) {
	12363	assert(prop_definition == NULL);
	12364
	12365	Perl_croak(aTHX_ "%" UTF8f,
	12366	UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
	12367	}
	12368
	12369	if (invlist) {
	12370	_invlist_union(invlist, prop_definition, &invlist);
	12371	SvREFCNT_dec_NN(prop_definition);
	12372	}
	12373	else {
	12374	invlist = prop_definition;
	12375	}
	12376
	12377	STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
	12378	STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
	12379
	12380	ary[INVLIST_INDEX] = invlist;
	12381	av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
	12382	? ONLY_LOCALE_MATCHES_INDEX
	12383	: INVLIST_INDEX);
	12384	si = NULL;
	12385	}
	12386	}
	12387	}
	12388	}
	12389
	12390	/* If requested, return a printable version of what this ANYOF node matches
	12391	* */
	12392	if (listsvp) {
	12393	SV* matches_string = NULL;
	12394
	12395	/* This function can be called at compile-time, before everything gets
	12396	* resolved, in which case we return the currently best available
	12397	* information, which is the string that will eventually be used to do
	12398	* that resolving, 'si' */
	12399	if (si) {
	12400	/* Here, we only have 'si' (and possibly some passed-in data in
	12401	* 'invlist', which is handled below) If the caller only wants
	12402	* 'si', use that. */
	12403	if (! output_invlist) {
	12404	matches_string = newSVsv(si);
	12405	}
	12406	else {
	12407	/* But if the caller wants an inversion list of the node, we
	12408	* need to parse 'si' and place as much as possible in the
	12409	* desired output inversion list, making 'matches_string' only
	12410	* contain the currently unresolvable things */
	12411	const char *si_string = SvPVX(si);
	12412	STRLEN remaining = SvCUR(si);
	12413	UV prev_cp = 0;
	12414	U8 count = 0;
	12415
	12416	/* Ignore everything before and including the first new-line */
	12417	si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
	12418	assert (si_string != NULL);
	12419	si_string++;
	12420	remaining = SvPVX(si) + SvCUR(si) - si_string;
	12421
	12422	while (remaining > 0) {
	12423
	12424	/* The data consists of just strings defining user-defined
	12425	* property names, but in prior incarnations, and perhaps
	12426	* somehow from pluggable regex engines, it could still
	12427	* hold hex code point definitions, all of which should be
	12428	* legal (or it wouldn't have gotten this far). Each
	12429	* component of a range would be separated by a tab, and
	12430	* each range by a new-line. If these are found, instead
	12431	* add them to the inversion list */
	12432	I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
	12433	\|PERL_SCAN_SILENT_NON_PORTABLE;
	12434	STRLEN len = remaining;
	12435	UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
	12436
	12437	/* If the hex decode routine found something, it should go
	12438	* up to the next \n */
	12439	if ( *(si_string + len) == '\n') {
	12440	if (count) { /* 2nd code point on line */
	12441	output_invlist = _add_range_to_invlist(output_invlist, prev_cp, cp);
	12442	}
	12443	else {
	12444	output_invlist = add_cp_to_invlist(output_invlist, cp);
	12445	}
	12446	count = 0;
	12447	goto prepare_for_next_iteration;
	12448	}
	12449
	12450	/* If the hex decode was instead for the lower range limit,
	12451	* save it, and go parse the upper range limit */
	12452	if (*(si_string + len) == '\t') {
	12453	assert(count == 0);
	12454
	12455	prev_cp = cp;
	12456	count = 1;
	12457	prepare_for_next_iteration:
	12458	si_string += len + 1;
	12459	remaining -= len + 1;
	12460	continue;
	12461	}
	12462
	12463	/* Here, didn't find a legal hex number. Just add the text
	12464	* from here up to the next \n, omitting any trailing
	12465	* markers. */
	12466
	12467	remaining -= len;
	12468	len = strcspn(si_string,
	12469	DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
	12470	remaining -= len;
	12471	if (matches_string) {
	12472	sv_catpvn(matches_string, si_string, len);
	12473	}
	12474	else {
	12475	matches_string = newSVpvn(si_string, len);
	12476	}
	12477	sv_catpvs(matches_string, " ");
	12478
	12479	si_string += len;
	12480	if ( remaining
	12481	&& UCHARAT(si_string)
	12482	== DEFERRED_COULD_BE_OFFICIAL_MARKERc)
	12483	{
	12484	si_string++;
	12485	remaining--;
	12486	}
	12487	if (remaining && UCHARAT(si_string) == '\n') {
	12488	si_string++;
	12489	remaining--;
	12490	}
	12491	} /* end of loop through the text */
	12492
	12493	assert(matches_string);
	12494	if (SvCUR(matches_string)) { /* Get rid of trailing blank */
	12495	SvCUR_set(matches_string, SvCUR(matches_string) - 1);
	12496	}
	12497	} /* end of has an 'si' */
	12498	}
	12499
	12500	/* Add the stuff that's already known */
	12501	if (invlist) {
	12502
	12503	/* Again, if the caller doesn't want the output inversion list, put
	12504	* everything in 'matches-string' */
	12505	if (! output_invlist) {
	12506	if ( ! matches_string) {
	12507	matches_string = newSVpvs("\n");
	12508	}
	12509	sv_catsv(matches_string, invlist_contents(invlist,
	12510	TRUE /* traditional style */
	12511	));
	12512	}
	12513	else if (! *output_invlist) {
	12514	*output_invlist = invlist_clone(invlist, NULL);
	12515	}
	12516	else {
	12517	_invlist_union(*output_invlist, invlist, output_invlist);
	12518	}
	12519	}
	12520
	12521	*listsvp = matches_string;
	12522	}
	12523
	12524	return invlist;
	12525	}
	12526
	12527	/* reg_skipcomment()
	12528
	12529	Absorbs an /x style # comment from the input stream,
	12530	returning a pointer to the first character beyond the comment, or if the
	12531	comment terminates the pattern without anything following it, this returns
	12532	one past the final character of the pattern (in other words, RExC_end) and
	12533	sets the REG_RUN_ON_COMMENT_SEEN flag.
	12534
	12535	Note it's the callers responsibility to ensure that we are
	12536	actually in /x mode
	12537
	12538	*/
	12539
	12540	PERL_STATIC_INLINE char*
	12541	S_reg_skipcomment(RExC_state_t pRExC_state, char p)
	12542	{
	12543	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	12544
	12545	assert(*p == '#');
	12546
	12547	while (p < RExC_end) {
	12548	if (*(++p) == '\n') {
	12549	return p+1;
	12550	}
	12551	}
	12552
	12553	/* we ran off the end of the pattern without ending the comment, so we have
	12554	* to add an \n when wrapping */
	12555	RExC_seen \|= REG_RUN_ON_COMMENT_SEEN;
	12556	return p;
	12557	}
	12558
	12559	STATIC void
	12560	S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
	12561	char ** p,
	12562	const bool force_to_xmod
	12563	)
	12564	{
	12565	/* If the text at the current parse position '*p' is a '(?#...)' comment,
	12566	* or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
	12567	* is /x whitespace, advance '*p' so that on exit it points to the first
	12568	* byte past all such white space and comments */
	12569
	12570	const bool use_xmod = force_to_xmod \|\| (RExC_flags & RXf_PMf_EXTENDED);
	12571
	12572	PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
	12573
	12574	assert( ! UTF \|\| UTF8_IS_INVARIANT(p) \|\| UTF8_IS_START(p));
	12575
	12576	for (;;) {
	12577	if (RExC_end - (*p) >= 3
	12578	&& (p) == '('
	12579	&& (p + 1) == '?'
	12580	&& (p + 2) == '#')
	12581	{
	12582	while ((p) != ')') {
	12583	if ((*p) == RExC_end)
	12584	FAIL("Sequence (?#... not terminated");
	12585	(*p)++;
	12586	}
	12587	(*p)++;
	12588	continue;
	12589	}
	12590
	12591	if (use_xmod) {
	12592	const char * save_p = *p;
	12593	while ((*p) < RExC_end) {
	12594	STRLEN len;
	12595	if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
	12596	(*p) += len;
	12597	}
	12598	else if ((p) == '#') {
	12599	(p) = reg_skipcomment(pRExC_state, (p));
	12600	}
	12601	else {
	12602	break;
	12603	}
	12604	}
	12605	if (*p != save_p) {
	12606	continue;
	12607	}
	12608	}
	12609
	12610	break;
	12611	}
	12612
	12613	return;
	12614	}
	12615
	12616	/* nextchar()
	12617
	12618	Advances the parse position by one byte, unless that byte is the beginning
	12619	of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
	12620	those two cases, the parse position is advanced beyond all such comments and
	12621	white space.
	12622
	12623	This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
	12624	*/
	12625
	12626	STATIC void
	12627	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	12628	{
	12629	PERL_ARGS_ASSERT_NEXTCHAR;
	12630
	12631	if (RExC_parse < RExC_end) {
	12632	assert( ! UTF
	12633	\|\| UTF8_IS_INVARIANT(*RExC_parse)
	12634	\|\| UTF8_IS_START(*RExC_parse));
	12635
	12636	RExC_parse_inc_safe();
	12637
	12638	skip_to_be_ignored_text(pRExC_state, &RExC_parse,
	12639	FALSE /* Don't force /x */ );
	12640	}
	12641	}
	12642
	12643	STATIC void
	12644	S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
	12645	{
	12646	/* 'size' is the delta number of smallest regnode equivalents to add or
	12647	* subtract from the current memory allocated to the regex engine being
	12648	* constructed. */
	12649
	12650	PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
	12651
	12652	RExC_size += size;
	12653
	12654	Renewc(RExC_rxi,
	12655	sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
	12656	/* +1 for REG_MAGIC */
	12657	char,
	12658	regexp_internal);
	12659	if ( RExC_rxi == NULL )
	12660	FAIL("Regexp out of space");
	12661	RXi_SET(RExC_rx, RExC_rxi);
	12662
	12663	RExC_emit_start = RExC_rxi->program;
	12664	if (size > 0) {
	12665	Zero(REGNODE_p(RExC_emit), size, regnode);
	12666	}
	12667	}
	12668
	12669	STATIC regnode_offset
	12670	S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
	12671	{
	12672	/* Allocate a regnode that is (1 + extra_size) times as big as the
	12673	* smallest regnode worth of space, and also aligns and increments
	12674	* RExC_size appropriately.
	12675	*
	12676	* It returns the regnode's offset into the regex engine program */
	12677
	12678	const regnode_offset ret = RExC_emit;
	12679
	12680	PERL_ARGS_ASSERT_REGNODE_GUTS;
	12681
	12682	SIZE_ALIGN(RExC_size);
	12683	change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
	12684	NODE_ALIGN_FILL(REGNODE_p(ret));
	12685	return(ret);
	12686	}
	12687
	12688	#ifdef DEBUGGING
	12689
	12690	STATIC regnode_offset
	12691	S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
	12692	PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
	12693	assert(extra_size >= REGNODE_ARG_LEN(op) \|\| REGNODE_TYPE(op) == ANYOF);
	12694	return S_regnode_guts(aTHX_ pRExC_state, extra_size);
	12695	}
	12696
	12697	#endif
	12698
	12699
	12700
	12701	/*
	12702	- reg_node - emit a node
	12703	*/
	12704	STATIC regnode_offset /* Location. */
	12705	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	12706	{
	12707	const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
	12708	regnode_offset ptr = ret;
	12709
	12710	PERL_ARGS_ASSERT_REG_NODE;
	12711
	12712	assert(REGNODE_ARG_LEN(op) == 0);
	12713
	12714	FILL_ADVANCE_NODE(ptr, op);
	12715	RExC_emit = ptr;
	12716	return(ret);
	12717	}
	12718
	12719	/*
	12720	- reg1node - emit a node with an argument
	12721	*/
	12722	STATIC regnode_offset /* Location. */
	12723	S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	12724	{
	12725	const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
	12726	regnode_offset ptr = ret;
	12727
	12728	PERL_ARGS_ASSERT_REG1NODE;
	12729
	12730	/* ANYOF are special cased to allow non-length 1 args */
	12731	assert(REGNODE_ARG_LEN(op) == 1);
	12732
	12733	FILL_ADVANCE_NODE_ARG1u(ptr, op, arg);
	12734	RExC_emit = ptr;
	12735	return(ret);
	12736	}
	12737
	12738	/*
	12739	- regpnode - emit a temporary node with a SV* argument
	12740	*/
	12741	STATIC regnode_offset /* Location. */
	12742	S_regpnode(pTHX_ RExC_state_t pRExC_state, U8 op, SV arg)
	12743	{
	12744	const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
	12745	regnode_offset ptr = ret;
	12746
	12747	PERL_ARGS_ASSERT_REGPNODE;
	12748
	12749	FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
	12750	RExC_emit = ptr;
	12751	return(ret);
	12752	}
	12753
	12754	STATIC regnode_offset
	12755	S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
	12756	{
	12757	/* emit a node with U32 and I32 arguments */
	12758
	12759	const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
	12760	regnode_offset ptr = ret;
	12761
	12762	PERL_ARGS_ASSERT_REG2NODE;
	12763
	12764	assert(REGNODE_ARG_LEN(op) == 2);
	12765
	12766	FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2);
	12767	RExC_emit = ptr;
	12768	return(ret);
	12769	}
	12770
	12771	/*
	12772	- reginsert - insert an operator in front of already-emitted operand
	12773	*
	12774	* That means that on exit 'operand' is the offset of the newly inserted
	12775	* operator, and the original operand has been relocated.
	12776	*
	12777	* IMPORTANT NOTE - it is the callers responsibility to correctly
	12778	* set up NEXT_OFF() of the inserted node if needed. Something like this:
	12779	*
	12780	* reginsert(pRExC, OPFAIL, orig_emit, depth+1);
	12781	* NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
	12782	*
	12783	* ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
	12784	*/
	12785	STATIC void
	12786	S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
	12787	const regnode_offset operand, const U32 depth)
	12788	{
	12789	regnode *src;
	12790	regnode *dst;
	12791	regnode *place;
	12792	const int offset = REGNODE_ARG_LEN((U8)op);
	12793	const int size = NODE_STEP_REGNODE + offset;
	12794	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	12795
	12796	PERL_ARGS_ASSERT_REGINSERT;
	12797	PERL_UNUSED_CONTEXT;
	12798	PERL_UNUSED_ARG(depth);
	12799	DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
	12800	assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
	12801	studying. If this is wrong then we need to adjust RExC_recurse
	12802	below like we do with RExC_open_parens/RExC_close_parens. */
	12803	change_engine_size(pRExC_state, (Ptrdiff_t) size);
	12804	src = REGNODE_p(RExC_emit);
	12805	RExC_emit += size;
	12806	dst = REGNODE_p(RExC_emit);
	12807
	12808	/* If we are in a "count the parentheses" pass, the numbers are unreliable,
	12809	* and [perl #133871] shows this can lead to problems, so skip this
	12810	* realignment of parens until a later pass when they are reliable */
	12811	if (! IN_PARENS_PASS && RExC_open_parens) {
	12812	int paren;
	12813	/DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);/
	12814	/* remember that RExC_npar is rex->nparens + 1,
	12815	* iow it is 1 more than the number of parens seen in
	12816	* the pattern so far. */
	12817	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	12818	/* note, RExC_open_parens[0] is the start of the
	12819	* regex, it can't move. RExC_close_parens[0] is the end
	12820	* of the regex, it can move. */
	12821	if ( paren && RExC_open_parens[paren] >= operand ) {
	12822	/DEBUG_PARSE_FMT("open"," - %d", size);/
	12823	RExC_open_parens[paren] += size;
	12824	} else {
	12825	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	12826	}
	12827	if ( RExC_close_parens[paren] >= operand ) {
	12828	/DEBUG_PARSE_FMT("close"," - %d", size);/
	12829	RExC_close_parens[paren] += size;
	12830	} else {
	12831	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	12832	}
	12833	}
	12834	}
	12835	if (RExC_end_op)
	12836	RExC_end_op += size;
	12837
	12838	while (src > REGNODE_p(operand)) {
	12839	StructCopy(--src, --dst, regnode);
	12840	}
	12841
	12842	place = REGNODE_p(operand); /* Op node, where operand used to be. */
	12843	src = place + 1; /* NOT REGNODE_AFTER! */
	12844	FLAGS(place) = 0;
	12845	FILL_NODE(operand, op);
	12846
	12847	/* Zero out any arguments in the new node */
	12848	Zero(src, offset, regnode);
	12849	}
	12850
	12851	/*
	12852	- regtail - set the next-pointer at the end of a node chain of p to val. If
	12853	that value won't fit in the space available, instead returns FALSE.
	12854	(Except asserts if we can't fit in the largest space the regex
	12855	engine is designed for.)
	12856	- SEE ALSO: regtail_study
	12857	*/
	12858	STATIC bool
	12859	S_regtail(pTHX_ RExC_state_t * pRExC_state,
	12860	const regnode_offset p,
	12861	const regnode_offset val,
	12862	const U32 depth)
	12863	{
	12864	regnode_offset scan;
	12865	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	12866
	12867	PERL_ARGS_ASSERT_REGTAIL;
	12868	#ifndef DEBUGGING
	12869	PERL_UNUSED_ARG(depth);
	12870	#endif
	12871
	12872	/* The final node in the chain is the first one with a nonzero next pointer
	12873	* */
	12874	scan = (regnode_offset) p;
	12875	for (;;) {
	12876	regnode * const temp = regnext(REGNODE_p(scan));
	12877	DEBUG_PARSE_r({
	12878	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	12879	regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
	12880	Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
	12881	SvPV_nolen_const(RExC_mysv), scan,
	12882	(temp == NULL ? "->" : ""),
	12883	(temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
	12884	);
	12885	});
	12886	if (temp == NULL)
	12887	break;
	12888	scan = REGNODE_OFFSET(temp);
	12889	}
	12890
	12891	/* Populate this node's next pointer */
	12892	assert(val >= scan);
	12893	if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
	12894	assert((UV) (val - scan) <= U32_MAX);
	12895	ARG1u_SET(REGNODE_p(scan), val - scan);
	12896	}
	12897	else {
	12898	if (val - scan > U16_MAX) {
	12899	/* Populate this with something that won't loop and will likely
	12900	* lead to a crash if the caller ignores the failure return, and
	12901	* execution continues */
	12902	NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
	12903	return FALSE;
	12904	}
	12905	NEXT_OFF(REGNODE_p(scan)) = val - scan;
	12906	}
	12907
	12908	return TRUE;
	12909	}
	12910
	12911	#ifdef DEBUGGING
	12912	/*
	12913	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	12914	- Look for optimizable sequences at the same time.
	12915	- currently only looks for EXACT chains.
	12916
	12917	This is experimental code. The idea is to use this routine to perform
	12918	in place optimizations on branches and groups as they are constructed,
	12919	with the long term intention of removing optimization from study_chunk so
	12920	that it is purely analytical.
	12921
	12922	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	12923	to control which is which.
	12924
	12925	This used to return a value that was ignored. It was a problem that it is
	12926	#ifdef'd to be another function that didn't return a value. khw has changed it
	12927	so both currently return a pass/fail return.
	12928
	12929	*/
	12930	/* TODO: All four parms should be const */
	12931
	12932	STATIC bool
	12933	S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
	12934	const regnode_offset val, U32 depth)
	12935	{
	12936	regnode_offset scan;
	12937	U8 exact = PSEUDO;
	12938	#ifdef EXPERIMENTAL_INPLACESCAN
	12939	I32 min = 0;
	12940	#endif
	12941	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	12942
	12943	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	12944
	12945
	12946	/* Find last node. */
	12947
	12948	scan = p;
	12949	for (;;) {
	12950	regnode * const temp = regnext(REGNODE_p(scan));
	12951	#ifdef EXPERIMENTAL_INPLACESCAN
	12952	if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
	12953	bool unfolded_multi_char; /* Unexamined in this routine */
	12954	if (join_exact(pRExC_state, scan, &min,
	12955	&unfolded_multi_char, 1, REGNODE_p(val), depth+1))
	12956	return TRUE; /* Was return EXACT */
	12957	}
	12958	#endif
	12959	if ( exact ) {
	12960	if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
	12961	if (exact == PSEUDO )
	12962	exact= OP(REGNODE_p(scan));
	12963	else if (exact != OP(REGNODE_p(scan)) )
	12964	exact= 0;
	12965	}
	12966	else if (OP(REGNODE_p(scan)) != NOTHING) {
	12967	exact= 0;
	12968	}
	12969	}
	12970	DEBUG_PARSE_r({
	12971	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	12972	regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
	12973	Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
	12974	SvPV_nolen_const(RExC_mysv),
	12975	scan,
	12976	REGNODE_NAME(exact));
	12977	});
	12978	if (temp == NULL)
	12979	break;
	12980	scan = REGNODE_OFFSET(temp);
	12981	}
	12982	DEBUG_PARSE_r({
	12983	DEBUG_PARSE_MSG("");
	12984	regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
	12985	Perl_re_printf( aTHX_
	12986	"~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
	12987	SvPV_nolen_const(RExC_mysv),
	12988	(IV)val,
	12989	(IV)(val - scan)
	12990	);
	12991	});
	12992	if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
	12993	assert((UV) (val - scan) <= U32_MAX);
	12994	ARG1u_SET(REGNODE_p(scan), val - scan);
	12995	}
	12996	else {
	12997	if (val - scan > U16_MAX) {
	12998	/* Populate this with something that won't loop and will likely
	12999	* lead to a crash if the caller ignores the failure return, and
	13000	* execution continues */
	13001	NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
	13002	return FALSE;
	13003	}
	13004	NEXT_OFF(REGNODE_p(scan)) = val - scan;
	13005	}
	13006
	13007	return TRUE; /* Was 'return exact' */
	13008	}
	13009	#endif
	13010
	13011
	13012	#ifdef PERL_RE_BUILD_AUX
	13013	SV*
	13014	Perl_get_ANYOFM_contents(pTHX_ const regnode * n) {
	13015
	13016	/* Returns an inversion list of all the code points matched by the
	13017	* ANYOFM/NANYOFM node 'n' */
	13018
	13019	SV * cp_list = _new_invlist(-1);
	13020	const U8 lowest = (U8) ARG1u(n);
	13021	unsigned int i;
	13022	U8 count = 0;
	13023	U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
	13024
	13025	PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
	13026
	13027	/* Starting with the lowest code point, any code point that ANDed with the
	13028	* mask yields the lowest code point is in the set */
	13029	for (i = lowest; i <= 0xFF; i++) {
	13030	if ((i & FLAGS(n)) == ARG1u(n)) {
	13031	cp_list = add_cp_to_invlist(cp_list, i);
	13032	count++;
	13033
	13034	/* We know how many code points (a power of two) that are in the
	13035	* set. No use looking once we've got that number */
	13036	if (count >= needed) break;
	13037	}
	13038	}
	13039
	13040	if (OP(n) == NANYOFM) {
	13041	_invlist_invert(cp_list);
	13042	}
	13043	return cp_list;
	13044	}
	13045
	13046	SV *
	13047	Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
	13048	PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
	13049
	13050	SV * cp_list = NULL;
	13051	populate_invlist_from_bitmap(
	13052	((struct regnode_bbm *) n)->bitmap,
	13053	REGNODE_BBM_BITMAP_LEN * CHARBITS,
	13054	&cp_list,
	13055
	13056	/* The base cp is from the start byte plus a zero continuation */
	13057	TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n),
	13058	UTF_CONTINUATION_MARK \| 0));
	13059	return cp_list;
	13060	}
	13061	#endif /* PERL_RE_BUILD_AUX */
	13062
	13063
	13064	SV *
	13065	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	13066	{ /* Assume that RE_INTUIT is set */
	13067	/* Returns an SV containing a string that must appear in the target for it
	13068	* to match, or NULL if nothing is known that must match.
	13069	*
	13070	* CAUTION: the SV can be freed during execution of the regex engine */
	13071
	13072	struct regexp *const prog = ReANY(r);
	13073	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	13074
	13075	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	13076	PERL_UNUSED_CONTEXT;
	13077
	13078	DEBUG_COMPILE_r(
	13079	{
	13080	if (prog->maxlen > 0 && (prog->check_utf8 \|\| prog->check_substr)) {
	13081	const char * const s = SvPV_nolen_const(RX_UTF8(r)
	13082	? prog->check_utf8 : prog->check_substr);
	13083
	13084	if (!PL_colorset) reginitcolors();
	13085	Perl_re_printf( aTHX_
	13086	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	13087	PL_colors[4],
	13088	RX_UTF8(r) ? "utf8 " : "",
	13089	PL_colors[5], PL_colors[0],
	13090	s,
	13091	PL_colors[1],
	13092	(strlen(s) > PL_dump_re_max_len ? "..." : ""));
	13093	}
	13094	} );
	13095
	13096	/* use UTF8 check substring if regexp pattern itself is in UTF8 */
	13097	return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
	13098	}
	13099
	13100	/*
	13101	pregfree()
	13102
	13103	handles refcounting and freeing the perl core regexp structure. When
	13104	it is necessary to actually free the structure the first thing it
	13105	does is call the 'free' method of the regexp_engine associated to
	13106	the regexp, allowing the handling of the void *pprivate; member
	13107	first. (This routine is not overridable by extensions, which is why
	13108	the extensions free is called first.)
	13109
	13110	See regdupe and regdupe_internal if you change anything here.
	13111	*/
	13112	#ifndef PERL_IN_XSUB_RE
	13113	void
	13114	Perl_pregfree(pTHX_ REGEXP *r)
	13115	{
	13116	SvREFCNT_dec(r);
	13117	}
	13118
	13119	void
	13120	Perl_pregfree2(pTHX_ REGEXP *rx)
	13121	{
	13122	struct regexp *const r = ReANY(rx);
	13123	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	13124
	13125	PERL_ARGS_ASSERT_PREGFREE2;
	13126
	13127	if (! r)
	13128	return;
	13129
	13130	if (r->mother_re) {
	13131	ReREFCNT_dec(r->mother_re);
	13132	} else {
	13133	CALLREGFREE_PVT(rx); /* free the private data */
	13134	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	13135	}
	13136	if (r->substrs) {
	13137	int i;
	13138	for (i = 0; i < 2; i++) {
	13139	SvREFCNT_dec(r->substrs->data[i].substr);
	13140	SvREFCNT_dec(r->substrs->data[i].utf8_substr);
	13141	}
	13142	Safefree(r->substrs);
	13143	}
	13144	RX_MATCH_COPY_FREE(rx);
	13145	#ifdef PERL_ANY_COW
	13146	SvREFCNT_dec(r->saved_copy);
	13147	#endif
	13148	Safefree(RXp_OFFSp(r));
	13149	if (r->logical_to_parno) {
	13150	Safefree(r->logical_to_parno);
	13151	Safefree(r->parno_to_logical);
	13152	Safefree(r->parno_to_logical_next);
	13153	}
	13154
	13155	SvREFCNT_dec(r->qr_anoncv);
	13156	if (r->recurse_locinput)
	13157	Safefree(r->recurse_locinput);
	13158	}
	13159
	13160
	13161	/* reg_temp_copy()
	13162
	13163	Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
	13164	except that dsv will be created if NULL.
	13165
	13166	This function is used in two main ways. First to implement
	13167	$r = qr/....; $s = $$r;
	13168
	13169	Secondly, it is used as a hacky workaround to the structural issue of
	13170	match results
	13171	being stored in the regexp structure which is in turn stored in
	13172	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	13173	could be PL_curpm in multiple contexts, and could require multiple
	13174	result sets being associated with the pattern simultaneously, such
	13175	as when doing a recursive match with (??{$qr})
	13176
	13177	The solution is to make a lightweight copy of the regexp structure
	13178	when a qr// is returned from the code executed by (??{$qr}) this
	13179	lightweight copy doesn't actually own any of its data except for
	13180	the starp/end and the actual regexp structure itself.
	13181
	13182	*/
	13183
	13184
	13185	REGEXP *
	13186	Perl_reg_temp_copy(pTHX_ REGEXP dsv, REGEXP ssv)
	13187	{
	13188	struct regexp *drx;
	13189	struct regexp *const srx = ReANY(ssv);
	13190	const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
	13191
	13192	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	13193
	13194	if (!dsv)
	13195	dsv = (REGEXP*) newSV_type(SVt_REGEXP);
	13196	else {
	13197	assert(SvTYPE(dsv) == SVt_REGEXP \|\| (SvTYPE(dsv) == SVt_PVLV));
	13198
	13199	/* our only valid caller, sv_setsv_flags(), should have done
	13200	* a SV_CHECK_THINKFIRST_COW_DROP() by now */
	13201	assert(!SvOOK(dsv));
	13202	assert(!SvIsCOW(dsv));
	13203	assert(!SvROK(dsv));
	13204
	13205	if (SvPVX_const(dsv)) {
	13206	if (SvLEN(dsv))
	13207	Safefree(SvPVX(dsv));
	13208	SvPVX(dsv) = NULL;
	13209	}
	13210	SvLEN_set(dsv, 0);
	13211	SvCUR_set(dsv, 0);
	13212	SvOK_off((SV *)dsv);
	13213
	13214	if (islv) {
	13215	/* For PVLVs, the head (sv_any) points to an XPVLV, while
	13216	* the LV's xpvlenu_rx will point to a regexp body, which
	13217	* we allocate here */
	13218	REGEXP temp = (REGEXP )newSV_type(SVt_REGEXP);
	13219	assert(!SvPVX(dsv));
	13220	/* We "steal" the body from the newly allocated SV temp, changing
	13221	* the pointer in its HEAD to NULL. We then change its type to
	13222	* SVt_NULL so that when we immediately release its only reference,
	13223	* no memory deallocation happens.
	13224	*
	13225	* The body will eventually be freed (from the PVLV) either in
	13226	* Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
	13227	* the regexp body needs to be removed)
	13228	* or in Perl_sv_clear() (if the PVLV still holds the pointer until
	13229	* the PVLV itself is deallocated). */
	13230	((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
	13231	temp->sv_any = NULL;
	13232	SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) \| SVt_NULL;
	13233	SvREFCNT_dec_NN(temp);
	13234	/* SvCUR still resides in the xpvlv struct, so the regexp copy-
	13235	ing below will not set it. */
	13236	SvCUR_set(dsv, SvCUR(ssv));
	13237	}
	13238	}
	13239	/* This ensures that SvTHINKFIRST(sv) is true, and hence that
	13240	sv_force_normal(sv) is called. */
	13241	SvFAKE_on(dsv);
	13242	drx = ReANY(dsv);
	13243
	13244	SvFLAGS(dsv) \|= SvFLAGS(ssv) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	13245	SvPV_set(dsv, RX_WRAPPED(ssv));
	13246	/* We share the same string buffer as the original regexp, on which we
	13247	hold a reference count, incremented when mother_re is set below.
	13248	The string pointer is copied here, being part of the regexp struct.
	13249	*/
	13250	memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
	13251	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	13252
	13253	if (!islv)
	13254	SvLEN_set(dsv, 0);
	13255	if (RXp_OFFSp(srx)) {
	13256	const I32 npar = srx->nparens+1;
	13257	NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair);
	13258	}
	13259	if (srx->substrs) {
	13260	int i;
	13261	Newx(drx->substrs, 1, struct reg_substr_data);
	13262	StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
	13263
	13264	for (i = 0; i < 2; i++) {
	13265	SvREFCNT_inc_void(drx->substrs->data[i].substr);
	13266	SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
	13267	}
	13268
	13269	/* check_substr and check_utf8, if non-NULL, point to either their
	13270	anchored or float namesakes, and don't hold a second reference. */
	13271	}
	13272	if (srx->logical_to_parno) {
	13273	NewCopy(srx->logical_to_parno,
	13274	drx->logical_to_parno,
	13275	srx->nparens+1, I32);
	13276	NewCopy(srx->parno_to_logical,
	13277	drx->parno_to_logical,
	13278	srx->nparens+1, I32);
	13279	NewCopy(srx->parno_to_logical_next,
	13280	drx->parno_to_logical_next,
	13281	srx->nparens+1, I32);
	13282	} else {
	13283	drx->logical_to_parno = NULL;
	13284	drx->parno_to_logical = NULL;
	13285	drx->parno_to_logical_next = NULL;
	13286	}
	13287	drx->logical_nparens = srx->logical_nparens;
	13288
	13289	RX_MATCH_COPIED_off(dsv);
	13290	#ifdef PERL_ANY_COW
	13291	RXp_SAVED_COPY(drx) = NULL;
	13292	#endif
	13293	drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
	13294	SvREFCNT_inc_void(drx->qr_anoncv);
	13295	if (srx->recurse_locinput)
	13296	Newx(drx->recurse_locinput, srx->nparens + 1, char *);
	13297
	13298	return dsv;
	13299	}
	13300	#endif
	13301
	13302
	13303	/* regfree_internal()
	13304
	13305	Free the private data in a regexp. This is overloadable by
	13306	extensions. Perl takes care of the regexp structure in pregfree(),
	13307	this covers the *pprivate pointer which technically perl doesn't
	13308	know about, however of course we have to handle the
	13309	regexp_internal structure when no extension is in use.
	13310
	13311	Note this is called before freeing anything in the regexp
	13312	structure.
	13313	*/
	13314
	13315	void
	13316	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	13317	{
	13318	struct regexp *const r = ReANY(rx);
	13319	RXi_GET_DECL(r, ri);
	13320	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	13321
	13322	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	13323
	13324	if (! ri) {
	13325	return;
	13326	}
	13327
	13328	DEBUG_COMPILE_r({
	13329	if (!PL_colorset)
	13330	reginitcolors();
	13331	{
	13332	SV *dsv= sv_newmortal();
	13333	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	13334	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
	13335	Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
	13336	PL_colors[4], PL_colors[5], s);
	13337	}
	13338	});
	13339
	13340	if (ri->code_blocks)
	13341	S_free_codeblocks(aTHX_ ri->code_blocks);
	13342
	13343	if (ri->data) {
	13344	int n = ri->data->count;
	13345
	13346	while (--n >= 0) {
	13347	/* If you add a ->what type here, update the comment in regcomp.h */
	13348	switch (ri->data->what[n]) {
	13349	case 'a':
	13350	case 'r':
	13351	case 's':
	13352	case 'S':
	13353	case 'u':
	13354	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	13355	break;
	13356	case 'f':
	13357	Safefree(ri->data->data[n]);
	13358	break;
	13359	case 'l':
	13360	case 'L':
	13361	break;
	13362	case 'T':
	13363	{ /* Aho Corasick add-on structure for a trie node.
	13364	Used in stclass optimization only */
	13365	U32 refcount;
	13366	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	13367	OP_REFCNT_LOCK;
	13368	refcount = --aho->refcount;
	13369	OP_REFCNT_UNLOCK;
	13370	if ( !refcount ) {
	13371	PerlMemShared_free(aho->states);
	13372	PerlMemShared_free(aho->fail);
	13373	/* do this last!!!! */
	13374	PerlMemShared_free(ri->data->data[n]);
	13375	/* we should only ever get called once, so
	13376	* assert as much, and also guard the free
	13377	* which /might/ happen twice. At the least
	13378	* it will make code anlyzers happy and it
	13379	* doesn't cost much. - Yves */
	13380	assert(ri->regstclass);
	13381	if (ri->regstclass) {
	13382	PerlMemShared_free(ri->regstclass);
	13383	ri->regstclass = 0;
	13384	}
	13385	}
	13386	}
	13387	break;
	13388	case 't':
	13389	{
	13390	/* trie structure. */
	13391	U32 refcount;
	13392	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	13393	OP_REFCNT_LOCK;
	13394	refcount = --trie->refcount;
	13395	OP_REFCNT_UNLOCK;
	13396	if ( !refcount ) {
	13397	PerlMemShared_free(trie->charmap);
	13398	PerlMemShared_free(trie->states);
	13399	PerlMemShared_free(trie->trans);
	13400	if (trie->bitmap)
	13401	PerlMemShared_free(trie->bitmap);
	13402	if (trie->jump)
	13403	PerlMemShared_free(trie->jump);
	13404	if (trie->j_before_paren)
	13405	PerlMemShared_free(trie->j_before_paren);
	13406	if (trie->j_after_paren)
	13407	PerlMemShared_free(trie->j_after_paren);
	13408	PerlMemShared_free(trie->wordinfo);
	13409	/* do this last!!!! */
	13410	PerlMemShared_free(ri->data->data[n]);
	13411	}
	13412	}
	13413	break;
	13414	case '%':
	13415	/* NO-OP a '%' data contains a null pointer, so that reg_add_data
	13416	* always returns non-zero, this should only ever happen in the
	13417	* 0 index */
	13418	assert(n==0);
	13419	break;
	13420	default:
	13421	Perl_croak(aTHX_ "panic: regfree data code '%c'",
	13422	ri->data->what[n]);
	13423	}
	13424	}
	13425	Safefree(ri->data->what);
	13426	Safefree(ri->data);
	13427	}
	13428
	13429	Safefree(ri);
	13430	}
	13431
	13432	#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
	13433
	13434	/*
	13435	=for apidoc re_dup_guts
	13436	Duplicate a regexp.
	13437
	13438	This routine is expected to clone a given regexp structure. It is only
	13439	compiled under USE_ITHREADS.
	13440
	13441	After all of the core data stored in struct regexp is duplicated
	13442	the C<regexp_engine.dupe> method is used to copy any private data
	13443	stored in the *pprivate pointer. This allows extensions to handle
	13444	any duplication they need to do.
	13445
	13446	=cut
	13447
	13448	See pregfree() and regfree_internal() if you change anything here.
	13449	*/
	13450	#if defined(USE_ITHREADS)
	13451	#ifndef PERL_IN_XSUB_RE
	13452	void
	13453	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	13454	{
	13455	I32 npar;
	13456	const struct regexp *r = ReANY(sstr);
	13457	struct regexp *ret = ReANY(dstr);
	13458
	13459	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	13460
	13461	npar = r->nparens+1;
	13462	NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair);
	13463
	13464	if (ret->substrs) {
	13465	/* Do it this way to avoid reading from *r after the StructCopy().
	13466	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	13467	cache, it doesn't matter. */
	13468	int i;
	13469	const bool anchored = r->check_substr
	13470	? r->check_substr == r->substrs->data[0].substr
	13471	: r->check_utf8 == r->substrs->data[0].utf8_substr;
	13472	Newx(ret->substrs, 1, struct reg_substr_data);
	13473	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	13474
	13475	for (i = 0; i < 2; i++) {
	13476	ret->substrs->data[i].substr =
	13477	sv_dup_inc(ret->substrs->data[i].substr, param);
	13478	ret->substrs->data[i].utf8_substr =
	13479	sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
	13480	}
	13481
	13482	/* check_substr and check_utf8, if non-NULL, point to either their
	13483	anchored or float namesakes, and don't hold a second reference. */
	13484
	13485	if (ret->check_substr) {
	13486	if (anchored) {
	13487	assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
	13488
	13489	ret->check_substr = ret->substrs->data[0].substr;
	13490	ret->check_utf8 = ret->substrs->data[0].utf8_substr;
	13491	} else {
	13492	assert(r->check_substr == r->substrs->data[1].substr);
	13493	assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
	13494
	13495	ret->check_substr = ret->substrs->data[1].substr;
	13496	ret->check_utf8 = ret->substrs->data[1].utf8_substr;
	13497	}
	13498	} else if (ret->check_utf8) {
	13499	if (anchored) {
	13500	ret->check_utf8 = ret->substrs->data[0].utf8_substr;
	13501	} else {
	13502	ret->check_utf8 = ret->substrs->data[1].utf8_substr;
	13503	}
	13504	}
	13505	}
	13506
	13507	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	13508	ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
	13509	if (r->recurse_locinput)
	13510	Newx(ret->recurse_locinput, r->nparens + 1, char *);
	13511
	13512	if (ret->pprivate)
	13513	RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
	13514
	13515	if (RX_MATCH_COPIED(dstr))
	13516	RXp_SUBBEG(ret) = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret));
	13517	else
	13518	RXp_SUBBEG(ret) = NULL;
	13519	#ifdef PERL_ANY_COW
	13520	RXp_SAVED_COPY(ret) = NULL;
	13521	#endif
	13522
	13523	if (r->logical_to_parno) {
	13524	/* we use total_parens for all three just for symmetry */
	13525	ret->logical_to_parno = (I32)SAVEPVN((char)(r->logical_to_parno), (1+r->nparens) * sizeof(I32));
	13526	ret->parno_to_logical = (I32)SAVEPVN((char)(r->parno_to_logical), (1+r->nparens) * sizeof(I32));
	13527	ret->parno_to_logical_next = (I32)SAVEPVN((char)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32));
	13528	} else {
	13529	ret->logical_to_parno = NULL;
	13530	ret->parno_to_logical = NULL;
	13531	ret->parno_to_logical_next = NULL;
	13532	}
	13533
	13534	ret->logical_nparens = r->logical_nparens;
	13535
	13536	/* Whether mother_re be set or no, we need to copy the string. We
	13537	cannot refrain from copying it when the storage points directly to
	13538	our mother regexp, because that's
	13539	1: a buffer in a different thread
	13540	2: something we no longer hold a reference on
	13541	so we need to copy it locally. */
	13542	RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
	13543	/* set malloced length to a non-zero value so it will be freed
	13544	* (otherwise in combination with SVf_FAKE it looks like an alien
	13545	* buffer). It doesn't have to be the actual malloced size, since it
	13546	* should never be grown */
	13547	SvLEN_set(dstr, SvCUR(sstr)+1);
	13548	ret->mother_re = NULL;
	13549	}
	13550	#endif /* PERL_IN_XSUB_RE */
	13551
	13552	/*
	13553	regdupe_internal()
	13554
	13555	This is the internal complement to regdupe() which is used to copy
	13556	the structure pointed to by the *pprivate pointer in the regexp.
	13557	This is the core version of the extension overridable cloning hook.
	13558	The regexp structure being duplicated will be copied by perl prior
	13559	to this and will be provided as the regexp *r argument, however
	13560	with the /old/ structures pprivate pointer value. Thus this routine
	13561	may override any copying normally done by perl.
	13562
	13563	It returns a pointer to the new regexp_internal structure.
	13564	*/
	13565
	13566	void *
	13567	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	13568	{
	13569	struct regexp *const r = ReANY(rx);
	13570	regexp_internal *reti;
	13571	int len;
	13572	RXi_GET_DECL(r, ri);
	13573
	13574	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	13575
	13576	len = ProgLen(ri);
	13577
	13578	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
	13579	char, regexp_internal);
	13580	Copy(ri->program, reti->program, len+1, regnode);
	13581
	13582
	13583	if (ri->code_blocks) {
	13584	int n;
	13585	Newx(reti->code_blocks, 1, struct reg_code_blocks);
	13586	Newx(reti->code_blocks->cb, ri->code_blocks->count,
	13587	struct reg_code_block);
	13588	Copy(ri->code_blocks->cb, reti->code_blocks->cb,
	13589	ri->code_blocks->count, struct reg_code_block);
	13590	for (n = 0; n < ri->code_blocks->count; n++)
	13591	reti->code_blocks->cb[n].src_regex = (REGEXP*)
	13592	sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
	13593	reti->code_blocks->count = ri->code_blocks->count;
	13594	reti->code_blocks->refcnt = 1;
	13595	}
	13596	else
	13597	reti->code_blocks = NULL;
	13598
	13599	reti->regstclass = NULL;
	13600
	13601	if (ri->data) {
	13602	struct reg_data *d;
	13603	const int count = ri->data->count;
	13604	int i;
	13605
	13606	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	13607	char, struct reg_data);
	13608	Newx(d->what, count, U8);
	13609
	13610	d->count = count;
	13611	for (i = 0; i < count; i++) {
	13612	d->what[i] = ri->data->what[i];
	13613	switch (d->what[i]) {
	13614	/* see also regcomp.h and regfree_internal() */
	13615	case 'a': /* actually an AV, but the dup function is identical.
	13616	values seem to be "plain sv's" generally. */
	13617	case 'r': /* a compiled regex (but still just another SV) */
	13618	case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
	13619	this use case should go away, the code could have used
	13620	'a' instead - see S_set_ANYOF_arg() for array contents. */
	13621	case 'S': /* actually an SV, but the dup function is identical. */
	13622	case 'u': /* actually an HV, but the dup function is identical.
	13623	values are "plain sv's" */
	13624	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	13625	break;
	13626	case 'f':
	13627	/* Synthetic Start Class - "Fake" charclass we generate to optimize
	13628	* patterns which could start with several different things. Pre-TRIE
	13629	* this was more important than it is now, however this still helps
	13630	* in some places, for instance /x?a+/ might produce a SSC equivalent
	13631	* to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
	13632	* in regexec.c
	13633	*/
	13634	/* This is cheating. */
	13635	Newx(d->data[i], 1, regnode_ssc);
	13636	StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
	13637	reti->regstclass = (regnode*)d->data[i];
	13638	break;
	13639	case 'T':
	13640	/* AHO-CORASICK fail table */
	13641	/* Trie stclasses are readonly and can thus be shared
	13642	* without duplication. We free the stclass in pregfree
	13643	* when the corresponding reg_ac_data struct is freed.
	13644	*/
	13645	reti->regstclass= ri->regstclass;
	13646	/* FALLTHROUGH */
	13647	case 't':
	13648	/* TRIE transition table */
	13649	OP_REFCNT_LOCK;
	13650	((reg_trie_data*)ri->data->data[i])->refcount++;
	13651	OP_REFCNT_UNLOCK;
	13652	/* FALLTHROUGH */
	13653	case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
	13654	case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
	13655	is not from another regexp */
	13656	d->data[i] = ri->data->data[i];
	13657	break;
	13658	case '%':
	13659	/* this is a placeholder type, it exists purely so that
	13660	* reg_add_data always returns a non-zero value, this type of
	13661	* entry should ONLY be present in the 0 slot of the array */
	13662	assert(i == 0);
	13663	d->data[i]= ri->data->data[i];
	13664	break;
	13665	default:
	13666	Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
	13667	ri->data->what[i]);
	13668	}
	13669	}
	13670
	13671	reti->data = d;
	13672	}
	13673	else
	13674	reti->data = NULL;
	13675
	13676	if (ri->regstclass && !reti->regstclass) {
	13677	/* Assume that the regstclass is a regnode which is inside of the
	13678	* program which we have to copy over */
	13679	regnode *node= ri->regstclass;
	13680	assert(node >= ri->program && (node - ri->program) < len);
	13681	reti->regstclass = reti->program + (node - ri->program);
	13682	}
	13683
	13684
	13685	reti->name_list_idx = ri->name_list_idx;
	13686
	13687	SetProgLen(reti, len);
	13688
	13689	return (void*)reti;
	13690	}
	13691
	13692	#endif /* USE_ITHREADS */
	13693
	13694	STATIC void
	13695	S_re_croak(pTHX_ bool utf8, const char* pat,...)
	13696	{
	13697	va_list args;
	13698	STRLEN len = strlen(pat);
	13699	char buf[512];
	13700	SV *msv;
	13701	const char *message;
	13702
	13703	PERL_ARGS_ASSERT_RE_CROAK;
	13704
	13705	if (len > 510)
	13706	len = 510;
	13707	Copy(pat, buf, len , char);
	13708	buf[len] = '\n';
	13709	buf[len + 1] = '\0';
	13710	va_start(args, pat);
	13711	msv = vmess(buf, &args);
	13712	va_end(args);
	13713	message = SvPV_const(msv, len);
	13714	if (len > 512)
	13715	len = 512;
	13716	Copy(message, buf, len , char);
	13717	/* len-1 to avoid \n */
	13718	Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
	13719	}
	13720
	13721	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	13722
	13723	#ifndef PERL_IN_XSUB_RE
	13724	void
	13725	Perl_save_re_context(pTHX)
	13726	{
	13727	I32 nparens = -1;
	13728	I32 i;
	13729
	13730	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	13731
	13732	if (PL_curpm) {
	13733	const REGEXP * const rx = PM_GETRE(PL_curpm);
	13734	if (rx)
	13735	nparens = RX_NPARENS(rx);
	13736	}
	13737
	13738	/* RT #124109. This is a complete hack; in the SWASHNEW case we know
	13739	* that PL_curpm will be null, but that utf8.pm and the modules it
	13740	* loads will only use $1..$3.
	13741	* The t/porting/re_context.t test file checks this assumption.
	13742	*/
	13743	if (nparens == -1)
	13744	nparens = 3;
	13745
	13746	for (i = 1; i <= nparens; i++) {
	13747	char digits[TYPE_CHARS(long)];
	13748	const STRLEN len = my_snprintf(digits, sizeof(digits),
	13749	"%lu", (long)i);
	13750	GV const const gvp
	13751	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	13752
	13753	if (gvp) {
	13754	GV * const gv = *gvp;
	13755	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	13756	save_scalar(gv);
	13757	}
	13758	}
	13759	}
	13760	#endif
	13761
	13762	#ifndef PERL_IN_XSUB_RE
	13763
	13764	# include "uni_keywords.h"
	13765
	13766	void
	13767	Perl_init_uniprops(pTHX)
	13768	{
	13769
	13770	# ifdef DEBUGGING
	13771	char * dump_len_string;
	13772
	13773	dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
	13774	if ( ! dump_len_string
	13775	\|\| ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
	13776	{
	13777	PL_dump_re_max_len = 60; /* A reasonable default */
	13778	}
	13779	# endif
	13780
	13781	PL_user_def_props = newHV();
	13782
	13783	# ifdef USE_ITHREADS
	13784
	13785	HvSHAREKEYS_off(PL_user_def_props);
	13786	PL_user_def_props_aTHX = aTHX;
	13787
	13788	# endif
	13789
	13790	/* Set up the inversion list interpreter-level variables */
	13791
	13792	PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
	13793	PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
	13794	PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
	13795	PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
	13796	PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
	13797	PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
	13798	PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
	13799	PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
	13800	PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
	13801	PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
	13802	PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
	13803	PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
	13804	PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
	13805	PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
	13806	PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
	13807	PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
	13808
	13809	PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
	13810	PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
	13811	PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
	13812	PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
	13813	PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
	13814	PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
	13815	PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
	13816	PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
	13817	PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
	13818	PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
	13819	PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
	13820	PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
	13821	PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
	13822	PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
	13823	PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
	13824	PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
	13825
	13826	PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
	13827	PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
	13828	PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
	13829	PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
	13830	PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
	13831
	13832	PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
	13833	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
	13834	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
	13835	PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
	13836
	13837	PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
	13838
	13839	PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
	13840	PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
	13841
	13842	PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
	13843	PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
	13844
	13845	PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
	13846	PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
	13847	UNI__PERL_FOLDS_TO_MULTI_CHAR]);
	13848	PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
	13849	UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
	13850	PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
	13851	PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
	13852	PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
	13853	PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
	13854	PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
	13855	PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
	13856	PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
	13857	PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
	13858	PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
	13859
	13860	# ifdef UNI_XIDC
	13861	/* The below are used only by deprecated functions. They could be removed */
	13862	PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
	13863	PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
	13864	PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
	13865	# endif
	13866	}
	13867
	13868	/* These four functions are compiled only in regcomp.c, where they have access
	13869	* to the data they return. They are a way for re_comp.c to get access to that
	13870	* data without having to compile the whole data structures. */
	13871
	13872	I16
	13873	Perl_do_uniprop_match(const char * const key, const U16 key_len)
	13874	{
	13875	PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
	13876
	13877	return match_uniprop((U8 *) key, key_len);
	13878	}
	13879
	13880	SV *
	13881	Perl_get_prop_definition(pTHX_ const int table_index)
	13882	{
	13883	PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
	13884
	13885	/* Create and return the inversion list */
	13886	return _new_invlist_C_array(uni_prop_ptrs[table_index]);
	13887	}
	13888
	13889	const char * const *
	13890	Perl_get_prop_values(const int table_index)
	13891	{
	13892	PERL_ARGS_ASSERT_GET_PROP_VALUES;
	13893
	13894	return UNI_prop_value_ptrs[table_index];
	13895	}
	13896
	13897	const char *
	13898	Perl_get_deprecated_property_msg(const Size_t warning_offset)
	13899	{
	13900	PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
	13901
	13902	return deprecated_property_msgs[warning_offset];
	13903	}
	13904
	13905	# if 0
	13906
	13907	This code was mainly added for backcompat to give a warning for non-portable
	13908	code points in user-defined properties. But experiments showed that the
	13909	warning in earlier perls were only omitted on overflow, which should be an
	13910	error, so there really isnt a backcompat issue, and actually adding the
	13911	warning when none was present before might cause breakage, for little gain. So
	13912	khw left this code in, but not enabled. Tests were never added.
	13913
	13914	embed.fnc entry:
	13915	Ei \|const char *\|get_extended_utf8_msg\|const UV cp
	13916
	13917	PERL_STATIC_INLINE const char *
	13918	S_get_extended_utf8_msg(pTHX_ const UV cp)
	13919	{
	13920	U8 dummy[UTF8_MAXBYTES + 1];
	13921	HV *msgs;
	13922	SV **msg;
	13923
	13924	uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
	13925	&msgs);
	13926
	13927	msg = hv_fetchs(msgs, "text", 0);
	13928	assert(msg);
	13929
	13930	(void) sv_2mortal((SV *) msgs);
	13931
	13932	return SvPVX(*msg);
	13933	}
	13934
	13935	# endif
	13936	#endif /* end of ! PERL_IN_XSUB_RE */
	13937
	13938	STATIC REGEXP *
	13939	S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
	13940	const bool ignore_case)
	13941	{
	13942	/* Pretends that the input subpattern is qr/subpattern/aam, compiling it
	13943	* possibly with /i if the 'ignore_case' parameter is true. Use /aa
	13944	* because nothing outside of ASCII will match. Use /m because the input
	13945	* string may be a bunch of lines strung together.
	13946	*
	13947	* Also sets up the debugging info */
	13948
	13949	U32 flags = PMf_MULTILINE\|PMf_WILDCARD;
	13950	U32 rx_flags;
	13951	SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
	13952	REGEXP * subpattern_re;
	13953	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	13954
	13955	PERL_ARGS_ASSERT_COMPILE_WILDCARD;
	13956
	13957	if (ignore_case) {
	13958	flags \|= PMf_FOLD;
	13959	}
	13960	set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
	13961
	13962	/* Like in op.c, we copy the compile time pm flags to the rx ones */
	13963	rx_flags = flags & RXf_PMf_COMPILETIME;
	13964
	13965	#ifndef PERL_IN_XSUB_RE
	13966	/* Use the core engine if this file is regcomp.c. That means no
	13967	* 'use re "Debug ..." is in effect, so the core engine is sufficient */
	13968	subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
	13969	&PL_core_reg_engine,
	13970	NULL, NULL,
	13971	rx_flags, flags);
	13972	#else
	13973	if (isDEBUG_WILDCARD) {
	13974	/* Use the special debugging engine if this file is re_comp.c and wants
	13975	* to output the wildcard matching. This uses whatever
	13976	* 'use re "Debug ..." is in effect */
	13977	subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
	13978	&my_reg_engine,
	13979	NULL, NULL,
	13980	rx_flags, flags);
	13981	}
	13982	else {
	13983	/* Use the special wildcard engine if this file is re_comp.c and
	13984	* doesn't want to output the wildcard matching. This uses whatever
	13985	* 'use re "Debug ..." is in effect for compilation, but this engine
	13986	* structure has been set up so that it uses the core engine for
	13987	* execution, so no execution debugging as a result of re.pm will be
	13988	* displayed. */
	13989	subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
	13990	&wild_reg_engine,
	13991	NULL, NULL,
	13992	rx_flags, flags);
	13993	/* XXX The above has the effect that any user-supplied regex engine
	13994	* won't be called for matching wildcards. That might be good, or bad.
	13995	* It could be changed in several ways. The reason it is done the
	13996	* current way is to avoid having to save and restore
	13997	* ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
	13998	* could be used. Another suggestion is to keep the authoritative
	13999	* value of the debug flags in a thread-local variable and add set/get
	14000	* magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
	14001	* Still another is to pass a flag, say in the engine's intflags that
	14002	* would be checked each time before doing the debug output */
	14003	}
	14004	#endif
	14005
	14006	assert(subpattern_re); /* Should have died if didn't compile successfully */
	14007	return subpattern_re;
	14008	}
	14009
	14010	STATIC I32
	14011	S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
	14012	char strbeg, SSize_t minend, SV screamer, U32 nosave)
	14013	{
	14014	I32 result;
	14015	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	14016
	14017	PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
	14018
	14019	ENTER;
	14020
	14021	/* The compilation has set things up so that if the program doesn't want to
	14022	* see the wildcard matching procedure, it will get the core execution
	14023	* engine, which is subject only to -Dr. So we have to turn that off
	14024	* around this procedure */
	14025	if (! isDEBUG_WILDCARD) {
	14026	/* Note! Casts away 'volatile' */
	14027	SAVEI32(PL_debug);
	14028	PL_debug &= ~ DEBUG_r_FLAG;
	14029	}
	14030
	14031	result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
	14032	NULL, nosave);
	14033	LEAVE;
	14034
	14035	return result;
	14036	}
	14037
	14038	SV *
	14039	S_handle_user_defined_property(pTHX_
	14040
	14041	/* Parses the contents of a user-defined property definition; returning the
	14042	* expanded definition if possible. If so, the return is an inversion
	14043	* list.
	14044	*
	14045	* If there are subroutines that are part of the expansion and which aren't
	14046	* known at the time of the call to this function, this returns what
	14047	* parse_uniprop_string() returned for the first one encountered.
	14048	*
	14049	* If an error was found, NULL is returned, and 'msg' gets a suitable
	14050	* message appended to it. (Appending allows the back trace of how we got
	14051	* to the faulty definition to be displayed through nested calls of
	14052	* user-defined subs.)
	14053	*
	14054	* The caller IS responsible for freeing any returned SV.
	14055	*
	14056	* The syntax of the contents is pretty much described in perlunicode.pod,
	14057	* but we also allow comments on each line */
	14058
	14059	const char * name, /* Name of property */
	14060	const STRLEN name_len, /* The name's length in bytes */
	14061	const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
	14062	const bool to_fold, /* ? Is this under /i */
	14063	const bool runtime, /* ? Are we in compile- or run-time */
	14064	const bool deferrable, /* Is it ok for this property's full definition
	14065	to be deferred until later? */
	14066	SV* contents, /* The property's definition */
	14067	bool user_defined_ptr, / This will be set TRUE as we wouldn't be
	14068	getting called unless this is thought to be
	14069	a user-defined property */
	14070	SV * msg, /* Any error or warning msg(s) are appended to
	14071	this */
	14072	const STRLEN level) /* Recursion level of this call */
	14073	{
	14074	STRLEN len;
	14075	const char * string = SvPV_const(contents, len);
	14076	const char * const e = string + len;
	14077	const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
	14078	const STRLEN msgs_length_on_entry = SvCUR(msg);
	14079
	14080	const char * s0 = string; /* Points to first byte in the current line
	14081	being parsed in 'string' */
	14082	const char overflow_msg[] = "Code point too large in \"";
	14083	SV* running_definition = NULL;
	14084
	14085	PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
	14086
	14087	*user_defined_ptr = TRUE;
	14088
	14089	/* Look at each line */
	14090	while (s0 < e) {
	14091	const char * s; /* Current byte */
	14092	char op = '+'; /* Default operation is 'union' */
	14093	IV min = 0; /* range begin code point */
	14094	IV max = -1; /* and range end */
	14095	SV* this_definition;
	14096
	14097	/* Skip comment lines */
	14098	if (*s0 == '#') {
	14099	s0 = strchr(s0, '\n');
	14100	if (s0 == NULL) {
	14101	break;
	14102	}
	14103	s0++;
	14104	continue;
	14105	}
	14106
	14107	/* For backcompat, allow an empty first line */
	14108	if (*s0 == '\n') {
	14109	s0++;
	14110	continue;
	14111	}
	14112
	14113	/* First character in the line may optionally be the operation */
	14114	if ( *s0 == '+'
	14115	\|\| *s0 == '!'
	14116	\|\| *s0 == '-'
	14117	\|\| *s0 == '&')
	14118	{
	14119	op = *s0++;
	14120	}
	14121
	14122	/* If the line is one or two hex digits separated by blank space, its
	14123	* a range; otherwise it is either another user-defined property or an
	14124	* error */
	14125
	14126	s = s0;
	14127
	14128	if (! isXDIGIT(*s)) {
	14129	goto check_if_property;
	14130	}
	14131
	14132	do { /* Each new hex digit will add 4 bits. */
	14133	if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
	14134	s = strchr(s, '\n');
	14135	if (s == NULL) {
	14136	s = e;
	14137	}
	14138	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	14139	sv_catpv(msg, overflow_msg);
	14140	Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
	14141	UTF8fARG(is_contents_utf8, s - s0, s0));
	14142	sv_catpvs(msg, "\"");
	14143	goto return_failure;
	14144	}
	14145
	14146	/* Accumulate this digit into the value */
	14147	min = (min << 4) + READ_XDIGIT(s);
	14148	} while (isXDIGIT(*s));
	14149
	14150	while (isBLANK(*s)) { s++; }
	14151
	14152	/* We allow comments at the end of the line */
	14153	if (*s == '#') {
	14154	s = strchr(s, '\n');
	14155	if (s == NULL) {
	14156	s = e;
	14157	}
	14158	s++;
	14159	}
	14160	else if (s < e && *s != '\n') {
	14161	if (! isXDIGIT(*s)) {
	14162	goto check_if_property;
	14163	}
	14164
	14165	/* Look for the high point of the range */
	14166	max = 0;
	14167	do {
	14168	if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
	14169	s = strchr(s, '\n');
	14170	if (s == NULL) {
	14171	s = e;
	14172	}
	14173	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	14174	sv_catpv(msg, overflow_msg);
	14175	Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
	14176	UTF8fARG(is_contents_utf8, s - s0, s0));
	14177	sv_catpvs(msg, "\"");
	14178	goto return_failure;
	14179	}
	14180
	14181	max = (max << 4) + READ_XDIGIT(s);
	14182	} while (isXDIGIT(*s));
	14183
	14184	while (isBLANK(*s)) { s++; }
	14185
	14186	if (*s == '#') {
	14187	s = strchr(s, '\n');
	14188	if (s == NULL) {
	14189	s = e;
	14190	}
	14191	}
	14192	else if (s < e && *s != '\n') {
	14193	goto check_if_property;
	14194	}
	14195	}
	14196
	14197	if (max == -1) { /* The line only had one entry */
	14198	max = min;
	14199	}
	14200	else if (max < min) {
	14201	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	14202	sv_catpvs(msg, "Illegal range in \"");
	14203	Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
	14204	UTF8fARG(is_contents_utf8, s - s0, s0));
	14205	sv_catpvs(msg, "\"");
	14206	goto return_failure;
	14207	}
	14208
	14209	# if 0 /* See explanation at definition above of get_extended_utf8_msg() */
	14210
	14211	if ( UNICODE_IS_PERL_EXTENDED(min)
	14212	\|\| UNICODE_IS_PERL_EXTENDED(max))
	14213	{
	14214	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	14215
	14216	/* If both code points are non-portable, warn only on the lower
	14217	* one. */
	14218	sv_catpv(msg, get_extended_utf8_msg(
	14219	(UNICODE_IS_PERL_EXTENDED(min))
	14220	? min : max));
	14221	sv_catpvs(msg, " in \"");
	14222	Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
	14223	UTF8fARG(is_contents_utf8, s - s0, s0));
	14224	sv_catpvs(msg, "\"");
	14225	}
	14226
	14227	# endif
	14228
	14229	/* Here, this line contains a legal range */
	14230	this_definition = sv_2mortal(_new_invlist(2));
	14231	this_definition = _add_range_to_invlist(this_definition, min, max);
	14232	goto calculate;
	14233
	14234	check_if_property:
	14235
	14236	/* Here it isn't a legal range line. See if it is a legal property
	14237	* line. First find the end of the meat of the line */
	14238	s = strpbrk(s, "#\n");
	14239	if (s == NULL) {
	14240	s = e;
	14241	}
	14242
	14243	/* Ignore trailing blanks in keeping with the requirements of
	14244	* parse_uniprop_string() */
	14245	s--;
	14246	while (s > s0 && isBLANK_A(*s)) {
	14247	s--;
	14248	}
	14249	s++;
	14250
	14251	this_definition = parse_uniprop_string(s0, s - s0,
	14252	is_utf8, to_fold, runtime,
	14253	deferrable,
	14254	NULL,
	14255	user_defined_ptr, msg,
	14256	(name_len == 0)
	14257	? level /* Don't increase level
	14258	if input is empty */
	14259	: level + 1
	14260	);
	14261	if (this_definition == NULL) {
	14262	goto return_failure; /* 'msg' should have had the reason
	14263	appended to it by the above call */
	14264	}
	14265
	14266	if (! is_invlist(this_definition)) { /* Unknown at this time */
	14267	return newSVsv(this_definition);
	14268	}
	14269
	14270	if (*s != '\n') {
	14271	s = strchr(s, '\n');
	14272	if (s == NULL) {
	14273	s = e;
	14274	}
	14275	}
	14276
	14277	calculate:
	14278
	14279	switch (op) {
	14280	case '+':
	14281	_invlist_union(running_definition, this_definition,
	14282	&running_definition);
	14283	break;
	14284	case '-':
	14285	_invlist_subtract(running_definition, this_definition,
	14286	&running_definition);
	14287	break;
	14288	case '&':
	14289	_invlist_intersection(running_definition, this_definition,
	14290	&running_definition);
	14291	break;
	14292	case '!':
	14293	_invlist_union_complement_2nd(running_definition,
	14294	this_definition, &running_definition);
	14295	break;
	14296	default:
	14297	Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
	14298	__FILE__, __LINE__, op);
	14299	break;
	14300	}
	14301
	14302	/* Position past the '\n' */
	14303	s0 = s + 1;
	14304	} /* End of loop through the lines of 'contents' */
	14305
	14306	/* Here, we processed all the lines in 'contents' without error. If we
	14307	* didn't add any warnings, simply return success */
	14308	if (msgs_length_on_entry == SvCUR(msg)) {
	14309
	14310	/* If the expansion was empty, the answer isn't nothing: its an empty
	14311	* inversion list */
	14312	if (running_definition == NULL) {
	14313	running_definition = _new_invlist(1);
	14314	}
	14315
	14316	return running_definition;
	14317	}
	14318
	14319	/* Otherwise, add some explanatory text, but we will return success */
	14320	goto return_msg;
	14321
	14322	return_failure:
	14323	running_definition = NULL;
	14324
	14325	return_msg:
	14326
	14327	if (name_len > 0) {
	14328	sv_catpvs(msg, " in expansion of ");
	14329	Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
	14330	}
	14331
	14332	return running_definition;
	14333	}
	14334
	14335	/* As explained below, certain operations need to take place in the first
	14336	* thread created. These macros switch contexts */
	14337	# ifdef USE_ITHREADS
	14338	# define DECLARATION_FOR_GLOBAL_CONTEXT \
	14339	PerlInterpreter * save_aTHX = aTHX;
	14340	# define SWITCH_TO_GLOBAL_CONTEXT \
	14341	PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
	14342	# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
	14343	# define CUR_CONTEXT aTHX
	14344	# define ORIGINAL_CONTEXT save_aTHX
	14345	# else
	14346	# define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
	14347	# define SWITCH_TO_GLOBAL_CONTEXT NOOP
	14348	# define RESTORE_CONTEXT NOOP
	14349	# define CUR_CONTEXT NULL
	14350	# define ORIGINAL_CONTEXT NULL
	14351	# endif
	14352
	14353	STATIC void
	14354	S_delete_recursion_entry(pTHX_ void *key)
	14355	{
	14356	/* Deletes the entry used to detect recursion when expanding user-defined
	14357	* properties. This is a function so it can be set up to be called even if
	14358	* the program unexpectedly quits */
	14359
	14360	SV ** current_entry;
	14361	const STRLEN key_len = strlen((const char *) key);
	14362	DECLARATION_FOR_GLOBAL_CONTEXT;
	14363
	14364	SWITCH_TO_GLOBAL_CONTEXT;
	14365
	14366	/* If the entry is one of these types, it is a permanent entry, and not the
	14367	* one used to detect recursions. This function should delete only the
	14368	* recursion entry */
	14369	current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
	14370	if ( current_entry
	14371	&& ! is_invlist(*current_entry)
	14372	&& ! SvPOK(*current_entry))
	14373	{
	14374	(void) hv_delete(PL_user_def_props, (const char *) key, key_len,
	14375	G_DISCARD);
	14376	}
	14377
	14378	RESTORE_CONTEXT;
	14379	}
	14380
	14381	STATIC SV *
	14382	S_get_fq_name(pTHX_
	14383	const char * const name, /* The first non-blank in the \p{}, \P{} */
	14384	const Size_t name_len, /* Its length in bytes, not including any trailing space */
	14385	const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
	14386	const bool has_colon_colon
	14387	)
	14388	{
	14389	/* Returns a mortal SV containing the fully qualified version of the input
	14390	* name */
	14391
	14392	SV * fq_name;
	14393
	14394	fq_name = newSVpvs_flags("", SVs_TEMP);
	14395
	14396	/* Use the current package if it wasn't included in our input */
	14397	if (! has_colon_colon) {
	14398	const HV * pkg = (IN_PERL_COMPILETIME)
	14399	? PL_curstash
	14400	: CopSTASH(PL_curcop);
	14401	const char* pkgname = HvNAME(pkg);
	14402
	14403	Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
	14404	UTF8fARG(is_utf8, strlen(pkgname), pkgname));
	14405	sv_catpvs(fq_name, "::");
	14406	}
	14407
	14408	Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
	14409	UTF8fARG(is_utf8, name_len, name));
	14410	return fq_name;
	14411	}
	14412
	14413	STATIC SV *
	14414	S_parse_uniprop_string(pTHX_
	14415
	14416	/* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
	14417	* now. If so, the return is an inversion list.
	14418	*
	14419	* If the property is user-defined, it is a subroutine, which in turn
	14420	* may call other subroutines. This function will call the whole nest of
	14421	* them to get the definition they return; if some aren't known at the time
	14422	* of the call to this function, the fully qualified name of the highest
	14423	* level sub is returned. It is an error to call this function at runtime
	14424	* without every sub defined.
	14425	*
	14426	* If an error was found, NULL is returned, and 'msg' gets a suitable
	14427	* message appended to it. (Appending allows the back trace of how we got
	14428	* to the faulty definition to be displayed through nested calls of
	14429	* user-defined subs.)
	14430	*
	14431	* The caller should NOT try to free any returned inversion list.
	14432	*
	14433	* Other parameters will be set on return as described below */
	14434
	14435	const char * const name, /* The first non-blank in the \p{}, \P{} */
	14436	Size_t name_len, /* Its length in bytes, not including any
	14437	trailing space */
	14438	const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
	14439	const bool to_fold, /* ? Is this under /i */
	14440	const bool runtime, /* TRUE if this is being called at run time */
	14441	const bool deferrable, /* TRUE if it's ok for the definition to not be
	14442	known at this call */
	14443	AV ** strings, /* To return string property values, like named
	14444	sequences */
	14445	bool user_defined_ptr, / Upon return from this function it will be
	14446	set to TRUE if any component is a
	14447	user-defined property */
	14448	SV * msg, /* Any error or warning msg(s) are appended to
	14449	this */
	14450	const STRLEN level) /* Recursion level of this call */
	14451	{
	14452	char* lookup_name; /* normalized name for lookup in our tables */
	14453	unsigned lookup_len; /* Its length */
	14454	enum { Not_Strict = 0, /* Some properties have stricter name */
	14455	Strict, /* normalization rules, which we decide */
	14456	As_Is /* upon based on parsing */
	14457	} stricter = Not_Strict;
	14458
	14459	/* nv= or numeric_value=, or possibly one of the cjk numeric properties
	14460	* (though it requires extra effort to download them from Unicode and
	14461	* compile perl to know about them) */
	14462	bool is_nv_type = FALSE;
	14463
	14464	unsigned int i = 0, i_zero = 0, j = 0;
	14465	int equals_pos = -1; /* Where the '=' is found, or negative if none */
	14466	int slash_pos = -1; /* Where the '/' is found, or negative if none */
	14467	int table_index = 0; /* The entry number for this property in the table
	14468	of all Unicode property names */
	14469	bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
	14470	Size_t lookup_offset = 0; /* Used to ignore the first few characters of
	14471	the normalized name in certain situations */
	14472	Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
	14473	part of a package name */
	14474	Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
	14475	bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
	14476	property rather than a Unicode
	14477	one. */
	14478	SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
	14479	if an error. If it is an inversion list,
	14480	it is the definition. Otherwise it is a
	14481	string containing the fully qualified sub
	14482	name of 'name' */
	14483	SV * fq_name = NULL; /* For user-defined properties, the fully
	14484	qualified name */
	14485	bool invert_return = FALSE; /* ? Do we need to complement the result before
	14486	returning it */
	14487	bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
	14488	explicit utf8:: package that we strip
	14489	off */
	14490	/* The expansion of properties that could be either user-defined or
	14491	* official unicode ones is deferred until runtime, including a marker for
	14492	* those that might be in the latter category. This boolean indicates if
	14493	* we've seen that marker. If not, what we're parsing can't be such an
	14494	* official Unicode property whose expansion was deferred */
	14495	bool could_be_deferred_official = FALSE;
	14496
	14497	PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
	14498
	14499	/* The input will be normalized into 'lookup_name' */
	14500	Newx(lookup_name, name_len, char);
	14501	SAVEFREEPV(lookup_name);
	14502
	14503	/* Parse the input. */
	14504	for (i = 0; i < name_len; i++) {
	14505	char cur = name[i];
	14506
	14507	/* Most of the characters in the input will be of this ilk, being parts
	14508	* of a name */
	14509	if (isIDCONT_A(cur)) {
	14510
	14511	/* Case differences are ignored. Our lookup routine assumes
	14512	* everything is lowercase, so normalize to that */
	14513	if (isUPPER_A(cur)) {
	14514	lookup_name[j++] = toLOWER_A(cur);
	14515	continue;
	14516	}
	14517
	14518	if (cur == '_') { /* Don't include these in the normalized name */
	14519	continue;
	14520	}
	14521
	14522	lookup_name[j++] = cur;
	14523
	14524	/* The first character in a user-defined name must be of this type.
	14525	* */
	14526	if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
	14527	could_be_user_defined = FALSE;
	14528	}
	14529
	14530	continue;
	14531	}
	14532
	14533	/* Here, the character is not something typically in a name, But these
	14534	* two types of characters (and the '_' above) can be freely ignored in
	14535	* most situations. Later it may turn out we shouldn't have ignored
	14536	* them, and we have to reparse, but we don't have enough information
	14537	* yet to make that decision */
	14538	if (cur == '-' \|\| isSPACE_A(cur)) {
	14539	could_be_user_defined = FALSE;
	14540	continue;
	14541	}
	14542
	14543	/* An equals sign or single colon mark the end of the first part of
	14544	* the property name */
	14545	if ( cur == '='
	14546	\|\| (cur == ':' && (i >= name_len - 1 \|\| name[i+1] != ':')))
	14547	{
	14548	lookup_name[j++] = '='; /* Treat the colon as an '=' */
	14549	equals_pos = j; /* Note where it occurred in the input */
	14550	could_be_user_defined = FALSE;
	14551	break;
	14552	}
	14553
	14554	/* If this looks like it is a marker we inserted at compile time,
	14555	* set a flag and otherwise ignore it. If it isn't in the final
	14556	* position, keep it as it would have been user input. */
	14557	if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
	14558	&& ! deferrable
	14559	&& could_be_user_defined
	14560	&& i == name_len - 1)
	14561	{
	14562	name_len--;
	14563	could_be_deferred_official = TRUE;
	14564	continue;
	14565	}
	14566
	14567	/* Otherwise, this character is part of the name. */
	14568	lookup_name[j++] = cur;
	14569
	14570	/* Here it isn't a single colon, so if it is a colon, it must be a
	14571	* double colon */
	14572	if (cur == ':') {
	14573
	14574	/* A double colon should be a package qualifier. We note its
	14575	* position and continue. Note that one could have
	14576	* pkg1::pkg2::...::foo
	14577	* so that the position at the end of the loop will be just after
	14578	* the final qualifier */
	14579
	14580	i++;
	14581	non_pkg_begin = i + 1;
	14582	lookup_name[j++] = ':';
	14583	lun_non_pkg_begin = j;
	14584	}
	14585	else { /* Only word chars (and '::') can be in a user-defined name */
	14586	could_be_user_defined = FALSE;
	14587	}
	14588	} /* End of parsing through the lhs of the property name (or all of it if
	14589	no rhs) */
	14590
	14591	/* If there is a single package name 'utf8::', it is ambiguous. It could
	14592	* be for a user-defined property, or it could be a Unicode property, as
	14593	* all of them are considered to be for that package. For the purposes of
	14594	* parsing the rest of the property, strip it off */
	14595	if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
	14596	lookup_name += STRLENs("utf8::");
	14597	j -= STRLENs("utf8::");
	14598	equals_pos -= STRLENs("utf8::");
	14599	i_zero = STRLENs("utf8::"); /* When resetting 'i' to reparse
	14600	from the beginning, it has to be
	14601	set past what we're stripping
	14602	off */
	14603	stripped_utf8_pkg = TRUE;
	14604	}
	14605
	14606	/* Here, we are either done with the whole property name, if it was simple;
	14607	* or are positioned just after the '=' if it is compound. */
	14608
	14609	if (equals_pos >= 0) {
	14610	assert(stricter == Not_Strict); /* We shouldn't have set this yet */
	14611
	14612	/* Space immediately after the '=' is ignored */
	14613	i++;
	14614	for (; i < name_len; i++) {
	14615	if (! isSPACE_A(name[i])) {
	14616	break;
	14617	}
	14618	}
	14619
	14620	/* Most punctuation after the equals indicates a subpattern, like
	14621	* \p{foo=/bar/} */
	14622	if ( isPUNCT_A(name[i])
	14623	&& name[i] != '-'
	14624	&& name[i] != '+'
	14625	&& name[i] != '_'
	14626	&& name[i] != '{'
	14627	/* A backslash means the real delimiter is the next character,
	14628	* but it must be punctuation */
	14629	&& (name[i] != '\\' \|\| (i < name_len && isPUNCT_A(name[i+1]))))
	14630	{
	14631	bool special_property = memEQs(lookup_name, j - 1, "name")
	14632	\|\| memEQs(lookup_name, j - 1, "na");
	14633	if (! special_property) {
	14634	/* Find the property. The table includes the equals sign, so
	14635	* we use 'j' as-is */
	14636	table_index = do_uniprop_match(lookup_name, j);
	14637	}
	14638	if (special_property \|\| table_index) {
	14639	REGEXP * subpattern_re;
	14640	char open = name[i++];
	14641	char close;
	14642	const char * pos_in_brackets;
	14643	const char * const * prop_values;
	14644	bool escaped = 0;
	14645
	14646	/* Backslash => delimiter is the character following. We
	14647	* already checked that it is punctuation */
	14648	if (open == '\\') {
	14649	open = name[i++];
	14650	escaped = 1;
	14651	}
	14652
	14653	/* This data structure is constructed so that the matching
	14654	* closing bracket is 3 past its matching opening. The second
	14655	* set of closing is so that if the opening is something like
	14656	* ']', the closing will be that as well. Something similar is
	14657	* done in toke.c */
	14658	pos_in_brackets = memCHRs("([<)]>)]>", open);
	14659	close = (pos_in_brackets) ? pos_in_brackets[3] : open;
	14660
	14661	if ( i >= name_len
	14662	\|\| name[name_len-1] != close
	14663	\|\| (escaped && name[name_len-2] != '\\')
	14664	/* Also make sure that there are enough characters.
	14665	* e.g., '\\\' would show up incorrectly as legal even
	14666	* though it is too short */
	14667	\|\| (SSize_t) (name_len - i - 1 - escaped) < 0)
	14668	{
	14669	sv_catpvs(msg, "Unicode property wildcard not terminated");
	14670	goto append_name_to_msg;
	14671	}
	14672
	14673	Perl_ck_warner_d(aTHX_
	14674	packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
	14675	"The Unicode property wildcards feature is experimental");
	14676
	14677	if (special_property) {
	14678	const char * error_msg;
	14679	const char * revised_name = name + i;
	14680	Size_t revised_name_len = name_len - (i + 1 + escaped);
	14681
	14682	/* Currently, the only 'special_property' is name, which we
	14683	* lookup in _charnames.pm */
	14684
	14685	if (! load_charnames(newSVpvs("placeholder"),
	14686	revised_name, revised_name_len,
	14687	&error_msg))
	14688	{
	14689	sv_catpv(msg, error_msg);
	14690	goto append_name_to_msg;
	14691	}
	14692
	14693	/* Farm this out to a function just to make the current
	14694	* function less unwieldy */
	14695	if (handle_names_wildcard(revised_name, revised_name_len,
	14696	&prop_definition,
	14697	strings))
	14698	{
	14699	return prop_definition;
	14700	}
	14701
	14702	goto failed;
	14703	}
	14704
	14705	prop_values = get_prop_values(table_index);
	14706
	14707	/* Now create and compile the wildcard subpattern. Use /i
	14708	* because the property values are supposed to match with case
	14709	* ignored. */
	14710	subpattern_re = compile_wildcard(name + i,
	14711	name_len - i - 1 - escaped,
	14712	TRUE /* /i */
	14713	);
	14714
	14715	/* For each legal property value, see if the supplied pattern
	14716	* matches it. */
	14717	while (*prop_values) {
	14718	const char * const entry = *prop_values;
	14719	const Size_t len = strlen(entry);
	14720	SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
	14721
	14722	if (execute_wildcard(subpattern_re,
	14723	(char *) entry,
	14724	(char *) entry + len,
	14725	(char *) entry, 0,
	14726	entry_sv,
	14727	0))
	14728	{ /* Here, matched. Add to the returned list */
	14729	Size_t total_len = j + len;
	14730	SV * sub_invlist = NULL;
	14731	char * this_string;
	14732
	14733	/* We know this is a legal \p{property=value}. Call
	14734	* the function to return the list of code points that
	14735	* match it */
	14736	Newxz(this_string, total_len + 1, char);
	14737	Copy(lookup_name, this_string, j, char);
	14738	my_strlcat(this_string, entry, total_len + 1);
	14739	SAVEFREEPV(this_string);
	14740	sub_invlist = parse_uniprop_string(this_string,
	14741	total_len,
	14742	is_utf8,
	14743	to_fold,
	14744	runtime,
	14745	deferrable,
	14746	NULL,
	14747	user_defined_ptr,
	14748	msg,
	14749	level + 1);
	14750	_invlist_union(prop_definition, sub_invlist,
	14751	&prop_definition);
	14752	}
	14753
	14754	prop_values++; /* Next iteration, look at next propvalue */
	14755	} /* End of looking through property values; (the data
	14756	structure is terminated by a NULL ptr) */
	14757
	14758	SvREFCNT_dec_NN(subpattern_re);
	14759
	14760	if (prop_definition) {
	14761	return prop_definition;
	14762	}
	14763
	14764	sv_catpvs(msg, "No Unicode property value wildcard matches:");
	14765	goto append_name_to_msg;
	14766	}
	14767
	14768	/* Here's how khw thinks we should proceed to handle the properties
	14769	* not yet done: Bidi Mirroring Glyph can map to ""
	14770	Bidi Paired Bracket can map to ""
	14771	Case Folding (both full and simple)
	14772	Shouldn't /i be good enough for Full
	14773	Decomposition Mapping
	14774	Equivalent Unified Ideograph can map to ""
	14775	Lowercase Mapping (both full and simple)
	14776	NFKC Case Fold can map to ""
	14777	Titlecase Mapping (both full and simple)
	14778	Uppercase Mapping (both full and simple)
	14779	* Handle these the same way Name is done, using say, _wild.pm, but
	14780	* having both loose and full, like in charclass_invlists.h.
	14781	* Perhaps move block and script to that as they are somewhat large
	14782	* in charclass_invlists.h.
	14783	* For properties where the default is the code point itself, such
	14784	* as any of the case changing mappings, the string would otherwise
	14785	* consist of all Unicode code points in UTF-8 strung together.
	14786	* This would be impractical. So instead, examine their compiled
	14787	* pattern, looking at the ssc. If none, reject the pattern as an
	14788	* error. Otherwise run the pattern against every code point in
	14789	* the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
	14790	* And it might be good to create an API to return the ssc.
	14791	* Or handle them like the algorithmic names are done
	14792	*/
	14793	} /* End of is a wildcard subppattern */
	14794
	14795	/* \p{name=...} is handled specially. Instead of using the normal
	14796	* mechanism involving charclass_invlists.h, it uses _charnames.pm
	14797	* which has the necessary (huge) data accessible to it, and which
	14798	* doesn't get loaded unless necessary. The legal syntax for names is
	14799	* somewhat different than other properties due both to the vagaries of
	14800	* a few outlier official names, and the fact that only a few ASCII
	14801	* characters are permitted in them */
	14802	if ( memEQs(lookup_name, j - 1, "name")
	14803	\|\| memEQs(lookup_name, j - 1, "na"))
	14804	{
	14805	dSP;
	14806	HV * table;
	14807	SV * character;
	14808	const char * error_msg;
	14809	CV* lookup_loose;
	14810	SV * character_name;
	14811	STRLEN character_len;
	14812	UV cp;
	14813
	14814	stricter = As_Is;
	14815
	14816	/* Since the RHS (after skipping initial space) is passed unchanged
	14817	* to charnames, and there are different criteria for what are
	14818	* legal characters in the name, just parse it here. A character
	14819	* name must begin with an ASCII alphabetic */
	14820	if (! isALPHA(name[i])) {
	14821	goto failed;
	14822	}
	14823	lookup_name[j++] = name[i];
	14824
	14825	for (++i; i < name_len; i++) {
	14826	/* Official names can only be in the ASCII range, and only
	14827	* certain characters */
	14828	if (! isASCII(name[i]) \|\| ! isCHARNAME_CONT(name[i])) {
	14829	goto failed;
	14830	}
	14831	lookup_name[j++] = name[i];
	14832	}
	14833
	14834	/* Finished parsing, save the name into an SV */
	14835	character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
	14836
	14837	/* Make sure _charnames is loaded. (The parameters give context
	14838	* for any errors generated */
	14839	table = load_charnames(character_name, name, name_len, &error_msg);
	14840	if (table == NULL) {
	14841	sv_catpv(msg, error_msg);
	14842	goto append_name_to_msg;
	14843	}
	14844
	14845	lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
	14846	if (! lookup_loose) {
	14847	Perl_croak(aTHX_
	14848	"panic: Can't find '_charnames::_loose_regcomp_lookup");
	14849	}
	14850
	14851	PUSHSTACKi(PERLSI_REGCOMP);
	14852	ENTER ;
	14853	SAVETMPS;
	14854	save_re_context();
	14855
	14856	PUSHMARK(SP) ;
	14857	XPUSHs(character_name);
	14858	PUTBACK;
	14859	call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
	14860
	14861	SPAGAIN ;
	14862
	14863	character = POPs;
	14864	SvREFCNT_inc_simple_void_NN(character);
	14865
	14866	PUTBACK ;
	14867	FREETMPS ;
	14868	LEAVE ;
	14869	POPSTACK;
	14870
	14871	if (! SvOK(character)) {
	14872	goto failed;
	14873	}
	14874
	14875	cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
	14876	if (character_len == SvCUR(character)) {
	14877	prop_definition = add_cp_to_invlist(NULL, cp);
	14878	}
	14879	else {
	14880	AV * this_string;
	14881
	14882	/* First of the remaining characters in the string. */
	14883	char * remaining = SvPVX(character) + character_len;
	14884
	14885	if (strings == NULL) {
	14886	goto failed; /* XXX Perhaps a specific msg instead, like
	14887	'not available here' */
	14888	}
	14889
	14890	if (*strings == NULL) {
	14891	*strings = newAV();
	14892	}
	14893
	14894	this_string = newAV();
	14895	av_push_simple(this_string, newSVuv(cp));
	14896
	14897	do {
	14898	cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
	14899	av_push_simple(this_string, newSVuv(cp));
	14900	remaining += character_len;
	14901	} while (remaining < SvEND(character));
	14902
	14903	av_push_simple(strings, (SV ) this_string);
	14904	}
	14905
	14906	return prop_definition;
	14907	}
	14908
	14909	/* Certain properties whose values are numeric need special handling.
	14910	* They may optionally be prefixed by 'is'. Ignore that prefix for the
	14911	* purposes of checking if this is one of those properties */
	14912	if (memBEGINPs(lookup_name, j, "is")) {
	14913	lookup_offset = 2;
	14914	}
	14915
	14916	/* Then check if it is one of these specially-handled properties. The
	14917	* possibilities are hard-coded because easier this way, and the list
	14918	* is unlikely to change.
	14919	*
	14920	* All numeric value type properties are of this ilk, and are also
	14921	* special in a different way later on. So find those first. There
	14922	* are several numeric value type properties in the Unihan DB (which is
	14923	* unlikely to be compiled with perl, but we handle it here in case it
	14924	* does get compiled). They all end with 'numeric'. The interiors
	14925	* aren't checked for the precise property. This would stop working if
	14926	* a cjk property were to be created that ended with 'numeric' and
	14927	* wasn't a numeric type */
	14928	is_nv_type = memEQs(lookup_name + lookup_offset,
	14929	j - 1 - lookup_offset, "numericvalue")
	14930	\|\| memEQs(lookup_name + lookup_offset,
	14931	j - 1 - lookup_offset, "nv")
	14932	\|\| ( memENDPs(lookup_name + lookup_offset,
	14933	j - 1 - lookup_offset, "numeric")
	14934	&& ( memBEGINPs(lookup_name + lookup_offset,
	14935	j - 1 - lookup_offset, "cjk")
	14936	\|\| memBEGINPs(lookup_name + lookup_offset,
	14937	j - 1 - lookup_offset, "k")));
	14938	if ( is_nv_type
	14939	\|\| memEQs(lookup_name + lookup_offset,
	14940	j - 1 - lookup_offset, "canonicalcombiningclass")
	14941	\|\| memEQs(lookup_name + lookup_offset,
	14942	j - 1 - lookup_offset, "ccc")
	14943	\|\| memEQs(lookup_name + lookup_offset,
	14944	j - 1 - lookup_offset, "age")
	14945	\|\| memEQs(lookup_name + lookup_offset,
	14946	j - 1 - lookup_offset, "in")
	14947	\|\| memEQs(lookup_name + lookup_offset,
	14948	j - 1 - lookup_offset, "presentin"))
	14949	{
	14950	unsigned int k;
	14951
	14952	/* Since the stuff after the '=' is a number, we can't throw away
	14953	* '-' willy-nilly, as those could be a minus sign. Other stricter
	14954	* rules also apply. However, these properties all can have the
	14955	* rhs not be a number, in which case they contain at least one
	14956	* alphabetic. In those cases, the stricter rules don't apply.
	14957	* But the numeric type properties can have the alphas [Ee] to
	14958	* signify an exponent, and it is still a number with stricter
	14959	* rules. So look for an alpha that signifies not-strict */
	14960	stricter = Strict;
	14961	for (k = i; k < name_len; k++) {
	14962	if ( isALPHA_A(name[k])
	14963	&& (! is_nv_type \|\| ! isALPHA_FOLD_EQ(name[k], 'E')))
	14964	{
	14965	stricter = Not_Strict;
	14966	break;
	14967	}
	14968	}
	14969	}
	14970
	14971	if (stricter) {
	14972
	14973	/* A number may have a leading '+' or '-'. The latter is retained
	14974	* */
	14975	if (name[i] == '+') {
	14976	i++;
	14977	}
	14978	else if (name[i] == '-') {
	14979	lookup_name[j++] = '-';
	14980	i++;
	14981	}
	14982
	14983	/* Skip leading zeros including single underscores separating the
	14984	* zeros, or between the final leading zero and the first other
	14985	* digit */
	14986	for (; i < name_len - 1; i++) {
	14987	if ( name[i] != '0'
	14988	&& (name[i] != '_' \|\| ! isDIGIT_A(name[i+1])))
	14989	{
	14990	break;
	14991	}
	14992	}
	14993
	14994	/* Turn nv=-0 into nv=0. These should be equivalent, but vary by
	14995	* underling libc implementation. */
	14996	if ( i == name_len - 1
	14997	&& name[name_len-1] == '0'
	14998	&& lookup_name[j-1] == '-')
	14999	{
	15000	j--;
	15001	}
	15002	}
	15003	}
	15004	else { /* No '=' */
	15005
	15006	/* Only a few properties without an '=' should be parsed with stricter
	15007	* rules. The list is unlikely to change. */
	15008	if ( memBEGINPs(lookup_name, j, "perl")
	15009	&& memNEs(lookup_name + 4, j - 4, "space")
	15010	&& memNEs(lookup_name + 4, j - 4, "word"))
	15011	{
	15012	stricter = Strict;
	15013
	15014	/* We set the inputs back to 0 and the code below will reparse,
	15015	* using strict */
	15016	i = i_zero;
	15017	j = 0;
	15018	}
	15019	}
	15020
	15021	/* Here, we have either finished the property, or are positioned to parse
	15022	* the remainder, and we know if stricter rules apply. Finish out, if not
	15023	* already done */
	15024	for (; i < name_len; i++) {
	15025	char cur = name[i];
	15026
	15027	/* In all instances, case differences are ignored, and we normalize to
	15028	* lowercase */
	15029	if (isUPPER_A(cur)) {
	15030	lookup_name[j++] = toLOWER(cur);
	15031	continue;
	15032	}
	15033
	15034	/* An underscore is skipped, but not under strict rules unless it
	15035	* separates two digits */
	15036	if (cur == '_') {
	15037	if ( stricter
	15038	&& ( i == i_zero \|\| (int) i == equals_pos \|\| i == name_len- 1
	15039	\|\| ! isDIGIT_A(name[i-1]) \|\| ! isDIGIT_A(name[i+1])))
	15040	{
	15041	lookup_name[j++] = '_';
	15042	}
	15043	continue;
	15044	}
	15045
	15046	/* Hyphens are skipped except under strict */
	15047	if (cur == '-' && ! stricter) {
	15048	continue;
	15049	}
	15050
	15051	/* XXX Bug in documentation. It says white space skipped adjacent to
	15052	* non-word char. Maybe we should, but shouldn't skip it next to a dot
	15053	* in a number */
	15054	if (isSPACE_A(cur) && ! stricter) {
	15055	continue;
	15056	}
	15057
	15058	lookup_name[j++] = cur;
	15059
	15060	/* Unless this is a non-trailing slash, we are done with it */
	15061	if (i >= name_len - 1 \|\| cur != '/') {
	15062	continue;
	15063	}
	15064
	15065	slash_pos = j;
	15066
	15067	/* A slash in the 'numeric value' property indicates that what follows
	15068	* is a denominator. It can have a leading '+' and '0's that should be
	15069	* skipped. But we have never allowed a negative denominator, so treat
	15070	* a minus like every other character. (No need to rule out a second
	15071	* '/', as that won't match anything anyway */
	15072	if (is_nv_type) {
	15073	i++;
	15074	if (i < name_len && name[i] == '+') {
	15075	i++;
	15076	}
	15077
	15078	/* Skip leading zeros including underscores separating digits */
	15079	for (; i < name_len - 1; i++) {
	15080	if ( name[i] != '0'
	15081	&& (name[i] != '_' \|\| ! isDIGIT_A(name[i+1])))
	15082	{
	15083	break;
	15084	}
	15085	}
	15086
	15087	/* Store the first real character in the denominator */
	15088	if (i < name_len) {
	15089	lookup_name[j++] = name[i];
	15090	}
	15091	}
	15092	}
	15093
	15094	/* Here are completely done parsing the input 'name', and 'lookup_name'
	15095	* contains a copy, normalized.
	15096	*
	15097	* This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
	15098	* different from without the underscores. */
	15099	if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
	15100	\|\| UNLIKELY(memEQs(lookup_name, j, "gc=l")))
	15101	&& UNLIKELY(name[name_len-1] == '_'))
	15102	{
	15103	lookup_name[j++] = '&';
	15104	}
	15105
	15106	/* If the original input began with 'In' or 'Is', it could be a subroutine
	15107	* call to a user-defined property instead of a Unicode property name. */
	15108	if ( name_len - non_pkg_begin > 2
	15109	&& name[non_pkg_begin+0] == 'I'
	15110	&& (name[non_pkg_begin+1] == 'n' \|\| name[non_pkg_begin+1] == 's'))
	15111	{
	15112	/* Names that start with In have different characteristics than those
	15113	* that start with Is */
	15114	if (name[non_pkg_begin+1] == 's') {
	15115	starts_with_Is = TRUE;
	15116	}
	15117	}
	15118	else {
	15119	could_be_user_defined = FALSE;
	15120	}
	15121
	15122	if (could_be_user_defined) {
	15123	CV* user_sub;
	15124
	15125	/* If the user defined property returns the empty string, it could
	15126	* easily be because the pattern is being compiled before the data it
	15127	* actually needs to compile is available. This could be argued to be
	15128	* a bug in the perl code, but this is a change of behavior for Perl,
	15129	* so we handle it. This means that intentionally returning nothing
	15130	* will not be resolved until runtime */
	15131	bool empty_return = FALSE;
	15132
	15133	/* Here, the name could be for a user defined property, which are
	15134	* implemented as subs. */
	15135	user_sub = get_cvn_flags(name, name_len, 0);
	15136	if (! user_sub) {
	15137
	15138	/* Here, the property name could be a user-defined one, but there
	15139	* is no subroutine to handle it (as of now). Defer handling it
	15140	* until runtime. Otherwise, a block defined by Unicode in a later
	15141	* release would get the synonym InFoo added for it, and existing
	15142	* code that used that name would suddenly break if it referred to
	15143	* the property before the sub was declared. See [perl #134146] */
	15144	if (deferrable) {
	15145	goto definition_deferred;
	15146	}
	15147
	15148	/* Here, we are at runtime, and didn't find the user property. It
	15149	* could be an official property, but only if no package was
	15150	* specified, or just the utf8:: package. */
	15151	if (could_be_deferred_official) {
	15152	lookup_name += lun_non_pkg_begin;
	15153	j -= lun_non_pkg_begin;
	15154	}
	15155	else if (! stripped_utf8_pkg) {
	15156	goto unknown_user_defined;
	15157	}
	15158
	15159	/* Drop down to look up in the official properties */
	15160	}
	15161	else {
	15162	const char insecure[] = "Insecure user-defined property";
	15163
	15164	/* Here, there is a sub by the correct name. Normally we call it
	15165	* to get the property definition */
	15166	dSP;
	15167	SV * user_sub_sv = MUTABLE_SV(user_sub);
	15168	SV * error; /* Any error returned by calling 'user_sub' */
	15169	SV * key; /* The key into the hash of user defined sub names
	15170	*/
	15171	SV * placeholder;
	15172	SV ** saved_user_prop_ptr; /* Hash entry for this property */
	15173
	15174	/* How many times to retry when another thread is in the middle of
	15175	* expanding the same definition we want */
	15176	PERL_INT_FAST8_T retry_countdown = 10;
	15177
	15178	DECLARATION_FOR_GLOBAL_CONTEXT;
	15179
	15180	/* If we get here, we know this property is user-defined */
	15181	*user_defined_ptr = TRUE;
	15182
	15183	/* We refuse to call a potentially tainted subroutine; returning an
	15184	* error instead */
	15185	if (TAINT_get) {
	15186	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15187	sv_catpvn(msg, insecure, sizeof(insecure) - 1);
	15188	goto append_name_to_msg;
	15189	}
	15190
	15191	/* In principal, we only call each subroutine property definition
	15192	* once during the life of the program. This guarantees that the
	15193	* property definition never changes. The results of the single
	15194	* sub call are stored in a hash, which is used instead for future
	15195	* references to this property. The property definition is thus
	15196	* immutable. But, to allow the user to have a /i-dependent
	15197	* definition, we call the sub once for non-/i, and once for /i,
	15198	* should the need arise, passing the /i status as a parameter.
	15199	*
	15200	* We start by constructing the hash key name, consisting of the
	15201	* fully qualified subroutine name, preceded by the /i status, so
	15202	* that there is a key for /i and a different key for non-/i */
	15203	key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
	15204	fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
	15205	non_pkg_begin != 0);
	15206	sv_catsv(key, fq_name);
	15207
	15208	/* We only call the sub once throughout the life of the program
	15209	* (with the /i, non-/i exception noted above). That means the
	15210	* hash must be global and accessible to all threads. It is
	15211	* created at program start-up, before any threads are created, so
	15212	* is accessible to all children. But this creates some
	15213	* complications.
	15214	*
	15215	* 1) The keys can't be shared, or else problems arise; sharing is
	15216	* turned off at hash creation time
	15217	* 2) All SVs in it are there for the remainder of the life of the
	15218	* program, and must be created in the same interpreter context
	15219	* as the hash, or else they will be freed from the wrong pool
	15220	* at global destruction time. This is handled by switching to
	15221	* the hash's context to create each SV going into it, and then
	15222	* immediately switching back
	15223	* 3) All accesses to the hash must be controlled by a mutex, to
	15224	* prevent two threads from getting an unstable state should
	15225	* they simultaneously be accessing it. The code below is
	15226	* crafted so that the mutex is locked whenever there is an
	15227	* access and unlocked only when the next stable state is
	15228	* achieved.
	15229	*
	15230	* The hash stores either the definition of the property if it was
	15231	* valid, or, if invalid, the error message that was raised. We
	15232	* use the type of SV to distinguish.
	15233	*
	15234	* There's also the need to guard against the definition expansion
	15235	* from infinitely recursing. This is handled by storing the aTHX
	15236	* of the expanding thread during the expansion. Again the SV type
	15237	* is used to distinguish this from the other two cases. If we
	15238	* come to here and the hash entry for this property is our aTHX,
	15239	* it means we have recursed, and the code assumes that we would
	15240	* infinitely recurse, so instead stops and raises an error.
	15241	* (Any recursion has always been treated as infinite recursion in
	15242	* this feature.)
	15243	*
	15244	* If instead, the entry is for a different aTHX, it means that
	15245	* that thread has gotten here first, and hasn't finished expanding
	15246	* the definition yet. We just have to wait until it is done. We
	15247	* sleep and retry a few times, returning an error if the other
	15248	* thread doesn't complete. */
	15249
	15250	re_fetch:
	15251	USER_PROP_MUTEX_LOCK;
	15252
	15253	/* If we have an entry for this key, the subroutine has already
	15254	* been called once with this /i status. */
	15255	saved_user_prop_ptr = hv_fetch(PL_user_def_props,
	15256	SvPVX(key), SvCUR(key), 0);
	15257	if (saved_user_prop_ptr) {
	15258
	15259	/* If the saved result is an inversion list, it is the valid
	15260	* definition of this property */
	15261	if (is_invlist(*saved_user_prop_ptr)) {
	15262	prop_definition = *saved_user_prop_ptr;
	15263
	15264	/* The SV in the hash won't be removed until global
	15265	* destruction, so it is stable and we can unlock */
	15266	USER_PROP_MUTEX_UNLOCK;
	15267
	15268	/* The caller shouldn't try to free this SV */
	15269	return prop_definition;
	15270	}
	15271
	15272	/* Otherwise, if it is a string, it is the error message
	15273	* that was returned when we first tried to evaluate this
	15274	* property. Fail, and append the message */
	15275	if (SvPOK(*saved_user_prop_ptr)) {
	15276	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15277	sv_catsv(msg, *saved_user_prop_ptr);
	15278
	15279	/* The SV in the hash won't be removed until global
	15280	* destruction, so it is stable and we can unlock */
	15281	USER_PROP_MUTEX_UNLOCK;
	15282
	15283	return NULL;
	15284	}
	15285
	15286	assert(SvIOK(*saved_user_prop_ptr));
	15287
	15288	/* Here, we have an unstable entry in the hash. Either another
	15289	* thread is in the middle of expanding the property's
	15290	* definition, or we are ourselves recursing. We use the aTHX
	15291	* in it to distinguish */
	15292	if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
	15293
	15294	/* Here, it's another thread doing the expanding. We've
	15295	* looked as much as we are going to at the contents of the
	15296	* hash entry. It's safe to unlock. */
	15297	USER_PROP_MUTEX_UNLOCK;
	15298
	15299	/* Retry a few times */
	15300	if (retry_countdown-- > 0) {
	15301	PerlProc_sleep(1);
	15302	goto re_fetch;
	15303	}
	15304
	15305	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15306	sv_catpvs(msg, "Timeout waiting for another thread to "
	15307	"define");
	15308	goto append_name_to_msg;
	15309	}
	15310
	15311	/* Here, we are recursing; don't dig any deeper */
	15312	USER_PROP_MUTEX_UNLOCK;
	15313
	15314	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15315	sv_catpvs(msg,
	15316	"Infinite recursion in user-defined property");
	15317	goto append_name_to_msg;
	15318	}
	15319
	15320	/* Here, this thread has exclusive control, and there is no entry
	15321	* for this property in the hash. So we have the go ahead to
	15322	* expand the definition ourselves. */
	15323
	15324	PUSHSTACKi(PERLSI_REGCOMP);
	15325	ENTER;
	15326
	15327	/* Create a temporary placeholder in the hash to detect recursion
	15328	* */
	15329	SWITCH_TO_GLOBAL_CONTEXT;
	15330	placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
	15331	(void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
	15332	RESTORE_CONTEXT;
	15333
	15334	/* Now that we have a placeholder, we can let other threads
	15335	* continue */
	15336	USER_PROP_MUTEX_UNLOCK;
	15337
	15338	/* Make sure the placeholder always gets destroyed */
	15339	SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
	15340
	15341	PUSHMARK(SP);
	15342	SAVETMPS;
	15343
	15344	/* Call the user's function, with the /i status as a parameter.
	15345	* Note that we have gone to a lot of trouble to keep this call
	15346	* from being within the locked mutex region. */
	15347	XPUSHs(boolSV(to_fold));
	15348	PUTBACK;
	15349
	15350	/* The following block was taken from swash_init(). Presumably
	15351	* they apply to here as well, though we no longer use a swash --
	15352	* khw */
	15353	SAVEHINTS();
	15354	save_re_context();
	15355	/* We might get here via a subroutine signature which uses a utf8
	15356	* parameter name, at which point PL_subname will have been set
	15357	* but not yet used. */
	15358	save_item(PL_subname);
	15359
	15360	/* G_SCALAR guarantees a single return value */
	15361	(void) call_sv(user_sub_sv, G_EVAL\|G_SCALAR);
	15362
	15363	SPAGAIN;
	15364
	15365	error = ERRSV;
	15366	if (TAINT_get \|\| SvTRUE(error)) {
	15367	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15368	if (SvTRUE(error)) {
	15369	sv_catpvs(msg, "Error \"");
	15370	sv_catsv(msg, error);
	15371	sv_catpvs(msg, "\"");
	15372	}
	15373	if (TAINT_get) {
	15374	if (SvTRUE(error)) sv_catpvs(msg, "; ");
	15375	sv_catpvn(msg, insecure, sizeof(insecure) - 1);
	15376	}
	15377
	15378	if (name_len > 0) {
	15379	sv_catpvs(msg, " in expansion of ");
	15380	Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
	15381	name_len,
	15382	name));
	15383	}
	15384
	15385	(void) POPs;
	15386	prop_definition = NULL;
	15387	}
	15388	else {
	15389	SV * contents = POPs;
	15390
	15391	/* The contents is supposed to be the expansion of the property
	15392	* definition. If the definition is deferrable, and we got an
	15393	* empty string back, set a flag to later defer it (after clean
	15394	* up below). */
	15395	if ( deferrable
	15396	&& (! SvPOK(contents) \|\| SvCUR(contents) == 0))
	15397	{
	15398	empty_return = TRUE;
	15399	}
	15400	else { /* Otherwise, call a function to check for valid syntax,
	15401	and handle it */
	15402
	15403	prop_definition = handle_user_defined_property(
	15404	name, name_len,
	15405	is_utf8, to_fold, runtime,
	15406	deferrable,
	15407	contents, user_defined_ptr,
	15408	msg,
	15409	level);
	15410	}
	15411	}
	15412
	15413	/* Here, we have the results of the expansion. Delete the
	15414	* placeholder, and if the definition is now known, replace it with
	15415	* that definition. We need exclusive access to the hash, and we
	15416	* can't let anyone else in, between when we delete the placeholder
	15417	* and add the permanent entry */
	15418	USER_PROP_MUTEX_LOCK;
	15419
	15420	S_delete_recursion_entry(aTHX_ SvPVX(key));
	15421
	15422	if ( ! empty_return
	15423	&& (! prop_definition \|\| is_invlist(prop_definition)))
	15424	{
	15425	/* If we got success we use the inversion list defining the
	15426	* property; otherwise use the error message */
	15427	SWITCH_TO_GLOBAL_CONTEXT;
	15428	(void) hv_store_ent(PL_user_def_props,
	15429	key,
	15430	((prop_definition)
	15431	? newSVsv(prop_definition)
	15432	: newSVsv(msg)),
	15433	0);
	15434	RESTORE_CONTEXT;
	15435	}
	15436
	15437	/* All done, and the hash now has a permanent entry for this
	15438	* property. Give up exclusive control */
	15439	USER_PROP_MUTEX_UNLOCK;
	15440
	15441	FREETMPS;
	15442	LEAVE;
	15443	POPSTACK;
	15444
	15445	if (empty_return) {
	15446	goto definition_deferred;
	15447	}
	15448
	15449	if (prop_definition) {
	15450
	15451	/* If the definition is for something not known at this time,
	15452	* we toss it, and go return the main property name, as that's
	15453	* the one the user will be aware of */
	15454	if (! is_invlist(prop_definition)) {
	15455	SvREFCNT_dec_NN(prop_definition);
	15456	goto definition_deferred;
	15457	}
	15458
	15459	sv_2mortal(prop_definition);
	15460	}
	15461
	15462	/* And return */
	15463	return prop_definition;
	15464
	15465	} /* End of calling the subroutine for the user-defined property */
	15466	} /* End of it could be a user-defined property */
	15467
	15468	/* Here it wasn't a user-defined property that is known at this time. See
	15469	* if it is a Unicode property */
	15470
	15471	lookup_len = j; /* This is a more mnemonic name than 'j' */
	15472
	15473	/* Get the index into our pointer table of the inversion list corresponding
	15474	* to the property */
	15475	table_index = do_uniprop_match(lookup_name, lookup_len);
	15476
	15477	/* If it didn't find the property ... */
	15478	if (table_index == 0) {
	15479
	15480	/* Try again stripping off any initial 'Is'. This is because we
	15481	* promise that an initial Is is optional. The same isn't true of
	15482	* names that start with 'In'. Those can match only blocks, and the
	15483	* lookup table already has those accounted for. The lookup table also
	15484	* has already accounted for Perl extensions (without and = sign)
	15485	* starting with 'i's'. */
	15486	if (starts_with_Is && equals_pos >= 0) {
	15487	lookup_name += 2;
	15488	lookup_len -= 2;
	15489	equals_pos -= 2;
	15490	slash_pos -= 2;
	15491
	15492	table_index = do_uniprop_match(lookup_name, lookup_len);
	15493	}
	15494
	15495	if (table_index == 0) {
	15496	char * canonical;
	15497
	15498	/* Here, we didn't find it. If not a numeric type property, and
	15499	* can't be a user-defined one, it isn't a legal property */
	15500	if (! is_nv_type) {
	15501	if (! could_be_user_defined) {
	15502	goto failed;
	15503	}
	15504
	15505	/* Here, the property name is legal as a user-defined one. At
	15506	* compile time, it might just be that the subroutine for that
	15507	* property hasn't been encountered yet, but at runtime, it's
	15508	* an error to try to use an undefined one */
	15509	if (! deferrable) {
	15510	goto unknown_user_defined;;
	15511	}
	15512
	15513	goto definition_deferred;
	15514	} /* End of isn't a numeric type property */
	15515
	15516	/* The numeric type properties need more work to decide. What we
	15517	* do is make sure we have the number in canonical form and look
	15518	* that up. */
	15519
	15520	if (slash_pos < 0) { /* No slash */
	15521
	15522	/* When it isn't a rational, take the input, convert it to a
	15523	* NV, then create a canonical string representation of that
	15524	* NV. */
	15525
	15526	NV value;
	15527	SSize_t value_len = lookup_len - equals_pos;
	15528
	15529	/* Get the value */
	15530	if ( value_len <= 0
	15531	\|\| my_atof3(lookup_name + equals_pos, &value,
	15532	value_len)
	15533	!= lookup_name + lookup_len)
	15534	{
	15535	goto failed;
	15536	}
	15537
	15538	/* If the value is an integer, the canonical value is integral
	15539	* */
	15540	if (Perl_ceil(value) == value) {
	15541	canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
	15542	equals_pos, lookup_name, value);
	15543	}
	15544	else { /* Otherwise, it is %e with a known precision */
	15545	char * exp_ptr;
	15546
	15547	canonical = Perl_form(aTHX_ "%.s%." NVef,
	15548	equals_pos, lookup_name,
	15549	PL_E_FORMAT_PRECISION, value);
	15550
	15551	/* The exponent generated is expecting two digits, whereas
	15552	* %e on some systems will generate three. Remove leading
	15553	* zeros in excess of 2 from the exponent. We start
	15554	* looking for them after the '=' */
	15555	exp_ptr = strchr(canonical + equals_pos, 'e');
	15556	if (exp_ptr) {
	15557	char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
	15558	SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
	15559
	15560	assert((cur_ptr - 1) == '-' \|\| (cur_ptr - 1) == '+');
	15561
	15562	if (excess_exponent_len > 0) {
	15563	SSize_t leading_zeros = strspn(cur_ptr, "0");
	15564	SSize_t excess_leading_zeros
	15565	= MIN(leading_zeros, excess_exponent_len);
	15566	if (excess_leading_zeros > 0) {
	15567	Move(cur_ptr + excess_leading_zeros,
	15568	cur_ptr,
	15569	strlen(cur_ptr) - excess_leading_zeros
	15570	+ 1, /* Copy the NUL as well */
	15571	char);
	15572	}
	15573	}
	15574	}
	15575	}
	15576	}
	15577	else { /* Has a slash. Create a rational in canonical form */
	15578	UV numerator, denominator, gcd, trial;
	15579	const char * end_ptr;
	15580	const char * sign = "";
	15581
	15582	/* We can't just find the numerator, denominator, and do the
	15583	* division, then use the method above, because that is
	15584	* inexact. And the input could be a rational that is within
	15585	* epsilon (given our precision) of a valid rational, and would
	15586	* then incorrectly compare valid.
	15587	*
	15588	* We're only interested in the part after the '=' */
	15589	const char * this_lookup_name = lookup_name + equals_pos;
	15590	lookup_len -= equals_pos;
	15591	slash_pos -= equals_pos;
	15592
	15593	/* Handle any leading minus */
	15594	if (this_lookup_name[0] == '-') {
	15595	sign = "-";
	15596	this_lookup_name++;
	15597	lookup_len--;
	15598	slash_pos--;
	15599	}
	15600
	15601	/* Convert the numerator to numeric */
	15602	end_ptr = this_lookup_name + slash_pos;
	15603	if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
	15604	goto failed;
	15605	}
	15606
	15607	/* It better have included all characters before the slash */
	15608	if (*end_ptr != '/') {
	15609	goto failed;
	15610	}
	15611
	15612	/* Set to look at just the denominator */
	15613	this_lookup_name += slash_pos;
	15614	lookup_len -= slash_pos;
	15615	end_ptr = this_lookup_name + lookup_len;
	15616
	15617	/* Convert the denominator to numeric */
	15618	if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
	15619	goto failed;
	15620	}
	15621
	15622	/* It better be the rest of the characters, and don't divide by
	15623	* 0 */
	15624	if ( end_ptr != this_lookup_name + lookup_len
	15625	\|\| denominator == 0)
	15626	{
	15627	goto failed;
	15628	}
	15629
	15630	/* Get the greatest common denominator using
	15631	https://en.wikipedia.org/wiki/Euclidean_algorithm */
	15632	gcd = numerator;
	15633	trial = denominator;
	15634	while (trial != 0) {
	15635	UV temp = trial;
	15636	trial = gcd % trial;
	15637	gcd = temp;
	15638	}
	15639
	15640	/* If already in lowest possible terms, we have already tried
	15641	* looking this up */
	15642	if (gcd == 1) {
	15643	goto failed;
	15644	}
	15645
	15646	/* Reduce the rational, which should put it in canonical form
	15647	* */
	15648	numerator /= gcd;
	15649	denominator /= gcd;
	15650
	15651	canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
	15652	equals_pos, lookup_name, sign, numerator, denominator);
	15653	}
	15654
	15655	/* Here, we have the number in canonical form. Try that */
	15656	table_index = do_uniprop_match(canonical, strlen(canonical));
	15657	if (table_index == 0) {
	15658	goto failed;
	15659	}
	15660	} /* End of still didn't find the property in our table */
	15661	} /* End of didn't find the property in our table */
	15662
	15663	/* Here, we have a non-zero return, which is an index into a table of ptrs.
	15664	* A negative return signifies that the real index is the absolute value,
	15665	* but the result needs to be inverted */
	15666	if (table_index < 0) {
	15667	invert_return = TRUE;
	15668	table_index = -table_index;
	15669	}
	15670
	15671	/* Out-of band indices indicate a deprecated property. The proper index is
	15672	* modulo it with the table size. And dividing by the table size yields
	15673	* an offset into a table constructed by regen/mk_invlists.pl to contain
	15674	* the corresponding warning message */
	15675	if (table_index > MAX_UNI_KEYWORD_INDEX) {
	15676	Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
	15677	table_index %= MAX_UNI_KEYWORD_INDEX;
	15678	Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME),
	15679	"Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
	15680	(int) name_len, name,
	15681	get_deprecated_property_msg(warning_offset));
	15682	}
	15683
	15684	/* In a few properties, a different property is used under /i. These are
	15685	* unlikely to change, so are hard-coded here. */
	15686	if (to_fold) {
	15687	if ( table_index == UNI_XPOSIXUPPER
	15688	\|\| table_index == UNI_XPOSIXLOWER
	15689	\|\| table_index == UNI_TITLE)
	15690	{
	15691	table_index = UNI_CASED;
	15692	}
	15693	else if ( table_index == UNI_UPPERCASELETTER
	15694	\|\| table_index == UNI_LOWERCASELETTER
	15695	# ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
	15696	\|\| table_index == UNI_TITLECASELETTER
	15697	# endif
	15698	) {
	15699	table_index = UNI_CASEDLETTER;
	15700	}
	15701	else if ( table_index == UNI_POSIXUPPER
	15702	\|\| table_index == UNI_POSIXLOWER)
	15703	{
	15704	table_index = UNI_POSIXALPHA;
	15705	}
	15706	}
	15707
	15708	/* Create and return the inversion list */
	15709	prop_definition = get_prop_definition(table_index);
	15710	sv_2mortal(prop_definition);
	15711
	15712	/* See if there is a private use override to add to this definition */
	15713	{
	15714	COPHH * hinthash = (IN_PERL_COMPILETIME)
	15715	? CopHINTHASH_get(&PL_compiling)
	15716	: CopHINTHASH_get(PL_curcop);
	15717	SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
	15718
	15719	if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
	15720
	15721	/* See if there is an element in the hints hash for this table */
	15722	SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
	15723	const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
	15724
	15725	if (pos) {
	15726	bool dummy;
	15727	SV * pu_definition;
	15728	SV * pu_invlist;
	15729	SV * expanded_prop_definition =
	15730	sv_2mortal(invlist_clone(prop_definition, NULL));
	15731
	15732	/* If so, it's definition is the string from here to the next
	15733	* \a character. And its format is the same as a user-defined
	15734	* property */
	15735	pos += SvCUR(pu_lookup);
	15736	pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
	15737	pu_invlist = handle_user_defined_property(lookup_name,
	15738	lookup_len,
	15739	0, /* Not UTF-8 */
	15740	0, /* Not folded */
	15741	runtime,
	15742	deferrable,
	15743	pu_definition,
	15744	&dummy,
	15745	msg,
	15746	level);
	15747	if (TAINT_get) {
	15748	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15749	sv_catpvs(msg, "Insecure private-use override");
	15750	goto append_name_to_msg;
	15751	}
	15752
	15753	/* For now, as a safety measure, make sure that it doesn't
	15754	* override non-private use code points */
	15755	_invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
	15756
	15757	/* Add it to the list to be returned */
	15758	_invlist_union(prop_definition, pu_invlist,
	15759	&expanded_prop_definition);
	15760	prop_definition = expanded_prop_definition;
	15761	Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
	15762	}
	15763	}
	15764	}
	15765
	15766	if (invert_return) {
	15767	_invlist_invert(prop_definition);
	15768	}
	15769	return prop_definition;
	15770
	15771	unknown_user_defined:
	15772	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15773	sv_catpvs(msg, "Unknown user-defined property name");
	15774	goto append_name_to_msg;
	15775
	15776	failed:
	15777	if (non_pkg_begin != 0) {
	15778	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15779	sv_catpvs(msg, "Illegal user-defined property name");
	15780	}
	15781	else {
	15782	if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
	15783	sv_catpvs(msg, "Can't find Unicode property definition");
	15784	}
	15785	/* FALLTHROUGH */
	15786
	15787	append_name_to_msg:
	15788	{
	15789	const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
	15790	const char * suffix = (runtime && level == 0) ? "}" : "\"";
	15791
	15792	sv_catpv(msg, prefix);
	15793	Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
	15794	sv_catpv(msg, suffix);
	15795	}
	15796
	15797	return NULL;
	15798
	15799	definition_deferred:
	15800
	15801	{
	15802	bool is_qualified = non_pkg_begin != 0; /* If has "::" */
	15803
	15804	/* Here it could yet to be defined, so defer evaluation of this until
	15805	* its needed at runtime. We need the fully qualified property name to
	15806	* avoid ambiguity */
	15807	if (! fq_name) {
	15808	fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
	15809	is_qualified);
	15810	}
	15811
	15812	/* If it didn't come with a package, or the package is utf8::, this
	15813	* actually could be an official Unicode property whose inclusion we
	15814	* are deferring until runtime to make sure that it isn't overridden by
	15815	* a user-defined property of the same name (which we haven't
	15816	* encountered yet). Add a marker to indicate this possibility, for
	15817	* use at such time when we first need the definition during pattern
	15818	* matching execution */
	15819	if (! is_qualified \|\| memBEGINPs(name, non_pkg_begin, "utf8::")) {
	15820	sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
	15821	}
	15822
	15823	/* We also need a trailing newline */
	15824	sv_catpvs(fq_name, "\n");
	15825
	15826	*user_defined_ptr = TRUE;
	15827	return fq_name;
	15828	}
	15829	}
	15830
	15831	STATIC bool
	15832	S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
	15833	const STRLEN wname_len, /* Its length */
	15834	SV ** prop_definition,
	15835	AV ** strings)
	15836	{
	15837	/* Deal with Name property wildcard subpatterns; returns TRUE if there were
	15838	* any matches, adding them to prop_definition */
	15839
	15840	dSP;
	15841
	15842	CV * get_names_info; /* entry to charnames.pm to get info we need */
	15843	SV * names_string; /* Contains all character names, except algo */
	15844	SV * algorithmic_names; /* Contains info about algorithmically
	15845	generated character names */
	15846	REGEXP * subpattern_re; /* The user's pattern to match with */
	15847	struct regexp * prog; /* The compiled pattern */
	15848	char * all_names_start; /* lib/unicore/Name.pl string of every
	15849	(non-algorithmic) character name */
	15850	char * cur_pos; /* We match, effectively using /gc; this is
	15851	where we are now */
	15852	bool found_matches = FALSE; /* Did any name match so far? */
	15853	SV * empty; /* For matching zero length names */
	15854	SV * must_sv; /* Contains the substring, if any, that must be
	15855	in a name for the subpattern to match */
	15856	const char * must; /* The PV of 'must' */
	15857	STRLEN must_len; /* And its length */
	15858	SV * syllable_name = NULL; /* For Hangul syllables */
	15859	const char hangul_prefix[] = "HANGUL SYLLABLE ";
	15860	const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
	15861
	15862	/* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
	15863	* syllable name, and these are immutable and guaranteed by the Unicode
	15864	* standard to never be extended */
	15865	const STRLEN syl_max_len = hangul_prefix_len + 7;
	15866
	15867	IV i;
	15868
	15869	PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
	15870
	15871	/* Make sure _charnames is loaded. (The parameters give context
	15872	* for any errors generated */
	15873	get_names_info = get_cv("_charnames::_get_names_info", 0);
	15874	if (! get_names_info) {
	15875	Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
	15876	}
	15877
	15878	/* Get the charnames data */
	15879	PUSHSTACKi(PERLSI_REGCOMP);
	15880	ENTER ;
	15881	SAVETMPS;
	15882	save_re_context();
	15883
	15884	PUSHMARK(SP) ;
	15885	PUTBACK;
	15886
	15887	/* Special _charnames entry point that returns the info this routine
	15888	* requires */
	15889	call_sv(MUTABLE_SV(get_names_info), G_LIST);
	15890
	15891	SPAGAIN ;
	15892
	15893	/* Data structure for names which end in their very own code points */
	15894	algorithmic_names = POPs;
	15895	SvREFCNT_inc_simple_void_NN(algorithmic_names);
	15896
	15897	/* The lib/unicore/Name.pl string */
	15898	names_string = POPs;
	15899	SvREFCNT_inc_simple_void_NN(names_string);
	15900
	15901	PUTBACK ;
	15902	FREETMPS ;
	15903	LEAVE ;
	15904	POPSTACK;
	15905
	15906	if ( ! SvROK(names_string)
	15907	\|\| ! SvROK(algorithmic_names))
	15908	{ /* Perhaps should panic instead XXX */
	15909	SvREFCNT_dec(names_string);
	15910	SvREFCNT_dec(algorithmic_names);
	15911	return FALSE;
	15912	}
	15913
	15914	names_string = sv_2mortal(SvRV(names_string));
	15915	all_names_start = SvPVX(names_string);
	15916	cur_pos = all_names_start;
	15917
	15918	algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
	15919
	15920	/* Compile the subpattern consisting of the name being looked for */
	15921	subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
	15922
	15923	must_sv = re_intuit_string(subpattern_re);
	15924	if (must_sv) {
	15925	/* regexec.c can free the re_intuit_string() return. GH #17734 */
	15926	must_sv = sv_2mortal(newSVsv(must_sv));
	15927	must = SvPV(must_sv, must_len);
	15928	}
	15929	else {
	15930	must = "";
	15931	must_len = 0;
	15932	}
	15933
	15934	/* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
	15935	* This works because the NUL causes the function to return early, thus
	15936	* showing that there are characters in it other than the acceptable ones,
	15937	* which is our desired result.) */
	15938
	15939	prog = ReANY(subpattern_re);
	15940
	15941	/* If only nothing is matched, skip to where empty names are looked for */
	15942	if (prog->maxlen == 0) {
	15943	goto check_empty;
	15944	}
	15945
	15946	/* And match against the string of all names /gc. Don't even try if it
	15947	* must match a character not found in any name. */
	15948	if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
	15949	{
	15950	while (execute_wildcard(subpattern_re,
	15951	cur_pos,
	15952	SvEND(names_string),
	15953	all_names_start, 0,
	15954	names_string,
	15955	0))
	15956	{ /* Here, matched. */
	15957
	15958	/* Note the string entries look like
	15959	* 00001\nSTART OF HEADING\n\n
	15960	* so we could match anywhere in that string. We have to rule out
	15961	* matching a code point line */
	15962	char * this_name_start = all_names_start
	15963	+ RX_OFFS_START(subpattern_re,0);
	15964	char * this_name_end = all_names_start
	15965	+ RX_OFFS_END(subpattern_re,0);
	15966	char * cp_start;
	15967	char * cp_end;
	15968	UV cp = 0; /* Silences some compilers */
	15969	AV * this_string = NULL;
	15970	bool is_multi = FALSE;
	15971
	15972	/* If matched nothing, advance to next possible match */
	15973	if (this_name_start == this_name_end) {
	15974	cur_pos = (char *) memchr(this_name_end + 1, '\n',
	15975	SvEND(names_string) - this_name_end);
	15976	if (cur_pos == NULL) {
	15977	break;
	15978	}
	15979	}
	15980	else {
	15981	/* Position the next match to start beyond the current returned
	15982	* entry */
	15983	cur_pos = (char *) memchr(this_name_end, '\n',
	15984	SvEND(names_string) - this_name_end);
	15985	}
	15986
	15987	/* Back up to the \n just before the beginning of the character. */
	15988	cp_end = (char *) my_memrchr(all_names_start,
	15989	'\n',
	15990	this_name_start - all_names_start);
	15991
	15992	/* If we didn't find a \n, it means it matched somewhere in the
	15993	* initial '00000' in the string, so isn't a real match */
	15994	if (cp_end == NULL) {
	15995	continue;
	15996	}
	15997
	15998	this_name_start = cp_end + 1; /* The name starts just after */
	15999	cp_end--; /* the \n, and the code point */
	16000	/* ends just before it */
	16001
	16002	/* All code points are 5 digits long */
	16003	cp_start = cp_end - 4;
	16004
	16005	/* This shouldn't happen, as we found a \n, and the first \n is
	16006	* further along than what we subtracted */
	16007	assert(cp_start >= all_names_start);
	16008
	16009	if (cp_start == all_names_start) {
	16010	prop_definition = add_cp_to_invlist(prop_definition, 0);
	16011	continue;
	16012	}
	16013
	16014	/* If the character is a blank, we either have a named sequence, or
	16015	* something is wrong */
	16016	if (*(cp_start - 1) == ' ') {
	16017	cp_start = (char *) my_memrchr(all_names_start,
	16018	'\n',
	16019	cp_start - all_names_start);
	16020	cp_start++;
	16021	}
	16022
	16023	assert(cp_start != NULL && cp_start >= all_names_start + 2);
	16024
	16025	/* Except for the first line in the string, the sequence before the
	16026	* code point is \n\n. If that isn't the case here, we didn't
	16027	* match the name of a character. (We could have matched a named
	16028	* sequence, not currently handled */
	16029	if ((cp_start - 1) != '\n' \|\| (cp_start - 2) != '\n') {
	16030	continue;
	16031	}
	16032
	16033	/* We matched! Add this to the list */
	16034	found_matches = TRUE;
	16035
	16036	/* Loop through all the code points in the sequence */
	16037	while (cp_start < cp_end) {
	16038
	16039	/* Calculate this code point from its 5 digits */
	16040	cp = (XDIGIT_VALUE(cp_start[0]) << 16)
	16041	+ (XDIGIT_VALUE(cp_start[1]) << 12)
	16042	+ (XDIGIT_VALUE(cp_start[2]) << 8)
	16043	+ (XDIGIT_VALUE(cp_start[3]) << 4)
	16044	+ XDIGIT_VALUE(cp_start[4]);
	16045
	16046	cp_start += 6; /* Go past any blank */
	16047
	16048	if (cp_start < cp_end \|\| is_multi) {
	16049	if (this_string == NULL) {
	16050	this_string = newAV();
	16051	}
	16052
	16053	is_multi = TRUE;
	16054	av_push_simple(this_string, newSVuv(cp));
	16055	}
	16056	}
	16057
	16058	if (is_multi) { /* Was more than one code point */
	16059	if (*strings == NULL) {
	16060	*strings = newAV();
	16061	}
	16062
	16063	av_push_simple(strings, (SV ) this_string);
	16064	}
	16065	else { /* Only a single code point */
	16066	prop_definition = add_cp_to_invlist(prop_definition, cp);
	16067	}
	16068	} /* End of loop through the non-algorithmic names string */
	16069	}
	16070
	16071	/* There are also character names not in 'names_string'. These are
	16072	* algorithmically generatable. Try this pattern on each possible one.
	16073	* (khw originally planned to leave this out given the large number of
	16074	* matches attempted; but the speed turned out to be quite acceptable
	16075	*
	16076	* There are plenty of opportunities to optimize to skip many of the tests.
	16077	* beyond the rudimentary ones already here */
	16078
	16079	/* First see if the subpattern matches any of the algorithmic generatable
	16080	* Hangul syllable names.
	16081	*
	16082	* We know none of these syllable names will match if the input pattern
	16083	* requires more bytes than any syllable has, or if the input pattern only
	16084	* matches an empty name, or if the pattern has something it must match and
	16085	* one of the characters in that isn't in any Hangul syllable. */
	16086	if ( prog->minlen <= (SSize_t) syl_max_len
	16087	&& prog->maxlen > 0
	16088	&& (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
	16089	{
	16090	/* These constants, names, values, and algorithm are adapted from the
	16091	* Unicode standard, version 5.1, section 3.12, and should never
	16092	* change. */
	16093	const char * JamoL[] = {
	16094	"G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
	16095	"S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
	16096	};
	16097	const int LCount = C_ARRAY_LENGTH(JamoL);
	16098
	16099	const char * JamoV[] = {
	16100	"A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
	16101	"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
	16102	"I"
	16103	};
	16104	const int VCount = C_ARRAY_LENGTH(JamoV);
	16105
	16106	const char * JamoT[] = {
	16107	"", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
	16108	"LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
	16109	"BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
	16110	};
	16111	const int TCount = C_ARRAY_LENGTH(JamoT);
	16112
	16113	int L, V, T;
	16114
	16115	/* This is the initial Hangul syllable code point; each time through the
	16116	* inner loop, it maps to the next higher code point. For more info,
	16117	* see the Hangul syllable section of the Unicode standard. */
	16118	int cp = 0xAC00;
	16119
	16120	syllable_name = sv_2mortal(newSV(syl_max_len));
	16121	sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
	16122
	16123	for (L = 0; L < LCount; L++) {
	16124	for (V = 0; V < VCount; V++) {
	16125	for (T = 0; T < TCount; T++) {
	16126
	16127	/* Truncate back to the prefix, which is unvarying */
	16128	SvCUR_set(syllable_name, hangul_prefix_len);
	16129
	16130	sv_catpv(syllable_name, JamoL[L]);
	16131	sv_catpv(syllable_name, JamoV[V]);
	16132	sv_catpv(syllable_name, JamoT[T]);
	16133
	16134	if (execute_wildcard(subpattern_re,
	16135	SvPVX(syllable_name),
	16136	SvEND(syllable_name),
	16137	SvPVX(syllable_name), 0,
	16138	syllable_name,
	16139	0))
	16140	{
	16141	prop_definition = add_cp_to_invlist(prop_definition,
	16142	cp);
	16143	found_matches = TRUE;
	16144	}
	16145
	16146	cp++;
	16147	}
	16148	}
	16149	}
	16150	}
	16151
	16152	/* The rest of the algorithmically generatable names are of the form
	16153	* "PREFIX-code_point". The prefixes and the code point limits of each
	16154	* were returned to us in the array 'algorithmic_names' from data in
	16155	* lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
	16156	for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
	16157	IV j;
	16158
	16159	/* Each element of the array is a hash, giving the details for the
	16160	* series of names it covers. There is the base name of the characters
	16161	* in the series, and the low and high code points in the series. And,
	16162	* for optimization purposes a string containing all the legal
	16163	* characters that could possibly be in a name in this series. */
	16164	HV * this_series = (HV ) SvRV( av_fetch((AV *) algorithmic_names, i, 0));
	16165	SV * prefix = * hv_fetchs(this_series, "name", 0);
	16166	IV low = SvIV(* hv_fetchs(this_series, "low", 0));
	16167	IV high = SvIV(* hv_fetchs(this_series, "high", 0));
	16168	char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
	16169
	16170	/* Pre-allocate an SV with enough space */
	16171	SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
	16172	SvPVX(prefix)));
	16173	if (high >= 0x10000) {
	16174	sv_catpvs(algo_name, "0");
	16175	}
	16176
	16177	/* This series can be skipped entirely if the pattern requires
	16178	* something longer than any name in the series, or can only match an
	16179	* empty name, or contains a character not found in any name in the
	16180	* series */
	16181	if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
	16182	&& prog->maxlen > 0
	16183	&& (strspn(must, legal) == must_len))
	16184	{
	16185	for (j = low; j <= high; j++) { /* For each code point in the series */
	16186
	16187	/* Get its name, and see if it matches the subpattern */
	16188	Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
	16189	(unsigned) j);
	16190
	16191	if (execute_wildcard(subpattern_re,
	16192	SvPVX(algo_name),
	16193	SvEND(algo_name),
	16194	SvPVX(algo_name), 0,
	16195	algo_name,
	16196	0))
	16197	{
	16198	prop_definition = add_cp_to_invlist(prop_definition, j);
	16199	found_matches = TRUE;
	16200	}
	16201	}
	16202	}
	16203	}
	16204
	16205	check_empty:
	16206	/* Finally, see if the subpattern matches an empty string */
	16207	empty = newSVpvs("");
	16208	if (execute_wildcard(subpattern_re,
	16209	SvPVX(empty),
	16210	SvEND(empty),
	16211	SvPVX(empty), 0,
	16212	empty,
	16213	0))
	16214	{
	16215	/* Many code points have empty names. Currently these are the \p{GC=C}
	16216	* ones, minus CC and CF */
	16217
	16218	SV * empty_names_ref = get_prop_definition(UNI_C);
	16219	SV * empty_names = invlist_clone(empty_names_ref, NULL);
	16220
	16221	SV * subtract = get_prop_definition(UNI_CC);
	16222
	16223	_invlist_subtract(empty_names, subtract, &empty_names);
	16224	SvREFCNT_dec_NN(empty_names_ref);
	16225	SvREFCNT_dec_NN(subtract);
	16226
	16227	subtract = get_prop_definition(UNI_CF);
	16228	_invlist_subtract(empty_names, subtract, &empty_names);
	16229	SvREFCNT_dec_NN(subtract);
	16230
	16231	_invlist_union(*prop_definition, empty_names, prop_definition);
	16232	found_matches = TRUE;
	16233	SvREFCNT_dec_NN(empty_names);
	16234	}
	16235	SvREFCNT_dec_NN(empty);
	16236
	16237	#if 0
	16238	/* If we ever were to accept aliases for, say private use names, we would
	16239	* need to do something fancier to find empty names. The code below works
	16240	* (at the time it was written), and is slower than the above */
	16241	const char empties_pat[] = "^.";
	16242	if (strNE(name, empties_pat)) {
	16243	SV * empty = newSVpvs("");
	16244	if (execute_wildcard(subpattern_re,
	16245	SvPVX(empty),
	16246	SvEND(empty),
	16247	SvPVX(empty), 0,
	16248	empty,
	16249	0))
	16250	{
	16251	SV * empties = NULL;
	16252
	16253	(void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
	16254
	16255	_invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
	16256	SvREFCNT_dec_NN(empties);
	16257
	16258	found_matches = TRUE;
	16259	}
	16260	SvREFCNT_dec_NN(empty);
	16261	}
	16262	#endif
	16263
	16264	SvREFCNT_dec_NN(subpattern_re);
	16265	return found_matches;
	16266	}
	16267
	16268	/*
	16269	* ex: set ts=8 sts=4 sw=4 et:
	16270	*/