perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	#else
	85	# include "regcomp.h"
	86	#endif
	87
	88	#include "dquote_static.c"
	89	#include "charclass_invlists.h"
	90
	91	#ifdef op
	92	#undef op
	93	#endif /* op */
	94
	95	#ifdef MSDOS
	96	# if defined(BUGGY_MSC6)
	97	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	98	# pragma optimize("a",off)
	99	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	100	# pragma optimize("w",on )
	101	# endif /* BUGGY_MSC6 */
	102	#endif /* MSDOS */
	103
	104	#ifndef STATIC
	105	#define STATIC static
	106	#endif
	107
	108	typedef struct RExC_state_t {
	109	U32 flags; /* are we folding, multilining? */
	110	char precomp; / uncompiled string. */
	111	REGEXP rx_sv; / The SV that is the regexp. */
	112	regexp rx; / perl core regexp structure */
	113	regexp_internal rxi; / internal data for regexp object pprivate field */
	114	char start; / Start of input for compile */
	115	char end; / End of input for compile */
	116	char parse; / Input-scan pointer. */
	117	I32 whilem_seen; /* number of WHILEM in this expr */
	118	regnode emit_start; / Start of emitted-code area */
	119	regnode emit_bound; / First regnode outside of the allocated space */
	120	regnode emit; / Code-emit pointer; &regdummy = don't = compiling */
	121	I32 naughty; /* How bad is this pattern? */
	122	I32 sawback; /* Did we see \1, ...? */
	123	U32 seen;
	124	I32 size; /* Code size. */
	125	I32 npar; /* Capture buffer count, (OPEN). */
	126	I32 cpar; /* Capture buffer count, (CLOSE). */
	127	I32 nestroot; /* root parens we are in - used by accept */
	128	I32 extralen;
	129	I32 seen_zerolen;
	130	I32 seen_evals;
	131	regnode *open_parens; / pointers to open parens */
	132	regnode *close_parens; / pointers to close parens */
	133	regnode opend; / END node in program */
	134	I32 utf8; /* whether the pattern is utf8 or not */
	135	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	136	/* XXX use this for future optimisation of case
	137	* where pattern must be upgraded to utf8. */
	138	I32 uni_semantics; /* If a d charset modifier should use unicode
	139	rules, even if the pattern is not in
	140	utf8 */
	141	HV paren_names; / Paren names */
	142
	143	regnode *recurse; / Recurse regops */
	144	I32 recurse_count; /* Number of recurse regops */
	145	I32 in_lookbehind;
	146	I32 contains_locale;
	147	I32 override_recoding;
	148	#if ADD_TO_REGEXEC
	149	char starttry; / -Dr: where regtry was called. */
	150	#define RExC_starttry (pRExC_state->starttry)
	151	#endif
	152	#ifdef DEBUGGING
	153	const char *lastparse;
	154	I32 lastnum;
	155	AV paren_name_list; / idx -> name */
	156	#define RExC_lastparse (pRExC_state->lastparse)
	157	#define RExC_lastnum (pRExC_state->lastnum)
	158	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	159	#endif
	160	} RExC_state_t;
	161
	162	#define RExC_flags (pRExC_state->flags)
	163	#define RExC_precomp (pRExC_state->precomp)
	164	#define RExC_rx_sv (pRExC_state->rx_sv)
	165	#define RExC_rx (pRExC_state->rx)
	166	#define RExC_rxi (pRExC_state->rxi)
	167	#define RExC_start (pRExC_state->start)
	168	#define RExC_end (pRExC_state->end)
	169	#define RExC_parse (pRExC_state->parse)
	170	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	171	#ifdef RE_TRACK_PATTERN_OFFSETS
	172	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	173	#endif
	174	#define RExC_emit (pRExC_state->emit)
	175	#define RExC_emit_start (pRExC_state->emit_start)
	176	#define RExC_emit_bound (pRExC_state->emit_bound)
	177	#define RExC_naughty (pRExC_state->naughty)
	178	#define RExC_sawback (pRExC_state->sawback)
	179	#define RExC_seen (pRExC_state->seen)
	180	#define RExC_size (pRExC_state->size)
	181	#define RExC_npar (pRExC_state->npar)
	182	#define RExC_nestroot (pRExC_state->nestroot)
	183	#define RExC_extralen (pRExC_state->extralen)
	184	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	185	#define RExC_seen_evals (pRExC_state->seen_evals)
	186	#define RExC_utf8 (pRExC_state->utf8)
	187	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	188	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	189	#define RExC_open_parens (pRExC_state->open_parens)
	190	#define RExC_close_parens (pRExC_state->close_parens)
	191	#define RExC_opend (pRExC_state->opend)
	192	#define RExC_paren_names (pRExC_state->paren_names)
	193	#define RExC_recurse (pRExC_state->recurse)
	194	#define RExC_recurse_count (pRExC_state->recurse_count)
	195	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	196	#define RExC_contains_locale (pRExC_state->contains_locale)
	197	#define RExC_override_recoding (pRExC_state->override_recoding)
	198
	199
	200	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	201	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	202	((*s) == '{' && regcurly(s)))
	203
	204	#ifdef SPSTART
	205	#undef SPSTART /* dratted cpp namespace... */
	206	#endif
	207	/*
	208	* Flags to be passed up and down.
	209	*/
	210	#define WORST 0 /* Worst case. */
	211	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	212
	213	/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
	214	* character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
	215	#define SIMPLE 0x02
	216	#define SPSTART 0x04 /* Starts with * or +. */
	217	#define TRYAGAIN 0x08 /* Weeded out a declaration. */
	218	#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
	219
	220	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	221
	222	/* whether trie related optimizations are enabled */
	223	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	224	#define TRIE_STUDY_OPT
	225	#define FULL_TRIE_STUDY
	226	#define TRIE_STCLASS
	227	#endif
	228
	229
	230
	231	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	232	#define PBITVAL(paren) (1 << ((paren) & 7))
	233	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	234	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	235	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	236
	237	/* If not already in utf8, do a longjmp back to the beginning */
	238	#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
	239	#define REQUIRE_UTF8 STMT_START { \
	240	if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
	241	} STMT_END
	242
	243	/* About scan_data_t.
	244
	245	During optimisation we recurse through the regexp program performing
	246	various inplace (keyhole style) optimisations. In addition study_chunk
	247	and scan_commit populate this data structure with information about
	248	what strings MUST appear in the pattern. We look for the longest
	249	string that must appear at a fixed location, and we look for the
	250	longest string that may appear at a floating location. So for instance
	251	in the pattern:
	252
	253	/FOO[xX]A.*B[xX]BAR/
	254
	255	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	256	strings (because they follow a .* construct). study_chunk will identify
	257	both FOO and BAR as being the longest fixed and floating strings respectively.
	258
	259	The strings can be composites, for instance
	260
	261	/(f)(o)(o)/
	262
	263	will result in a composite fixed substring 'foo'.
	264
	265	For each string some basic information is maintained:
	266
	267	- offset or min_offset
	268	This is the position the string must appear at, or not before.
	269	It also implicitly (when combined with minlenp) tells us how many
	270	characters must match before the string we are searching for.
	271	Likewise when combined with minlenp and the length of the string it
	272	tells us how many characters must appear after the string we have
	273	found.
	274
	275	- max_offset
	276	Only used for floating strings. This is the rightmost point that
	277	the string can appear at. If set to I32 max it indicates that the
	278	string can occur infinitely far to the right.
	279
	280	- minlenp
	281	A pointer to the minimum length of the pattern that the string
	282	was found inside. This is important as in the case of positive
	283	lookahead or positive lookbehind we can have multiple patterns
	284	involved. Consider
	285
	286	/(?=FOO).*F/
	287
	288	The minimum length of the pattern overall is 3, the minimum length
	289	of the lookahead part is 3, but the minimum length of the part that
	290	will actually match is 1. So 'FOO's minimum length is 3, but the
	291	minimum length for the F is 1. This is important as the minimum length
	292	is used to determine offsets in front of and behind the string being
	293	looked for. Since strings can be composites this is the length of the
	294	pattern at the time it was committed with a scan_commit. Note that
	295	the length is calculated by study_chunk, so that the minimum lengths
	296	are not known until the full pattern has been compiled, thus the
	297	pointer to the value.
	298
	299	- lookbehind
	300
	301	In the case of lookbehind the string being searched for can be
	302	offset past the start point of the final matching string.
	303	If this value was just blithely removed from the min_offset it would
	304	invalidate some of the calculations for how many chars must match
	305	before or after (as they are derived from min_offset and minlen and
	306	the length of the string being searched for).
	307	When the final pattern is compiled and the data is moved from the
	308	scan_data_t structure into the regexp structure the information
	309	about lookbehind is factored in, with the information that would
	310	have been lost precalculated in the end_shift field for the
	311	associated string.
	312
	313	The fields pos_min and pos_delta are used to store the minimum offset
	314	and the delta to the maximum offset at the current point in the pattern.
	315
	316	*/
	317
	318	typedef struct scan_data_t {
	319	/I32 len_min; unused /
	320	/I32 len_delta; unused /
	321	I32 pos_min;
	322	I32 pos_delta;
	323	SV *last_found;
	324	I32 last_end; /* min value, <0 unless valid. */
	325	I32 last_start_min;
	326	I32 last_start_max;
	327	SV *longest; / Either &l_fixed, or &l_float. */
	328	SV longest_fixed; / longest fixed string found in pattern */
	329	I32 offset_fixed; /* offset where it starts */
	330	I32 minlen_fixed; / pointer to the minlen relevant to the string */
	331	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	332	SV longest_float; / longest floating string found in pattern */
	333	I32 offset_float_min; /* earliest point in string it can appear */
	334	I32 offset_float_max; /* latest point in string it can appear */
	335	I32 minlen_float; / pointer to the minlen relevant to the string */
	336	I32 lookbehind_float; /* is the position of the string modified by LB */
	337	I32 flags;
	338	I32 whilem_c;
	339	I32 *last_closep;
	340	struct regnode_charclass_class *start_class;
	341	} scan_data_t;
	342
	343	/*
	344	* Forward declarations for pregcomp()'s friends.
	345	*/
	346
	347	static const scan_data_t zero_scan_data =
	348	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	349
	350	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	351	#define SF_BEFORE_SEOL 0x0001
	352	#define SF_BEFORE_MEOL 0x0002
	353	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	354	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	355
	356	#ifdef NO_UNARY_PLUS
	357	# define SF_FIX_SHIFT_EOL (0+2)
	358	# define SF_FL_SHIFT_EOL (0+4)
	359	#else
	360	# define SF_FIX_SHIFT_EOL (+2)
	361	# define SF_FL_SHIFT_EOL (+4)
	362	#endif
	363
	364	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	365	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	366
	367	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	368	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	369	#define SF_IS_INF 0x0040
	370	#define SF_HAS_PAR 0x0080
	371	#define SF_IN_PAR 0x0100
	372	#define SF_HAS_EVAL 0x0200
	373	#define SCF_DO_SUBSTR 0x0400
	374	#define SCF_DO_STCLASS_AND 0x0800
	375	#define SCF_DO_STCLASS_OR 0x1000
	376	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	377	#define SCF_WHILEM_VISITED_POS 0x2000
	378
	379	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	380	#define SCF_SEEN_ACCEPT 0x8000
	381
	382	#define UTF cBOOL(RExC_utf8)
	383	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	384	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	385	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	386	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	387	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	388	#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	389	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
	390
	391	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	392
	393	#define OOB_UNICODE 12345678
	394	#define OOB_NAMEDCLASS -1
	395
	396	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	397	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	398
	399
	400	/* length of regex to show in messages that don't mark a position within */
	401	#define RegexLengthToShowInErrorMessages 127
	402
	403	/*
	404	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	405	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	406	* op/pragma/warn/regcomp.
	407	*/
	408	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	409	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	410
	411	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	412
	413	/*
	414	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	415	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	416	* "...".
	417	*/
	418	#define _FAIL(code) STMT_START { \
	419	const char *ellipses = ""; \
	420	IV len = RExC_end - RExC_precomp; \
	421	\
	422	if (!SIZE_ONLY) \
	423	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	424	if (len > RegexLengthToShowInErrorMessages) { \
	425	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	426	len = RegexLengthToShowInErrorMessages - 10; \
	427	ellipses = "..."; \
	428	} \
	429	code; \
	430	} STMT_END
	431
	432	#define FAIL(msg) _FAIL( \
	433	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	434	msg, (int)len, RExC_precomp, ellipses))
	435
	436	#define FAIL2(msg,arg) _FAIL( \
	437	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	438	arg, (int)len, RExC_precomp, ellipses))
	439
	440	/*
	441	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	442	*/
	443	#define Simple_vFAIL(m) STMT_START { \
	444	const IV offset = RExC_parse - RExC_precomp; \
	445	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	446	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	447	} STMT_END
	448
	449	/*
	450	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	451	*/
	452	#define vFAIL(m) STMT_START { \
	453	if (!SIZE_ONLY) \
	454	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	455	Simple_vFAIL(m); \
	456	} STMT_END
	457
	458	/*
	459	* Like Simple_vFAIL(), but accepts two arguments.
	460	*/
	461	#define Simple_vFAIL2(m,a1) STMT_START { \
	462	const IV offset = RExC_parse - RExC_precomp; \
	463	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	464	(int)offset, RExC_precomp, RExC_precomp + offset); \
	465	} STMT_END
	466
	467	/*
	468	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	469	*/
	470	#define vFAIL2(m,a1) STMT_START { \
	471	if (!SIZE_ONLY) \
	472	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	473	Simple_vFAIL2(m, a1); \
	474	} STMT_END
	475
	476
	477	/*
	478	* Like Simple_vFAIL(), but accepts three arguments.
	479	*/
	480	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	481	const IV offset = RExC_parse - RExC_precomp; \
	482	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	483	(int)offset, RExC_precomp, RExC_precomp + offset); \
	484	} STMT_END
	485
	486	/*
	487	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	488	*/
	489	#define vFAIL3(m,a1,a2) STMT_START { \
	490	if (!SIZE_ONLY) \
	491	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	492	Simple_vFAIL3(m, a1, a2); \
	493	} STMT_END
	494
	495	/*
	496	* Like Simple_vFAIL(), but accepts four arguments.
	497	*/
	498	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	499	const IV offset = RExC_parse - RExC_precomp; \
	500	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	501	(int)offset, RExC_precomp, RExC_precomp + offset); \
	502	} STMT_END
	503
	504	#define ckWARNreg(loc,m) STMT_START { \
	505	const IV offset = loc - RExC_precomp; \
	506	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	507	(int)offset, RExC_precomp, RExC_precomp + offset); \
	508	} STMT_END
	509
	510	#define ckWARNregdep(loc,m) STMT_START { \
	511	const IV offset = loc - RExC_precomp; \
	512	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	513	m REPORT_LOCATION, \
	514	(int)offset, RExC_precomp, RExC_precomp + offset); \
	515	} STMT_END
	516
	517	#define ckWARN2regdep(loc,m, a1) STMT_START { \
	518	const IV offset = loc - RExC_precomp; \
	519	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	520	m REPORT_LOCATION, \
	521	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	522	} STMT_END
	523
	524	#define ckWARN2reg(loc, m, a1) STMT_START { \
	525	const IV offset = loc - RExC_precomp; \
	526	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	527	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	528	} STMT_END
	529
	530	#define vWARN3(loc, m, a1, a2) STMT_START { \
	531	const IV offset = loc - RExC_precomp; \
	532	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	533	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	534	} STMT_END
	535
	536	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	537	const IV offset = loc - RExC_precomp; \
	538	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	539	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	540	} STMT_END
	541
	542	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	543	const IV offset = loc - RExC_precomp; \
	544	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	545	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	546	} STMT_END
	547
	548	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	549	const IV offset = loc - RExC_precomp; \
	550	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	551	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	552	} STMT_END
	553
	554	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	555	const IV offset = loc - RExC_precomp; \
	556	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	557	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	558	} STMT_END
	559
	560
	561	/* Allow for side effects in s */
	562	#define REGC(c,s) STMT_START { \
	563	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	564	} STMT_END
	565
	566	/* Macros for recording node offsets. 20001227 mjd@plover.com
	567	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	568	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	569	* Element 0 holds the number n.
	570	* Position is 1 indexed.
	571	*/
	572	#ifndef RE_TRACK_PATTERN_OFFSETS
	573	#define Set_Node_Offset_To_R(node,byte)
	574	#define Set_Node_Offset(node,byte)
	575	#define Set_Cur_Node_Offset
	576	#define Set_Node_Length_To_R(node,len)
	577	#define Set_Node_Length(node,len)
	578	#define Set_Node_Cur_Length(node)
	579	#define Node_Offset(n)
	580	#define Node_Length(n)
	581	#define Set_Node_Offset_Length(node,offset,len)
	582	#define ProgLen(ri) ri->u.proglen
	583	#define SetProgLen(ri,x) ri->u.proglen = x
	584	#else
	585	#define ProgLen(ri) ri->u.offsets[0]
	586	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	587	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	588	if (! SIZE_ONLY) { \
	589	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	590	__LINE__, (int)(node), (int)(byte))); \
	591	if((node) < 0) { \
	592	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	593	} else { \
	594	RExC_offsets[2*(node)-1] = (byte); \
	595	} \
	596	} \
	597	} STMT_END
	598
	599	#define Set_Node_Offset(node,byte) \
	600	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	601	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	602
	603	#define Set_Node_Length_To_R(node,len) STMT_START { \
	604	if (! SIZE_ONLY) { \
	605	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	606	__LINE__, (int)(node), (int)(len))); \
	607	if((node) < 0) { \
	608	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	609	} else { \
	610	RExC_offsets[2*(node)] = (len); \
	611	} \
	612	} \
	613	} STMT_END
	614
	615	#define Set_Node_Length(node,len) \
	616	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	617	#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
	618	#define Set_Node_Cur_Length(node) \
	619	Set_Node_Length(node, RExC_parse - parse_start)
	620
	621	/* Get offsets and lengths */
	622	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	623	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	624
	625	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	626	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	627	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	628	} STMT_END
	629	#endif
	630
	631	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	632	#define EXPERIMENTAL_INPLACESCAN
	633	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	634
	635	#define DEBUG_STUDYDATA(str,data,depth) \
	636	DEBUG_OPTIMISE_MORE_r(if(data){ \
	637	PerlIO_printf(Perl_debug_log, \
	638	"%*s" str "Pos:%"IVdf"/%"IVdf \
	639	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	640	(int)(depth)*2, "", \
	641	(IV)((data)->pos_min), \
	642	(IV)((data)->pos_delta), \
	643	(UV)((data)->flags), \
	644	(IV)((data)->whilem_c), \
	645	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	646	is_inf ? "INF " : "" \
	647	); \
	648	if ((data)->last_found) \
	649	PerlIO_printf(Perl_debug_log, \
	650	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	651	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	652	SvPVX_const((data)->last_found), \
	653	(IV)((data)->last_end), \
	654	(IV)((data)->last_start_min), \
	655	(IV)((data)->last_start_max), \
	656	((data)->longest && \
	657	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	658	SvPVX_const((data)->longest_fixed), \
	659	(IV)((data)->offset_fixed), \
	660	((data)->longest && \
	661	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	662	SvPVX_const((data)->longest_float), \
	663	(IV)((data)->offset_float_min), \
	664	(IV)((data)->offset_float_max) \
	665	); \
	666	PerlIO_printf(Perl_debug_log,"\n"); \
	667	});
	668
	669	static void clear_re(pTHX_ void *r);
	670
	671	/* Mark that we cannot extend a found fixed substring at this point.
	672	Update the longest found anchored substring and the longest found
	673	floating substrings if needed. */
	674
	675	STATIC void
	676	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data, I32 *minlenp, int is_inf)
	677	{
	678	const STRLEN l = CHR_SVLEN(data->last_found);
	679	const STRLEN old_l = CHR_SVLEN(*data->longest);
	680	GET_RE_DEBUG_FLAGS_DECL;
	681
	682	PERL_ARGS_ASSERT_SCAN_COMMIT;
	683
	684	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	685	SvSetMagicSV(*data->longest, data->last_found);
	686	if (*data->longest == data->longest_fixed) {
	687	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	688	if (data->flags & SF_BEFORE_EOL)
	689	data->flags
	690	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	691	else
	692	data->flags &= ~SF_FIX_BEFORE_EOL;
	693	data->minlen_fixed=minlenp;
	694	data->lookbehind_fixed=0;
	695	}
	696	else { /* data->longest == data->longest_float /
	697	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	698	data->offset_float_max = (l
	699	? data->last_start_max
	700	: data->pos_min + data->pos_delta);
	701	if (is_inf \|\| (U32)data->offset_float_max > (U32)I32_MAX)
	702	data->offset_float_max = I32_MAX;
	703	if (data->flags & SF_BEFORE_EOL)
	704	data->flags
	705	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	706	else
	707	data->flags &= ~SF_FL_BEFORE_EOL;
	708	data->minlen_float=minlenp;
	709	data->lookbehind_float=0;
	710	}
	711	}
	712	SvCUR_set(data->last_found, 0);
	713	{
	714	SV * const sv = data->last_found;
	715	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	716	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	717	if (mg)
	718	mg->mg_len = 0;
	719	}
	720	}
	721	data->last_end = -1;
	722	data->flags &= ~SF_BEFORE_EOL;
	723	DEBUG_STUDYDATA("commit: ",data,0);
	724	}
	725
	726	/* Can match anything (initialization) */
	727	STATIC void
	728	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	729	{
	730	PERL_ARGS_ASSERT_CL_ANYTHING;
	731
	732	ANYOF_BITMAP_SETALL(cl);
	733	cl->flags = ANYOF_CLASS\|ANYOF_EOS\|ANYOF_UNICODE_ALL
	734	\|ANYOF_LOC_NONBITMAP_FOLD\|ANYOF_NON_UTF8_LATIN1_ALL;
	735
	736	/* If any portion of the regex is to operate under locale rules,
	737	* initialization includes it. The reason this isn't done for all regexes
	738	* is that the optimizer was written under the assumption that locale was
	739	* all-or-nothing. Given the complexity and lack of documentation in the
	740	* optimizer, and that there are inadequate test cases for locale, so many
	741	* parts of it may not work properly, it is safest to avoid locale unless
	742	* necessary. */
	743	if (RExC_contains_locale) {
	744	ANYOF_CLASS_SETALL(cl); /* /l uses class */
	745	cl->flags \|= ANYOF_LOCALE;
	746	}
	747	else {
	748	ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
	749	}
	750	}
	751
	752	/* Can match anything (initialization) */
	753	STATIC int
	754	S_cl_is_anything(const struct regnode_charclass_class *cl)
	755	{
	756	int value;
	757
	758	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	759
	760	for (value = 0; value <= ANYOF_MAX; value += 2)
	761	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	762	return 1;
	763	if (!(cl->flags & ANYOF_UNICODE_ALL))
	764	return 0;
	765	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	766	return 0;
	767	return 1;
	768	}
	769
	770	/* Can match anything (initialization) */
	771	STATIC void
	772	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	773	{
	774	PERL_ARGS_ASSERT_CL_INIT;
	775
	776	Zero(cl, 1, struct regnode_charclass_class);
	777	cl->type = ANYOF;
	778	cl_anything(pRExC_state, cl);
	779	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	780	}
	781
	782	/* These two functions currently do the exact same thing */
	783	#define cl_init_zero S_cl_init
	784
	785	/* 'AND' a given class with another one. Can create false positives. 'cl'
	786	* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
	787	* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
	788	STATIC void
	789	S_cl_and(struct regnode_charclass_class *cl,
	790	const struct regnode_charclass_class *and_with)
	791	{
	792	PERL_ARGS_ASSERT_CL_AND;
	793
	794	assert(and_with->type == ANYOF);
	795
	796	/* I (khw) am not sure all these restrictions are necessary XXX */
	797	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	798	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	799	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	800	&& !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	801	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
	802	int i;
	803
	804	if (and_with->flags & ANYOF_INVERT)
	805	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	806	cl->bitmap[i] &= ~and_with->bitmap[i];
	807	else
	808	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	809	cl->bitmap[i] &= and_with->bitmap[i];
	810	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	811
	812	if (and_with->flags & ANYOF_INVERT) {
	813
	814	/* Here, the and'ed node is inverted. Get the AND of the flags that
	815	* aren't affected by the inversion. Those that are affected are
	816	* handled individually below */
	817	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
	818	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
	819	cl->flags \|= affected_flags;
	820
	821	/* We currently don't know how to deal with things that aren't in the
	822	* bitmap, but we know that the intersection is no greater than what
	823	* is already in cl, so let there be false positives that get sorted
	824	* out after the synthetic start class succeeds, and the node is
	825	* matched for real. */
	826
	827	/* The inversion of these two flags indicate that the resulting
	828	* intersection doesn't have them */
	829	if (and_with->flags & ANYOF_UNICODE_ALL) {
	830	cl->flags &= ~ANYOF_UNICODE_ALL;
	831	}
	832	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	833	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	834	}
	835	}
	836	else { /* and'd node is not inverted */
	837	U8 outside_bitmap_but_not_utf8; /* Temp variable */
	838
	839	if (! ANYOF_NONBITMAP(and_with)) {
	840
	841	/* Here 'and_with' doesn't match anything outside the bitmap
	842	* (except possibly ANYOF_UNICODE_ALL), which means the
	843	* intersection can't either, except for ANYOF_UNICODE_ALL, in
	844	* which case we don't know what the intersection is, but it's no
	845	* greater than what cl already has, so can just leave it alone,
	846	* with possible false positives */
	847	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
	848	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	849	cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
	850	}
	851	}
	852	else if (! ANYOF_NONBITMAP(cl)) {
	853
	854	/* Here, 'and_with' does match something outside the bitmap, and cl
	855	* doesn't have a list of things to match outside the bitmap. If
	856	* cl can match all code points above 255, the intersection will
	857	* be those above-255 code points that 'and_with' matches. If cl
	858	* can't match all Unicode code points, it means that it can't
	859	* match anything outside the bitmap (since the 'if' that got us
	860	* into this block tested for that), so we leave the bitmap empty.
	861	*/
	862	if (cl->flags & ANYOF_UNICODE_ALL) {
	863	ARG_SET(cl, ARG(and_with));
	864
	865	/* and_with's ARG may match things that don't require UTF8.
	866	* And now cl's will too, in spite of this being an 'and'. See
	867	* the comments below about the kludge */
	868	cl->flags \|= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
	869	}
	870	}
	871	else {
	872	/* Here, both 'and_with' and cl match something outside the
	873	* bitmap. Currently we do not do the intersection, so just match
	874	* whatever cl had at the beginning. */
	875	}
	876
	877
	878	/* Take the intersection of the two sets of flags. However, the
	879	* ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
	880	* kludge around the fact that this flag is not treated like the others
	881	* which are initialized in cl_anything(). The way the optimizer works
	882	* is that the synthetic start class (SSC) is initialized to match
	883	* anything, and then the first time a real node is encountered, its
	884	* values are AND'd with the SSC's with the result being the values of
	885	* the real node. However, there are paths through the optimizer where
	886	* the AND never gets called, so those initialized bits are set
	887	* inappropriately, which is not usually a big deal, as they just cause
	888	* false positives in the SSC, which will just mean a probably
	889	* imperceptible slow down in execution. However this bit has a
	890	* higher false positive consequence in that it can cause utf8.pm,
	891	* utf8_heavy.pl ... to be loaded when not necessary, which is a much
	892	* bigger slowdown and also causes significant extra memory to be used.
	893	* In order to prevent this, the code now takes a different tack. The
	894	* bit isn't set unless some part of the regular expression needs it,
	895	* but once set it won't get cleared. This means that these extra
	896	* modules won't get loaded unless there was some path through the
	897	* pattern that would have required them anyway, and so any false
	898	* positives that occur by not ANDing them out when they could be
	899	* aren't as severe as they would be if we treated this bit like all
	900	* the others */
	901	outside_bitmap_but_not_utf8 = (cl->flags \| and_with->flags)
	902	& ANYOF_NONBITMAP_NON_UTF8;
	903	cl->flags &= and_with->flags;
	904	cl->flags \|= outside_bitmap_but_not_utf8;
	905	}
	906	}
	907
	908	/* 'OR' a given class with another one. Can create false positives. 'cl'
	909	* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
	910	* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
	911	STATIC void
	912	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	913	{
	914	PERL_ARGS_ASSERT_CL_OR;
	915
	916	if (or_with->flags & ANYOF_INVERT) {
	917
	918	/* Here, the or'd node is to be inverted. This means we take the
	919	* complement of everything not in the bitmap, but currently we don't
	920	* know what that is, so give up and match anything */
	921	if (ANYOF_NONBITMAP(or_with)) {
	922	cl_anything(pRExC_state, cl);
	923	}
	924	/* We do not use
	925	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	926	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	927	* which is wasteful if CL2 is small, but we ignore CL2:
	928	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	929	* XXXX Can we handle case-fold? Unclear:
	930	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	931	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	932	*/
	933	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	934	&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	935	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
	936	int i;
	937
	938	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	939	cl->bitmap[i] \|= ~or_with->bitmap[i];
	940	} /* XXXX: logic is complicated otherwise */
	941	else {
	942	cl_anything(pRExC_state, cl);
	943	}
	944
	945	/* And, we can just take the union of the flags that aren't affected
	946	* by the inversion */
	947	cl->flags \|= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
	948
	949	/* For the remaining flags:
	950	ANYOF_UNICODE_ALL and inverted means to not match anything above
	951	255, which means that the union with cl should just be
	952	what cl has in it, so can ignore this flag
	953	ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
	954	is 127-255 to match them, but then invert that, so the
	955	union with cl should just be what cl has in it, so can
	956	ignore this flag
	957	*/
	958	} else { /* 'or_with' is not inverted */
	959	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	960	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	961	&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	962	\|\| (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
	963	int i;
	964
	965	/* OR char bitmap and class bitmap separately */
	966	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	967	cl->bitmap[i] \|= or_with->bitmap[i];
	968	if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
	969	for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
	970	cl->classflags[i] \|= or_with->classflags[i];
	971	cl->flags \|= ANYOF_CLASS;
	972	}
	973	}
	974	else { /* XXXX: logic is complicated, leave it along for a moment. */
	975	cl_anything(pRExC_state, cl);
	976	}
	977
	978	if (ANYOF_NONBITMAP(or_with)) {
	979
	980	/* Use the added node's outside-the-bit-map match if there isn't a
	981	* conflict. If there is a conflict (both nodes match something
	982	* outside the bitmap, but what they match outside is not the same
	983	* pointer, and hence not easily compared until XXX we extend
	984	* inversion lists this far), give up and allow the start class to
	985	* match everything outside the bitmap. If that stuff is all above
	986	* 255, can just set UNICODE_ALL, otherwise caould be anything. */
	987	if (! ANYOF_NONBITMAP(cl)) {
	988	ARG_SET(cl, ARG(or_with));
	989	}
	990	else if (ARG(cl) != ARG(or_with)) {
	991
	992	if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
	993	cl_anything(pRExC_state, cl);
	994	}
	995	else {
	996	cl->flags \|= ANYOF_UNICODE_ALL;
	997	}
	998	}
	999	}
	1000
	1001	/* Take the union */
	1002	cl->flags \|= or_with->flags;
	1003	}
	1004	}
	1005
	1006	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	1007	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	1008	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	1009	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	1010
	1011
	1012	#ifdef DEBUGGING
	1013	/*
	1014	dump_trie(trie,widecharmap,revcharmap)
	1015	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	1016	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	1017
	1018	These routines dump out a trie in a somewhat readable format.
	1019	The _interim_ variants are used for debugging the interim
	1020	tables that are used to generate the final compressed
	1021	representation which is what dump_trie expects.
	1022
	1023	Part of the reason for their existence is to provide a form
	1024	of documentation as to how the different representations function.
	1025
	1026	*/
	1027
	1028	/*
	1029	Dumps the final compressed table form of the trie to Perl_debug_log.
	1030	Used for debugging make_trie().
	1031	*/
	1032
	1033	STATIC void
	1034	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1035	AV *revcharmap, U32 depth)
	1036	{
	1037	U32 state;
	1038	SV *sv=sv_newmortal();
	1039	int colwidth= widecharmap ? 6 : 4;
	1040	U16 word;
	1041	GET_RE_DEBUG_FLAGS_DECL;
	1042
	1043	PERL_ARGS_ASSERT_DUMP_TRIE;
	1044
	1045	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1046	(int)depth * 2 + 2,"",
	1047	"Match","Base","Ofs" );
	1048
	1049	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1050	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1051	if ( tmp ) {
	1052	PerlIO_printf( Perl_debug_log, "%*s",
	1053	colwidth,
	1054	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1055	PL_colors[0], PL_colors[1],
	1056	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1057	PERL_PV_ESCAPE_FIRSTCHAR
	1058	)
	1059	);
	1060	}
	1061	}
	1062	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1063	(int)depth * 2 + 2,"");
	1064
	1065	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1066	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1067	PerlIO_printf( Perl_debug_log, "\n");
	1068
	1069	for( state = 1 ; state < trie->statecount ; state++ ) {
	1070	const U32 base = trie->states[ state ].trans.base;
	1071
	1072	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	1073
	1074	if ( trie->states[ state ].wordnum ) {
	1075	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	1076	} else {
	1077	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1078	}
	1079
	1080	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1081
	1082	if ( base ) {
	1083	U32 ofs = 0;
	1084
	1085	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1086	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1087	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	1088	ofs++;
	1089
	1090	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1091
	1092	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1093	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1094	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1095	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1096	{
	1097	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1098	colwidth,
	1099	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	1100	} else {
	1101	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1102	}
	1103	}
	1104
	1105	PerlIO_printf( Perl_debug_log, "]");
	1106
	1107	}
	1108	PerlIO_printf( Perl_debug_log, "\n" );
	1109	}
	1110	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	1111	for (word=1; word <= trie->wordcount; word++) {
	1112	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1113	(int)word, (int)(trie->wordinfo[word].prev),
	1114	(int)(trie->wordinfo[word].len));
	1115	}
	1116	PerlIO_printf(Perl_debug_log, "\n" );
	1117	}
	1118	/*
	1119	Dumps a fully constructed but uncompressed trie in list form.
	1120	List tries normally only are used for construction when the number of
	1121	possible chars (trie->uniquecharcount) is very high.
	1122	Used for debugging make_trie().
	1123	*/
	1124	STATIC void
	1125	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1126	HV widecharmap, AV revcharmap, U32 next_alloc,
	1127	U32 depth)
	1128	{
	1129	U32 state;
	1130	SV *sv=sv_newmortal();
	1131	int colwidth= widecharmap ? 6 : 4;
	1132	GET_RE_DEBUG_FLAGS_DECL;
	1133
	1134	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1135
	1136	/* print out the table precompression. */
	1137	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1138	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1139	"------:-----+-----------------\n" );
	1140
	1141	for( state=1 ; state < next_alloc ; state ++ ) {
	1142	U16 charid;
	1143
	1144	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1145	(int)depth * 2 + 2,"", (UV)state );
	1146	if ( ! trie->states[ state ].wordnum ) {
	1147	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1148	} else {
	1149	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1150	trie->states[ state ].wordnum
	1151	);
	1152	}
	1153	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1154	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1155	if ( tmp ) {
	1156	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1157	colwidth,
	1158	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1159	PL_colors[0], PL_colors[1],
	1160	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1161	PERL_PV_ESCAPE_FIRSTCHAR
	1162	) ,
	1163	TRIE_LIST_ITEM(state,charid).forid,
	1164	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1165	);
	1166	if (!(charid % 10))
	1167	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1168	(int)((depth * 2) + 14), "");
	1169	}
	1170	}
	1171	PerlIO_printf( Perl_debug_log, "\n");
	1172	}
	1173	}
	1174
	1175	/*
	1176	Dumps a fully constructed but uncompressed trie in table form.
	1177	This is the normal DFA style state transition table, with a few
	1178	twists to facilitate compression later.
	1179	Used for debugging make_trie().
	1180	*/
	1181	STATIC void
	1182	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1183	HV widecharmap, AV revcharmap, U32 next_alloc,
	1184	U32 depth)
	1185	{
	1186	U32 state;
	1187	U16 charid;
	1188	SV *sv=sv_newmortal();
	1189	int colwidth= widecharmap ? 6 : 4;
	1190	GET_RE_DEBUG_FLAGS_DECL;
	1191
	1192	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1193
	1194	/*
	1195	print out the table precompression so that we can do a visual check
	1196	that they are identical.
	1197	*/
	1198
	1199	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1200
	1201	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1202	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1203	if ( tmp ) {
	1204	PerlIO_printf( Perl_debug_log, "%*s",
	1205	colwidth,
	1206	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1207	PL_colors[0], PL_colors[1],
	1208	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1209	PERL_PV_ESCAPE_FIRSTCHAR
	1210	)
	1211	);
	1212	}
	1213	}
	1214
	1215	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1216
	1217	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1218	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1219	}
	1220
	1221	PerlIO_printf( Perl_debug_log, "\n" );
	1222
	1223	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1224
	1225	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1226	(int)depth * 2 + 2,"",
	1227	(UV)TRIE_NODENUM( state ) );
	1228
	1229	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1230	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1231	if (v)
	1232	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1233	else
	1234	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1235	}
	1236	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1237	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1238	} else {
	1239	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1240	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1241	}
	1242	}
	1243	}
	1244
	1245	#endif
	1246
	1247
	1248	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1249	startbranch: the first branch in the whole branch sequence
	1250	first : start branch of sequence of branch-exact nodes.
	1251	May be the same as startbranch
	1252	last : Thing following the last branch.
	1253	May be the same as tail.
	1254	tail : item following the branch sequence
	1255	count : words in the sequence
	1256	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1257	depth : indent depth
	1258
	1259	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1260
	1261	A trie is an N'ary tree where the branches are determined by digital
	1262	decomposition of the key. IE, at the root node you look up the 1st character and
	1263	follow that branch repeat until you find the end of the branches. Nodes can be
	1264	marked as "accepting" meaning they represent a complete word. Eg:
	1265
	1266	/he\|she\|his\|hers/
	1267
	1268	would convert into the following structure. Numbers represent states, letters
	1269	following numbers represent valid transitions on the letter from that state, if
	1270	the number is in square brackets it represents an accepting state, otherwise it
	1271	will be in parenthesis.
	1272
	1273	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1274	\| \|
	1275	\| (2)
	1276	\| \|
	1277	(1) +-i->(6)-+-s->[7]
	1278	\|
	1279	+-s->(3)-+-h->(4)-+-e->[5]
	1280
	1281	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1282
	1283	This shows that when matching against the string 'hers' we will begin at state 1
	1284	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1285	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1286	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1287	single traverse. We store a mapping from accepting to state to which word was
	1288	matched, and then when we have multiple possibilities we try to complete the
	1289	rest of the regex in the order in which they occured in the alternation.
	1290
	1291	The only prior NFA like behaviour that would be changed by the TRIE support is
	1292	the silent ignoring of duplicate alternations which are of the form:
	1293
	1294	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1295
	1296	Thus EVAL blocks following a trie may be called a different number of times with
	1297	and without the optimisation. With the optimisations dupes will be silently
	1298	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1299	the following demonstrates:
	1300
	1301	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1302
	1303	which prints out 'word' three times, but
	1304
	1305	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1306
	1307	which doesnt print it out at all. This is due to other optimisations kicking in.
	1308
	1309	Example of what happens on a structural level:
	1310
	1311	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1312
	1313	1: CURLYM[1] {1,32767}(18)
	1314	5: BRANCH(8)
	1315	6: EXACT <ac>(16)
	1316	8: BRANCH(11)
	1317	9: EXACT <ad>(16)
	1318	11: BRANCH(14)
	1319	12: EXACT <ab>(16)
	1320	16: SUCCEED(0)
	1321	17: NOTHING(18)
	1322	18: END(0)
	1323
	1324	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1325	and should turn into:
	1326
	1327	1: CURLYM[1] {1,32767}(18)
	1328	5: TRIE(16)
	1329	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1330	<ac>
	1331	<ad>
	1332	<ab>
	1333	16: SUCCEED(0)
	1334	17: NOTHING(18)
	1335	18: END(0)
	1336
	1337	Cases where tail != last would be like /(?foo\|bar)baz/:
	1338
	1339	1: BRANCH(4)
	1340	2: EXACT <foo>(8)
	1341	4: BRANCH(7)
	1342	5: EXACT <bar>(8)
	1343	7: TAIL(8)
	1344	8: EXACT <baz>(10)
	1345	10: END(0)
	1346
	1347	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1348	and would end up looking like:
	1349
	1350	1: TRIE(8)
	1351	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1352	<foo>
	1353	<bar>
	1354	7: TAIL(8)
	1355	8: EXACT <baz>(10)
	1356	10: END(0)
	1357
	1358	d = uvuni_to_utf8_flags(d, uv, 0);
	1359
	1360	is the recommended Unicode-aware way of saying
	1361
	1362	*(d++) = uv;
	1363	*/
	1364
	1365	#define TRIE_STORE_REVCHAR \
	1366	STMT_START { \
	1367	if (UTF) { \
	1368	SV *zlopp = newSV(2); \
	1369	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1370	unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
	1371	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1372	SvPOK_on(zlopp); \
	1373	SvUTF8_on(zlopp); \
	1374	av_push(revcharmap, zlopp); \
	1375	} else { \
	1376	char ooooff = (char)uvc; \
	1377	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1378	} \
	1379	} STMT_END
	1380
	1381	#define TRIE_READ_CHAR STMT_START { \
	1382	wordlen++; \
	1383	if ( UTF ) { \
	1384	if ( folder ) { \
	1385	if ( foldlen > 0 ) { \
	1386	uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
	1387	foldlen -= len; \
	1388	scan += len; \
	1389	len = 0; \
	1390	} else { \
	1391	len = UTF8SKIP(uc);\
	1392	uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
	1393	foldlen -= UNISKIP( uvc ); \
	1394	scan = foldbuf + UNISKIP( uvc ); \
	1395	} \
	1396	} else { \
	1397	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1398	} \
	1399	} else { \
	1400	uvc = (U32)*uc; \
	1401	len = 1; \
	1402	} \
	1403	} STMT_END
	1404
	1405
	1406
	1407	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1408	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1409	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1410	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1411	} \
	1412	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1413	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1414	TRIE_LIST_CUR( state )++; \
	1415	} STMT_END
	1416
	1417	#define TRIE_LIST_NEW(state) STMT_START { \
	1418	Newxz( trie->states[ state ].trans.list, \
	1419	4, reg_trie_trans_le ); \
	1420	TRIE_LIST_CUR( state ) = 1; \
	1421	TRIE_LIST_LEN( state ) = 4; \
	1422	} STMT_END
	1423
	1424	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1425	U16 dupe= trie->states[ state ].wordnum; \
	1426	regnode * const noper_next = regnext( noper ); \
	1427	\
	1428	DEBUG_r({ \
	1429	/* store the word for dumping */ \
	1430	SV* tmp; \
	1431	if (OP(noper) != NOTHING) \
	1432	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1433	else \
	1434	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1435	av_push( trie_words, tmp ); \
	1436	}); \
	1437	\
	1438	curword++; \
	1439	trie->wordinfo[curword].prev = 0; \
	1440	trie->wordinfo[curword].len = wordlen; \
	1441	trie->wordinfo[curword].accept = state; \
	1442	\
	1443	if ( noper_next < tail ) { \
	1444	if (!trie->jump) \
	1445	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1446	trie->jump[curword] = (U16)(noper_next - convert); \
	1447	if (!jumper) \
	1448	jumper = noper_next; \
	1449	if (!nextbranch) \
	1450	nextbranch= regnext(cur); \
	1451	} \
	1452	\
	1453	if ( dupe ) { \
	1454	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1455	/* chain, so that when the bits of chain are later */\
	1456	/* linked together, the dups appear in the chain */\
	1457	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1458	trie->wordinfo[dupe].prev = curword; \
	1459	} else { \
	1460	/* we haven't inserted this word yet. */ \
	1461	trie->states[ state ].wordnum = curword; \
	1462	} \
	1463	} STMT_END
	1464
	1465
	1466	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1467	( ( base + charid >= ucharcount \
	1468	&& base + charid < ubound \
	1469	&& state == trie->trans[ base - ucharcount + charid ].check \
	1470	&& trie->trans[ base - ucharcount + charid ].next ) \
	1471	? trie->trans[ base - ucharcount + charid ].next \
	1472	: ( state==1 ? special : 0 ) \
	1473	)
	1474
	1475	#define MADE_TRIE 1
	1476	#define MADE_JUMP_TRIE 2
	1477	#define MADE_EXACT_TRIE 4
	1478
	1479	STATIC I32
	1480	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1481	{
	1482	dVAR;
	1483	/* first pass, loop through and scan words */
	1484	reg_trie_data *trie;
	1485	HV *widecharmap = NULL;
	1486	AV *revcharmap = newAV();
	1487	regnode *cur;
	1488	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1489	STRLEN len = 0;
	1490	UV uvc = 0;
	1491	U16 curword = 0;
	1492	U32 next_alloc = 0;
	1493	regnode *jumper = NULL;
	1494	regnode *nextbranch = NULL;
	1495	regnode *convert = NULL;
	1496	U32 prev_states; / temp array mapping each state to previous one */
	1497	/* we just use folder as a flag in utf8 */
	1498	const U8 * folder = NULL;
	1499
	1500	#ifdef DEBUGGING
	1501	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1502	AV *trie_words = NULL;
	1503	/* along with revcharmap, this only used during construction but both are
	1504	* useful during debugging so we store them in the struct when debugging.
	1505	*/
	1506	#else
	1507	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1508	STRLEN trie_charcount=0;
	1509	#endif
	1510	SV *re_trie_maxbuff;
	1511	GET_RE_DEBUG_FLAGS_DECL;
	1512
	1513	PERL_ARGS_ASSERT_MAKE_TRIE;
	1514	#ifndef DEBUGGING
	1515	PERL_UNUSED_ARG(depth);
	1516	#endif
	1517
	1518	switch (flags) {
	1519	case EXACT: break;
	1520	case EXACTFA:
	1521	case EXACTFU: folder = PL_fold_latin1; break;
	1522	case EXACTF: folder = PL_fold; break;
	1523	case EXACTFL: folder = PL_fold_locale; break;
	1524	default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
	1525	}
	1526
	1527	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1528	trie->refcount = 1;
	1529	trie->startstate = 1;
	1530	trie->wordcount = word_count;
	1531	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1532	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1533	if (!(UTF && folder))
	1534	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1535	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1536	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1537
	1538	DEBUG_r({
	1539	trie_words = newAV();
	1540	});
	1541
	1542	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1543	if (!SvIOK(re_trie_maxbuff)) {
	1544	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1545	}
	1546	DEBUG_OPTIMISE_r({
	1547	PerlIO_printf( Perl_debug_log,
	1548	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1549	(int)depth * 2 + 2, "",
	1550	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1551	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1552	(int)depth);
	1553	});
	1554
	1555	/* Find the node we are going to overwrite */
	1556	if ( first == startbranch && OP( last ) != BRANCH ) {
	1557	/* whole branch chain */
	1558	convert = first;
	1559	} else {
	1560	/* branch sub-chain */
	1561	convert = NEXTOPER( first );
	1562	}
	1563
	1564	/* -- First loop and Setup --
	1565
	1566	We first traverse the branches and scan each word to determine if it
	1567	contains widechars, and how many unique chars there are, this is
	1568	important as we have to build a table with at least as many columns as we
	1569	have unique chars.
	1570
	1571	We use an array of integers to represent the character codes 0..255
	1572	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1573	native representation of the character value as the key and IV's for the
	1574	coded index.
	1575
	1576	TODO If we keep track of how many times each character is used we can
	1577	remap the columns so that the table compression later on is more
	1578	efficient in terms of memory by ensuring the most common value is in the
	1579	middle and the least common are on the outside. IMO this would be better
	1580	than a most to least common mapping as theres a decent chance the most
	1581	common letter will share a node with the least common, meaning the node
	1582	will not be compressible. With a middle is most common approach the worst
	1583	case is when we have the least common nodes twice.
	1584
	1585	*/
	1586
	1587	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1588	regnode * const noper = NEXTOPER( cur );
	1589	const U8 uc = (U8)STRING( noper );
	1590	const U8 * const e = uc + STR_LEN( noper );
	1591	STRLEN foldlen = 0;
	1592	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1593	const U8 scan = (U8)NULL;
	1594	U32 wordlen = 0; /* required init */
	1595	STRLEN chars = 0;
	1596	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1597
	1598	if (OP(noper) == NOTHING) {
	1599	trie->minlen= 0;
	1600	continue;
	1601	}
	1602	if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
	1603	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1604	regardless of encoding */
	1605
	1606	for ( ; uc < e ; uc += len ) {
	1607	TRIE_CHARCOUNT(trie)++;
	1608	TRIE_READ_CHAR;
	1609	chars++;
	1610	if ( uvc < 256 ) {
	1611	if ( !trie->charmap[ uvc ] ) {
	1612	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1613	if ( folder )
	1614	trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
	1615	TRIE_STORE_REVCHAR;
	1616	}
	1617	if ( set_bit ) {
	1618	/* store the codepoint in the bitmap, and its folded
	1619	* equivalent. */
	1620	TRIE_BITMAP_SET(trie,uvc);
	1621
	1622	/* store the folded codepoint */
	1623	if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
	1624
	1625	if ( !UTF ) {
	1626	/* store first byte of utf8 representation of
	1627	variant codepoints */
	1628	if (! UNI_IS_INVARIANT(uvc)) {
	1629	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1630	}
	1631	}
	1632	set_bit = 0; /* We've done our bit :-) */
	1633	}
	1634	} else {
	1635	SV** svpp;
	1636	if ( !widecharmap )
	1637	widecharmap = newHV();
	1638
	1639	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1640
	1641	if ( !svpp )
	1642	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1643
	1644	if ( !SvTRUE( *svpp ) ) {
	1645	sv_setiv( *svpp, ++trie->uniquecharcount );
	1646	TRIE_STORE_REVCHAR;
	1647	}
	1648	}
	1649	}
	1650	if( cur == first ) {
	1651	trie->minlen=chars;
	1652	trie->maxlen=chars;
	1653	} else if (chars < trie->minlen) {
	1654	trie->minlen=chars;
	1655	} else if (chars > trie->maxlen) {
	1656	trie->maxlen=chars;
	1657	}
	1658
	1659	} /* end first pass */
	1660	DEBUG_TRIE_COMPILE_r(
	1661	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1662	(int)depth * 2 + 2,"",
	1663	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1664	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1665	(int)trie->minlen, (int)trie->maxlen )
	1666	);
	1667
	1668	/*
	1669	We now know what we are dealing with in terms of unique chars and
	1670	string sizes so we can calculate how much memory a naive
	1671	representation using a flat table will take. If it's over a reasonable
	1672	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1673	conservative but potentially much slower representation using an array
	1674	of lists.
	1675
	1676	At the end we convert both representations into the same compressed
	1677	form that will be used in regexec.c for matching with. The latter
	1678	is a form that cannot be used to construct with but has memory
	1679	properties similar to the list form and access properties similar
	1680	to the table form making it both suitable for fast searches and
	1681	small enough that its feasable to store for the duration of a program.
	1682
	1683	See the comment in the code where the compressed table is produced
	1684	inplace from the flat tabe representation for an explanation of how
	1685	the compression works.
	1686
	1687	*/
	1688
	1689
	1690	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1691	prev_states[1] = 0;
	1692
	1693	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1694	/*
	1695	Second Pass -- Array Of Lists Representation
	1696
	1697	Each state will be represented by a list of charid:state records
	1698	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1699	points of the allocated array. (See defines above).
	1700
	1701	We build the initial structure using the lists, and then convert
	1702	it into the compressed table form which allows faster lookups
	1703	(but cant be modified once converted).
	1704	*/
	1705
	1706	STRLEN transcount = 1;
	1707
	1708	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1709	"%*sCompiling trie using list compiler\n",
	1710	(int)depth * 2 + 2, ""));
	1711
	1712	trie->states = (reg_trie_state *)
	1713	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1714	sizeof(reg_trie_state) );
	1715	TRIE_LIST_NEW(1);
	1716	next_alloc = 2;
	1717
	1718	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1719
	1720	regnode * const noper = NEXTOPER( cur );
	1721	U8 uc = (U8)STRING( noper );
	1722	const U8 * const e = uc + STR_LEN( noper );
	1723	U32 state = 1; /* required init */
	1724	U16 charid = 0; /* sanity init */
	1725	U8 scan = (U8)NULL; /* sanity init */
	1726	STRLEN foldlen = 0; /* required init */
	1727	U32 wordlen = 0; /* required init */
	1728	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1729
	1730	if (OP(noper) != NOTHING) {
	1731	for ( ; uc < e ; uc += len ) {
	1732
	1733	TRIE_READ_CHAR;
	1734
	1735	if ( uvc < 256 ) {
	1736	charid = trie->charmap[ uvc ];
	1737	} else {
	1738	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1739	if ( !svpp ) {
	1740	charid = 0;
	1741	} else {
	1742	charid=(U16)SvIV( *svpp );
	1743	}
	1744	}
	1745	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1746	if ( charid ) {
	1747
	1748	U16 check;
	1749	U32 newstate = 0;
	1750
	1751	charid--;
	1752	if ( !trie->states[ state ].trans.list ) {
	1753	TRIE_LIST_NEW( state );
	1754	}
	1755	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1756	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1757	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1758	break;
	1759	}
	1760	}
	1761	if ( ! newstate ) {
	1762	newstate = next_alloc++;
	1763	prev_states[newstate] = state;
	1764	TRIE_LIST_PUSH( state, charid, newstate );
	1765	transcount++;
	1766	}
	1767	state = newstate;
	1768	} else {
	1769	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1770	}
	1771	}
	1772	}
	1773	TRIE_HANDLE_WORD(state);
	1774
	1775	} /* end second pass */
	1776
	1777	/* next alloc is the NEXT state to be allocated */
	1778	trie->statecount = next_alloc;
	1779	trie->states = (reg_trie_state *)
	1780	PerlMemShared_realloc( trie->states,
	1781	next_alloc
	1782	* sizeof(reg_trie_state) );
	1783
	1784	/* and now dump it out before we compress it */
	1785	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1786	revcharmap, next_alloc,
	1787	depth+1)
	1788	);
	1789
	1790	trie->trans = (reg_trie_trans *)
	1791	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1792	{
	1793	U32 state;
	1794	U32 tp = 0;
	1795	U32 zp = 0;
	1796
	1797
	1798	for( state=1 ; state < next_alloc ; state ++ ) {
	1799	U32 base=0;
	1800
	1801	/*
	1802	DEBUG_TRIE_COMPILE_MORE_r(
	1803	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1804	);
	1805	*/
	1806
	1807	if (trie->states[state].trans.list) {
	1808	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1809	U16 maxid=minid;
	1810	U16 idx;
	1811
	1812	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1813	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1814	if ( forid < minid ) {
	1815	minid=forid;
	1816	} else if ( forid > maxid ) {
	1817	maxid=forid;
	1818	}
	1819	}
	1820	if ( transcount < tp + maxid - minid + 1) {
	1821	transcount *= 2;
	1822	trie->trans = (reg_trie_trans *)
	1823	PerlMemShared_realloc( trie->trans,
	1824	transcount
	1825	* sizeof(reg_trie_trans) );
	1826	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1827	}
	1828	base = trie->uniquecharcount + tp - minid;
	1829	if ( maxid == minid ) {
	1830	U32 set = 0;
	1831	for ( ; zp < tp ; zp++ ) {
	1832	if ( ! trie->trans[ zp ].next ) {
	1833	base = trie->uniquecharcount + zp - minid;
	1834	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1835	trie->trans[ zp ].check = state;
	1836	set = 1;
	1837	break;
	1838	}
	1839	}
	1840	if ( !set ) {
	1841	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1842	trie->trans[ tp ].check = state;
	1843	tp++;
	1844	zp = tp;
	1845	}
	1846	} else {
	1847	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1848	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	1849	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	1850	trie->trans[ tid ].check = state;
	1851	}
	1852	tp += ( maxid - minid + 1 );
	1853	}
	1854	Safefree(trie->states[ state ].trans.list);
	1855	}
	1856	/*
	1857	DEBUG_TRIE_COMPILE_MORE_r(
	1858	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	1859	);
	1860	*/
	1861	trie->states[ state ].trans.base=base;
	1862	}
	1863	trie->lasttrans = tp + 1;
	1864	}
	1865	} else {
	1866	/*
	1867	Second Pass -- Flat Table Representation.
	1868
	1869	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	1870	We know that we will need Charcount+1 trans at most to store the data
	1871	(one row per char at worst case) So we preallocate both structures
	1872	assuming worst case.
	1873
	1874	We then construct the trie using only the .next slots of the entry
	1875	structs.
	1876
	1877	We use the .check field of the first entry of the node temporarily to
	1878	make compression both faster and easier by keeping track of how many non
	1879	zero fields are in the node.
	1880
	1881	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1882	transition.
	1883
	1884	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	1885	number representing the first entry of the node, and state as a
	1886	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	1887	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	1888	are 2 entrys per node. eg:
	1889
	1890	A B A B
	1891	1. 2 4 1. 3 7
	1892	2. 0 3 3. 0 5
	1893	3. 0 0 5. 0 0
	1894	4. 0 0 7. 0 0
	1895
	1896	The table is internally in the right hand, idx form. However as we also
	1897	have to deal with the states array which is indexed by nodenum we have to
	1898	use TRIE_NODENUM() to convert.
	1899
	1900	*/
	1901	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1902	"%*sCompiling trie using table compiler\n",
	1903	(int)depth * 2 + 2, ""));
	1904
	1905	trie->trans = (reg_trie_trans *)
	1906	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1907	* trie->uniquecharcount + 1,
	1908	sizeof(reg_trie_trans) );
	1909	trie->states = (reg_trie_state *)
	1910	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1911	sizeof(reg_trie_state) );
	1912	next_alloc = trie->uniquecharcount + 1;
	1913
	1914
	1915	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1916
	1917	regnode * const noper = NEXTOPER( cur );
	1918	const U8 uc = (U8)STRING( noper );
	1919	const U8 * const e = uc + STR_LEN( noper );
	1920
	1921	U32 state = 1; /* required init */
	1922
	1923	U16 charid = 0; /* sanity init */
	1924	U32 accept_state = 0; /* sanity init */
	1925	U8 scan = (U8)NULL; /* sanity init */
	1926
	1927	STRLEN foldlen = 0; /* required init */
	1928	U32 wordlen = 0; /* required init */
	1929	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1930
	1931	if ( OP(noper) != NOTHING ) {
	1932	for ( ; uc < e ; uc += len ) {
	1933
	1934	TRIE_READ_CHAR;
	1935
	1936	if ( uvc < 256 ) {
	1937	charid = trie->charmap[ uvc ];
	1938	} else {
	1939	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1940	charid = svpp ? (U16)SvIV(*svpp) : 0;
	1941	}
	1942	if ( charid ) {
	1943	charid--;
	1944	if ( !trie->trans[ state + charid ].next ) {
	1945	trie->trans[ state + charid ].next = next_alloc;
	1946	trie->trans[ state ].check++;
	1947	prev_states[TRIE_NODENUM(next_alloc)]
	1948	= TRIE_NODENUM(state);
	1949	next_alloc += trie->uniquecharcount;
	1950	}
	1951	state = trie->trans[ state + charid ].next;
	1952	} else {
	1953	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1954	}
	1955	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1956	}
	1957	}
	1958	accept_state = TRIE_NODENUM( state );
	1959	TRIE_HANDLE_WORD(accept_state);
	1960
	1961	} /* end second pass */
	1962
	1963	/* and now dump it out before we compress it */
	1964	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	1965	revcharmap,
	1966	next_alloc, depth+1));
	1967
	1968	{
	1969	/*
	1970	* Inplace compress the table.*
	1971
	1972	For sparse data sets the table constructed by the trie algorithm will
	1973	be mostly 0/FAIL transitions or to put it another way mostly empty.
	1974	(Note that leaf nodes will not contain any transitions.)
	1975
	1976	This algorithm compresses the tables by eliminating most such
	1977	transitions, at the cost of a modest bit of extra work during lookup:
	1978
	1979	- Each states[] entry contains a .base field which indicates the
	1980	index in the state[] array wheres its transition data is stored.
	1981
	1982	- If .base is 0 there are no valid transitions from that node.
	1983
	1984	- If .base is nonzero then charid is added to it to find an entry in
	1985	the trans array.
	1986
	1987	-If trans[states[state].base+charid].check!=state then the
	1988	transition is taken to be a 0/Fail transition. Thus if there are fail
	1989	transitions at the front of the node then the .base offset will point
	1990	somewhere inside the previous nodes data (or maybe even into a node
	1991	even earlier), but the .check field determines if the transition is
	1992	valid.
	1993
	1994	XXX - wrong maybe?
	1995	The following process inplace converts the table to the compressed
	1996	table: We first do not compress the root node 1,and mark all its
	1997	.check pointers as 1 and set its .base pointer as 1 as well. This
	1998	allows us to do a DFA construction from the compressed table later,
	1999	and ensures that any .base pointers we calculate later are greater
	2000	than 0.
	2001
	2002	- We set 'pos' to indicate the first entry of the second node.
	2003
	2004	- We then iterate over the columns of the node, finding the first and
	2005	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	2006	and set the .check pointers accordingly, and advance pos
	2007	appropriately and repreat for the next node. Note that when we copy
	2008	the next pointers we have to convert them from the original
	2009	NODEIDX form to NODENUM form as the former is not valid post
	2010	compression.
	2011
	2012	- If a node has no transitions used we mark its base as 0 and do not
	2013	advance the pos pointer.
	2014
	2015	- If a node only has one transition we use a second pointer into the
	2016	structure to fill in allocated fail transitions from other states.
	2017	This pointer is independent of the main pointer and scans forward
	2018	looking for null transitions that are allocated to a state. When it
	2019	finds one it writes the single transition into the "hole". If the
	2020	pointer doesnt find one the single transition is appended as normal.
	2021
	2022	- Once compressed we can Renew/realloc the structures to release the
	2023	excess space.
	2024
	2025	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	2026	specifically Fig 3.47 and the associated pseudocode.
	2027
	2028	demq
	2029	*/
	2030	const U32 laststate = TRIE_NODENUM( next_alloc );
	2031	U32 state, charid;
	2032	U32 pos = 0, zp=0;
	2033	trie->statecount = laststate;
	2034
	2035	for ( state = 1 ; state < laststate ; state++ ) {
	2036	U8 flag = 0;
	2037	const U32 stateidx = TRIE_NODEIDX( state );
	2038	const U32 o_used = trie->trans[ stateidx ].check;
	2039	U32 used = trie->trans[ stateidx ].check;
	2040	trie->trans[ stateidx ].check = 0;
	2041
	2042	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	2043	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2044	if ( trie->trans[ stateidx + charid ].next ) {
	2045	if (o_used == 1) {
	2046	for ( ; zp < pos ; zp++ ) {
	2047	if ( ! trie->trans[ zp ].next ) {
	2048	break;
	2049	}
	2050	}
	2051	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	2052	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2053	trie->trans[ zp ].check = state;
	2054	if ( ++zp > pos ) pos = zp;
	2055	break;
	2056	}
	2057	used--;
	2058	}
	2059	if ( !flag ) {
	2060	flag = 1;
	2061	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	2062	}
	2063	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2064	trie->trans[ pos ].check = state;
	2065	pos++;
	2066	}
	2067	}
	2068	}
	2069	trie->lasttrans = pos + 1;
	2070	trie->states = (reg_trie_state *)
	2071	PerlMemShared_realloc( trie->states, laststate
	2072	* sizeof(reg_trie_state) );
	2073	DEBUG_TRIE_COMPILE_MORE_r(
	2074	PerlIO_printf( Perl_debug_log,
	2075	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2076	(int)depth * 2 + 2,"",
	2077	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	2078	(IV)next_alloc,
	2079	(IV)pos,
	2080	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2081	);
	2082
	2083	} /* end table compress */
	2084	}
	2085	DEBUG_TRIE_COMPILE_MORE_r(
	2086	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2087	(int)depth * 2 + 2, "",
	2088	(UV)trie->statecount,
	2089	(UV)trie->lasttrans)
	2090	);
	2091	/* resize the trans array to remove unused space */
	2092	trie->trans = (reg_trie_trans *)
	2093	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2094	* sizeof(reg_trie_trans) );
	2095
	2096	{ /* Modify the program and insert the new TRIE node */
	2097	U8 nodetype =(U8)(flags & 0xFF);
	2098	char *str=NULL;
	2099
	2100	#ifdef DEBUGGING
	2101	regnode *optimize = NULL;
	2102	#ifdef RE_TRACK_PATTERN_OFFSETS
	2103
	2104	U32 mjd_offset = 0;
	2105	U32 mjd_nodelen = 0;
	2106	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2107	#endif /* DEBUGGING */
	2108	/*
	2109	This means we convert either the first branch or the first Exact,
	2110	depending on whether the thing following (in 'last') is a branch
	2111	or not and whther first is the startbranch (ie is it a sub part of
	2112	the alternation or is it the whole thing.)
	2113	Assuming its a sub part we convert the EXACT otherwise we convert
	2114	the whole branch sequence, including the first.
	2115	*/
	2116	/* Find the node we are going to overwrite */
	2117	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2118	/* branch sub-chain */
	2119	NEXT_OFF( first ) = (U16)(last - first);
	2120	#ifdef RE_TRACK_PATTERN_OFFSETS
	2121	DEBUG_r({
	2122	mjd_offset= Node_Offset((convert));
	2123	mjd_nodelen= Node_Length((convert));
	2124	});
	2125	#endif
	2126	/* whole branch chain */
	2127	}
	2128	#ifdef RE_TRACK_PATTERN_OFFSETS
	2129	else {
	2130	DEBUG_r({
	2131	const regnode *nop = NEXTOPER( convert );
	2132	mjd_offset= Node_Offset((nop));
	2133	mjd_nodelen= Node_Length((nop));
	2134	});
	2135	}
	2136	DEBUG_OPTIMISE_r(
	2137	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2138	(int)depth * 2 + 2, "",
	2139	(UV)mjd_offset, (UV)mjd_nodelen)
	2140	);
	2141	#endif
	2142	/* But first we check to see if there is a common prefix we can
	2143	split out as an EXACT and put in front of the TRIE node. */
	2144	trie->startstate= 1;
	2145	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2146	U32 state;
	2147	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2148	U32 ofs = 0;
	2149	I32 idx = -1;
	2150	U32 count = 0;
	2151	const U32 base = trie->states[ state ].trans.base;
	2152
	2153	if ( trie->states[state].wordnum )
	2154	count = 1;
	2155
	2156	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2157	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2158	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2159	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2160	{
	2161	if ( ++count > 1 ) {
	2162	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2163	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2164	if ( state == 1 ) break;
	2165	if ( count == 2 ) {
	2166	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2167	DEBUG_OPTIMISE_r(
	2168	PerlIO_printf(Perl_debug_log,
	2169	"%*sNew Start State=%"UVuf" Class: [",
	2170	(int)depth * 2 + 2, "",
	2171	(UV)state));
	2172	if (idx >= 0) {
	2173	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2174	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2175
	2176	TRIE_BITMAP_SET(trie,*ch);
	2177	if ( folder )
	2178	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2179	DEBUG_OPTIMISE_r(
	2180	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2181	);
	2182	}
	2183	}
	2184	TRIE_BITMAP_SET(trie,*ch);
	2185	if ( folder )
	2186	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2187	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2188	}
	2189	idx = ofs;
	2190	}
	2191	}
	2192	if ( count == 1 ) {
	2193	SV **tmp = av_fetch( revcharmap, idx, 0);
	2194	STRLEN len;
	2195	char ch = SvPV( tmp, len );
	2196	DEBUG_OPTIMISE_r({
	2197	SV *sv=sv_newmortal();
	2198	PerlIO_printf( Perl_debug_log,
	2199	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2200	(int)depth * 2 + 2, "",
	2201	(UV)state, (UV)idx,
	2202	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2203	PL_colors[0], PL_colors[1],
	2204	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2205	PERL_PV_ESCAPE_FIRSTCHAR
	2206	)
	2207	);
	2208	});
	2209	if ( state==1 ) {
	2210	OP( convert ) = nodetype;
	2211	str=STRING(convert);
	2212	STR_LEN(convert)=0;
	2213	}
	2214	STR_LEN(convert) += len;
	2215	while (len--)
	2216	str++ = ch++;
	2217	} else {
	2218	#ifdef DEBUGGING
	2219	if (state>1)
	2220	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2221	#endif
	2222	break;
	2223	}
	2224	}
	2225	trie->prefixlen = (state-1);
	2226	if (str) {
	2227	regnode *n = convert+NODE_SZ_STR(convert);
	2228	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2229	trie->startstate = state;
	2230	trie->minlen -= (state - 1);
	2231	trie->maxlen -= (state - 1);
	2232	#ifdef DEBUGGING
	2233	/* At least the UNICOS C compiler choked on this
	2234	* being argument to DEBUG_r(), so let's just have
	2235	* it right here. */
	2236	if (
	2237	#ifdef PERL_EXT_RE_BUILD
	2238	1
	2239	#else
	2240	DEBUG_r_TEST
	2241	#endif
	2242	) {
	2243	regnode *fix = convert;
	2244	U32 word = trie->wordcount;
	2245	mjd_nodelen++;
	2246	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2247	while( ++fix < n ) {
	2248	Set_Node_Offset_Length(fix, 0, 0);
	2249	}
	2250	while (word--) {
	2251	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2252	if (tmp) {
	2253	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2254	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2255	else
	2256	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2257	}
	2258	}
	2259	}
	2260	#endif
	2261	if (trie->maxlen) {
	2262	convert = n;
	2263	} else {
	2264	NEXT_OFF(convert) = (U16)(tail - convert);
	2265	DEBUG_r(optimize= n);
	2266	}
	2267	}
	2268	}
	2269	if (!jumper)
	2270	jumper = last;
	2271	if ( trie->maxlen ) {
	2272	NEXT_OFF( convert ) = (U16)(tail - convert);
	2273	ARG_SET( convert, data_slot );
	2274	/* Store the offset to the first unabsorbed branch in
	2275	jump[0], which is otherwise unused by the jump logic.
	2276	We use this when dumping a trie and during optimisation. */
	2277	if (trie->jump)
	2278	trie->jump[0] = (U16)(nextbranch - convert);
	2279
	2280	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2281	* and there is a bitmap
	2282	* and the first "jump target" node we found leaves enough room
	2283	* then convert the TRIE node into a TRIEC node, with the bitmap
	2284	* embedded inline in the opcode - this is hypothetically faster.
	2285	*/
	2286	if ( !trie->states[trie->startstate].wordnum
	2287	&& trie->bitmap
	2288	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2289	{
	2290	OP( convert ) = TRIEC;
	2291	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2292	PerlMemShared_free(trie->bitmap);
	2293	trie->bitmap= NULL;
	2294	} else
	2295	OP( convert ) = TRIE;
	2296
	2297	/* store the type in the flags */
	2298	convert->flags = nodetype;
	2299	DEBUG_r({
	2300	optimize = convert
	2301	+ NODE_STEP_REGNODE
	2302	+ regarglen[ OP( convert ) ];
	2303	});
	2304	/* XXX We really should free up the resource in trie now,
	2305	as we won't use them - (which resources?) dmq */
	2306	}
	2307	/* needed for dumping*/
	2308	DEBUG_r(if (optimize) {
	2309	regnode *opt = convert;
	2310
	2311	while ( ++opt < optimize) {
	2312	Set_Node_Offset_Length(opt,0,0);
	2313	}
	2314	/*
	2315	Try to clean up some of the debris left after the
	2316	optimisation.
	2317	*/
	2318	while( optimize < jumper ) {
	2319	mjd_nodelen += Node_Length((optimize));
	2320	OP( optimize ) = OPTIMIZED;
	2321	Set_Node_Offset_Length(optimize,0,0);
	2322	optimize++;
	2323	}
	2324	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2325	});
	2326	} /* end node insert */
	2327
	2328	/* Finish populating the prev field of the wordinfo array. Walk back
	2329	* from each accept state until we find another accept state, and if
	2330	* so, point the first word's .prev field at the second word. If the
	2331	* second already has a .prev field set, stop now. This will be the
	2332	* case either if we've already processed that word's accept state,
	2333	* or that state had multiple words, and the overspill words were
	2334	* already linked up earlier.
	2335	*/
	2336	{
	2337	U16 word;
	2338	U32 state;
	2339	U16 prev;
	2340
	2341	for (word=1; word <= trie->wordcount; word++) {
	2342	prev = 0;
	2343	if (trie->wordinfo[word].prev)
	2344	continue;
	2345	state = trie->wordinfo[word].accept;
	2346	while (state) {
	2347	state = prev_states[state];
	2348	if (!state)
	2349	break;
	2350	prev = trie->states[state].wordnum;
	2351	if (prev)
	2352	break;
	2353	}
	2354	trie->wordinfo[word].prev = prev;
	2355	}
	2356	Safefree(prev_states);
	2357	}
	2358
	2359
	2360	/* and now dump out the compressed format */
	2361	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2362
	2363	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2364	#ifdef DEBUGGING
	2365	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2366	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2367	#else
	2368	SvREFCNT_dec(revcharmap);
	2369	#endif
	2370	return trie->jump
	2371	? MADE_JUMP_TRIE
	2372	: trie->startstate>1
	2373	? MADE_EXACT_TRIE
	2374	: MADE_TRIE;
	2375	}
	2376
	2377	STATIC void
	2378	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2379	{
	2380	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2381
	2382	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2383	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2384	ISBN 0-201-10088-6
	2385
	2386	We find the fail state for each state in the trie, this state is the longest proper
	2387	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2388	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2389	the DFA not to have to restart after its tried and failed a word at a given point, it
	2390	simply continues as though it had been matching the other word in the first place.
	2391	Consider
	2392	'abcdgu'=~/abcdefg\|cdgu/
	2393	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2394	fail, which would bring us to the state representing 'd' in the second word where we would
	2395	try 'g' and succeed, proceeding to match 'cdgu'.
	2396	*/
	2397	/* add a fail transition */
	2398	const U32 trie_offset = ARG(source);
	2399	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2400	U32 *q;
	2401	const U32 ucharcount = trie->uniquecharcount;
	2402	const U32 numstates = trie->statecount;
	2403	const U32 ubound = trie->lasttrans + ucharcount;
	2404	U32 q_read = 0;
	2405	U32 q_write = 0;
	2406	U32 charid;
	2407	U32 base = trie->states[ 1 ].trans.base;
	2408	U32 *fail;
	2409	reg_ac_data *aho;
	2410	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2411	GET_RE_DEBUG_FLAGS_DECL;
	2412
	2413	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2414	#ifndef DEBUGGING
	2415	PERL_UNUSED_ARG(depth);
	2416	#endif
	2417
	2418
	2419	ARG_SET( stclass, data_slot );
	2420	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2421	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2422	aho->trie=trie_offset;
	2423	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2424	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2425	Newxz( q, numstates, U32);
	2426	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2427	aho->refcount = 1;
	2428	fail = aho->fail;
	2429	/* initialize fail[0..1] to be 1 so that we always have
	2430	a valid final fail state */
	2431	fail[ 0 ] = fail[ 1 ] = 1;
	2432
	2433	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2434	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2435	if ( newstate ) {
	2436	q[ q_write ] = newstate;
	2437	/* set to point at the root */
	2438	fail[ q[ q_write++ ] ]=1;
	2439	}
	2440	}
	2441	while ( q_read < q_write) {
	2442	const U32 cur = q[ q_read++ % numstates ];
	2443	base = trie->states[ cur ].trans.base;
	2444
	2445	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2446	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2447	if (ch_state) {
	2448	U32 fail_state = cur;
	2449	U32 fail_base;
	2450	do {
	2451	fail_state = fail[ fail_state ];
	2452	fail_base = aho->states[ fail_state ].trans.base;
	2453	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2454
	2455	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2456	fail[ ch_state ] = fail_state;
	2457	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2458	{
	2459	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2460	}
	2461	q[ q_write++ % numstates] = ch_state;
	2462	}
	2463	}
	2464	}
	2465	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2466	when we fail in state 1, this allows us to use the
	2467	charclass scan to find a valid start char. This is based on the principle
	2468	that theres a good chance the string being searched contains lots of stuff
	2469	that cant be a start char.
	2470	*/
	2471	fail[ 0 ] = fail[ 1 ] = 0;
	2472	DEBUG_TRIE_COMPILE_r({
	2473	PerlIO_printf(Perl_debug_log,
	2474	"%*sStclass Failtable (%"UVuf" states): 0",
	2475	(int)(depth * 2), "", (UV)numstates
	2476	);
	2477	for( q_read=1; q_read<numstates; q_read++ ) {
	2478	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2479	}
	2480	PerlIO_printf(Perl_debug_log, "\n");
	2481	});
	2482	Safefree(q);
	2483	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2484	}
	2485
	2486
	2487	/*
	2488	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2489	* These need to be revisited when a newer toolchain becomes available.
	2490	*/
	2491	#if defined(__sparc64__) && defined(__GNUC__)
	2492	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2493	# undef SPARC64_GCC_WORKAROUND
	2494	# define SPARC64_GCC_WORKAROUND 1
	2495	# endif
	2496	#endif
	2497
	2498	#define DEBUG_PEEP(str,scan,depth) \
	2499	DEBUG_OPTIMISE_r({if (scan){ \
	2500	SV * const mysv=sv_newmortal(); \
	2501	regnode *Next = regnext(scan); \
	2502	regprop(RExC_rx, mysv, scan); \
	2503	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2504	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2505	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2506	}});
	2507
	2508
	2509	/* The below joins as many adjacent EXACTish nodes as possible into a single
	2510	* one, and looks for problematic sequences of characters whose folds vs.
	2511	* non-folds have sufficiently different lengths, that the optimizer would be
	2512	* fooled into rejecting legitimate matches of them, and the trie construction
	2513	* code can't cope with them. The joining is only done if:
	2514	* 1) there is room in the current conglomerated node to entirely contain the
	2515	* next one.
	2516	* 2) they are the exact same node type
	2517	*
	2518	* The adjacent nodes actually may be separated by NOTHING kind nodes, and
	2519	* these get optimized out
	2520	*
	2521	* If there are problematic code sequences, *min_subtract is set to the delta
	2522	* that the minimum size of the node can be less than its actual size. And,
	2523	* the node type of the result is changed to reflect that it contains these
	2524	* sequences.
	2525	*
	2526	* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
	2527	* and contains LATIN SMALL LETTER SHARP S
	2528	*
	2529	* This is as good a place as any to discuss the design of handling these
	2530	* problematic sequences. It's been wrong in Perl for a very long time. There
	2531	* are three code points in Unicode whose folded lengths differ so much from
	2532	* the un-folded lengths that it causes problems for the optimizer and trie
	2533	* construction. Why only these are problematic, and not others where lengths
	2534	* also differ is something I (khw) do not understand. New versions of Unicode
	2535	* might add more such code points. Hopefully the logic in fold_grind.t that
	2536	* figures out what to test (in part by verifying that each size-combination
	2537	* gets tested) will catch any that do come along, so they can be added to the
	2538	* special handling below. The chances of new ones are actually rather small,
	2539	* as most, if not all, of the world's scripts that have casefolding have
	2540	* already been encoded by Unicode. Also, a number of Unicode's decisions were
	2541	* made to allow compatibility with pre-existing standards, and almost all of
	2542	* those have already been dealt with. These would otherwise be the most
	2543	* likely candidates for generating further tricky sequences. In other words,
	2544	* Unicode by itself is unlikely to add new ones unless it is for compatibility
	2545	* with pre-existing standards, and there aren't many of those left.
	2546	*
	2547	* The previous designs for dealing with these involved assigning a special
	2548	* node for them. This approach doesn't work, as evidenced by this example:
	2549	* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
	2550	* Both these fold to "sss", but if the pattern is parsed to create a node of
	2551	* that would match just the \xDF, it won't be able to handle the case where a
	2552	* successful match would have to cross the node's boundary. The new approach
	2553	* that hopefully generally solves the problem generates an EXACTFU_SS node
	2554	* that is "sss".
	2555	*
	2556	* There are a number of components to the approach (a lot of work for just
	2557	* three code points!):
	2558	* 1) This routine examines each EXACTFish node that could contain the
	2559	* problematic sequences. It returns in *min_subtract how much to
	2560	* subtract from the the actual length of the string to get a real minimum
	2561	* for one that could match it. This number is usually 0 except for the
	2562	* problematic sequences. This delta is used by the caller to adjust the
	2563	* min length of the match, and the delta between min and max, so that the
	2564	* optimizer doesn't reject these possibilities based on size constraints.
	2565	* 2) These sequences are not currently correctly handled by the trie code
	2566	* either, so it changes the joined node type to ops that are not handled
	2567	* by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
	2568	* 3) This is sufficient for the two Greek sequences (described below), but
	2569	* the one involving the Sharp s (\xDF) needs more. The node type
	2570	* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
	2571	* sequence in it. For non-UTF-8 patterns and strings, this is the only
	2572	* case where there is a possible fold length change. That means that a
	2573	* regular EXACTFU node without UTF-8 involvement doesn't have to concern
	2574	* itself with length changes, and so can be processed faster. regexec.c
	2575	* takes advantage of this. Generally, an EXACTFish node that is in UTF-8
	2576	* is pre-folded by regcomp.c. This saves effort in regex matching.
	2577	* However, probably mostly for historical reasons, the pre-folding isn't
	2578	* done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
	2579	* nodes, as what they fold to isn't known until runtime.) The fold
	2580	* possibilities for the non-UTF8 patterns are quite simple, except for
	2581	* the sharp s. All the ones that don't involve a UTF-8 target string
	2582	* are members of a fold-pair, and arrays are set up for all of them
	2583	* that quickly find the other member of the pair. It might actually
	2584	* be faster to pre-fold these, but it isn't currently done, except for
	2585	* the sharp s. Code elsewhere in this file makes sure that it gets
	2586	* folded to 'ss', even if the pattern isn't UTF-8. This avoids the
	2587	* issues described in the next item.
	2588	* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
	2589	* 'ss' or not is not knowable at compile time. It will match iff the
	2590	* target string is in UTF-8, unlike the EXACTFU nodes, where it always
	2591	* matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
	2592	* it can't be folded to "ss" at compile time, unlike EXACTFU does as
	2593	* described in item 3). An assumption that the optimizer part of
	2594	* regexec.c (probably unwittingly) makes is that a character in the
	2595	* pattern corresponds to at most a single character in the target string.
	2596	* (And I do mean character, and not byte here, unlike other parts of the
	2597	* documentation that have never been updated to account for multibyte
	2598	* Unicode.) This assumption is wrong only in this case, as all other
	2599	* cases are either 1-1 folds when no UTF-8 is involved; or is true by
	2600	* virtue of having this file pre-fold UTF-8 patterns. I'm
	2601	* reluctant to try to change this assumption, so instead the code punts.
	2602	* This routine examines EXACTF nodes for the sharp s, and returns a
	2603	* boolean indicating whether or not the node is an EXACTF node that
	2604	* contains a sharp s. When it is true, the caller sets a flag that later
	2605	* causes the optimizer in this file to not set values for the floating
	2606	* and fixed string lengths, and thus avoids the optimizer code in
	2607	* regexec.c that makes the invalid assumption. Thus, there is no
	2608	* optimization based on string lengths for EXACTF nodes that contain the
	2609	* sharp s. This only happens for /id rules (which means the pattern
	2610	* isn't in UTF-8).
	2611	*/
	2612
	2613	#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
	2614	if (PL_regkind[OP(scan)] == EXACT) \
	2615	join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
	2616
	2617	STATIC U32
	2618	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, UV min_subtract, bool has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
	2619	/* Merge several consecutive EXACTish nodes into one. */
	2620	regnode *n = regnext(scan);
	2621	U32 stringok = 1;
	2622	regnode *next = scan + NODE_SZ_STR(scan);
	2623	U32 merged = 0;
	2624	U32 stopnow = 0;
	2625	#ifdef DEBUGGING
	2626	regnode *stop = scan;
	2627	GET_RE_DEBUG_FLAGS_DECL;
	2628	#else
	2629	PERL_UNUSED_ARG(depth);
	2630	#endif
	2631
	2632	PERL_ARGS_ASSERT_JOIN_EXACT;
	2633	#ifndef EXPERIMENTAL_INPLACESCAN
	2634	PERL_UNUSED_ARG(flags);
	2635	PERL_UNUSED_ARG(val);
	2636	#endif
	2637	DEBUG_PEEP("join",scan,depth);
	2638
	2639	/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
	2640	* EXACT ones that are mergeable to the current one. */
	2641	while (n
	2642	&& (PL_regkind[OP(n)] == NOTHING
	2643	\|\| (stringok && OP(n) == OP(scan)))
	2644	&& NEXT_OFF(n)
	2645	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
	2646	{
	2647
	2648	if (OP(n) == TAIL \|\| n > next)
	2649	stringok = 0;
	2650	if (PL_regkind[OP(n)] == NOTHING) {
	2651	DEBUG_PEEP("skip:",n,depth);
	2652	NEXT_OFF(scan) += NEXT_OFF(n);
	2653	next = n + NODE_STEP_REGNODE;
	2654	#ifdef DEBUGGING
	2655	if (stringok)
	2656	stop = n;
	2657	#endif
	2658	n = regnext(n);
	2659	}
	2660	else if (stringok) {
	2661	const unsigned int oldl = STR_LEN(scan);
	2662	regnode * const nnext = regnext(n);
	2663
	2664	if (oldl + STR_LEN(n) > U8_MAX)
	2665	break;
	2666
	2667	DEBUG_PEEP("merg",n,depth);
	2668	merged++;
	2669
	2670	NEXT_OFF(scan) += NEXT_OFF(n);
	2671	STR_LEN(scan) += STR_LEN(n);
	2672	next = n + NODE_SZ_STR(n);
	2673	/* Now we can overwrite n : /
	2674	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2675	#ifdef DEBUGGING
	2676	stop = next - 1;
	2677	#endif
	2678	n = nnext;
	2679	if (stopnow) break;
	2680	}
	2681
	2682	#ifdef EXPERIMENTAL_INPLACESCAN
	2683	if (flags && !NEXT_OFF(n)) {
	2684	DEBUG_PEEP("atch", val, depth);
	2685	if (reg_off_by_arg[OP(n)]) {
	2686	ARG_SET(n, val - n);
	2687	}
	2688	else {
	2689	NEXT_OFF(n) = val - n;
	2690	}
	2691	stopnow = 1;
	2692	}
	2693	#endif
	2694	}
	2695
	2696	*min_subtract = 0;
	2697	*has_exactf_sharp_s = FALSE;
	2698
	2699	/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
	2700	* can now analyze for sequences of problematic code points. (Prior to
	2701	* this final joining, sequences could have been split over boundaries, and
	2702	* hence missed). The sequences only happen in folding, hence for any
	2703	* non-EXACT EXACTish node */
	2704	if (OP(scan) != EXACT) {
	2705	U8 *s;
	2706	U8 * s0 = (U8*) STRING(scan);
	2707	U8 * const s_end = s0 + STR_LEN(scan);
	2708
	2709	/* The below is perhaps overboard, but this allows us to save a test
	2710	* each time through the loop at the expense of a mask. This is
	2711	* because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
	2712	* single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
	2713	* This uses an exclusive 'or' to find that bit and then inverts it to
	2714	* form a mask, with just a single 0, in the bit position where 'S' and
	2715	* 's' differ. */
	2716	const U8 S_or_s_mask = ~ ('S' ^ 's');
	2717	const U8 s_masked = 's' & S_or_s_mask;
	2718
	2719	/* One pass is made over the node's string looking for all the
	2720	* possibilities. to avoid some tests in the loop, there are two main
	2721	* cases, for UTF-8 patterns (which can't have EXACTF nodes) and
	2722	* non-UTF-8 */
	2723	if (UTF) {
	2724
	2725	/* There are two problematic Greek code points in Unicode
	2726	* casefolding
	2727	*
	2728	* U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	2729	* U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	2730	*
	2731	* which casefold to
	2732	*
	2733	* Unicode UTF-8
	2734	*
	2735	* U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
	2736	* U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
	2737	*
	2738	* This means that in case-insensitive matching (or "loose
	2739	* matching", as Unicode calls it), an EXACTF of length six (the
	2740	* UTF-8 encoded byte length of the above casefolded versions) can
	2741	* match a target string of length two (the byte length of UTF-8
	2742	* encoded U+0390 or U+03B0). This would rather mess up the
	2743	* minimum length computation. (there are other code points that
	2744	* also fold to these two sequences, but the delta is smaller)
	2745	*
	2746	* If these sequences are found, the minimum length is decreased by
	2747	* four (six minus two).
	2748	*
	2749	* Similarly, 'ss' may match the single char and byte LATIN SMALL
	2750	* LETTER SHARP S. We decrease the min length by 1 for each
	2751	* occurrence of 'ss' found */
	2752
	2753	#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
	2754	# define U390_first_byte 0xb4
	2755	const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
	2756	# define U3B0_first_byte 0xb5
	2757	const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
	2758	#else
	2759	# define U390_first_byte 0xce
	2760	const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
	2761	# define U3B0_first_byte 0xcf
	2762	const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
	2763	#endif
	2764	const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
	2765	yields a net of 0 */
	2766	/* Examine the string for one of the problematic sequences */
	2767	for (s = s0;
	2768	s < s_end - 1; /* Can stop 1 before the end, as minimum length
	2769	* sequence we are looking for is 2 */
	2770	s += UTF8SKIP(s))
	2771	{
	2772
	2773	/* Look for the first byte in each problematic sequence */
	2774	switch (*s) {
	2775	/* We don't have to worry about other things that fold to
	2776	* 's' (such as the long s, U+017F), as all above-latin1
	2777	* code points have been pre-folded */
	2778	case 's':
	2779	case 'S':
	2780
	2781	/* Current character is an 's' or 'S'. If next one is
	2782	* as well, we have the dreaded sequence */
	2783	if (((*(s+1) & S_or_s_mask) == s_masked)
	2784	/* These two node types don't have special handling
	2785	* for 'ss' */
	2786	&& OP(scan) != EXACTFL && OP(scan) != EXACTFA)
	2787	{
	2788	*min_subtract += 1;
	2789	OP(scan) = EXACTFU_SS;
	2790	s++; /* No need to look at this character again */
	2791	}
	2792	break;
	2793
	2794	case U390_first_byte:
	2795	if (s_end - s >= len
	2796
	2797	/* The 1's are because are skipping comparing the
	2798	* first byte */
	2799	&& memEQ(s + 1, U390_tail, len - 1))
	2800	{
	2801	goto greek_sequence;
	2802	}
	2803	break;
	2804
	2805	case U3B0_first_byte:
	2806	if (! (s_end - s >= len
	2807	&& memEQ(s + 1, U3B0_tail, len - 1)))
	2808	{
	2809	break;
	2810	}
	2811	greek_sequence:
	2812	*min_subtract += 4;
	2813
	2814	/* This can't currently be handled by trie's, so change
	2815	* the node type to indicate this. If EXACTFA and
	2816	* EXACTFL were ever to be handled by trie's, this
	2817	* would have to be changed. If this node has already
	2818	* been changed to EXACTFU_SS in this loop, leave it as
	2819	* is. (I (khw) think it doesn't matter in regexec.c
	2820	* for UTF patterns, but no need to change it */
	2821	if (OP(scan) == EXACTFU) {
	2822	OP(scan) = EXACTFU_NO_TRIE;
	2823	}
	2824	s += 6; /* We already know what this sequence is. Skip
	2825	the rest of it */
	2826	break;
	2827	}
	2828	}
	2829	}
	2830	else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
	2831
	2832	/* Here, the pattern is not UTF-8. We need to look only for the
	2833	* 'ss' sequence, and in the EXACTF case, the sharp s, which can be
	2834	* in the final position. Otherwise we can stop looking 1 byte
	2835	* earlier because have to find both the first and second 's' */
	2836	const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
	2837
	2838	for (s = s0; s < upper; s++) {
	2839	switch (*s) {
	2840	case 'S':
	2841	case 's':
	2842	if (s_end - s > 1
	2843	&& ((*(s+1) & S_or_s_mask) == s_masked))
	2844	{
	2845	*min_subtract += 1;
	2846
	2847	/* EXACTF nodes need to know that the minimum
	2848	* length changed so that a sharp s in the string
	2849	* can match this ss in the pattern, but they
	2850	* remain EXACTF nodes, as they are not trie'able,
	2851	* so don't have to invent a new node type to
	2852	* exclude them from the trie code */
	2853	if (OP(scan) != EXACTF) {
	2854	OP(scan) = EXACTFU_SS;
	2855	}
	2856	s++;
	2857	}
	2858	break;
	2859	case LATIN_SMALL_LETTER_SHARP_S:
	2860	if (OP(scan) == EXACTF) {
	2861	*has_exactf_sharp_s = TRUE;
	2862	}
	2863	break;
	2864	}
	2865	}
	2866	}
	2867	}
	2868
	2869	#ifdef DEBUGGING
	2870	/* Allow dumping but overwriting the collection of skipped
	2871	* ops and/or strings with fake optimized ops */
	2872	n = scan + NODE_SZ_STR(scan);
	2873	while (n <= stop) {
	2874	OP(n) = OPTIMIZED;
	2875	FLAGS(n) = 0;
	2876	NEXT_OFF(n) = 0;
	2877	n++;
	2878	}
	2879	#endif
	2880	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	2881	return stopnow;
	2882	}
	2883
	2884	/* REx optimizer. Converts nodes into quicker variants "in place".
	2885	Finds fixed substrings. */
	2886
	2887	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	2888	to the position after last scanned or to NULL. */
	2889
	2890	#define INIT_AND_WITHP \
	2891	assert(!and_withp); \
	2892	Newx(and_withp,1,struct regnode_charclass_class); \
	2893	SAVEFREEPV(and_withp)
	2894
	2895	/* this is a chain of data about sub patterns we are processing that
	2896	need to be handled separately/specially in study_chunk. Its so
	2897	we can simulate recursion without losing state. */
	2898	struct scan_frame;
	2899	typedef struct scan_frame {
	2900	regnode last; / last node to process in this frame */
	2901	regnode next; / next node to process when last is reached */
	2902	struct scan_frame prev; /previous frame*/
	2903	I32 stop; /* what stopparen do we use */
	2904	} scan_frame;
	2905
	2906
	2907	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	2908
	2909	#define CASE_SYNST_FNC(nAmE) \
	2910	case nAmE: \
	2911	if (flags & SCF_DO_STCLASS_AND) { \
	2912	for (value = 0; value < 256; value++) \
	2913	if (!is_ ## nAmE ## _cp(value)) \
	2914	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2915	} \
	2916	else { \
	2917	for (value = 0; value < 256; value++) \
	2918	if (is_ ## nAmE ## _cp(value)) \
	2919	ANYOF_BITMAP_SET(data->start_class, value); \
	2920	} \
	2921	break; \
	2922	case N ## nAmE: \
	2923	if (flags & SCF_DO_STCLASS_AND) { \
	2924	for (value = 0; value < 256; value++) \
	2925	if (is_ ## nAmE ## _cp(value)) \
	2926	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2927	} \
	2928	else { \
	2929	for (value = 0; value < 256; value++) \
	2930	if (!is_ ## nAmE ## _cp(value)) \
	2931	ANYOF_BITMAP_SET(data->start_class, value); \
	2932	} \
	2933	break
	2934
	2935
	2936
	2937	STATIC I32
	2938	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	2939	I32 minlenp, I32 deltap,
	2940	regnode *last,
	2941	scan_data_t *data,
	2942	I32 stopparen,
	2943	U8* recursed,
	2944	struct regnode_charclass_class *and_withp,
	2945	U32 flags, U32 depth)
	2946	/* scanp: Start here (read-write). */
	2947	/* deltap: Write maxlen-minlen here. */
	2948	/* last: Stop before this one. */
	2949	/* data: string data about the pattern */
	2950	/* stopparen: treat close N as END */
	2951	/* recursed: which subroutines have we recursed into */
	2952	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	2953	{
	2954	dVAR;
	2955	I32 min = 0, pars = 0, code;
	2956	regnode scan = scanp, *next;
	2957	I32 delta = 0;
	2958	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	2959	int is_inf_internal = 0; /* The studied chunk is infinite */
	2960	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	2961	scan_data_t data_fake;
	2962	SV *re_trie_maxbuff = NULL;
	2963	regnode *first_non_open = scan;
	2964	I32 stopmin = I32_MAX;
	2965	scan_frame *frame = NULL;
	2966	GET_RE_DEBUG_FLAGS_DECL;
	2967
	2968	PERL_ARGS_ASSERT_STUDY_CHUNK;
	2969
	2970	#ifdef DEBUGGING
	2971	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	2972	#endif
	2973
	2974	if ( depth == 0 ) {
	2975	while (first_non_open && OP(first_non_open) == OPEN)
	2976	first_non_open=regnext(first_non_open);
	2977	}
	2978
	2979
	2980	fake_study_recurse:
	2981	while ( scan && OP(scan) != END && scan < last ){
	2982	UV min_subtract = 0; /* How much to subtract from the minimum node
	2983	length to get a real minimum (because the
	2984	folded version may be shorter) */
	2985	bool has_exactf_sharp_s = FALSE;
	2986	/* Peephole optimizer: */
	2987	DEBUG_STUDYDATA("Peep:", data,depth);
	2988	DEBUG_PEEP("Peep",scan,depth);
	2989
	2990	/* Its not clear to khw or hv why this is done here, and not in the
	2991	* clauses that deal with EXACT nodes. khw's guess is that it's
	2992	* because of a previous design */
	2993	JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
	2994
	2995	/* Follow the next-chain of the current node and optimize
	2996	away all the NOTHINGs from it. */
	2997	if (OP(scan) != CURLYX) {
	2998	const int max = (reg_off_by_arg[OP(scan)]
	2999	? I32_MAX
	3000	/* I32 may be smaller than U16 on CRAYs! */
	3001	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	3002	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	3003	int noff;
	3004	regnode *n = scan;
	3005
	3006	/* Skip NOTHING and LONGJMP. */
	3007	while ((n = regnext(n))
	3008	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	3009	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	3010	&& off + noff < max)
	3011	off += noff;
	3012	if (reg_off_by_arg[OP(scan)])
	3013	ARG(scan) = off;
	3014	else
	3015	NEXT_OFF(scan) = off;
	3016	}
	3017
	3018
	3019
	3020	/* The principal pseudo-switch. Cannot be a switch, since we
	3021	look into several different things. */
	3022	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	3023	\|\| OP(scan) == IFTHEN) {
	3024	next = regnext(scan);
	3025	code = OP(scan);
	3026	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	3027
	3028	if (OP(next) == code \|\| code == IFTHEN) {
	3029	/* NOTE - There is similar code to this block below for handling
	3030	TRIE nodes on a re-study. If you change stuff here check there
	3031	too. */
	3032	I32 max1 = 0, min1 = I32_MAX, num = 0;
	3033	struct regnode_charclass_class accum;
	3034	regnode * const startbranch=scan;
	3035
	3036	if (flags & SCF_DO_SUBSTR)
	3037	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	3038	if (flags & SCF_DO_STCLASS)
	3039	cl_init_zero(pRExC_state, &accum);
	3040
	3041	while (OP(scan) == code) {
	3042	I32 deltanext, minnext, f = 0, fake;
	3043	struct regnode_charclass_class this_class;
	3044
	3045	num++;
	3046	data_fake.flags = 0;
	3047	if (data) {
	3048	data_fake.whilem_c = data->whilem_c;
	3049	data_fake.last_closep = data->last_closep;
	3050	}
	3051	else
	3052	data_fake.last_closep = &fake;
	3053
	3054	data_fake.pos_delta = delta;
	3055	next = regnext(scan);
	3056	scan = NEXTOPER(scan);
	3057	if (code != BRANCH)
	3058	scan = NEXTOPER(scan);
	3059	if (flags & SCF_DO_STCLASS) {
	3060	cl_init(pRExC_state, &this_class);
	3061	data_fake.start_class = &this_class;
	3062	f = SCF_DO_STCLASS_AND;
	3063	}
	3064	if (flags & SCF_WHILEM_VISITED_POS)
	3065	f \|= SCF_WHILEM_VISITED_POS;
	3066
	3067	/* we suppose the run is continuous, last=next...*/
	3068	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3069	next, &data_fake,
	3070	stopparen, recursed, NULL, f,depth+1);
	3071	if (min1 > minnext)
	3072	min1 = minnext;
	3073	if (max1 < minnext + deltanext)
	3074	max1 = minnext + deltanext;
	3075	if (deltanext == I32_MAX)
	3076	is_inf = is_inf_internal = 1;
	3077	scan = next;
	3078	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3079	pars++;
	3080	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	3081	if ( stopmin > minnext)
	3082	stopmin = min + min1;
	3083	flags &= ~SCF_DO_SUBSTR;
	3084	if (data)
	3085	data->flags \|= SCF_SEEN_ACCEPT;
	3086	}
	3087	if (data) {
	3088	if (data_fake.flags & SF_HAS_EVAL)
	3089	data->flags \|= SF_HAS_EVAL;
	3090	data->whilem_c = data_fake.whilem_c;
	3091	}
	3092	if (flags & SCF_DO_STCLASS)
	3093	cl_or(pRExC_state, &accum, &this_class);
	3094	}
	3095	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	3096	min1 = 0;
	3097	if (flags & SCF_DO_SUBSTR) {
	3098	data->pos_min += min1;
	3099	data->pos_delta += max1 - min1;
	3100	if (max1 != min1 \|\| is_inf)
	3101	data->longest = &(data->longest_float);
	3102	}
	3103	min += min1;
	3104	delta += max1 - min1;
	3105	if (flags & SCF_DO_STCLASS_OR) {
	3106	cl_or(pRExC_state, data->start_class, &accum);
	3107	if (min1) {
	3108	cl_and(data->start_class, and_withp);
	3109	flags &= ~SCF_DO_STCLASS;
	3110	}
	3111	}
	3112	else if (flags & SCF_DO_STCLASS_AND) {
	3113	if (min1) {
	3114	cl_and(data->start_class, &accum);
	3115	flags &= ~SCF_DO_STCLASS;
	3116	}
	3117	else {
	3118	/* Switch to OR mode: cache the old value of
	3119	* data->start_class */
	3120	INIT_AND_WITHP;
	3121	StructCopy(data->start_class, and_withp,
	3122	struct regnode_charclass_class);
	3123	flags &= ~SCF_DO_STCLASS_AND;
	3124	StructCopy(&accum, data->start_class,
	3125	struct regnode_charclass_class);
	3126	flags \|= SCF_DO_STCLASS_OR;
	3127	data->start_class->flags \|= ANYOF_EOS;
	3128	}
	3129	}
	3130
	3131	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	3132	/* demq.
	3133
	3134	Assuming this was/is a branch we are dealing with: 'scan' now
	3135	points at the item that follows the branch sequence, whatever
	3136	it is. We now start at the beginning of the sequence and look
	3137	for subsequences of
	3138
	3139	BRANCH->EXACT=>x1
	3140	BRANCH->EXACT=>x2
	3141	tail
	3142
	3143	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	3144
	3145	If we can find such a subsequence we need to turn the first
	3146	element into a trie and then add the subsequent branch exact
	3147	strings to the trie.
	3148
	3149	We have two cases
	3150
	3151	1. patterns where the whole set of branches can be converted.
	3152
	3153	2. patterns where only a subset can be converted.
	3154
	3155	In case 1 we can replace the whole set with a single regop
	3156	for the trie. In case 2 we need to keep the start and end
	3157	branches so
	3158
	3159	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	3160	becomes BRANCH TRIE; BRANCH X;
	3161
	3162	There is an additional case, that being where there is a
	3163	common prefix, which gets split out into an EXACT like node
	3164	preceding the TRIE node.
	3165
	3166	If x(1..n)==tail then we can do a simple trie, if not we make
	3167	a "jump" trie, such that when we match the appropriate word
	3168	we "jump" to the appropriate tail node. Essentially we turn
	3169	a nested if into a case structure of sorts.
	3170
	3171	*/
	3172
	3173	int made=0;
	3174	if (!re_trie_maxbuff) {
	3175	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	3176	if (!SvIOK(re_trie_maxbuff))
	3177	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	3178	}
	3179	if ( SvIV(re_trie_maxbuff)>=0 ) {
	3180	regnode *cur;
	3181	regnode first = (regnode )NULL;
	3182	regnode last = (regnode )NULL;
	3183	regnode *tail = scan;
	3184	U8 optype = 0;
	3185	U32 count=0;
	3186
	3187	#ifdef DEBUGGING
	3188	SV * const mysv = sv_newmortal(); /* for dumping */
	3189	#endif
	3190	/* var tail is used because there may be a TAIL
	3191	regop in the way. Ie, the exacts will point to the
	3192	thing following the TAIL, but the last branch will
	3193	point at the TAIL. So we advance tail. If we
	3194	have nested (?:) we may have to move through several
	3195	tails.
	3196	*/
	3197
	3198	while ( OP( tail ) == TAIL ) {
	3199	/* this is the TAIL generated by (?:) */
	3200	tail = regnext( tail );
	3201	}
	3202
	3203
	3204	DEBUG_OPTIMISE_r({
	3205	regprop(RExC_rx, mysv, tail );
	3206	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	3207	(int)depth * 2 + 2, "",
	3208	"Looking for TRIE'able sequences. Tail node is: ",
	3209	SvPV_nolen_const( mysv )
	3210	);
	3211	});
	3212
	3213	/*
	3214
	3215	step through the branches, cur represents each
	3216	branch, noper is the first thing to be matched
	3217	as part of that branch and noper_next is the
	3218	regnext() of that node. if noper is an EXACT
	3219	and noper_next is the same as scan (our current
	3220	position in the regex) then the EXACT branch is
	3221	a possible optimization target. Once we have
	3222	two or more consecutive such branches we can
	3223	create a trie of the EXACT's contents and stich
	3224	it in place. If the sequence represents all of
	3225	the branches we eliminate the whole thing and
	3226	replace it with a single TRIE. If it is a
	3227	subsequence then we need to stitch it in. This
	3228	means the first branch has to remain, and needs
	3229	to be repointed at the item on the branch chain
	3230	following the last branch optimized. This could
	3231	be either a BRANCH, in which case the
	3232	subsequence is internal, or it could be the
	3233	item following the branch sequence in which
	3234	case the subsequence is at the end.
	3235
	3236	*/
	3237
	3238	/* dont use tail as the end marker for this traverse */
	3239	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	3240	regnode * const noper = NEXTOPER( cur );
	3241	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	3242	regnode * const noper_next = regnext( noper );
	3243	#endif
	3244
	3245	DEBUG_OPTIMISE_r({
	3246	regprop(RExC_rx, mysv, cur);
	3247	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	3248	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	3249
	3250	regprop(RExC_rx, mysv, noper);
	3251	PerlIO_printf( Perl_debug_log, " -> %s",
	3252	SvPV_nolen_const(mysv));
	3253
	3254	if ( noper_next ) {
	3255	regprop(RExC_rx, mysv, noper_next );
	3256	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3257	SvPV_nolen_const(mysv));
	3258	}
	3259	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
	3260	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
	3261	});
	3262	if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
	3263	: PL_regkind[ OP( noper ) ] == EXACT )
	3264	\|\| OP(noper) == NOTHING )
	3265	#ifdef NOJUMPTRIE
	3266	&& noper_next == tail
	3267	#endif
	3268	&& count < U16_MAX)
	3269	{
	3270	count++;
	3271	if ( !first \|\| optype == NOTHING ) {
	3272	if (!first) first = cur;
	3273	optype = OP( noper );
	3274	} else {
	3275	last = cur;
	3276	}
	3277	} else {
	3278	/*
	3279	Currently the trie logic handles case insensitive matching properly only
	3280	when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
	3281	semantics).
	3282
	3283	If/when this is fixed the following define can be swapped
	3284	in below to fully enable trie logic.
	3285
	3286	#define TRIE_TYPE_IS_SAFE 1
	3287
	3288	Note that join_exact() assumes that the other types of EXACTFish nodes are not
	3289	used in tries, so that would have to be updated if this changed
	3290
	3291	*/
	3292	#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) \|\| optype==EXACT)
	3293
	3294	if ( last && TRIE_TYPE_IS_SAFE ) {
	3295	make_trie( pRExC_state,
	3296	startbranch, first, cur, tail, count,
	3297	optype, depth+1 );
	3298	}
	3299	if ( PL_regkind[ OP( noper ) ] == EXACT
	3300	#ifdef NOJUMPTRIE
	3301	&& noper_next == tail
	3302	#endif
	3303	){
	3304	count = 1;
	3305	first = cur;
	3306	optype = OP( noper );
	3307	} else {
	3308	count = 0;
	3309	first = NULL;
	3310	optype = 0;
	3311	}
	3312	last = NULL;
	3313	}
	3314	}
	3315	DEBUG_OPTIMISE_r({
	3316	regprop(RExC_rx, mysv, cur);
	3317	PerlIO_printf( Perl_debug_log,
	3318	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	3319	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3320
	3321	});
	3322
	3323	if ( last && TRIE_TYPE_IS_SAFE ) {
	3324	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
	3325	#ifdef TRIE_STUDY_OPT
	3326	if ( ((made == MADE_EXACT_TRIE &&
	3327	startbranch == first)
	3328	\|\| ( first_non_open == first )) &&
	3329	depth==0 ) {
	3330	flags \|= SCF_TRIE_RESTUDY;
	3331	if ( startbranch == first
	3332	&& scan == tail )
	3333	{
	3334	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	3335	}
	3336	}
	3337	#endif
	3338	}
	3339	}
	3340
	3341	} /* do trie */
	3342
	3343	}
	3344	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	3345	scan = NEXTOPER(NEXTOPER(scan));
	3346	} else /* single branch is optimized. */
	3347	scan = NEXTOPER(scan);
	3348	continue;
	3349	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	3350	scan_frame *newframe = NULL;
	3351	I32 paren;
	3352	regnode *start;
	3353	regnode *end;
	3354
	3355	if (OP(scan) != SUSPEND) {
	3356	/* set the pointer */
	3357	if (OP(scan) == GOSUB) {
	3358	paren = ARG(scan);
	3359	RExC_recurse[ARG2L(scan)] = scan;
	3360	start = RExC_open_parens[paren-1];
	3361	end = RExC_close_parens[paren-1];
	3362	} else {
	3363	paren = 0;
	3364	start = RExC_rxi->program + 1;
	3365	end = RExC_opend;
	3366	}
	3367	if (!recursed) {
	3368	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3369	SAVEFREEPV(recursed);
	3370	}
	3371	if (!PAREN_TEST(recursed,paren+1)) {
	3372	PAREN_SET(recursed,paren+1);
	3373	Newx(newframe,1,scan_frame);
	3374	} else {
	3375	if (flags & SCF_DO_SUBSTR) {
	3376	SCAN_COMMIT(pRExC_state,data,minlenp);
	3377	data->longest = &(data->longest_float);
	3378	}
	3379	is_inf = is_inf_internal = 1;
	3380	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3381	cl_anything(pRExC_state, data->start_class);
	3382	flags &= ~SCF_DO_STCLASS;
	3383	}
	3384	} else {
	3385	Newx(newframe,1,scan_frame);
	3386	paren = stopparen;
	3387	start = scan+2;
	3388	end = regnext(scan);
	3389	}
	3390	if (newframe) {
	3391	assert(start);
	3392	assert(end);
	3393	SAVEFREEPV(newframe);
	3394	newframe->next = regnext(scan);
	3395	newframe->last = last;
	3396	newframe->stop = stopparen;
	3397	newframe->prev = frame;
	3398
	3399	frame = newframe;
	3400	scan = start;
	3401	stopparen = paren;
	3402	last = end;
	3403
	3404	continue;
	3405	}
	3406	}
	3407	else if (OP(scan) == EXACT) {
	3408	I32 l = STR_LEN(scan);
	3409	UV uc;
	3410	if (UTF) {
	3411	const U8 * const s = (U8*)STRING(scan);
	3412	l = utf8_length(s, s + l);
	3413	uc = utf8_to_uvchr(s, NULL);
	3414	} else {
	3415	uc = ((U8)STRING(scan));
	3416	}
	3417	min += l;
	3418	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3419	/* The code below prefers earlier match for fixed
	3420	offset, later match for variable offset. */
	3421	if (data->last_end == -1) { /* Update the start info. */
	3422	data->last_start_min = data->pos_min;
	3423	data->last_start_max = is_inf
	3424	? I32_MAX : data->pos_min + data->pos_delta;
	3425	}
	3426	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3427	if (UTF)
	3428	SvUTF8_on(data->last_found);
	3429	{
	3430	SV * const sv = data->last_found;
	3431	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3432	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3433	if (mg && mg->mg_len >= 0)
	3434	mg->mg_len += utf8_length((U8*)STRING(scan),
	3435	(U8*)STRING(scan)+STR_LEN(scan));
	3436	}
	3437	data->last_end = data->pos_min + l;
	3438	data->pos_min += l; /* As in the first entry. */
	3439	data->flags &= ~SF_BEFORE_EOL;
	3440	}
	3441	if (flags & SCF_DO_STCLASS_AND) {
	3442	/* Check whether it is compatible with what we know already! */
	3443	int compat = 1;
	3444
	3445
	3446	/* If compatible, we or it in below. It is compatible if is
	3447	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3448	* it's for a locale. Even if there isn't unicode semantics
	3449	* here, at runtime there may be because of matching against a
	3450	* utf8 string, so accept a possible false positive for
	3451	* latin1-range folds */
	3452	if (uc >= 0x100 \|\|
	3453	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3454	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3455	&& (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
	3456	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3457	)
	3458	{
	3459	compat = 0;
	3460	}
	3461	ANYOF_CLASS_ZERO(data->start_class);
	3462	ANYOF_BITMAP_ZERO(data->start_class);
	3463	if (compat)
	3464	ANYOF_BITMAP_SET(data->start_class, uc);
	3465	else if (uc >= 0x100) {
	3466	int i;
	3467
	3468	/* Some Unicode code points fold to the Latin1 range; as
	3469	* XXX temporary code, instead of figuring out if this is
	3470	* one, just assume it is and set all the start class bits
	3471	* that could be some such above 255 code point's fold
	3472	* which will generate fals positives. As the code
	3473	* elsewhere that does compute the fold settles down, it
	3474	* can be extracted out and re-used here */
	3475	for (i = 0; i < 256; i++){
	3476	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3477	ANYOF_BITMAP_SET(data->start_class, i);
	3478	}
	3479	}
	3480	}
	3481	data->start_class->flags &= ~ANYOF_EOS;
	3482	if (uc < 0x100)
	3483	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3484	}
	3485	else if (flags & SCF_DO_STCLASS_OR) {
	3486	/* false positive possible if the class is case-folded */
	3487	if (uc < 0x100)
	3488	ANYOF_BITMAP_SET(data->start_class, uc);
	3489	else
	3490	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3491	data->start_class->flags &= ~ANYOF_EOS;
	3492	cl_and(data->start_class, and_withp);
	3493	}
	3494	flags &= ~SCF_DO_STCLASS;
	3495	}
	3496	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3497	I32 l = STR_LEN(scan);
	3498	UV uc = ((U8)STRING(scan));
	3499
	3500	/* Search for fixed substrings supports EXACT only. */
	3501	if (flags & SCF_DO_SUBSTR) {
	3502	assert(data);
	3503	SCAN_COMMIT(pRExC_state, data, minlenp);
	3504	}
	3505	if (UTF) {
	3506	const U8 * const s = (U8 *)STRING(scan);
	3507	l = utf8_length(s, s + l);
	3508	uc = utf8_to_uvchr(s, NULL);
	3509	}
	3510	else if (has_exactf_sharp_s) {
	3511	RExC_seen \|= REG_SEEN_EXACTF_SHARP_S;
	3512	}
	3513	min += l - min_subtract;
	3514	if (min < 0) {
	3515	min = 0;
	3516	}
	3517	delta += min_subtract;
	3518	if (flags & SCF_DO_SUBSTR) {
	3519	data->pos_min += l - min_subtract;
	3520	if (data->pos_min < 0) {
	3521	data->pos_min = 0;
	3522	}
	3523	data->pos_delta += min_subtract;
	3524	if (min_subtract) {
	3525	data->longest = &(data->longest_float);
	3526	}
	3527	}
	3528	if (flags & SCF_DO_STCLASS_AND) {
	3529	/* Check whether it is compatible with what we know already! */
	3530	int compat = 1;
	3531	if (uc >= 0x100 \|\|
	3532	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3533	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3534	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3535	{
	3536	compat = 0;
	3537	}
	3538	ANYOF_CLASS_ZERO(data->start_class);
	3539	ANYOF_BITMAP_ZERO(data->start_class);
	3540	if (compat) {
	3541	ANYOF_BITMAP_SET(data->start_class, uc);
	3542	data->start_class->flags &= ~ANYOF_EOS;
	3543	data->start_class->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	3544	if (OP(scan) == EXACTFL) {
	3545	/* XXX This set is probably no longer necessary, and
	3546	* probably wrong as LOCALE now is on in the initial
	3547	* state */
	3548	data->start_class->flags \|= ANYOF_LOCALE;
	3549	}
	3550	else {
	3551
	3552	/* Also set the other member of the fold pair. In case
	3553	* that unicode semantics is called for at runtime, use
	3554	* the full latin1 fold. (Can't do this for locale,
	3555	* because not known until runtime) */
	3556	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3557
	3558	/* All other (EXACTFL handled above) folds except under
	3559	* /iaa that include s, S, and sharp_s also may include
	3560	* the others */
	3561	if (OP(scan) != EXACTFA) {
	3562	if (uc == 's' \|\| uc == 'S') {
	3563	ANYOF_BITMAP_SET(data->start_class,
	3564	LATIN_SMALL_LETTER_SHARP_S);
	3565	}
	3566	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3567	ANYOF_BITMAP_SET(data->start_class, 's');
	3568	ANYOF_BITMAP_SET(data->start_class, 'S');
	3569	}
	3570	}
	3571	}
	3572	}
	3573	else if (uc >= 0x100) {
	3574	int i;
	3575	for (i = 0; i < 256; i++){
	3576	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3577	ANYOF_BITMAP_SET(data->start_class, i);
	3578	}
	3579	}
	3580	}
	3581	}
	3582	else if (flags & SCF_DO_STCLASS_OR) {
	3583	if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
	3584	/* false positive possible if the class is case-folded.
	3585	Assume that the locale settings are the same... */
	3586	if (uc < 0x100) {
	3587	ANYOF_BITMAP_SET(data->start_class, uc);
	3588	if (OP(scan) != EXACTFL) {
	3589
	3590	/* And set the other member of the fold pair, but
	3591	* can't do that in locale because not known until
	3592	* run-time */
	3593	ANYOF_BITMAP_SET(data->start_class,
	3594	PL_fold_latin1[uc]);
	3595
	3596	/* All folds except under /iaa that include s, S,
	3597	* and sharp_s also may include the others */
	3598	if (OP(scan) != EXACTFA) {
	3599	if (uc == 's' \|\| uc == 'S') {
	3600	ANYOF_BITMAP_SET(data->start_class,
	3601	LATIN_SMALL_LETTER_SHARP_S);
	3602	}
	3603	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3604	ANYOF_BITMAP_SET(data->start_class, 's');
	3605	ANYOF_BITMAP_SET(data->start_class, 'S');
	3606	}
	3607	}
	3608	}
	3609	}
	3610	data->start_class->flags &= ~ANYOF_EOS;
	3611	}
	3612	cl_and(data->start_class, and_withp);
	3613	}
	3614	flags &= ~SCF_DO_STCLASS;
	3615	}
	3616	else if (REGNODE_VARIES(OP(scan))) {
	3617	I32 mincount, maxcount, minnext, deltanext, fl = 0;
	3618	I32 f = flags, pos_before = 0;
	3619	regnode * const oscan = scan;
	3620	struct regnode_charclass_class this_class;
	3621	struct regnode_charclass_class *oclass = NULL;
	3622	I32 next_is_eval = 0;
	3623
	3624	switch (PL_regkind[OP(scan)]) {
	3625	case WHILEM: /* End of (?:...)* . */
	3626	scan = NEXTOPER(scan);
	3627	goto finish;
	3628	case PLUS:
	3629	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3630	next = NEXTOPER(scan);
	3631	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3632	mincount = 1;
	3633	maxcount = REG_INFTY;
	3634	next = regnext(scan);
	3635	scan = NEXTOPER(scan);
	3636	goto do_curly;
	3637	}
	3638	}
	3639	if (flags & SCF_DO_SUBSTR)
	3640	data->pos_min++;
	3641	min++;
	3642	/* Fall through. */
	3643	case STAR:
	3644	if (flags & SCF_DO_STCLASS) {
	3645	mincount = 0;
	3646	maxcount = REG_INFTY;
	3647	next = regnext(scan);
	3648	scan = NEXTOPER(scan);
	3649	goto do_curly;
	3650	}
	3651	is_inf = is_inf_internal = 1;
	3652	scan = regnext(scan);
	3653	if (flags & SCF_DO_SUBSTR) {
	3654	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3655	data->longest = &(data->longest_float);
	3656	}
	3657	goto optimize_curly_tail;
	3658	case CURLY:
	3659	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3660	&& (scan->flags == stopparen))
	3661	{
	3662	mincount = 1;
	3663	maxcount = 1;
	3664	} else {
	3665	mincount = ARG1(scan);
	3666	maxcount = ARG2(scan);
	3667	}
	3668	next = regnext(scan);
	3669	if (OP(scan) == CURLYX) {
	3670	I32 lp = (data ? *(data->last_closep) : 0);
	3671	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3672	}
	3673	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3674	next_is_eval = (OP(scan) == EVAL);
	3675	do_curly:
	3676	if (flags & SCF_DO_SUBSTR) {
	3677	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3678	pos_before = data->pos_min;
	3679	}
	3680	if (data) {
	3681	fl = data->flags;
	3682	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3683	if (is_inf)
	3684	data->flags \|= SF_IS_INF;
	3685	}
	3686	if (flags & SCF_DO_STCLASS) {
	3687	cl_init(pRExC_state, &this_class);
	3688	oclass = data->start_class;
	3689	data->start_class = &this_class;
	3690	f \|= SCF_DO_STCLASS_AND;
	3691	f &= ~SCF_DO_STCLASS_OR;
	3692	}
	3693	/* Exclude from super-linear cache processing any {n,m}
	3694	regops for which the combination of input pos and regex
	3695	pos is not enough information to determine if a match
	3696	will be possible.
	3697
	3698	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3699	regex pos at the \s*, the prospects for a match depend not
	3700	only on the input position but also on how many (bar\s*)
	3701	repeats into the {4,8} we are. */
	3702	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3703	f &= ~SCF_WHILEM_VISITED_POS;
	3704
	3705	/* This will finish on WHILEM, setting scan, or on NULL: */
	3706	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3707	last, data, stopparen, recursed, NULL,
	3708	(mincount == 0
	3709	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3710
	3711	if (flags & SCF_DO_STCLASS)
	3712	data->start_class = oclass;
	3713	if (mincount == 0 \|\| minnext == 0) {
	3714	if (flags & SCF_DO_STCLASS_OR) {
	3715	cl_or(pRExC_state, data->start_class, &this_class);
	3716	}
	3717	else if (flags & SCF_DO_STCLASS_AND) {
	3718	/* Switch to OR mode: cache the old value of
	3719	* data->start_class */
	3720	INIT_AND_WITHP;
	3721	StructCopy(data->start_class, and_withp,
	3722	struct regnode_charclass_class);
	3723	flags &= ~SCF_DO_STCLASS_AND;
	3724	StructCopy(&this_class, data->start_class,
	3725	struct regnode_charclass_class);
	3726	flags \|= SCF_DO_STCLASS_OR;
	3727	data->start_class->flags \|= ANYOF_EOS;
	3728	}
	3729	} else { /* Non-zero len */
	3730	if (flags & SCF_DO_STCLASS_OR) {
	3731	cl_or(pRExC_state, data->start_class, &this_class);
	3732	cl_and(data->start_class, and_withp);
	3733	}
	3734	else if (flags & SCF_DO_STCLASS_AND)
	3735	cl_and(data->start_class, &this_class);
	3736	flags &= ~SCF_DO_STCLASS;
	3737	}
	3738	if (!scan) /* It was not CURLYX, but CURLY. */
	3739	scan = next;
	3740	if ( /* ? quantifier ok, except for (?{ ... }) */
	3741	(next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3742	&& (minnext == 0) && (deltanext == 0)
	3743	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3744	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3745	{
	3746	ckWARNreg(RExC_parse,
	3747	"Quantifier unexpected on zero-length expression");
	3748	}
	3749
	3750	min += minnext * mincount;
	3751	is_inf_internal \|= ((maxcount == REG_INFTY
	3752	&& (minnext + deltanext) > 0)
	3753	\|\| deltanext == I32_MAX);
	3754	is_inf \|= is_inf_internal;
	3755	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3756
	3757	/* Try powerful optimization CURLYX => CURLYN. */
	3758	if ( OP(oscan) == CURLYX && data
	3759	&& data->flags & SF_IN_PAR
	3760	&& !(data->flags & SF_HAS_EVAL)
	3761	&& !deltanext && minnext == 1 ) {
	3762	/* Try to optimize to CURLYN. */
	3763	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3764	regnode * const nxt1 = nxt;
	3765	#ifdef DEBUGGING
	3766	regnode *nxt2;
	3767	#endif
	3768
	3769	/* Skip open. */
	3770	nxt = regnext(nxt);
	3771	if (!REGNODE_SIMPLE(OP(nxt))
	3772	&& !(PL_regkind[OP(nxt)] == EXACT
	3773	&& STR_LEN(nxt) == 1))
	3774	goto nogo;
	3775	#ifdef DEBUGGING
	3776	nxt2 = nxt;
	3777	#endif
	3778	nxt = regnext(nxt);
	3779	if (OP(nxt) != CLOSE)
	3780	goto nogo;
	3781	if (RExC_open_parens) {
	3782	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3783	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3784	}
	3785	/* Now we know that nxt2 is the only contents: */
	3786	oscan->flags = (U8)ARG(nxt);
	3787	OP(oscan) = CURLYN;
	3788	OP(nxt1) = NOTHING; /* was OPEN. */
	3789
	3790	#ifdef DEBUGGING
	3791	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3792	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	3793	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	3794	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3795	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3796	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	3797	#endif
	3798	}
	3799	nogo:
	3800
	3801	/* Try optimization CURLYX => CURLYM. */
	3802	if ( OP(oscan) == CURLYX && data
	3803	&& !(data->flags & SF_HAS_PAR)
	3804	&& !(data->flags & SF_HAS_EVAL)
	3805	&& !deltanext /* atom is fixed width */
	3806	&& minnext != 0 /* CURLYM can't handle zero width */
	3807	) {
	3808	/* XXXX How to optimize if data == 0? */
	3809	/* Optimize to a simpler form. */
	3810	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	3811	regnode *nxt2;
	3812
	3813	OP(oscan) = CURLYM;
	3814	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	3815	&& (OP(nxt2) != WHILEM))
	3816	nxt = nxt2;
	3817	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	3818	/* Need to optimize away parenths. */
	3819	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	3820	/* Set the parenth number. */
	3821	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	3822
	3823	oscan->flags = (U8)ARG(nxt);
	3824	if (RExC_open_parens) {
	3825	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3826	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	3827	}
	3828	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	3829	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3830
	3831	#ifdef DEBUGGING
	3832	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3833	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3834	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	3835	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	3836	#endif
	3837	#if 0
	3838	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	3839	regnode *nnxt = regnext(nxt1);
	3840	if (nnxt == nxt) {
	3841	if (reg_off_by_arg[OP(nxt1)])
	3842	ARG_SET(nxt1, nxt2 - nxt1);
	3843	else if (nxt2 - nxt1 < U16_MAX)
	3844	NEXT_OFF(nxt1) = nxt2 - nxt1;
	3845	else
	3846	OP(nxt) = NOTHING; /* Cannot beautify */
	3847	}
	3848	nxt1 = nnxt;
	3849	}
	3850	#endif
	3851	/* Optimize again: */
	3852	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	3853	NULL, stopparen, recursed, NULL, 0,depth+1);
	3854	}
	3855	else
	3856	oscan->flags = 0;
	3857	}
	3858	else if ((OP(oscan) == CURLYX)
	3859	&& (flags & SCF_WHILEM_VISITED_POS)
	3860	/* See the comment on a similar expression above.
	3861	However, this time it's not a subexpression
	3862	we care about, but the expression itself. */
	3863	&& (maxcount == REG_INFTY)
	3864	&& data && ++data->whilem_c < 16) {
	3865	/* This stays as CURLYX, we can put the count/of pair. */
	3866	/* Find WHILEM (as in regexec.c) */
	3867	regnode *nxt = oscan + NEXT_OFF(oscan);
	3868
	3869	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	3870	nxt += ARG(nxt);
	3871	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	3872	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	3873	}
	3874	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	3875	pars++;
	3876	if (flags & SCF_DO_SUBSTR) {
	3877	SV *last_str = NULL;
	3878	int counted = mincount != 0;
	3879
	3880	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	3881	#if defined(SPARC64_GCC_WORKAROUND)
	3882	I32 b = 0;
	3883	STRLEN l = 0;
	3884	const char *s = NULL;
	3885	I32 old = 0;
	3886
	3887	if (pos_before >= data->last_start_min)
	3888	b = pos_before;
	3889	else
	3890	b = data->last_start_min;
	3891
	3892	l = 0;
	3893	s = SvPV_const(data->last_found, l);
	3894	old = b - data->last_start_min;
	3895
	3896	#else
	3897	I32 b = pos_before >= data->last_start_min
	3898	? pos_before : data->last_start_min;
	3899	STRLEN l;
	3900	const char * const s = SvPV_const(data->last_found, l);
	3901	I32 old = b - data->last_start_min;
	3902	#endif
	3903
	3904	if (UTF)
	3905	old = utf8_hop((U8)s, old) - (U8)s;
	3906	l -= old;
	3907	/* Get the added string: */
	3908	last_str = newSVpvn_utf8(s + old, l, UTF);
	3909	if (deltanext == 0 && pos_before == b) {
	3910	/* What was added is a constant string */
	3911	if (mincount > 1) {
	3912	SvGROW(last_str, (mincount * l) + 1);
	3913	repeatcpy(SvPVX(last_str) + l,
	3914	SvPVX_const(last_str), l, mincount - 1);
	3915	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	3916	/* Add additional parts. */
	3917	SvCUR_set(data->last_found,
	3918	SvCUR(data->last_found) - l);
	3919	sv_catsv(data->last_found, last_str);
	3920	{
	3921	SV * sv = data->last_found;
	3922	MAGIC *mg =
	3923	SvUTF8(sv) && SvMAGICAL(sv) ?
	3924	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3925	if (mg && mg->mg_len >= 0)
	3926	mg->mg_len += CHR_SVLEN(last_str) - l;
	3927	}
	3928	data->last_end += l * (mincount - 1);
	3929	}
	3930	} else {
	3931	/* start offset must point into the last copy */
	3932	data->last_start_min += minnext * (mincount - 1);
	3933	data->last_start_max += is_inf ? I32_MAX
	3934	: (maxcount - 1) * (minnext + data->pos_delta);
	3935	}
	3936	}
	3937	/* It is counted once already... */
	3938	data->pos_min += minnext * (mincount - counted);
	3939	data->pos_delta += - counted * deltanext +
	3940	(minnext + deltanext) * maxcount - minnext * mincount;
	3941	if (mincount != maxcount) {
	3942	/* Cannot extend fixed substrings found inside
	3943	the group. */
	3944	SCAN_COMMIT(pRExC_state,data,minlenp);
	3945	if (mincount && last_str) {
	3946	SV * const sv = data->last_found;
	3947	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3948	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3949
	3950	if (mg)
	3951	mg->mg_len = -1;
	3952	sv_setsv(sv, last_str);
	3953	data->last_end = data->pos_min;
	3954	data->last_start_min =
	3955	data->pos_min - CHR_SVLEN(last_str);
	3956	data->last_start_max = is_inf
	3957	? I32_MAX
	3958	: data->pos_min + data->pos_delta
	3959	- CHR_SVLEN(last_str);
	3960	}
	3961	data->longest = &(data->longest_float);
	3962	}
	3963	SvREFCNT_dec(last_str);
	3964	}
	3965	if (data && (fl & SF_HAS_EVAL))
	3966	data->flags \|= SF_HAS_EVAL;
	3967	optimize_curly_tail:
	3968	if (OP(oscan) != CURLYX) {
	3969	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	3970	&& NEXT_OFF(next))
	3971	NEXT_OFF(oscan) += NEXT_OFF(next);
	3972	}
	3973	continue;
	3974	default: /* REF, ANYOFV, and CLUMP only? */
	3975	if (flags & SCF_DO_SUBSTR) {
	3976	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3977	data->longest = &(data->longest_float);
	3978	}
	3979	is_inf = is_inf_internal = 1;
	3980	if (flags & SCF_DO_STCLASS_OR)
	3981	cl_anything(pRExC_state, data->start_class);
	3982	flags &= ~SCF_DO_STCLASS;
	3983	break;
	3984	}
	3985	}
	3986	else if (OP(scan) == LNBREAK) {
	3987	if (flags & SCF_DO_STCLASS) {
	3988	int value = 0;
	3989	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3990	if (flags & SCF_DO_STCLASS_AND) {
	3991	for (value = 0; value < 256; value++)
	3992	if (!is_VERTWS_cp(value))
	3993	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3994	}
	3995	else {
	3996	for (value = 0; value < 256; value++)
	3997	if (is_VERTWS_cp(value))
	3998	ANYOF_BITMAP_SET(data->start_class, value);
	3999	}
	4000	if (flags & SCF_DO_STCLASS_OR)
	4001	cl_and(data->start_class, and_withp);
	4002	flags &= ~SCF_DO_STCLASS;
	4003	}
	4004	min += 1;
	4005	delta += 1;
	4006	if (flags & SCF_DO_SUBSTR) {
	4007	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4008	data->pos_min += 1;
	4009	data->pos_delta += 1;
	4010	data->longest = &(data->longest_float);
	4011	}
	4012	}
	4013	else if (REGNODE_SIMPLE(OP(scan))) {
	4014	int value = 0;
	4015
	4016	if (flags & SCF_DO_SUBSTR) {
	4017	SCAN_COMMIT(pRExC_state,data,minlenp);
	4018	data->pos_min++;
	4019	}
	4020	min++;
	4021	if (flags & SCF_DO_STCLASS) {
	4022	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	4023
	4024	/* Some of the logic below assumes that switching
	4025	locale on will only add false positives. */
	4026	switch (PL_regkind[OP(scan)]) {
	4027	case SANY:
	4028	default:
	4029	do_default:
	4030	/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
	4031	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4032	cl_anything(pRExC_state, data->start_class);
	4033	break;
	4034	case REG_ANY:
	4035	if (OP(scan) == SANY)
	4036	goto do_default;
	4037	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	4038	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	4039	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	4040	cl_anything(pRExC_state, data->start_class);
	4041	}
	4042	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	4043	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	4044	break;
	4045	case ANYOF:
	4046	if (flags & SCF_DO_STCLASS_AND)
	4047	cl_and(data->start_class,
	4048	(struct regnode_charclass_class*)scan);
	4049	else
	4050	cl_or(pRExC_state, data->start_class,
	4051	(struct regnode_charclass_class*)scan);
	4052	break;
	4053	case ALNUM:
	4054	if (flags & SCF_DO_STCLASS_AND) {
	4055	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4056	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
	4057	if (OP(scan) == ALNUMU) {
	4058	for (value = 0; value < 256; value++) {
	4059	if (!isWORDCHAR_L1(value)) {
	4060	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4061	}
	4062	}
	4063	} else {
	4064	for (value = 0; value < 256; value++) {
	4065	if (!isALNUM(value)) {
	4066	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4067	}
	4068	}
	4069	}
	4070	}
	4071	}
	4072	else {
	4073	if (data->start_class->flags & ANYOF_LOCALE)
	4074	ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
	4075
	4076	/* Even if under locale, set the bits for non-locale
	4077	* in case it isn't a true locale-node. This will
	4078	* create false positives if it truly is locale */
	4079	if (OP(scan) == ALNUMU) {
	4080	for (value = 0; value < 256; value++) {
	4081	if (isWORDCHAR_L1(value)) {
	4082	ANYOF_BITMAP_SET(data->start_class, value);
	4083	}
	4084	}
	4085	} else {
	4086	for (value = 0; value < 256; value++) {
	4087	if (isALNUM(value)) {
	4088	ANYOF_BITMAP_SET(data->start_class, value);
	4089	}
	4090	}
	4091	}
	4092	}
	4093	break;
	4094	case NALNUM:
	4095	if (flags & SCF_DO_STCLASS_AND) {
	4096	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4097	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
	4098	if (OP(scan) == NALNUMU) {
	4099	for (value = 0; value < 256; value++) {
	4100	if (isWORDCHAR_L1(value)) {
	4101	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4102	}
	4103	}
	4104	} else {
	4105	for (value = 0; value < 256; value++) {
	4106	if (isALNUM(value)) {
	4107	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4108	}
	4109	}
	4110	}
	4111	}
	4112	}
	4113	else {
	4114	if (data->start_class->flags & ANYOF_LOCALE)
	4115	ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
	4116
	4117	/* Even if under locale, set the bits for non-locale in
	4118	* case it isn't a true locale-node. This will create
	4119	* false positives if it truly is locale */
	4120	if (OP(scan) == NALNUMU) {
	4121	for (value = 0; value < 256; value++) {
	4122	if (! isWORDCHAR_L1(value)) {
	4123	ANYOF_BITMAP_SET(data->start_class, value);
	4124	}
	4125	}
	4126	} else {
	4127	for (value = 0; value < 256; value++) {
	4128	if (! isALNUM(value)) {
	4129	ANYOF_BITMAP_SET(data->start_class, value);
	4130	}
	4131	}
	4132	}
	4133	}
	4134	break;
	4135	case SPACE:
	4136	if (flags & SCF_DO_STCLASS_AND) {
	4137	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4138	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
	4139	if (OP(scan) == SPACEU) {
	4140	for (value = 0; value < 256; value++) {
	4141	if (!isSPACE_L1(value)) {
	4142	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4143	}
	4144	}
	4145	} else {
	4146	for (value = 0; value < 256; value++) {
	4147	if (!isSPACE(value)) {
	4148	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4149	}
	4150	}
	4151	}
	4152	}
	4153	}
	4154	else {
	4155	if (data->start_class->flags & ANYOF_LOCALE) {
	4156	ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
	4157	}
	4158	if (OP(scan) == SPACEU) {
	4159	for (value = 0; value < 256; value++) {
	4160	if (isSPACE_L1(value)) {
	4161	ANYOF_BITMAP_SET(data->start_class, value);
	4162	}
	4163	}
	4164	} else {
	4165	for (value = 0; value < 256; value++) {
	4166	if (isSPACE(value)) {
	4167	ANYOF_BITMAP_SET(data->start_class, value);
	4168	}
	4169	}
	4170	}
	4171	}
	4172	break;
	4173	case NSPACE:
	4174	if (flags & SCF_DO_STCLASS_AND) {
	4175	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4176	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
	4177	if (OP(scan) == NSPACEU) {
	4178	for (value = 0; value < 256; value++) {
	4179	if (isSPACE_L1(value)) {
	4180	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4181	}
	4182	}
	4183	} else {
	4184	for (value = 0; value < 256; value++) {
	4185	if (isSPACE(value)) {
	4186	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4187	}
	4188	}
	4189	}
	4190	}
	4191	}
	4192	else {
	4193	if (data->start_class->flags & ANYOF_LOCALE)
	4194	ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
	4195	if (OP(scan) == NSPACEU) {
	4196	for (value = 0; value < 256; value++) {
	4197	if (!isSPACE_L1(value)) {
	4198	ANYOF_BITMAP_SET(data->start_class, value);
	4199	}
	4200	}
	4201	}
	4202	else {
	4203	for (value = 0; value < 256; value++) {
	4204	if (!isSPACE(value)) {
	4205	ANYOF_BITMAP_SET(data->start_class, value);
	4206	}
	4207	}
	4208	}
	4209	}
	4210	break;
	4211	case DIGIT:
	4212	if (flags & SCF_DO_STCLASS_AND) {
	4213	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4214	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
	4215	for (value = 0; value < 256; value++)
	4216	if (!isDIGIT(value))
	4217	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4218	}
	4219	}
	4220	else {
	4221	if (data->start_class->flags & ANYOF_LOCALE)
	4222	ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
	4223	for (value = 0; value < 256; value++)
	4224	if (isDIGIT(value))
	4225	ANYOF_BITMAP_SET(data->start_class, value);
	4226	}
	4227	break;
	4228	case NDIGIT:
	4229	if (flags & SCF_DO_STCLASS_AND) {
	4230	if (!(data->start_class->flags & ANYOF_LOCALE))
	4231	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
	4232	for (value = 0; value < 256; value++)
	4233	if (isDIGIT(value))
	4234	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4235	}
	4236	else {
	4237	if (data->start_class->flags & ANYOF_LOCALE)
	4238	ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
	4239	for (value = 0; value < 256; value++)
	4240	if (!isDIGIT(value))
	4241	ANYOF_BITMAP_SET(data->start_class, value);
	4242	}
	4243	break;
	4244	CASE_SYNST_FNC(VERTWS);
	4245	CASE_SYNST_FNC(HORIZWS);
	4246
	4247	}
	4248	if (flags & SCF_DO_STCLASS_OR)
	4249	cl_and(data->start_class, and_withp);
	4250	flags &= ~SCF_DO_STCLASS;
	4251	}
	4252	}
	4253	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	4254	data->flags \|= (OP(scan) == MEOL
	4255	? SF_BEFORE_MEOL
	4256	: SF_BEFORE_SEOL);
	4257	}
	4258	else if ( PL_regkind[OP(scan)] == BRANCHJ
	4259	/* Lookbehind, or need to calculate parens/evals/stclass: */
	4260	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	4261	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	4262	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4263	\|\| OP(scan) == UNLESSM )
	4264	{
	4265	/* Negative Lookahead/lookbehind
	4266	In this case we can't do fixed string optimisation.
	4267	*/
	4268
	4269	I32 deltanext, minnext, fake = 0;
	4270	regnode *nscan;
	4271	struct regnode_charclass_class intrnl;
	4272	int f = 0;
	4273
	4274	data_fake.flags = 0;
	4275	if (data) {
	4276	data_fake.whilem_c = data->whilem_c;
	4277	data_fake.last_closep = data->last_closep;
	4278	}
	4279	else
	4280	data_fake.last_closep = &fake;
	4281	data_fake.pos_delta = delta;
	4282	if ( flags & SCF_DO_STCLASS && !scan->flags
	4283	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4284	cl_init(pRExC_state, &intrnl);
	4285	data_fake.start_class = &intrnl;
	4286	f \|= SCF_DO_STCLASS_AND;
	4287	}
	4288	if (flags & SCF_WHILEM_VISITED_POS)
	4289	f \|= SCF_WHILEM_VISITED_POS;
	4290	next = regnext(scan);
	4291	nscan = NEXTOPER(NEXTOPER(scan));
	4292	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	4293	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	4294	if (scan->flags) {
	4295	if (deltanext) {
	4296	FAIL("Variable length lookbehind not implemented");
	4297	}
	4298	else if (minnext > (I32)U8_MAX) {
	4299	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4300	}
	4301	scan->flags = (U8)minnext;
	4302	}
	4303	if (data) {
	4304	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4305	pars++;
	4306	if (data_fake.flags & SF_HAS_EVAL)
	4307	data->flags \|= SF_HAS_EVAL;
	4308	data->whilem_c = data_fake.whilem_c;
	4309	}
	4310	if (f & SCF_DO_STCLASS_AND) {
	4311	if (flags & SCF_DO_STCLASS_OR) {
	4312	/* OR before, AND after: ideally we would recurse with
	4313	* data_fake to get the AND applied by study of the
	4314	* remainder of the pattern, and then derecurse;
	4315	* * HACK * for now just treat as "no information".
	4316	* See [perl #56690].
	4317	*/
	4318	cl_init(pRExC_state, data->start_class);
	4319	} else {
	4320	/* AND before and after: combine and continue */
	4321	const int was = (data->start_class->flags & ANYOF_EOS);
	4322
	4323	cl_and(data->start_class, &intrnl);
	4324	if (was)
	4325	data->start_class->flags \|= ANYOF_EOS;
	4326	}
	4327	}
	4328	}
	4329	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4330	else {
	4331	/* Positive Lookahead/lookbehind
	4332	In this case we can do fixed string optimisation,
	4333	but we must be careful about it. Note in the case of
	4334	lookbehind the positions will be offset by the minimum
	4335	length of the pattern, something we won't know about
	4336	until after the recurse.
	4337	*/
	4338	I32 deltanext, fake = 0;
	4339	regnode *nscan;
	4340	struct regnode_charclass_class intrnl;
	4341	int f = 0;
	4342	/* We use SAVEFREEPV so that when the full compile
	4343	is finished perl will clean up the allocated
	4344	minlens when it's all done. This way we don't
	4345	have to worry about freeing them when we know
	4346	they wont be used, which would be a pain.
	4347	*/
	4348	I32 *minnextp;
	4349	Newx( minnextp, 1, I32 );
	4350	SAVEFREEPV(minnextp);
	4351
	4352	if (data) {
	4353	StructCopy(data, &data_fake, scan_data_t);
	4354	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	4355	f \|= SCF_DO_SUBSTR;
	4356	if (scan->flags)
	4357	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	4358	data_fake.last_found=newSVsv(data->last_found);
	4359	}
	4360	}
	4361	else
	4362	data_fake.last_closep = &fake;
	4363	data_fake.flags = 0;
	4364	data_fake.pos_delta = delta;
	4365	if (is_inf)
	4366	data_fake.flags \|= SF_IS_INF;
	4367	if ( flags & SCF_DO_STCLASS && !scan->flags
	4368	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4369	cl_init(pRExC_state, &intrnl);
	4370	data_fake.start_class = &intrnl;
	4371	f \|= SCF_DO_STCLASS_AND;
	4372	}
	4373	if (flags & SCF_WHILEM_VISITED_POS)
	4374	f \|= SCF_WHILEM_VISITED_POS;
	4375	next = regnext(scan);
	4376	nscan = NEXTOPER(NEXTOPER(scan));
	4377
	4378	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	4379	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	4380	if (scan->flags) {
	4381	if (deltanext) {
	4382	FAIL("Variable length lookbehind not implemented");
	4383	}
	4384	else if (*minnextp > (I32)U8_MAX) {
	4385	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4386	}
	4387	scan->flags = (U8)*minnextp;
	4388	}
	4389
	4390	*minnextp += min;
	4391
	4392	if (f & SCF_DO_STCLASS_AND) {
	4393	const int was = (data->start_class->flags & ANYOF_EOS);
	4394
	4395	cl_and(data->start_class, &intrnl);
	4396	if (was)
	4397	data->start_class->flags \|= ANYOF_EOS;
	4398	}
	4399	if (data) {
	4400	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4401	pars++;
	4402	if (data_fake.flags & SF_HAS_EVAL)
	4403	data->flags \|= SF_HAS_EVAL;
	4404	data->whilem_c = data_fake.whilem_c;
	4405	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	4406	if (RExC_rx->minlen<*minnextp)
	4407	RExC_rx->minlen=*minnextp;
	4408	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	4409	SvREFCNT_dec(data_fake.last_found);
	4410
	4411	if ( data_fake.minlen_fixed != minlenp )
	4412	{
	4413	data->offset_fixed= data_fake.offset_fixed;
	4414	data->minlen_fixed= data_fake.minlen_fixed;
	4415	data->lookbehind_fixed+= scan->flags;
	4416	}
	4417	if ( data_fake.minlen_float != minlenp )
	4418	{
	4419	data->minlen_float= data_fake.minlen_float;
	4420	data->offset_float_min=data_fake.offset_float_min;
	4421	data->offset_float_max=data_fake.offset_float_max;
	4422	data->lookbehind_float+= scan->flags;
	4423	}
	4424	}
	4425	}
	4426
	4427
	4428	}
	4429	#endif
	4430	}
	4431	else if (OP(scan) == OPEN) {
	4432	if (stopparen != (I32)ARG(scan))
	4433	pars++;
	4434	}
	4435	else if (OP(scan) == CLOSE) {
	4436	if (stopparen == (I32)ARG(scan)) {
	4437	break;
	4438	}
	4439	if ((I32)ARG(scan) == is_par) {
	4440	next = regnext(scan);
	4441
	4442	if ( next && (OP(next) != WHILEM) && next < last)
	4443	is_par = 0; /* Disable optimization */
	4444	}
	4445	if (data)
	4446	*(data->last_closep) = ARG(scan);
	4447	}
	4448	else if (OP(scan) == EVAL) {
	4449	if (data)
	4450	data->flags \|= SF_HAS_EVAL;
	4451	}
	4452	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4453	if (flags & SCF_DO_SUBSTR) {
	4454	SCAN_COMMIT(pRExC_state,data,minlenp);
	4455	flags &= ~SCF_DO_SUBSTR;
	4456	}
	4457	if (data && OP(scan)==ACCEPT) {
	4458	data->flags \|= SCF_SEEN_ACCEPT;
	4459	if (stopmin > min)
	4460	stopmin = min;
	4461	}
	4462	}
	4463	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4464	{
	4465	if (flags & SCF_DO_SUBSTR) {
	4466	SCAN_COMMIT(pRExC_state,data,minlenp);
	4467	data->longest = &(data->longest_float);
	4468	}
	4469	is_inf = is_inf_internal = 1;
	4470	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4471	cl_anything(pRExC_state, data->start_class);
	4472	flags &= ~SCF_DO_STCLASS;
	4473	}
	4474	else if (OP(scan) == GPOS) {
	4475	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4476	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4477	{
	4478	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4479	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4480	if (RExC_rx->gofs < (U32)min)
	4481	RExC_rx->gofs = min;
	4482	} else {
	4483	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4484	RExC_rx->gofs = 0;
	4485	}
	4486	}
	4487	#ifdef TRIE_STUDY_OPT
	4488	#ifdef FULL_TRIE_STUDY
	4489	else if (PL_regkind[OP(scan)] == TRIE) {
	4490	/* NOTE - There is similar code to this block above for handling
	4491	BRANCH nodes on the initial study. If you change stuff here
	4492	check there too. */
	4493	regnode *trie_node= scan;
	4494	regnode *tail= regnext(scan);
	4495	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4496	I32 max1 = 0, min1 = I32_MAX;
	4497	struct regnode_charclass_class accum;
	4498
	4499	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4500	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4501	if (flags & SCF_DO_STCLASS)
	4502	cl_init_zero(pRExC_state, &accum);
	4503
	4504	if (!trie->jump) {
	4505	min1= trie->minlen;
	4506	max1= trie->maxlen;
	4507	} else {
	4508	const regnode *nextbranch= NULL;
	4509	U32 word;
	4510
	4511	for ( word=1 ; word <= trie->wordcount ; word++)
	4512	{
	4513	I32 deltanext=0, minnext=0, f = 0, fake;
	4514	struct regnode_charclass_class this_class;
	4515
	4516	data_fake.flags = 0;
	4517	if (data) {
	4518	data_fake.whilem_c = data->whilem_c;
	4519	data_fake.last_closep = data->last_closep;
	4520	}
	4521	else
	4522	data_fake.last_closep = &fake;
	4523	data_fake.pos_delta = delta;
	4524	if (flags & SCF_DO_STCLASS) {
	4525	cl_init(pRExC_state, &this_class);
	4526	data_fake.start_class = &this_class;
	4527	f = SCF_DO_STCLASS_AND;
	4528	}
	4529	if (flags & SCF_WHILEM_VISITED_POS)
	4530	f \|= SCF_WHILEM_VISITED_POS;
	4531
	4532	if (trie->jump[word]) {
	4533	if (!nextbranch)
	4534	nextbranch = trie_node + trie->jump[0];
	4535	scan= trie_node + trie->jump[word];
	4536	/* We go from the jump point to the branch that follows
	4537	it. Note this means we need the vestigal unused branches
	4538	even though they arent otherwise used.
	4539	*/
	4540	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4541	&deltanext, (regnode *)nextbranch, &data_fake,
	4542	stopparen, recursed, NULL, f,depth+1);
	4543	}
	4544	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4545	nextbranch= regnext((regnode*)nextbranch);
	4546
	4547	if (min1 > (I32)(minnext + trie->minlen))
	4548	min1 = minnext + trie->minlen;
	4549	if (max1 < (I32)(minnext + deltanext + trie->maxlen))
	4550	max1 = minnext + deltanext + trie->maxlen;
	4551	if (deltanext == I32_MAX)
	4552	is_inf = is_inf_internal = 1;
	4553
	4554	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4555	pars++;
	4556	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4557	if ( stopmin > min + min1)
	4558	stopmin = min + min1;
	4559	flags &= ~SCF_DO_SUBSTR;
	4560	if (data)
	4561	data->flags \|= SCF_SEEN_ACCEPT;
	4562	}
	4563	if (data) {
	4564	if (data_fake.flags & SF_HAS_EVAL)
	4565	data->flags \|= SF_HAS_EVAL;
	4566	data->whilem_c = data_fake.whilem_c;
	4567	}
	4568	if (flags & SCF_DO_STCLASS)
	4569	cl_or(pRExC_state, &accum, &this_class);
	4570	}
	4571	}
	4572	if (flags & SCF_DO_SUBSTR) {
	4573	data->pos_min += min1;
	4574	data->pos_delta += max1 - min1;
	4575	if (max1 != min1 \|\| is_inf)
	4576	data->longest = &(data->longest_float);
	4577	}
	4578	min += min1;
	4579	delta += max1 - min1;
	4580	if (flags & SCF_DO_STCLASS_OR) {
	4581	cl_or(pRExC_state, data->start_class, &accum);
	4582	if (min1) {
	4583	cl_and(data->start_class, and_withp);
	4584	flags &= ~SCF_DO_STCLASS;
	4585	}
	4586	}
	4587	else if (flags & SCF_DO_STCLASS_AND) {
	4588	if (min1) {
	4589	cl_and(data->start_class, &accum);
	4590	flags &= ~SCF_DO_STCLASS;
	4591	}
	4592	else {
	4593	/* Switch to OR mode: cache the old value of
	4594	* data->start_class */
	4595	INIT_AND_WITHP;
	4596	StructCopy(data->start_class, and_withp,
	4597	struct regnode_charclass_class);
	4598	flags &= ~SCF_DO_STCLASS_AND;
	4599	StructCopy(&accum, data->start_class,
	4600	struct regnode_charclass_class);
	4601	flags \|= SCF_DO_STCLASS_OR;
	4602	data->start_class->flags \|= ANYOF_EOS;
	4603	}
	4604	}
	4605	scan= tail;
	4606	continue;
	4607	}
	4608	#else
	4609	else if (PL_regkind[OP(scan)] == TRIE) {
	4610	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4611	U8*bang=NULL;
	4612
	4613	min += trie->minlen;
	4614	delta += (trie->maxlen - trie->minlen);
	4615	flags &= ~SCF_DO_STCLASS; /* xxx */
	4616	if (flags & SCF_DO_SUBSTR) {
	4617	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4618	data->pos_min += trie->minlen;
	4619	data->pos_delta += (trie->maxlen - trie->minlen);
	4620	if (trie->maxlen != trie->minlen)
	4621	data->longest = &(data->longest_float);
	4622	}
	4623	if (trie->jump) /* no more substrings -- for now /grr*/
	4624	flags &= ~SCF_DO_SUBSTR;
	4625	}
	4626	#endif /* old or new */
	4627	#endif /* TRIE_STUDY_OPT */
	4628
	4629	/* Else: zero-length, ignore. */
	4630	scan = regnext(scan);
	4631	}
	4632	if (frame) {
	4633	last = frame->last;
	4634	scan = frame->next;
	4635	stopparen = frame->stop;
	4636	frame = frame->prev;
	4637	goto fake_study_recurse;
	4638	}
	4639
	4640	finish:
	4641	assert(!frame);
	4642	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4643
	4644	*scanp = scan;
	4645	*deltap = is_inf_internal ? I32_MAX : delta;
	4646	if (flags & SCF_DO_SUBSTR && is_inf)
	4647	data->pos_delta = I32_MAX - data->pos_min;
	4648	if (is_par > (I32)U8_MAX)
	4649	is_par = 0;
	4650	if (is_par && pars==1 && data) {
	4651	data->flags \|= SF_IN_PAR;
	4652	data->flags &= ~SF_HAS_PAR;
	4653	}
	4654	else if (pars && data) {
	4655	data->flags \|= SF_HAS_PAR;
	4656	data->flags &= ~SF_IN_PAR;
	4657	}
	4658	if (flags & SCF_DO_STCLASS_OR)
	4659	cl_and(data->start_class, and_withp);
	4660	if (flags & SCF_TRIE_RESTUDY)
	4661	data->flags \|= SCF_TRIE_RESTUDY;
	4662
	4663	DEBUG_STUDYDATA("post-fin:",data,depth);
	4664
	4665	return min < stopmin ? min : stopmin;
	4666	}
	4667
	4668	STATIC U32
	4669	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4670	{
	4671	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4672
	4673	PERL_ARGS_ASSERT_ADD_DATA;
	4674
	4675	Renewc(RExC_rxi->data,
	4676	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4677	char, struct reg_data);
	4678	if(count)
	4679	Renew(RExC_rxi->data->what, count + n, U8);
	4680	else
	4681	Newx(RExC_rxi->data->what, n, U8);
	4682	RExC_rxi->data->count = count + n;
	4683	Copy(s, RExC_rxi->data->what + count, n, U8);
	4684	return count;
	4685	}
	4686
	4687	/XXX: todo make this not included in a non debugging perl /
	4688	#ifndef PERL_IN_XSUB_RE
	4689	void
	4690	Perl_reginitcolors(pTHX)
	4691	{
	4692	dVAR;
	4693	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4694	if (s) {
	4695	char *t = savepv(s);
	4696	int i = 0;
	4697	PL_colors[0] = t;
	4698	while (++i < 6) {
	4699	t = strchr(t, '\t');
	4700	if (t) {
	4701	*t = '\0';
	4702	PL_colors[i] = ++t;
	4703	}
	4704	else
	4705	PL_colors[i] = t = (char *)"";
	4706	}
	4707	} else {
	4708	int i = 0;
	4709	while (i < 6)
	4710	PL_colors[i++] = (char *)"";
	4711	}
	4712	PL_colorset = 1;
	4713	}
	4714	#endif
	4715
	4716
	4717	#ifdef TRIE_STUDY_OPT
	4718	#define CHECK_RESTUDY_GOTO \
	4719	if ( \
	4720	(data.flags & SCF_TRIE_RESTUDY) \
	4721	&& ! restudied++ \
	4722	) goto reStudy
	4723	#else
	4724	#define CHECK_RESTUDY_GOTO
	4725	#endif
	4726
	4727	/*
	4728	- pregcomp - compile a regular expression into internal code
	4729	*
	4730	* We can't allocate space until we know how big the compiled form will be,
	4731	* but we can't compile it (and thus know how big it is) until we've got a
	4732	* place to put the code. So we cheat: we compile it twice, once with code
	4733	* generation turned off and size counting turned on, and once "for real".
	4734	* This also means that we don't allocate space until we are sure that the
	4735	* thing really will compile successfully, and we never have to move the
	4736	* code and thus invalidate pointers into it. (Note that it has to be in
	4737	* one piece because free() must be able to free it all.) [NB: not true in perl]
	4738	*
	4739	* Beware that the optimization-preparation code in here knows about some
	4740	* of the structure of the compiled regexp. [I'll say.]
	4741	*/
	4742
	4743
	4744
	4745	#ifndef PERL_IN_XSUB_RE
	4746	#define RE_ENGINE_PTR &PL_core_reg_engine
	4747	#else
	4748	extern const struct regexp_engine my_reg_engine;
	4749	#define RE_ENGINE_PTR &my_reg_engine
	4750	#endif
	4751
	4752	#ifndef PERL_IN_XSUB_RE
	4753	REGEXP *
	4754	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4755	{
	4756	dVAR;
	4757	HV * const table = GvHV(PL_hintgv);
	4758
	4759	PERL_ARGS_ASSERT_PREGCOMP;
	4760
	4761	/* Dispatch a request to compile a regexp to correct
	4762	regexp engine. */
	4763	if (table) {
	4764	SV **ptr= hv_fetchs(table, "regcomp", FALSE);
	4765	GET_RE_DEBUG_FLAGS_DECL;
	4766	if (ptr && SvIOK(ptr) && SvIV(ptr)) {
	4767	const regexp_engine eng=INT2PTR(regexp_engine,SvIV(*ptr));
	4768	DEBUG_COMPILE_r({
	4769	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4770	SvIV(*ptr));
	4771	});
	4772	return CALLREGCOMP_ENG(eng, pattern, flags);
	4773	}
	4774	}
	4775	return Perl_re_compile(aTHX_ pattern, flags);
	4776	}
	4777	#endif
	4778
	4779	REGEXP *
	4780	Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
	4781	{
	4782	dVAR;
	4783	REGEXP *rx;
	4784	struct regexp *r;
	4785	register regexp_internal *ri;
	4786	STRLEN plen;
	4787	char* VOL exp;
	4788	char* xend;
	4789	regnode *scan;
	4790	I32 flags;
	4791	I32 minlen = 0;
	4792	U32 pm_flags;
	4793
	4794	/* these are all flags - maybe they should be turned
	4795	* into a single int with different bit masks */
	4796	I32 sawlookahead = 0;
	4797	I32 sawplus = 0;
	4798	I32 sawopen = 0;
	4799	bool used_setjump = FALSE;
	4800	regex_charset initial_charset = get_regex_charset(orig_pm_flags);
	4801
	4802	U8 jump_ret = 0;
	4803	dJMPENV;
	4804	scan_data_t data;
	4805	RExC_state_t RExC_state;
	4806	RExC_state_t * const pRExC_state = &RExC_state;
	4807	#ifdef TRIE_STUDY_OPT
	4808	int restudied;
	4809	RExC_state_t copyRExC_state;
	4810	#endif
	4811	GET_RE_DEBUG_FLAGS_DECL;
	4812
	4813	PERL_ARGS_ASSERT_RE_COMPILE;
	4814
	4815	DEBUG_r(if (!PL_colorset) reginitcolors());
	4816
	4817	/* Initialize these here instead of as-needed, as is quick and avoids
	4818	* having to test them each time otherwise */
	4819	if (! PL_AboveLatin1) {
	4820	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
	4821	PL_ASCII = _new_invlist_C_array(ASCII_invlist);
	4822	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
	4823
	4824	PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
	4825	PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
	4826
	4827	PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
	4828	PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
	4829
	4830	PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
	4831	PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
	4832
	4833	PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
	4834
	4835	PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
	4836	PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
	4837
	4838	PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
	4839
	4840	PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
	4841	PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
	4842
	4843	PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
	4844	PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
	4845
	4846	PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
	4847	PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
	4848
	4849	PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
	4850	PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
	4851
	4852	PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
	4853	PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
	4854
	4855	PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
	4856	PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
	4857
	4858	PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
	4859	PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
	4860
	4861	PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
	4862	PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
	4863
	4864	PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
	4865
	4866	PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
	4867	PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
	4868
	4869	PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
	4870	PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
	4871	}
	4872
	4873	exp = SvPV(pattern, plen);
	4874
	4875	if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
	4876	RExC_utf8 = RExC_orig_utf8 = 0;
	4877	}
	4878	else {
	4879	RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
	4880	}
	4881	RExC_uni_semantics = 0;
	4882	RExC_contains_locale = 0;
	4883
	4884	/**************** LONG JUMP TARGET HERE*********************/
	4885	/* Longjmp back to here if have to switch in midstream to utf8 */
	4886	if (! RExC_orig_utf8) {
	4887	JMPENV_PUSH(jump_ret);
	4888	used_setjump = TRUE;
	4889	}
	4890
	4891	if (jump_ret == 0) { /* First time through */
	4892	xend = exp + plen;
	4893
	4894	DEBUG_COMPILE_r({
	4895	SV *dsv= sv_newmortal();
	4896	RE_PV_QUOTED_DECL(s, RExC_utf8,
	4897	dsv, exp, plen, 60);
	4898	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	4899	PL_colors[4],PL_colors[5],s);
	4900	});
	4901	}
	4902	else { /* longjumped back */
	4903	STRLEN len = plen;
	4904
	4905	/* If the cause for the longjmp was other than changing to utf8, pop
	4906	* our own setjmp, and longjmp to the correct handler */
	4907	if (jump_ret != UTF8_LONGJMP) {
	4908	JMPENV_POP;
	4909	JMPENV_JUMP(jump_ret);
	4910	}
	4911
	4912	GET_RE_DEBUG_FLAGS;
	4913
	4914	/* It's possible to write a regexp in ascii that represents Unicode
	4915	codepoints outside of the byte range, such as via \x{100}. If we
	4916	detect such a sequence we have to convert the entire pattern to utf8
	4917	and then recompile, as our sizing calculation will have been based
	4918	on 1 byte == 1 character, but we will need to use utf8 to encode
	4919	at least some part of the pattern, and therefore must convert the whole
	4920	thing.
	4921	-- dmq */
	4922	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	4923	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	4924	exp = (char*)Perl_bytes_to_utf8(aTHX_
	4925	(U8*)SvPV_nomg(pattern, plen),
	4926	&len);
	4927	xend = exp + len;
	4928	RExC_orig_utf8 = RExC_utf8 = 1;
	4929	SAVEFREEPV(exp);
	4930	}
	4931
	4932	#ifdef TRIE_STUDY_OPT
	4933	restudied = 0;
	4934	#endif
	4935
	4936	pm_flags = orig_pm_flags;
	4937
	4938	if (initial_charset == REGEX_LOCALE_CHARSET) {
	4939	RExC_contains_locale = 1;
	4940	}
	4941	else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	4942
	4943	/* Set to use unicode semantics if the pattern is in utf8 and has the
	4944	* 'depends' charset specified, as it means unicode when utf8 */
	4945	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4946	}
	4947
	4948	RExC_precomp = exp;
	4949	RExC_flags = pm_flags;
	4950	RExC_sawback = 0;
	4951
	4952	RExC_seen = 0;
	4953	RExC_in_lookbehind = 0;
	4954	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	4955	RExC_seen_evals = 0;
	4956	RExC_extralen = 0;
	4957	RExC_override_recoding = 0;
	4958
	4959	/* First pass: determine size, legality. */
	4960	RExC_parse = exp;
	4961	RExC_start = exp;
	4962	RExC_end = xend;
	4963	RExC_naughty = 0;
	4964	RExC_npar = 1;
	4965	RExC_nestroot = 0;
	4966	RExC_size = 0L;
	4967	RExC_emit = &PL_regdummy;
	4968	RExC_whilem_seen = 0;
	4969	RExC_open_parens = NULL;
	4970	RExC_close_parens = NULL;
	4971	RExC_opend = NULL;
	4972	RExC_paren_names = NULL;
	4973	#ifdef DEBUGGING
	4974	RExC_paren_name_list = NULL;
	4975	#endif
	4976	RExC_recurse = NULL;
	4977	RExC_recurse_count = 0;
	4978
	4979	#if 0 /* REGC() is (currently) a NOP at the first pass.
	4980	* Clever compilers notice this and complain. --jhi */
	4981	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	4982	#endif
	4983	DEBUG_PARSE_r(
	4984	PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
	4985	RExC_lastnum=0;
	4986	RExC_lastparse=NULL;
	4987	);
	4988	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4989	RExC_precomp = NULL;
	4990	return(NULL);
	4991	}
	4992
	4993	/* Here, finished first pass. Get rid of any added setjmp */
	4994	if (used_setjump) {
	4995	JMPENV_POP;
	4996	}
	4997
	4998	DEBUG_PARSE_r({
	4999	PerlIO_printf(Perl_debug_log,
	5000	"Required size %"IVdf" nodes\n"
	5001	"Starting second pass (creation)\n",
	5002	(IV)RExC_size);
	5003	RExC_lastnum=0;
	5004	RExC_lastparse=NULL;
	5005	});
	5006
	5007	/* The first pass could have found things that force Unicode semantics */
	5008	if ((RExC_utf8 \|\| RExC_uni_semantics)
	5009	&& get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
	5010	{
	5011	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	5012	}
	5013
	5014	/* Small enough for pointer-storage convention?
	5015	If extralen==0, this means that we will not need long jumps. */
	5016	if (RExC_size >= 0x10000L && RExC_extralen)
	5017	RExC_size += RExC_extralen;
	5018	else
	5019	RExC_extralen = 0;
	5020	if (RExC_whilem_seen > 15)
	5021	RExC_whilem_seen = 15;
	5022
	5023	/* Allocate space and zero-initialize. Note, the two step process
	5024	of zeroing when in debug mode, thus anything assigned has to
	5025	happen after that */
	5026	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	5027	r = (struct regexp*)SvANY(rx);
	5028	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	5029	char, regexp_internal);
	5030	if ( r == NULL \|\| ri == NULL )
	5031	FAIL("Regexp out of space");
	5032	#ifdef DEBUGGING
	5033	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	5034	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	5035	#else
	5036	/* bulk initialize base fields with 0. */
	5037	Zero(ri, sizeof(regexp_internal), char);
	5038	#endif
	5039
	5040	/* non-zero initialization begins here */
	5041	RXi_SET( r, ri );
	5042	r->engine= RE_ENGINE_PTR;
	5043	r->extflags = pm_flags;
	5044	{
	5045	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	5046	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	5047
	5048	/* The caret is output if there are any defaults: if not all the STD
	5049	* flags are set, or if no character set specifier is needed */
	5050	bool has_default =
	5051	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	5052	\|\| ! has_charset);
	5053	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	5054	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	5055	>> RXf_PMf_STD_PMMOD_SHIFT);
	5056	const char fptr = STD_PAT_MODS; /"msix"*/
	5057	char *p;
	5058	/* Allocate for the worst case, which is all the std flags are turned
	5059	* on. If more precision is desired, we could do a population count of
	5060	* the flags set. This could be done with a small lookup table, or by
	5061	* shifting, masking and adding, or even, when available, assembly
	5062	* language for a machine-language population count.
	5063	* We never output a minus, as all those are defaults, so are
	5064	* covered by the caret */
	5065	const STRLEN wraplen = plen + has_p + has_runon
	5066	+ has_default /* If needs a caret */
	5067
	5068	/* If needs a character set specifier */
	5069	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	5070	+ (sizeof(STD_PAT_MODS) - 1)
	5071	+ (sizeof("(?:)") - 1);
	5072
	5073	p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
	5074	SvPOK_on(rx);
	5075	SvFLAGS(rx) \|= SvUTF8(pattern);
	5076	p++='('; p++='?';
	5077
	5078	/* If a default, cover it using the caret */
	5079	if (has_default) {
	5080	*p++= DEFAULT_PAT_MOD;
	5081	}
	5082	if (has_charset) {
	5083	STRLEN len;
	5084	const char* const name = get_regex_charset_name(r->extflags, &len);
	5085	Copy(name, p, len, char);
	5086	p += len;
	5087	}
	5088	if (has_p)
	5089	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	5090	{
	5091	char ch;
	5092	while((ch = *fptr++)) {
	5093	if(reganch & 1)
	5094	*p++ = ch;
	5095	reganch >>= 1;
	5096	}
	5097	}
	5098
	5099	*p++ = ':';
	5100	Copy(RExC_precomp, p, plen, char);
	5101	assert ((RX_WRAPPED(rx) - p) < 16);
	5102	r->pre_prefix = p - RX_WRAPPED(rx);
	5103	p += plen;
	5104	if (has_runon)
	5105	*p++ = '\n';
	5106	*p++ = ')';
	5107	*p = 0;
	5108	SvCUR_set(rx, p - SvPVX_const(rx));
	5109	}
	5110
	5111	r->intflags = 0;
	5112	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	5113
	5114	if (RExC_seen & REG_SEEN_RECURSE) {
	5115	Newxz(RExC_open_parens, RExC_npar,regnode *);
	5116	SAVEFREEPV(RExC_open_parens);
	5117	Newxz(RExC_close_parens,RExC_npar,regnode *);
	5118	SAVEFREEPV(RExC_close_parens);
	5119	}
	5120
	5121	/* Useful during FAIL. */
	5122	#ifdef RE_TRACK_PATTERN_OFFSETS
	5123	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	5124	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	5125	"%s %"UVuf" bytes for offset annotations.\n",
	5126	ri->u.offsets ? "Got" : "Couldn't get",
	5127	(UV)((2RExC_size+1) sizeof(U32))));
	5128	#endif
	5129	SetProgLen(ri,RExC_size);
	5130	RExC_rx_sv = rx;
	5131	RExC_rx = r;
	5132	RExC_rxi = ri;
	5133
	5134	/* Second pass: emit code. */
	5135	RExC_flags = pm_flags; /* don't let top level (?i) bleed */
	5136	RExC_parse = exp;
	5137	RExC_end = xend;
	5138	RExC_naughty = 0;
	5139	RExC_npar = 1;
	5140	RExC_emit_start = ri->program;
	5141	RExC_emit = ri->program;
	5142	RExC_emit_bound = ri->program + RExC_size + 1;
	5143
	5144	/* Store the count of eval-groups for security checks: */
	5145	RExC_rx->seen_evals = RExC_seen_evals;
	5146	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	5147	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	5148	ReREFCNT_dec(rx);
	5149	return(NULL);
	5150	}
	5151	/* XXXX To minimize changes to RE engine we always allocate
	5152	3-units-long substrs field. */
	5153	Newx(r->substrs, 1, struct reg_substr_data);
	5154	if (RExC_recurse_count) {
	5155	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	5156	SAVEFREEPV(RExC_recurse);
	5157	}
	5158
	5159	reStudy:
	5160	r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
	5161	Zero(r->substrs, 1, struct reg_substr_data);
	5162
	5163	#ifdef TRIE_STUDY_OPT
	5164	if (!restudied) {
	5165	StructCopy(&zero_scan_data, &data, scan_data_t);
	5166	copyRExC_state = RExC_state;
	5167	} else {
	5168	U32 seen=RExC_seen;
	5169	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	5170
	5171	RExC_state = copyRExC_state;
	5172	if (seen & REG_TOP_LEVEL_BRANCHES)
	5173	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	5174	else
	5175	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	5176	if (data.last_found) {
	5177	SvREFCNT_dec(data.longest_fixed);
	5178	SvREFCNT_dec(data.longest_float);
	5179	SvREFCNT_dec(data.last_found);
	5180	}
	5181	StructCopy(&zero_scan_data, &data, scan_data_t);
	5182	}
	5183	#else
	5184	StructCopy(&zero_scan_data, &data, scan_data_t);
	5185	#endif
	5186
	5187	/* Dig out information for optimizations. */
	5188	r->extflags = RExC_flags; /* was pm_op */
	5189	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	5190
	5191	if (UTF)
	5192	SvUTF8_on(rx); /* Unicode in it? */
	5193	ri->regstclass = NULL;
	5194	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	5195	r->intflags \|= PREGf_NAUGHTY;
	5196	scan = ri->program + 1; /* First BRANCH. */
	5197
	5198	/* testing for BRANCH here tells us whether there is "must appear"
	5199	data in the pattern. If there is then we can use it for optimisations */
	5200	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	5201	I32 fake;
	5202	STRLEN longest_float_length, longest_fixed_length;
	5203	struct regnode_charclass_class ch_class; /* pointed to by data */
	5204	int stclass_flag;
	5205	I32 last_close = 0; /* pointed to by data */
	5206	regnode *first= scan;
	5207	regnode *first_next= regnext(first);
	5208	/*
	5209	* Skip introductions and multiplicators >= 1
	5210	* so that we can extract the 'meat' of the pattern that must
	5211	* match in the large if() sequence following.
	5212	* NOTE that EXACT is NOT covered here, as it is normally
	5213	* picked up by the optimiser separately.
	5214	*
	5215	* This is unfortunate as the optimiser isnt handling lookahead
	5216	* properly currently.
	5217	*
	5218	*/
	5219	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	5220	/* An OR of one alternative - should not happen now. */
	5221	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	5222	/* for now we can't handle lookbehind IFMATCH*/
	5223	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	5224	(OP(first) == PLUS) \|\|
	5225	(OP(first) == MINMOD) \|\|
	5226	/* An {n,m} with n>0 */
	5227	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	5228	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	5229	{
	5230	/*
	5231	* the only op that could be a regnode is PLUS, all the rest
	5232	* will be regnode_1 or regnode_2.
	5233	*
	5234	*/
	5235	if (OP(first) == PLUS)
	5236	sawplus = 1;
	5237	else
	5238	first += regarglen[OP(first)];
	5239
	5240	first = NEXTOPER(first);
	5241	first_next= regnext(first);
	5242	}
	5243
	5244	/* Starting-point info. */
	5245	again:
	5246	DEBUG_PEEP("first:",first,0);
	5247	/* Ignore EXACT as we deal with it later. */
	5248	if (PL_regkind[OP(first)] == EXACT) {
	5249	if (OP(first) == EXACT)
	5250	NOOP; /* Empty, get anchored substr later. */
	5251	else
	5252	ri->regstclass = first;
	5253	}
	5254	#ifdef TRIE_STCLASS
	5255	else if (PL_regkind[OP(first)] == TRIE &&
	5256	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	5257	{
	5258	regnode *trie_op;
	5259	/* this can happen only on restudy */
	5260	if ( OP(first) == TRIE ) {
	5261	struct regnode_1 trieop = (struct regnode_1 )
	5262	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	5263	StructCopy(first,trieop,struct regnode_1);
	5264	trie_op=(regnode *)trieop;
	5265	} else {
	5266	struct regnode_charclass trieop = (struct regnode_charclass )
	5267	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	5268	StructCopy(first,trieop,struct regnode_charclass);
	5269	trie_op=(regnode *)trieop;
	5270	}
	5271	OP(trie_op)+=2;
	5272	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	5273	ri->regstclass = trie_op;
	5274	}
	5275	#endif
	5276	else if (REGNODE_SIMPLE(OP(first)))
	5277	ri->regstclass = first;
	5278	else if (PL_regkind[OP(first)] == BOUND \|\|
	5279	PL_regkind[OP(first)] == NBOUND)
	5280	ri->regstclass = first;
	5281	else if (PL_regkind[OP(first)] == BOL) {
	5282	r->extflags \|= (OP(first) == MBOL
	5283	? RXf_ANCH_MBOL
	5284	: (OP(first) == SBOL
	5285	? RXf_ANCH_SBOL
	5286	: RXf_ANCH_BOL));
	5287	first = NEXTOPER(first);
	5288	goto again;
	5289	}
	5290	else if (OP(first) == GPOS) {
	5291	r->extflags \|= RXf_ANCH_GPOS;
	5292	first = NEXTOPER(first);
	5293	goto again;
	5294	}
	5295	else if ((!sawopen \|\| !RExC_sawback) &&
	5296	(OP(first) == STAR &&
	5297	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	5298	!(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
	5299	{
	5300	/* turn .* into ^.* with an implied $=1 /
	5301	const int type =
	5302	(OP(NEXTOPER(first)) == REG_ANY)
	5303	? RXf_ANCH_MBOL
	5304	: RXf_ANCH_SBOL;
	5305	r->extflags \|= type;
	5306	r->intflags \|= PREGf_IMPLICIT;
	5307	first = NEXTOPER(first);
	5308	goto again;
	5309	}
	5310	if (sawplus && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	5311	&& !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
	5312	/* x+ must match at the 1st pos of run of x's */
	5313	r->intflags \|= PREGf_SKIP;
	5314
	5315	/* Scan is after the zeroth branch, first is atomic matcher. */
	5316	#ifdef TRIE_STUDY_OPT
	5317	DEBUG_PARSE_r(
	5318	if (!restudied)
	5319	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	5320	(IV)(first - scan + 1))
	5321	);
	5322	#else
	5323	DEBUG_PARSE_r(
	5324	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	5325	(IV)(first - scan + 1))
	5326	);
	5327	#endif
	5328
	5329
	5330	/*
	5331	* If there's something expensive in the r.e., find the
	5332	* longest literal string that must appear and make it the
	5333	* regmust. Resolve ties in favor of later strings, since
	5334	* the regstart check works with the beginning of the r.e.
	5335	* and avoiding duplication strengthens checking. Not a
	5336	* strong reason, but sufficient in the absence of others.
	5337	* [Now we resolve ties in favor of the earlier string if
	5338	* it happens that c_offset_min has been invalidated, since the
	5339	* earlier string may buy us something the later one won't.]
	5340	*/
	5341
	5342	data.longest_fixed = newSVpvs("");
	5343	data.longest_float = newSVpvs("");
	5344	data.last_found = newSVpvs("");
	5345	data.longest = &(data.longest_fixed);
	5346	first = scan;
	5347	if (!ri->regstclass) {
	5348	cl_init(pRExC_state, &ch_class);
	5349	data.start_class = &ch_class;
	5350	stclass_flag = SCF_DO_STCLASS_AND;
	5351	} else /* XXXX Check for BOUND? */
	5352	stclass_flag = 0;
	5353	data.last_closep = &last_close;
	5354
	5355	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	5356	&data, -1, NULL, NULL,
	5357	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag,0);
	5358
	5359
	5360	CHECK_RESTUDY_GOTO;
	5361
	5362
	5363	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	5364	&& data.last_start_min == 0 && data.last_end > 0
	5365	&& !RExC_seen_zerolen
	5366	&& !(RExC_seen & REG_SEEN_VERBARG)
	5367	&& (!(RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	5368	r->extflags \|= RXf_CHECK_ALL;
	5369	scan_commit(pRExC_state, &data,&minlen,0);
	5370	SvREFCNT_dec(data.last_found);
	5371
	5372	/* Note that code very similar to this but for anchored string
	5373	follows immediately below, changes may need to be made to both.
	5374	Be careful.
	5375	*/
	5376	longest_float_length = CHR_SVLEN(data.longest_float);
	5377	if (longest_float_length
	5378	\|\| (data.flags & SF_FL_BEFORE_EOL
	5379	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5380	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	5381	{
	5382	I32 t,ml;
	5383
	5384	/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
	5385	if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
	5386	\|\| (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	5387	&& data.offset_fixed == data.offset_float_min
	5388	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
	5389	goto remove_float; /* As in (a)+. */
	5390
	5391	/* copy the information about the longest float from the reg_scan_data
	5392	over to the program. */
	5393	if (SvUTF8(data.longest_float)) {
	5394	r->float_utf8 = data.longest_float;
	5395	r->float_substr = NULL;
	5396	} else {
	5397	r->float_substr = data.longest_float;
	5398	r->float_utf8 = NULL;
	5399	}
	5400	/* float_end_shift is how many chars that must be matched that
	5401	follow this item. We calculate it ahead of time as once the
	5402	lookbehind offset is added in we lose the ability to correctly
	5403	calculate it.*/
	5404	ml = data.minlen_float ? *(data.minlen_float)
	5405	: (I32)longest_float_length;
	5406	r->float_end_shift = ml - data.offset_float_min
	5407	- longest_float_length + (SvTAIL(data.longest_float) != 0)
	5408	+ data.lookbehind_float;
	5409	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	5410	r->float_max_offset = data.offset_float_max;
	5411	if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
	5412	r->float_max_offset -= data.lookbehind_float;
	5413
	5414	t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
	5415	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5416	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5417	fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
	5418	}
	5419	else {
	5420	remove_float:
	5421	r->float_substr = r->float_utf8 = NULL;
	5422	SvREFCNT_dec(data.longest_float);
	5423	longest_float_length = 0;
	5424	}
	5425
	5426	/* Note that code very similar to this but for floating string
	5427	is immediately above, changes may need to be made to both.
	5428	Be careful.
	5429	*/
	5430	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	5431
	5432	/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
	5433	if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
	5434	&& (longest_fixed_length
	5435	\|\| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
	5436	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5437	\|\| (RExC_flags & RXf_PMf_MULTILINE)))) )
	5438	{
	5439	I32 t,ml;
	5440
	5441	/* copy the information about the longest fixed
	5442	from the reg_scan_data over to the program. */
	5443	if (SvUTF8(data.longest_fixed)) {
	5444	r->anchored_utf8 = data.longest_fixed;
	5445	r->anchored_substr = NULL;
	5446	} else {
	5447	r->anchored_substr = data.longest_fixed;
	5448	r->anchored_utf8 = NULL;
	5449	}
	5450	/* fixed_end_shift is how many chars that must be matched that
	5451	follow this item. We calculate it ahead of time as once the
	5452	lookbehind offset is added in we lose the ability to correctly
	5453	calculate it.*/
	5454	ml = data.minlen_fixed ? *(data.minlen_fixed)
	5455	: (I32)longest_fixed_length;
	5456	r->anchored_end_shift = ml - data.offset_fixed
	5457	- longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
	5458	+ data.lookbehind_fixed;
	5459	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	5460
	5461	t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
	5462	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5463	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5464	fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
	5465	}
	5466	else {
	5467	r->anchored_substr = r->anchored_utf8 = NULL;
	5468	SvREFCNT_dec(data.longest_fixed);
	5469	longest_fixed_length = 0;
	5470	}
	5471	if (ri->regstclass
	5472	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	5473	ri->regstclass = NULL;
	5474
	5475	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	5476	&& stclass_flag
	5477	&& !(data.start_class->flags & ANYOF_EOS)
	5478	&& !cl_is_anything(data.start_class))
	5479	{
	5480	const U32 n = add_data(pRExC_state, 1, "f");
	5481	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5482
	5483	Newx(RExC_rxi->data->data[n], 1,
	5484	struct regnode_charclass_class);
	5485	StructCopy(data.start_class,
	5486	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5487	struct regnode_charclass_class);
	5488	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5489	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5490	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	5491	regprop(r, sv, (regnode*)data.start_class);
	5492	PerlIO_printf(Perl_debug_log,
	5493	"synthetic stclass \"%s\".\n",
	5494	SvPVX_const(sv));});
	5495	}
	5496
	5497	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	5498	if (longest_fixed_length > longest_float_length) {
	5499	r->check_end_shift = r->anchored_end_shift;
	5500	r->check_substr = r->anchored_substr;
	5501	r->check_utf8 = r->anchored_utf8;
	5502	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	5503	if (r->extflags & RXf_ANCH_SINGLE)
	5504	r->extflags \|= RXf_NOSCAN;
	5505	}
	5506	else {
	5507	r->check_end_shift = r->float_end_shift;
	5508	r->check_substr = r->float_substr;
	5509	r->check_utf8 = r->float_utf8;
	5510	r->check_offset_min = r->float_min_offset;
	5511	r->check_offset_max = r->float_max_offset;
	5512	}
	5513	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
	5514	This should be changed ASAP! */
	5515	if ((r->check_substr \|\| r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
	5516	r->extflags \|= RXf_USE_INTUIT;
	5517	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	5518	r->extflags \|= RXf_INTUIT_TAIL;
	5519	}
	5520	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	5521	if ( (STRLEN)minlen < longest_float_length )
	5522	minlen= longest_float_length;
	5523	if ( (STRLEN)minlen < longest_fixed_length )
	5524	minlen= longest_fixed_length;
	5525	*/
	5526	}
	5527	else {
	5528	/* Several toplevels. Best we can is to set minlen. */
	5529	I32 fake;
	5530	struct regnode_charclass_class ch_class;
	5531	I32 last_close = 0;
	5532
	5533	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	5534
	5535	scan = ri->program + 1;
	5536	cl_init(pRExC_state, &ch_class);
	5537	data.start_class = &ch_class;
	5538	data.last_closep = &last_close;
	5539
	5540
	5541	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	5542	&data, -1, NULL, NULL, SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS,0);
	5543
	5544	CHECK_RESTUDY_GOTO;
	5545
	5546	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	5547	= r->float_substr = r->float_utf8 = NULL;
	5548
	5549	if (!(data.start_class->flags & ANYOF_EOS)
	5550	&& !cl_is_anything(data.start_class))
	5551	{
	5552	const U32 n = add_data(pRExC_state, 1, "f");
	5553	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5554
	5555	Newx(RExC_rxi->data->data[n], 1,
	5556	struct regnode_charclass_class);
	5557	StructCopy(data.start_class,
	5558	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5559	struct regnode_charclass_class);
	5560	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5561	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5562	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	5563	regprop(r, sv, (regnode*)data.start_class);
	5564	PerlIO_printf(Perl_debug_log,
	5565	"synthetic stclass \"%s\".\n",
	5566	SvPVX_const(sv));});
	5567	}
	5568	}
	5569
	5570	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	5571	the "real" pattern. */
	5572	DEBUG_OPTIMISE_r({
	5573	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	5574	(IV)minlen, (IV)r->minlen);
	5575	});
	5576	r->minlenret = minlen;
	5577	if (r->minlen < minlen)
	5578	r->minlen = minlen;
	5579
	5580	if (RExC_seen & REG_SEEN_GPOS)
	5581	r->extflags \|= RXf_GPOS_SEEN;
	5582	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	5583	r->extflags \|= RXf_LOOKBEHIND_SEEN;
	5584	if (RExC_seen & REG_SEEN_EVAL)
	5585	r->extflags \|= RXf_EVAL_SEEN;
	5586	if (RExC_seen & REG_SEEN_CANY)
	5587	r->extflags \|= RXf_CANY_SEEN;
	5588	if (RExC_seen & REG_SEEN_VERBARG)
	5589	r->intflags \|= PREGf_VERBARG_SEEN;
	5590	if (RExC_seen & REG_SEEN_CUTGROUP)
	5591	r->intflags \|= PREGf_CUTGROUP_SEEN;
	5592	if (RExC_paren_names)
	5593	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	5594	else
	5595	RXp_PAREN_NAMES(r) = NULL;
	5596
	5597	#ifdef STUPID_PATTERN_CHECKS
	5598	if (RX_PRELEN(rx) == 0)
	5599	r->extflags \|= RXf_NULL;
	5600	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5601	/* XXX: this should happen BEFORE we compile */
	5602	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5603	else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
	5604	r->extflags \|= RXf_WHITE;
	5605	else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
	5606	r->extflags \|= RXf_START_ONLY;
	5607	#else
	5608	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5609	/* XXX: this should happen BEFORE we compile */
	5610	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5611	else {
	5612	regnode *first = ri->program + 1;
	5613	U8 fop = OP(first);
	5614
	5615	if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
	5616	r->extflags \|= RXf_NULL;
	5617	else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
	5618	r->extflags \|= RXf_START_ONLY;
	5619	else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
	5620	&& OP(regnext(first)) == END)
	5621	r->extflags \|= RXf_WHITE;
	5622	}
	5623	#endif
	5624	#ifdef DEBUGGING
	5625	if (RExC_paren_names) {
	5626	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	5627	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	5628	} else
	5629	#endif
	5630	ri->name_list_idx = 0;
	5631
	5632	if (RExC_recurse_count) {
	5633	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	5634	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	5635	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	5636	}
	5637	}
	5638	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	5639	/* assume we don't need to swap parens around before we match */
	5640
	5641	DEBUG_DUMP_r({
	5642	PerlIO_printf(Perl_debug_log,"Final program:\n");
	5643	regdump(r);
	5644	});
	5645	#ifdef RE_TRACK_PATTERN_OFFSETS
	5646	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	5647	const U32 len = ri->u.offsets[0];
	5648	U32 i;
	5649	GET_RE_DEBUG_FLAGS_DECL;
	5650	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	5651	for (i = 1; i <= len; i++) {
	5652	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	5653	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	5654	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	5655	}
	5656	PerlIO_printf(Perl_debug_log, "\n");
	5657	});
	5658	#endif
	5659	return rx;
	5660	}
	5661
	5662	#undef RE_ENGINE_PTR
	5663
	5664
	5665	SV*
	5666	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	5667	const U32 flags)
	5668	{
	5669	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	5670
	5671	PERL_UNUSED_ARG(value);
	5672
	5673	if (flags & RXapif_FETCH) {
	5674	return reg_named_buff_fetch(rx, key, flags);
	5675	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	5676	Perl_croak_no_modify(aTHX);
	5677	return NULL;
	5678	} else if (flags & RXapif_EXISTS) {
	5679	return reg_named_buff_exists(rx, key, flags)
	5680	? &PL_sv_yes
	5681	: &PL_sv_no;
	5682	} else if (flags & RXapif_REGNAMES) {
	5683	return reg_named_buff_all(rx, flags);
	5684	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	5685	return reg_named_buff_scalar(rx, flags);
	5686	} else {
	5687	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	5688	return NULL;
	5689	}
	5690	}
	5691
	5692	SV*
	5693	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	5694	const U32 flags)
	5695	{
	5696	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	5697	PERL_UNUSED_ARG(lastkey);
	5698
	5699	if (flags & RXapif_FIRSTKEY)
	5700	return reg_named_buff_firstkey(rx, flags);
	5701	else if (flags & RXapif_NEXTKEY)
	5702	return reg_named_buff_nextkey(rx, flags);
	5703	else {
	5704	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	5705	return NULL;
	5706	}
	5707	}
	5708
	5709	SV*
	5710	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	5711	const U32 flags)
	5712	{
	5713	AV *retarray = NULL;
	5714	SV *ret;
	5715	struct regexp const rx = (struct regexp )SvANY(r);
	5716
	5717	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	5718
	5719	if (flags & RXapif_ALL)
	5720	retarray=newAV();
	5721
	5722	if (rx && RXp_PAREN_NAMES(rx)) {
	5723	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	5724	if (he_str) {
	5725	IV i;
	5726	SV* sv_dat=HeVAL(he_str);
	5727	I32 nums=(I32)SvPVX(sv_dat);
	5728	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	5729	if ((I32)(rx->nparens) >= nums[i]
	5730	&& rx->offs[nums[i]].start != -1
	5731	&& rx->offs[nums[i]].end != -1)
	5732	{
	5733	ret = newSVpvs("");
	5734	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	5735	if (!retarray)
	5736	return ret;
	5737	} else {
	5738	if (retarray)
	5739	ret = newSVsv(&PL_sv_undef);
	5740	}
	5741	if (retarray)
	5742	av_push(retarray, ret);
	5743	}
	5744	if (retarray)
	5745	return newRV_noinc(MUTABLE_SV(retarray));
	5746	}
	5747	}
	5748	return NULL;
	5749	}
	5750
	5751	bool
	5752	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	5753	const U32 flags)
	5754	{
	5755	struct regexp const rx = (struct regexp )SvANY(r);
	5756
	5757	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	5758
	5759	if (rx && RXp_PAREN_NAMES(rx)) {
	5760	if (flags & RXapif_ALL) {
	5761	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	5762	} else {
	5763	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	5764	if (sv) {
	5765	SvREFCNT_dec(sv);
	5766	return TRUE;
	5767	} else {
	5768	return FALSE;
	5769	}
	5770	}
	5771	} else {
	5772	return FALSE;
	5773	}
	5774	}
	5775
	5776	SV*
	5777	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	5778	{
	5779	struct regexp const rx = (struct regexp )SvANY(r);
	5780
	5781	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	5782
	5783	if ( rx && RXp_PAREN_NAMES(rx) ) {
	5784	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	5785
	5786	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	5787	} else {
	5788	return FALSE;
	5789	}
	5790	}
	5791
	5792	SV*
	5793	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	5794	{
	5795	struct regexp const rx = (struct regexp )SvANY(r);
	5796	GET_RE_DEBUG_FLAGS_DECL;
	5797
	5798	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	5799
	5800	if (rx && RXp_PAREN_NAMES(rx)) {
	5801	HV *hv = RXp_PAREN_NAMES(rx);
	5802	HE *temphe;
	5803	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5804	IV i;
	5805	IV parno = 0;
	5806	SV* sv_dat = HeVAL(temphe);
	5807	I32 nums = (I32)SvPVX(sv_dat);
	5808	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5809	if ((I32)(rx->lastparen) >= nums[i] &&
	5810	rx->offs[nums[i]].start != -1 &&
	5811	rx->offs[nums[i]].end != -1)
	5812	{
	5813	parno = nums[i];
	5814	break;
	5815	}
	5816	}
	5817	if (parno \|\| flags & RXapif_ALL) {
	5818	return newSVhek(HeKEY_hek(temphe));
	5819	}
	5820	}
	5821	}
	5822	return NULL;
	5823	}
	5824
	5825	SV*
	5826	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	5827	{
	5828	SV *ret;
	5829	AV *av;
	5830	I32 length;
	5831	struct regexp const rx = (struct regexp )SvANY(r);
	5832
	5833	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	5834
	5835	if (rx && RXp_PAREN_NAMES(rx)) {
	5836	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	5837	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	5838	} else if (flags & RXapif_ONE) {
	5839	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	5840	av = MUTABLE_AV(SvRV(ret));
	5841	length = av_len(av);
	5842	SvREFCNT_dec(ret);
	5843	return newSViv(length + 1);
	5844	} else {
	5845	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	5846	return NULL;
	5847	}
	5848	}
	5849	return &PL_sv_undef;
	5850	}
	5851
	5852	SV*
	5853	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	5854	{
	5855	struct regexp const rx = (struct regexp )SvANY(r);
	5856	AV *av = newAV();
	5857
	5858	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	5859
	5860	if (rx && RXp_PAREN_NAMES(rx)) {
	5861	HV *hv= RXp_PAREN_NAMES(rx);
	5862	HE *temphe;
	5863	(void)hv_iterinit(hv);
	5864	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5865	IV i;
	5866	IV parno = 0;
	5867	SV* sv_dat = HeVAL(temphe);
	5868	I32 nums = (I32)SvPVX(sv_dat);
	5869	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5870	if ((I32)(rx->lastparen) >= nums[i] &&
	5871	rx->offs[nums[i]].start != -1 &&
	5872	rx->offs[nums[i]].end != -1)
	5873	{
	5874	parno = nums[i];
	5875	break;
	5876	}
	5877	}
	5878	if (parno \|\| flags & RXapif_ALL) {
	5879	av_push(av, newSVhek(HeKEY_hek(temphe)));
	5880	}
	5881	}
	5882	}
	5883
	5884	return newRV_noinc(MUTABLE_SV(av));
	5885	}
	5886
	5887	void
	5888	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	5889	SV * const sv)
	5890	{
	5891	struct regexp const rx = (struct regexp )SvANY(r);
	5892	char *s = NULL;
	5893	I32 i = 0;
	5894	I32 s1, t1;
	5895
	5896	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	5897
	5898	if (!rx->subbeg) {
	5899	sv_setsv(sv,&PL_sv_undef);
	5900	return;
	5901	}
	5902	else
	5903	if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
	5904	/* $` */
	5905	i = rx->offs[0].start;
	5906	s = rx->subbeg;
	5907	}
	5908	else
	5909	if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
	5910	/* $' */
	5911	s = rx->subbeg + rx->offs[0].end;
	5912	i = rx->sublen - rx->offs[0].end;
	5913	}
	5914	else
	5915	if ( 0 <= paren && paren <= (I32)rx->nparens &&
	5916	(s1 = rx->offs[paren].start) != -1 &&
	5917	(t1 = rx->offs[paren].end) != -1)
	5918	{
	5919	/* $& $1 ... */
	5920	i = t1 - s1;
	5921	s = rx->subbeg + s1;
	5922	} else {
	5923	sv_setsv(sv,&PL_sv_undef);
	5924	return;
	5925	}
	5926	assert(rx->sublen >= (s - rx->subbeg) + i );
	5927	if (i >= 0) {
	5928	const int oldtainted = PL_tainted;
	5929	TAINT_NOT;
	5930	sv_setpvn(sv, s, i);
	5931	PL_tainted = oldtainted;
	5932	if ( (rx->extflags & RXf_CANY_SEEN)
	5933	? (RXp_MATCH_UTF8(rx)
	5934	&& (!i \|\| is_utf8_string((U8*)s, i)))
	5935	: (RXp_MATCH_UTF8(rx)) )
	5936	{
	5937	SvUTF8_on(sv);
	5938	}
	5939	else
	5940	SvUTF8_off(sv);
	5941	if (PL_tainting) {
	5942	if (RXp_MATCH_TAINTED(rx)) {
	5943	if (SvTYPE(sv) >= SVt_PVMG) {
	5944	MAGIC* const mg = SvMAGIC(sv);
	5945	MAGIC* mgt;
	5946	PL_tainted = 1;
	5947	SvMAGIC_set(sv, mg->mg_moremagic);
	5948	SvTAINT(sv);
	5949	if ((mgt = SvMAGIC(sv))) {
	5950	mg->mg_moremagic = mgt;
	5951	SvMAGIC_set(sv, mg);
	5952	}
	5953	} else {
	5954	PL_tainted = 1;
	5955	SvTAINT(sv);
	5956	}
	5957	} else
	5958	SvTAINTED_off(sv);
	5959	}
	5960	} else {
	5961	sv_setsv(sv,&PL_sv_undef);
	5962	return;
	5963	}
	5964	}
	5965
	5966	void
	5967	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	5968	SV const * const value)
	5969	{
	5970	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	5971
	5972	PERL_UNUSED_ARG(rx);
	5973	PERL_UNUSED_ARG(paren);
	5974	PERL_UNUSED_ARG(value);
	5975
	5976	if (!PL_localizing)
	5977	Perl_croak_no_modify(aTHX);
	5978	}
	5979
	5980	I32
	5981	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	5982	const I32 paren)
	5983	{
	5984	struct regexp const rx = (struct regexp )SvANY(r);
	5985	I32 i;
	5986	I32 s1, t1;
	5987
	5988	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	5989
	5990	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	5991	switch (paren) {
	5992	/* $` / ${^PREMATCH} */
	5993	case RX_BUFF_IDX_PREMATCH:
	5994	if (rx->offs[0].start != -1) {
	5995	i = rx->offs[0].start;
	5996	if (i > 0) {
	5997	s1 = 0;
	5998	t1 = i;
	5999	goto getlen;
	6000	}
	6001	}
	6002	return 0;
	6003	/* $' / ${^POSTMATCH} */
	6004	case RX_BUFF_IDX_POSTMATCH:
	6005	if (rx->offs[0].end != -1) {
	6006	i = rx->sublen - rx->offs[0].end;
	6007	if (i > 0) {
	6008	s1 = rx->offs[0].end;
	6009	t1 = rx->sublen;
	6010	goto getlen;
	6011	}
	6012	}
	6013	return 0;
	6014	/* $& / ${^MATCH}, $1, $2, ... */
	6015	default:
	6016	if (paren <= (I32)rx->nparens &&
	6017	(s1 = rx->offs[paren].start) != -1 &&
	6018	(t1 = rx->offs[paren].end) != -1)
	6019	{
	6020	i = t1 - s1;
	6021	goto getlen;
	6022	} else {
	6023	if (ckWARN(WARN_UNINITIALIZED))
	6024	report_uninit((const SV *)sv);
	6025	return 0;
	6026	}
	6027	}
	6028	getlen:
	6029	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	6030	const char * const s = rx->subbeg + s1;
	6031	const U8 *ep;
	6032	STRLEN el;
	6033
	6034	i = t1 - s1;
	6035	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	6036	i = el;
	6037	}
	6038	return i;
	6039	}
	6040
	6041	SV*
	6042	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	6043	{
	6044	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	6045	PERL_UNUSED_ARG(rx);
	6046	if (0)
	6047	return NULL;
	6048	else
	6049	return newSVpvs("Regexp");
	6050	}
	6051
	6052	/* Scans the name of a named buffer from the pattern.
	6053	* If flags is REG_RSN_RETURN_NULL returns null.
	6054	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	6055	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	6056	* to the parsed name as looked up in the RExC_paren_names hash.
	6057	* If there is an error throws a vFAIL().. type exception.
	6058	*/
	6059
	6060	#define REG_RSN_RETURN_NULL 0
	6061	#define REG_RSN_RETURN_NAME 1
	6062	#define REG_RSN_RETURN_DATA 2
	6063
	6064	STATIC SV*
	6065	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	6066	{
	6067	char *name_start = RExC_parse;
	6068
	6069	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	6070
	6071	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	6072	/* skip IDFIRST by using do...while */
	6073	if (UTF)
	6074	do {
	6075	RExC_parse += UTF8SKIP(RExC_parse);
	6076	} while (isALNUM_utf8((U8*)RExC_parse));
	6077	else
	6078	do {
	6079	RExC_parse++;
	6080	} while (isALNUM(*RExC_parse));
	6081	}
	6082
	6083	if ( flags ) {
	6084	SV* sv_name
	6085	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	6086	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	6087	if ( flags == REG_RSN_RETURN_NAME)
	6088	return sv_name;
	6089	else if (flags==REG_RSN_RETURN_DATA) {
	6090	HE *he_str = NULL;
	6091	SV *sv_dat = NULL;
	6092	if ( ! sv_name ) /* should not happen*/
	6093	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	6094	if (RExC_paren_names)
	6095	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	6096	if ( he_str )
	6097	sv_dat = HeVAL(he_str);
	6098	if ( ! sv_dat )
	6099	vFAIL("Reference to nonexistent named group");
	6100	return sv_dat;
	6101	}
	6102	else {
	6103	Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
	6104	(unsigned long) flags);
	6105	}
	6106	/* NOT REACHED */
	6107	}
	6108	return NULL;
	6109	}
	6110
	6111	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	6112	int rem=(int)(RExC_end - RExC_parse); \
	6113	int cut; \
	6114	int num; \
	6115	int iscut=0; \
	6116	if (rem>10) { \
	6117	rem=10; \
	6118	iscut=1; \
	6119	} \
	6120	cut=10-rem; \
	6121	if (RExC_lastparse!=RExC_parse) \
	6122	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	6123	rem, RExC_parse, \
	6124	cut + 4, \
	6125	iscut ? "..." : "<" \
	6126	); \
	6127	else \
	6128	PerlIO_printf(Perl_debug_log,"%16s",""); \
	6129	\
	6130	if (SIZE_ONLY) \
	6131	num = RExC_size + 1; \
	6132	else \
	6133	num=REG_NODE_NUM(RExC_emit); \
	6134	if (RExC_lastnum!=num) \
	6135	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	6136	else \
	6137	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	6138	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	6139	(int)((depth*2)), "", \
	6140	(funcname) \
	6141	); \
	6142	RExC_lastnum=num; \
	6143	RExC_lastparse=RExC_parse; \
	6144	})
	6145
	6146
	6147
	6148	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	6149	DEBUG_PARSE_MSG((funcname)); \
	6150	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	6151	})
	6152	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	6153	DEBUG_PARSE_MSG((funcname)); \
	6154	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	6155	})
	6156
	6157	/* This section of code defines the inversion list object and its methods. The
	6158	* interfaces are highly subject to change, so as much as possible is static to
	6159	* this file. An inversion list is here implemented as a malloc'd C UV array
	6160	* with some added info that is placed as UVs at the beginning in a header
	6161	* portion. An inversion list for Unicode is an array of code points, sorted
	6162	* by ordinal number. The zeroth element is the first code point in the list.
	6163	* The 1th element is the first element beyond that not in the list. In other
	6164	* words, the first range is
	6165	* invlist[0]..(invlist[1]-1)
	6166	* The other ranges follow. Thus every element whose index is divisible by two
	6167	* marks the beginning of a range that is in the list, and every element not
	6168	* divisible by two marks the beginning of a range not in the list. A single
	6169	* element inversion list that contains the single code point N generally
	6170	* consists of two elements
	6171	* invlist[0] == N
	6172	* invlist[1] == N+1
	6173	* (The exception is when N is the highest representable value on the
	6174	* machine, in which case the list containing just it would be a single
	6175	* element, itself. By extension, if the last range in the list extends to
	6176	* infinity, then the first element of that range will be in the inversion list
	6177	* at a position that is divisible by two, and is the final element in the
	6178	* list.)
	6179	* Taking the complement (inverting) an inversion list is quite simple, if the
	6180	* first element is 0, remove it; otherwise add a 0 element at the beginning.
	6181	* This implementation reserves an element at the beginning of each inversion list
	6182	* to contain 0 when the list contains 0, and contains 1 otherwise. The actual
	6183	* beginning of the list is either that element if 0, or the next one if 1.
	6184	*
	6185	* More about inversion lists can be found in "Unicode Demystified"
	6186	* Chapter 13 by Richard Gillam, published by Addison-Wesley.
	6187	* More will be coming when functionality is added later.
	6188	*
	6189	* The inversion list data structure is currently implemented as an SV pointing
	6190	* to an array of UVs that the SV thinks are bytes. This allows us to have an
	6191	* array of UV whose memory management is automatically handled by the existing
	6192	* facilities for SV's.
	6193	*
	6194	* Some of the methods should always be private to the implementation, and some
	6195	* should eventually be made public */
	6196
	6197	#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
	6198	#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
	6199
	6200	/* This is a combination of a version and data structure type, so that one
	6201	* being passed in can be validated to be an inversion list of the correct
	6202	* vintage. When the structure of the header is changed, a new random number
	6203	* in the range 2**31-1 should be generated and the new() method changed to
	6204	* insert that at this location. Then, if an auxiliary program doesn't change
	6205	* correspondingly, it will be discovered immediately */
	6206	#define INVLIST_VERSION_ID_OFFSET 2
	6207	#define INVLIST_VERSION_ID 1064334010
	6208
	6209	/* For safety, when adding new elements, remember to #undef them at the end of
	6210	* the inversion list code section */
	6211
	6212	#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
	6213	/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
	6214	* contains the code point U+00000, and begins here. If 1, the inversion list
	6215	* doesn't contain U+0000, and it begins at the next UV in the array.
	6216	* Inverting an inversion list consists of adding or removing the 0 at the
	6217	* beginning of it. By reserving a space for that 0, inversion can be made
	6218	* very fast */
	6219
	6220	#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
	6221
	6222	/* Internally things are UVs */
	6223	#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
	6224	#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
	6225
	6226	#define INVLIST_INITIAL_LEN 10
	6227
	6228	PERL_STATIC_INLINE UV*
	6229	S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
	6230	{
	6231	/* Returns a pointer to the first element in the inversion list's array.
	6232	* This is called upon initialization of an inversion list. Where the
	6233	* array begins depends on whether the list has the code point U+0000
	6234	* in it or not. The other parameter tells it whether the code that
	6235	* follows this call is about to put a 0 in the inversion list or not.
	6236	* The first element is either the element with 0, if 0, or the next one,
	6237	* if 1 */
	6238
	6239	UV* zero = get_invlist_zero_addr(invlist);
	6240
	6241	PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
	6242
	6243	/* Must be empty */
	6244	assert(! *get_invlist_len_addr(invlist));
	6245
	6246	/* 1^1 = 0; 1^0 = 1 */
	6247	*zero = 1 ^ will_have_0;
	6248	return zero + *zero;
	6249	}
	6250
	6251	PERL_STATIC_INLINE UV*
	6252	S_invlist_array(pTHX_ SV* const invlist)
	6253	{
	6254	/* Returns the pointer to the inversion list's array. Every time the
	6255	* length changes, this needs to be called in case malloc or realloc moved
	6256	* it */
	6257
	6258	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	6259
	6260	/* Must not be empty. If these fail, you probably didn't check for <len>
	6261	* being non-zero before trying to get the array */
	6262	assert(*get_invlist_len_addr(invlist));
	6263	assert(*get_invlist_zero_addr(invlist) == 0
	6264	\|\| *get_invlist_zero_addr(invlist) == 1);
	6265
	6266	/* The array begins either at the element reserved for zero if the
	6267	* list contains 0 (that element will be set to 0), or otherwise the next
	6268	* element (in which case the reserved element will be set to 1). */
	6269	return (UV *) (get_invlist_zero_addr(invlist)
	6270	+ *get_invlist_zero_addr(invlist));
	6271	}
	6272
	6273	PERL_STATIC_INLINE UV*
	6274	S_get_invlist_len_addr(pTHX_ SV* invlist)
	6275	{
	6276	/* Return the address of the UV that contains the current number
	6277	* of used elements in the inversion list */
	6278
	6279	PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
	6280
	6281	return (UV ) (SvPVX(invlist) + (INVLIST_LEN_OFFSET sizeof (UV)));
	6282	}
	6283
	6284	PERL_STATIC_INLINE UV
	6285	S_invlist_len(pTHX_ SV* const invlist)
	6286	{
	6287	/* Returns the current number of elements stored in the inversion list's
	6288	* array */
	6289
	6290	PERL_ARGS_ASSERT_INVLIST_LEN;
	6291
	6292	return *get_invlist_len_addr(invlist);
	6293	}
	6294
	6295	PERL_STATIC_INLINE void
	6296	S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
	6297	{
	6298	/* Sets the current number of elements stored in the inversion list */
	6299
	6300	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	6301
	6302	*get_invlist_len_addr(invlist) = len;
	6303
	6304	assert(len <= SvLEN(invlist));
	6305
	6306	SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
	6307	/* If the list contains U+0000, that element is part of the header,
	6308	* and should not be counted as part of the array. It will contain
	6309	* 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
	6310	* subtract:
	6311	* SvCUR_set(invlist,
	6312	* TO_INTERNAL_SIZE(len
	6313	* - (*get_invlist_zero_addr(inv_list) ^ 1)));
	6314	* But, this is only valid if len is not 0. The consequences of not doing
	6315	* this is that the memory allocation code may think that 1 more UV is
	6316	* being used than actually is, and so might do an unnecessary grow. That
	6317	* seems worth not bothering to make this the precise amount.
	6318	*
	6319	* Note that when inverting, SvCUR shouldn't change */
	6320	}
	6321
	6322	PERL_STATIC_INLINE UV
	6323	S_invlist_max(pTHX_ SV* const invlist)
	6324	{
	6325	/* Returns the maximum number of elements storable in the inversion list's
	6326	* array, without having to realloc() */
	6327
	6328	PERL_ARGS_ASSERT_INVLIST_MAX;
	6329
	6330	return FROM_INTERNAL_SIZE(SvLEN(invlist));
	6331	}
	6332
	6333	PERL_STATIC_INLINE UV*
	6334	S_get_invlist_zero_addr(pTHX_ SV* invlist)
	6335	{
	6336	/* Return the address of the UV that is reserved to hold 0 if the inversion
	6337	* list contains 0. This has to be the last element of the heading, as the
	6338	* list proper starts with either it if 0, or the next element if not.
	6339	* (But we force it to contain either 0 or 1) */
	6340
	6341	PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
	6342
	6343	return (UV ) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET sizeof (UV)));
	6344	}
	6345
	6346	#ifndef PERL_IN_XSUB_RE
	6347	SV*
	6348	Perl__new_invlist(pTHX_ IV initial_size)
	6349	{
	6350
	6351	/* Return a pointer to a newly constructed inversion list, with enough
	6352	* space to store 'initial_size' elements. If that number is negative, a
	6353	* system default is used instead */
	6354
	6355	SV* new_list;
	6356
	6357	if (initial_size < 0) {
	6358	initial_size = INVLIST_INITIAL_LEN;
	6359	}
	6360
	6361	/* Allocate the initial space */
	6362	new_list = newSV(TO_INTERNAL_SIZE(initial_size));
	6363	invlist_set_len(new_list, 0);
	6364
	6365	/* Force iterinit() to be used to get iteration to work */
	6366	*get_invlist_iter_addr(new_list) = UV_MAX;
	6367
	6368	/* This should force a segfault if a method doesn't initialize this
	6369	* properly */
	6370	*get_invlist_zero_addr(new_list) = UV_MAX;
	6371
	6372	*get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
	6373	#if HEADER_LENGTH != 4
	6374	# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
	6375	#endif
	6376
	6377	return new_list;
	6378	}
	6379	#endif
	6380
	6381	STATIC SV*
	6382	S__new_invlist_C_array(pTHX_ UV* list)
	6383	{
	6384	/* Return a pointer to a newly constructed inversion list, initialized to
	6385	* point to <list>, which has to be in the exact correct inversion list
	6386	* form, including internal fields. Thus this is a dangerous routine that
	6387	* should not be used in the wrong hands */
	6388
	6389	SV* invlist = newSV_type(SVt_PV);
	6390
	6391	PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
	6392
	6393	SvPV_set(invlist, (char *) list);
	6394	SvLEN_set(invlist, 0); /* Means we own the contents, and the system
	6395	shouldn't touch it */
	6396	SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
	6397
	6398	if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
	6399	Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
	6400	}
	6401
	6402	return invlist;
	6403	}
	6404
	6405	STATIC void
	6406	S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
	6407	{
	6408	/* Grow the maximum size of an inversion list */
	6409
	6410	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	6411
	6412	SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
	6413	}
	6414
	6415	PERL_STATIC_INLINE void
	6416	S_invlist_trim(pTHX_ SV* const invlist)
	6417	{
	6418	PERL_ARGS_ASSERT_INVLIST_TRIM;
	6419
	6420	/* Change the length of the inversion list to how many entries it currently
	6421	* has */
	6422
	6423	SvPV_shrink_to_cur((SV *) invlist);
	6424	}
	6425
	6426	/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
	6427	* etc */
	6428	#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
	6429	#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
	6430
	6431	#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
	6432
	6433	#ifndef PERL_IN_XSUB_RE
	6434	void
	6435	Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
	6436	{
	6437	/* Subject to change or removal. Append the range from 'start' to 'end' at
	6438	* the end of the inversion list. The range must be above any existing
	6439	* ones. */
	6440
	6441	UV* array;
	6442	UV max = invlist_max(invlist);
	6443	UV len = invlist_len(invlist);
	6444
	6445	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	6446
	6447	if (len == 0) { /* Empty lists must be initialized */
	6448	array = _invlist_array_init(invlist, start == 0);
	6449	}
	6450	else {
	6451	/* Here, the existing list is non-empty. The current max entry in the
	6452	* list is generally the first value not in the set, except when the
	6453	* set extends to the end of permissible values, in which case it is
	6454	* the first entry in that final set, and so this call is an attempt to
	6455	* append out-of-order */
	6456
	6457	UV final_element = len - 1;
	6458	array = invlist_array(invlist);
	6459	if (array[final_element] > start
	6460	\|\| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
	6461	{
	6462	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
	6463	array[final_element], start,
	6464	ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
	6465	}
	6466
	6467	/* Here, it is a legal append. If the new range begins with the first
	6468	* value not in the set, it is extending the set, so the new first
	6469	* value not in the set is one greater than the newly extended range.
	6470	* */
	6471	if (array[final_element] == start) {
	6472	if (end != UV_MAX) {
	6473	array[final_element] = end + 1;
	6474	}
	6475	else {
	6476	/* But if the end is the maximum representable on the machine,
	6477	* just let the range that this would extend to have no end */
	6478	invlist_set_len(invlist, len - 1);
	6479	}
	6480	return;
	6481	}
	6482	}
	6483
	6484	/* Here the new range doesn't extend any existing set. Add it */
	6485
	6486	len += 2; /* Includes an element each for the start and end of range */
	6487
	6488	/* If overflows the existing space, extend, which may cause the array to be
	6489	* moved */
	6490	if (max < len) {
	6491	invlist_extend(invlist, len);
	6492	invlist_set_len(invlist, len); /* Have to set len here to avoid assert
	6493	failure in invlist_array() */
	6494	array = invlist_array(invlist);
	6495	}
	6496	else {
	6497	invlist_set_len(invlist, len);
	6498	}
	6499
	6500	/* The next item on the list starts the range, the one after that is
	6501	* one past the new range. */
	6502	array[len - 2] = start;
	6503	if (end != UV_MAX) {
	6504	array[len - 1] = end + 1;
	6505	}
	6506	else {
	6507	/* But if the end is the maximum representable on the machine, just let
	6508	* the range have no end */
	6509	invlist_set_len(invlist, len - 1);
	6510	}
	6511	}
	6512
	6513	STATIC IV
	6514	S_invlist_search(pTHX_ SV* const invlist, const UV cp)
	6515	{
	6516	/* Searches the inversion list for the entry that contains the input code
	6517	* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
	6518	* return value is the index into the list's array of the range that
	6519	* contains <cp> */
	6520
	6521	IV low = 0;
	6522	IV high = invlist_len(invlist);
	6523	const UV * const array = invlist_array(invlist);
	6524
	6525	PERL_ARGS_ASSERT_INVLIST_SEARCH;
	6526
	6527	/* If list is empty or the code point is before the first element, return
	6528	* failure. */
	6529	if (high == 0 \|\| cp < array[0]) {
	6530	return -1;
	6531	}
	6532
	6533	/* Binary search. What we are looking for is <i> such that
	6534	* array[i] <= cp < array[i+1]
	6535	* The loop below converges on the i+1. */
	6536	while (low < high) {
	6537	IV mid = (low + high) / 2;
	6538	if (array[mid] <= cp) {
	6539	low = mid + 1;
	6540
	6541	/* We could do this extra test to exit the loop early.
	6542	if (cp < array[low]) {
	6543	return mid;
	6544	}
	6545	*/
	6546	}
	6547	else { /* cp < array[mid] */
	6548	high = mid;
	6549	}
	6550	}
	6551
	6552	return high - 1;
	6553	}
	6554
	6555	void
	6556	Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
	6557	{
	6558	/* populates a swatch of a swash the same way swatch_get() does in utf8.c,
	6559	* but is used when the swash has an inversion list. This makes this much
	6560	* faster, as it uses a binary search instead of a linear one. This is
	6561	* intimately tied to that function, and perhaps should be in utf8.c,
	6562	* except it is intimately tied to inversion lists as well. It assumes
	6563	* that <swatch> is all 0's on input */
	6564
	6565	UV current = start;
	6566	const IV len = invlist_len(invlist);
	6567	IV i;
	6568	const UV * array;
	6569
	6570	PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
	6571
	6572	if (len == 0) { /* Empty inversion list */
	6573	return;
	6574	}
	6575
	6576	array = invlist_array(invlist);
	6577
	6578	/* Find which element it is */
	6579	i = invlist_search(invlist, start);
	6580
	6581	/* We populate from <start> to <end> */
	6582	while (current < end) {
	6583	UV upper;
	6584
	6585	/* The inversion list gives the results for every possible code point
	6586	* after the first one in the list. Only those ranges whose index is
	6587	* even are ones that the inversion list matches. For the odd ones,
	6588	* and if the initial code point is not in the list, we have to skip
	6589	* forward to the next element */
	6590	if (i == -1 \|\| ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
	6591	i++;
	6592	if (i >= len) { /* Finished if beyond the end of the array */
	6593	return;
	6594	}
	6595	current = array[i];
	6596	if (current >= end) { /* Finished if beyond the end of what we
	6597	are populating */
	6598	return;
	6599	}
	6600	}
	6601	assert(current >= start);
	6602
	6603	/* The current range ends one below the next one, except don't go past
	6604	* <end> */
	6605	i++;
	6606	upper = (i < len && array[i] < end) ? array[i] : end;
	6607
	6608	/* Here we are in a range that matches. Populate a bit in the 3-bit U8
	6609	* for each code point in it */
	6610	for (; current < upper; current++) {
	6611	const STRLEN offset = (STRLEN)(current - start);
	6612	swatch[offset >> 3] \|= 1 << (offset & 7);
	6613	}
	6614
	6615	/* Quit if at the end of the list */
	6616	if (i >= len) {
	6617
	6618	/* But first, have to deal with the highest possible code point on
	6619	* the platform. The previous code assumes that <end> is one
	6620	* beyond where we want to populate, but that is impossible at the
	6621	* platform's infinity, so have to handle it specially */
	6622	if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
	6623	{
	6624	const STRLEN offset = (STRLEN)(end - start);
	6625	swatch[offset >> 3] \|= 1 << (offset & 7);
	6626	}
	6627	return;
	6628	}
	6629
	6630	/* Advance to the next range, which will be for code points not in the
	6631	* inversion list */
	6632	current = array[i];
	6633	}
	6634
	6635	return;
	6636	}
	6637
	6638
	6639	void
	6640	Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
	6641	{
	6642	/* Take the union of two inversion lists and point <output> to it. *output
	6643	* should be defined upon input, and if it points to one of the two lists,
	6644	* the reference count to that list will be decremented. The first list,
	6645	* <a>, may be NULL, in which case a copy of the second list is returned.
	6646	* If <complement_b> is TRUE, the union is taken of the complement
	6647	* (inversion) of <b> instead of b itself.
	6648	*
	6649	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	6650	* Richard Gillam, published by Addison-Wesley, and explained at some
	6651	* length there. The preface says to incorporate its examples into your
	6652	* code at your own risk.
	6653	*
	6654	* The algorithm is like a merge sort.
	6655	*
	6656	* XXX A potential performance improvement is to keep track as we go along
	6657	* if only one of the inputs contributes to the result, meaning the other
	6658	* is a subset of that one. In that case, we can skip the final copy and
	6659	* return the larger of the input lists, but then outside code might need
	6660	* to keep track of whether to free the input list or not */
	6661
	6662	UV* array_a; /* a's array */
	6663	UV* array_b;
	6664	UV len_a; /* length of a's array */
	6665	UV len_b;
	6666
	6667	SV* u; /* the resulting union */
	6668	UV* array_u;
	6669	UV len_u;
	6670
	6671	UV i_a = 0; /* current index into a's array */
	6672	UV i_b = 0;
	6673	UV i_u = 0;
	6674
	6675	/* running count, as explained in the algorithm source book; items are
	6676	* stopped accumulating and are output when the count changes to/from 0.
	6677	* The count is incremented when we start a range that's in the set, and
	6678	* decremented when we start a range that's not in the set. So its range
	6679	* is 0 to 2. Only when the count is zero is something not in the set.
	6680	*/
	6681	UV count = 0;
	6682
	6683	PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
	6684	assert(a != b);
	6685
	6686	/* If either one is empty, the union is the other one */
	6687	if (a == NULL \|\| ((len_a = invlist_len(a)) == 0)) {
	6688	if (*output == a) {
	6689	if (a != NULL) {
	6690	SvREFCNT_dec(a);
	6691	}
	6692	}
	6693	if (*output != b) {
	6694	*output = invlist_clone(b);
	6695	if (complement_b) {
	6696	_invlist_invert(*output);
	6697	}
	6698	} /* else output already = b; /
	6699	return;
	6700	}
	6701	else if ((len_b = invlist_len(b)) == 0) {
	6702	if (*output == b) {
	6703	SvREFCNT_dec(b);
	6704	}
	6705
	6706	/* The complement of an empty list is a list that has everything in it,
	6707	* so the union with <a> includes everything too */
	6708	if (complement_b) {
	6709	if (a == *output) {
	6710	SvREFCNT_dec(a);
	6711	}
	6712	*output = _new_invlist(1);
	6713	_append_range_to_invlist(*output, 0, UV_MAX);
	6714	}
	6715	else if (*output != a) {
	6716	*output = invlist_clone(a);
	6717	}
	6718	/* else output already = a; /
	6719	return;
	6720	}
	6721
	6722	/* Here both lists exist and are non-empty */
	6723	array_a = invlist_array(a);
	6724	array_b = invlist_array(b);
	6725
	6726	/* If are to take the union of 'a' with the complement of b, set it
	6727	* up so are looking at b's complement. */
	6728	if (complement_b) {
	6729
	6730	/* To complement, we invert: if the first element is 0, remove it. To
	6731	* do this, we just pretend the array starts one later, and clear the
	6732	* flag as we don't have to do anything else later */
	6733	if (array_b[0] == 0) {
	6734	array_b++;
	6735	len_b--;
	6736	complement_b = FALSE;
	6737	}
	6738	else {
	6739
	6740	/* But if the first element is not zero, we unshift a 0 before the
	6741	* array. The data structure reserves a space for that 0 (which
	6742	* should be a '1' right now), so physical shifting is unneeded,
	6743	* but temporarily change that element to 0. Before exiting the
	6744	* routine, we must restore the element to '1' */
	6745	array_b--;
	6746	len_b++;
	6747	array_b[0] = 0;
	6748	}
	6749	}
	6750
	6751	/* Size the union for the worst case: that the sets are completely
	6752	* disjoint */
	6753	u = _new_invlist(len_a + len_b);
	6754
	6755	/* Will contain U+0000 if either component does */
	6756	array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
	6757	\|\| (len_b > 0 && array_b[0] == 0));
	6758
	6759	/* Go through each list item by item, stopping when exhausted one of
	6760	* them */
	6761	while (i_a < len_a && i_b < len_b) {
	6762	UV cp; /* The element to potentially add to the union's array */
	6763	bool cp_in_set; /* is it in the the input list's set or not */
	6764
	6765	/* We need to take one or the other of the two inputs for the union.
	6766	* Since we are merging two sorted lists, we take the smaller of the
	6767	* next items. In case of a tie, we take the one that is in its set
	6768	* first. If we took one not in the set first, it would decrement the
	6769	* count, possibly to 0 which would cause it to be output as ending the
	6770	* range, and the next time through we would take the same number, and
	6771	* output it again as beginning the next range. By doing it the
	6772	* opposite way, there is no possibility that the count will be
	6773	* momentarily decremented to 0, and thus the two adjoining ranges will
	6774	* be seamlessly merged. (In a tie and both are in the set or both not
	6775	* in the set, it doesn't matter which we take first.) */
	6776	if (array_a[i_a] < array_b[i_b]
	6777	\|\| (array_a[i_a] == array_b[i_b]
	6778	&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	6779	{
	6780	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	6781	cp= array_a[i_a++];
	6782	}
	6783	else {
	6784	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	6785	cp= array_b[i_b++];
	6786	}
	6787
	6788	/* Here, have chosen which of the two inputs to look at. Only output
	6789	* if the running count changes to/from 0, which marks the
	6790	* beginning/end of a range in that's in the set */
	6791	if (cp_in_set) {
	6792	if (count == 0) {
	6793	array_u[i_u++] = cp;
	6794	}
	6795	count++;
	6796	}
	6797	else {
	6798	count--;
	6799	if (count == 0) {
	6800	array_u[i_u++] = cp;
	6801	}
	6802	}
	6803	}
	6804
	6805	/* Here, we are finished going through at least one of the lists, which
	6806	* means there is something remaining in at most one. We check if the list
	6807	* that hasn't been exhausted is positioned such that we are in the middle
	6808	* of a range in its set or not. (i_a and i_b point to the element beyond
	6809	* the one we care about.) If in the set, we decrement 'count'; if 0, there
	6810	* is potentially more to output.
	6811	* There are four cases:
	6812	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	6813	* in the union is entirely from the non-exhausted set.
	6814	* 2) Both were in their sets, count is 2. Nothing further should
	6815	* be output, as everything that remains will be in the exhausted
	6816	* list's set, hence in the union; decrementing to 1 but not 0 insures
	6817	* that
	6818	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	6819	* Nothing further should be output because the union includes
	6820	* everything from the exhausted set. Not decrementing ensures that.
	6821	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	6822	* decrementing to 0 insures that we look at the remainder of the
	6823	* non-exhausted set */
	6824	if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	6825	\|\| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	6826	{
	6827	count--;
	6828	}
	6829
	6830	/* The final length is what we've output so far, plus what else is about to
	6831	* be output. (If 'count' is non-zero, then the input list we exhausted
	6832	* has everything remaining up to the machine's limit in its set, and hence
	6833	* in the union, so there will be no further output. */
	6834	len_u = i_u;
	6835	if (count == 0) {
	6836	/* At most one of the subexpressions will be non-zero */
	6837	len_u += (len_a - i_a) + (len_b - i_b);
	6838	}
	6839
	6840	/* Set result to final length, which can change the pointer to array_u, so
	6841	* re-find it */
	6842	if (len_u != invlist_len(u)) {
	6843	invlist_set_len(u, len_u);
	6844	invlist_trim(u);
	6845	array_u = invlist_array(u);
	6846	}
	6847
	6848	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	6849	* the other) ended with everything above it not in its set. That means
	6850	* that the remaining part of the union is precisely the same as the
	6851	* non-exhausted list, so can just copy it unchanged. (If both list were
	6852	* exhausted at the same time, then the operations below will be both 0.)
	6853	*/
	6854	if (count == 0) {
	6855	IV copy_count; /* At most one will have a non-zero copy count */
	6856	if ((copy_count = len_a - i_a) > 0) {
	6857	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	6858	}
	6859	else if ((copy_count = len_b - i_b) > 0) {
	6860	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	6861	}
	6862	}
	6863
	6864	/* We may be removing a reference to one of the inputs */
	6865	if (a == output \|\| b == output) {
	6866	SvREFCNT_dec(*output);
	6867	}
	6868
	6869	/* If we've changed b, restore it */
	6870	if (complement_b) {
	6871	array_b[0] = 1;
	6872	}
	6873
	6874	*output = u;
	6875	return;
	6876	}
	6877
	6878	void
	6879	Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
	6880	{
	6881	/* Take the intersection of two inversion lists and point <i> to it. *i
	6882	* should be defined upon input, and if it points to one of the two lists,
	6883	* the reference count to that list will be decremented.
	6884	* If <complement_b> is TRUE, the result will be the intersection of <a>
	6885	* and the complement (or inversion) of <b> instead of <b> directly.
	6886	*
	6887	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	6888	* Richard Gillam, published by Addison-Wesley, and explained at some
	6889	* length there. The preface says to incorporate its examples into your
	6890	* code at your own risk. In fact, it had bugs
	6891	*
	6892	* The algorithm is like a merge sort, and is essentially the same as the
	6893	* union above
	6894	*/
	6895
	6896	UV* array_a; /* a's array */
	6897	UV* array_b;
	6898	UV len_a; /* length of a's array */
	6899	UV len_b;
	6900
	6901	SV* r; /* the resulting intersection */
	6902	UV* array_r;
	6903	UV len_r;
	6904
	6905	UV i_a = 0; /* current index into a's array */
	6906	UV i_b = 0;
	6907	UV i_r = 0;
	6908
	6909	/* running count, as explained in the algorithm source book; items are
	6910	* stopped accumulating and are output when the count changes to/from 2.
	6911	* The count is incremented when we start a range that's in the set, and
	6912	* decremented when we start a range that's not in the set. So its range
	6913	* is 0 to 2. Only when the count is 2 is something in the intersection.
	6914	*/
	6915	UV count = 0;
	6916
	6917	PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
	6918	assert(a != b);
	6919
	6920	/* Special case if either one is empty */
	6921	len_a = invlist_len(a);
	6922	if ((len_a == 0) \|\| ((len_b = invlist_len(b)) == 0)) {
	6923
	6924	if (len_a != 0 && complement_b) {
	6925
	6926	/* Here, 'a' is not empty, therefore from the above 'if', 'b' must
	6927	* be empty. Here, also we are using 'b's complement, which hence
	6928	* must be every possible code point. Thus the intersection is
	6929	* simply 'a'. */
	6930	if (*i != a) {
	6931	*i = invlist_clone(a);
	6932
	6933	if (*i == b) {
	6934	SvREFCNT_dec(b);
	6935	}
	6936	}
	6937	/* else i is already 'a' /
	6938	return;
	6939	}
	6940
	6941	/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
	6942	* intersection must be empty */
	6943	if (*i == a) {
	6944	SvREFCNT_dec(a);
	6945	}
	6946	else if (*i == b) {
	6947	SvREFCNT_dec(b);
	6948	}
	6949	*i = _new_invlist(0);
	6950	return;
	6951	}
	6952
	6953	/* Here both lists exist and are non-empty */
	6954	array_a = invlist_array(a);
	6955	array_b = invlist_array(b);
	6956
	6957	/* If are to take the intersection of 'a' with the complement of b, set it
	6958	* up so are looking at b's complement. */
	6959	if (complement_b) {
	6960
	6961	/* To complement, we invert: if the first element is 0, remove it. To
	6962	* do this, we just pretend the array starts one later, and clear the
	6963	* flag as we don't have to do anything else later */
	6964	if (array_b[0] == 0) {
	6965	array_b++;
	6966	len_b--;
	6967	complement_b = FALSE;
	6968	}
	6969	else {
	6970
	6971	/* But if the first element is not zero, we unshift a 0 before the
	6972	* array. The data structure reserves a space for that 0 (which
	6973	* should be a '1' right now), so physical shifting is unneeded,
	6974	* but temporarily change that element to 0. Before exiting the
	6975	* routine, we must restore the element to '1' */
	6976	array_b--;
	6977	len_b++;
	6978	array_b[0] = 0;
	6979	}
	6980	}
	6981
	6982	/* Size the intersection for the worst case: that the intersection ends up
	6983	* fragmenting everything to be completely disjoint */
	6984	r= _new_invlist(len_a + len_b);
	6985
	6986	/* Will contain U+0000 iff both components do */
	6987	array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
	6988	&& len_b > 0 && array_b[0] == 0);
	6989
	6990	/* Go through each list item by item, stopping when exhausted one of
	6991	* them */
	6992	while (i_a < len_a && i_b < len_b) {
	6993	UV cp; /* The element to potentially add to the intersection's
	6994	array */
	6995	bool cp_in_set; /* Is it in the input list's set or not */
	6996
	6997	/* We need to take one or the other of the two inputs for the
	6998	* intersection. Since we are merging two sorted lists, we take the
	6999	* smaller of the next items. In case of a tie, we take the one that
	7000	* is not in its set first (a difference from the union algorithm). If
	7001	* we took one in the set first, it would increment the count, possibly
	7002	* to 2 which would cause it to be output as starting a range in the
	7003	* intersection, and the next time through we would take that same
	7004	* number, and output it again as ending the set. By doing it the
	7005	* opposite of this, there is no possibility that the count will be
	7006	* momentarily incremented to 2. (In a tie and both are in the set or
	7007	* both not in the set, it doesn't matter which we take first.) */
	7008	if (array_a[i_a] < array_b[i_b]
	7009	\|\| (array_a[i_a] == array_b[i_b]
	7010	&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	7011	{
	7012	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	7013	cp= array_a[i_a++];
	7014	}
	7015	else {
	7016	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	7017	cp= array_b[i_b++];
	7018	}
	7019
	7020	/* Here, have chosen which of the two inputs to look at. Only output
	7021	* if the running count changes to/from 2, which marks the
	7022	* beginning/end of a range that's in the intersection */
	7023	if (cp_in_set) {
	7024	count++;
	7025	if (count == 2) {
	7026	array_r[i_r++] = cp;
	7027	}
	7028	}
	7029	else {
	7030	if (count == 2) {
	7031	array_r[i_r++] = cp;
	7032	}
	7033	count--;
	7034	}
	7035	}
	7036
	7037	/* Here, we are finished going through at least one of the lists, which
	7038	* means there is something remaining in at most one. We check if the list
	7039	* that has been exhausted is positioned such that we are in the middle
	7040	* of a range in its set or not. (i_a and i_b point to elements 1 beyond
	7041	* the ones we care about.) There are four cases:
	7042	* 1) Both weren't in their sets, count is 0, and remains 0. There's
	7043	* nothing left in the intersection.
	7044	* 2) Both were in their sets, count is 2 and perhaps is incremented to
	7045	* above 2. What should be output is exactly that which is in the
	7046	* non-exhausted set, as everything it has is also in the intersection
	7047	* set, and everything it doesn't have can't be in the intersection
	7048	* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
	7049	* gets incremented to 2. Like the previous case, the intersection is
	7050	* everything that remains in the non-exhausted set.
	7051	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
	7052	* remains 1. And the intersection has nothing more. */
	7053	if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	7054	\|\| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	7055	{
	7056	count++;
	7057	}
	7058
	7059	/* The final length is what we've output so far plus what else is in the
	7060	* intersection. At most one of the subexpressions below will be non-zero */
	7061	len_r = i_r;
	7062	if (count >= 2) {
	7063	len_r += (len_a - i_a) + (len_b - i_b);
	7064	}
	7065
	7066	/* Set result to final length, which can change the pointer to array_r, so
	7067	* re-find it */
	7068	if (len_r != invlist_len(r)) {
	7069	invlist_set_len(r, len_r);
	7070	invlist_trim(r);
	7071	array_r = invlist_array(r);
	7072	}
	7073
	7074	/* Finish outputting any remaining */
	7075	if (count >= 2) { /* At most one will have a non-zero copy count */
	7076	IV copy_count;
	7077	if ((copy_count = len_a - i_a) > 0) {
	7078	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	7079	}
	7080	else if ((copy_count = len_b - i_b) > 0) {
	7081	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	7082	}
	7083	}
	7084
	7085	/* We may be removing a reference to one of the inputs */
	7086	if (a == i \|\| b == i) {
	7087	SvREFCNT_dec(*i);
	7088	}
	7089
	7090	/* If we've changed b, restore it */
	7091	if (complement_b) {
	7092	array_b[0] = 1;
	7093	}
	7094
	7095	*i = r;
	7096	return;
	7097	}
	7098
	7099	#endif
	7100
	7101	STATIC SV*
	7102	S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
	7103	{
	7104	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	7105	* set. A pointer to the inversion list is returned. This may actually be
	7106	* a new list, in which case the passed in one has been destroyed. The
	7107	* passed in inversion list can be NULL, in which case a new one is created
	7108	* with just the one range in it */
	7109
	7110	SV* range_invlist;
	7111	UV len;
	7112
	7113	if (invlist == NULL) {
	7114	invlist = _new_invlist(2);
	7115	len = 0;
	7116	}
	7117	else {
	7118	len = invlist_len(invlist);
	7119	}
	7120
	7121	/* If comes after the final entry, can just append it to the end */
	7122	if (len == 0
	7123	\|\| start >= invlist_array(invlist)
	7124	[invlist_len(invlist) - 1])
	7125	{
	7126	_append_range_to_invlist(invlist, start, end);
	7127	return invlist;
	7128	}
	7129
	7130	/* Here, can't just append things, create and return a new inversion list
	7131	* which is the union of this range and the existing inversion list */
	7132	range_invlist = _new_invlist(2);
	7133	_append_range_to_invlist(range_invlist, start, end);
	7134
	7135	_invlist_union(invlist, range_invlist, &invlist);
	7136
	7137	/* The temporary can be freed */
	7138	SvREFCNT_dec(range_invlist);
	7139
	7140	return invlist;
	7141	}
	7142
	7143	PERL_STATIC_INLINE SV*
	7144	S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
	7145	return add_range_to_invlist(invlist, cp, cp);
	7146	}
	7147
	7148	#ifndef PERL_IN_XSUB_RE
	7149	void
	7150	Perl__invlist_invert(pTHX_ SV* const invlist)
	7151	{
	7152	/* Complement the input inversion list. This adds a 0 if the list didn't
	7153	* have a zero; removes it otherwise. As described above, the data
	7154	* structure is set up so that this is very efficient */
	7155
	7156	UV* len_pos = get_invlist_len_addr(invlist);
	7157
	7158	PERL_ARGS_ASSERT__INVLIST_INVERT;
	7159
	7160	/* The inverse of matching nothing is matching everything */
	7161	if (*len_pos == 0) {
	7162	_append_range_to_invlist(invlist, 0, UV_MAX);
	7163	return;
	7164	}
	7165
	7166	/* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
	7167	* zero element was a 0, so it is being removed, so the length decrements
	7168	* by 1; and vice-versa. SvCUR is unaffected */
	7169	if (*get_invlist_zero_addr(invlist) ^= 1) {
	7170	(*len_pos)--;
	7171	}
	7172	else {
	7173	(*len_pos)++;
	7174	}
	7175	}
	7176
	7177	void
	7178	Perl__invlist_invert_prop(pTHX_ SV* const invlist)
	7179	{
	7180	/* Complement the input inversion list (which must be a Unicode property,
	7181	* all of which don't match above the Unicode maximum code point.) And
	7182	* Perl has chosen to not have the inversion match above that either. This
	7183	* adds a 0x110000 if the list didn't end with it, and removes it if it did
	7184	*/
	7185
	7186	UV len;
	7187	UV* array;
	7188
	7189	PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
	7190
	7191	_invlist_invert(invlist);
	7192
	7193	len = invlist_len(invlist);
	7194
	7195	if (len != 0) { /* If empty do nothing */
	7196	array = invlist_array(invlist);
	7197	if (array[len - 1] != PERL_UNICODE_MAX + 1) {
	7198	/* Add 0x110000. First, grow if necessary */
	7199	len++;
	7200	if (invlist_max(invlist) < len) {
	7201	invlist_extend(invlist, len);
	7202	array = invlist_array(invlist);
	7203	}
	7204	invlist_set_len(invlist, len);
	7205	array[len - 1] = PERL_UNICODE_MAX + 1;
	7206	}
	7207	else { /* Remove the 0x110000 */
	7208	invlist_set_len(invlist, len - 1);
	7209	}
	7210	}
	7211
	7212	return;
	7213	}
	7214	#endif
	7215
	7216	PERL_STATIC_INLINE SV*
	7217	S_invlist_clone(pTHX_ SV* const invlist)
	7218	{
	7219
	7220	/* Return a new inversion list that is a copy of the input one, which is
	7221	* unchanged */
	7222
	7223	/* Need to allocate extra space to accommodate Perl's addition of a
	7224	* trailing NUL to SvPV's, since it thinks they are always strings */
	7225	SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
	7226	STRLEN length = SvCUR(invlist);
	7227
	7228	PERL_ARGS_ASSERT_INVLIST_CLONE;
	7229
	7230	SvCUR_set(new_invlist, length); /* This isn't done automatically */
	7231	Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
	7232
	7233	return new_invlist;
	7234	}
	7235
	7236	PERL_STATIC_INLINE UV*
	7237	S_get_invlist_iter_addr(pTHX_ SV* invlist)
	7238	{
	7239	/* Return the address of the UV that contains the current iteration
	7240	* position */
	7241
	7242	PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
	7243
	7244	return (UV ) (SvPVX(invlist) + (INVLIST_ITER_OFFSET sizeof (UV)));
	7245	}
	7246
	7247	PERL_STATIC_INLINE UV*
	7248	S_get_invlist_version_id_addr(pTHX_ SV* invlist)
	7249	{
	7250	/* Return the address of the UV that contains the version id. */
	7251
	7252	PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
	7253
	7254	return (UV ) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET sizeof (UV)));
	7255	}
	7256
	7257	PERL_STATIC_INLINE void
	7258	S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
	7259	{
	7260	PERL_ARGS_ASSERT_INVLIST_ITERINIT;
	7261
	7262	*get_invlist_iter_addr(invlist) = 0;
	7263	}
	7264
	7265	STATIC bool
	7266	S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
	7267	{
	7268	/* An C<invlist_iterinit> call on <invlist> must be used to set this up.
	7269	* This call sets in <start> and <end>, the next range in <invlist>.
	7270	* Returns <TRUE> if successful and the next call will return the next
	7271	* range; <FALSE> if was already at the end of the list. If the latter,
	7272	* <start> and <end> are unchanged, and the next call to this function
	7273	* will start over at the beginning of the list */
	7274
	7275	UV* pos = get_invlist_iter_addr(invlist);
	7276	UV len = invlist_len(invlist);
	7277	UV *array;
	7278
	7279	PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
	7280
	7281	if (*pos >= len) {
	7282	pos = UV_MAX; / Force iternit() to be required next time */
	7283	return FALSE;
	7284	}
	7285
	7286	array = invlist_array(invlist);
	7287
	7288	start = array[(pos)++];
	7289
	7290	if (*pos >= len) {
	7291	*end = UV_MAX;
	7292	}
	7293	else {
	7294	end = array[(pos)++] - 1;
	7295	}
	7296
	7297	return TRUE;
	7298	}
	7299
	7300	#ifndef PERL_IN_XSUB_RE
	7301	SV *
	7302	Perl__invlist_contents(pTHX_ SV* const invlist)
	7303	{
	7304	/* Get the contents of an inversion list into a string SV so that they can
	7305	* be printed out. It uses the format traditionally done for debug tracing
	7306	*/
	7307
	7308	UV start, end;
	7309	SV* output = newSVpvs("\n");
	7310
	7311	PERL_ARGS_ASSERT__INVLIST_CONTENTS;
	7312
	7313	invlist_iterinit(invlist);
	7314	while (invlist_iternext(invlist, &start, &end)) {
	7315	if (end == UV_MAX) {
	7316	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
	7317	}
	7318	else if (end != start) {
	7319	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
	7320	start, end);
	7321	}
	7322	else {
	7323	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
	7324	}
	7325	}
	7326
	7327	return output;
	7328	}
	7329	#endif
	7330
	7331	#if 0
	7332	void
	7333	S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
	7334	{
	7335	/* Dumps out the ranges in an inversion list. The string 'header'
	7336	* if present is output on a line before the first range */
	7337
	7338	UV start, end;
	7339
	7340	if (header && strlen(header)) {
	7341	PerlIO_printf(Perl_debug_log, "%s\n", header);
	7342	}
	7343	invlist_iterinit(invlist);
	7344	while (invlist_iternext(invlist, &start, &end)) {
	7345	if (end == UV_MAX) {
	7346	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
	7347	}
	7348	else {
	7349	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
	7350	}
	7351	}
	7352	}
	7353	#endif
	7354
	7355	#undef HEADER_LENGTH
	7356	#undef INVLIST_INITIAL_LENGTH
	7357	#undef TO_INTERNAL_SIZE
	7358	#undef FROM_INTERNAL_SIZE
	7359	#undef INVLIST_LEN_OFFSET
	7360	#undef INVLIST_ZERO_OFFSET
	7361	#undef INVLIST_ITER_OFFSET
	7362	#undef INVLIST_VERSION_ID
	7363
	7364	/* End of inversion list object */
	7365
	7366	/*
	7367	- reg - regular expression, i.e. main body or parenthesized thing
	7368	*
	7369	* Caller must absorb opening parenthesis.
	7370	*
	7371	* Combining parenthesis handling with the base level of regular expression
	7372	* is a trifle forced, but the need to tie the tails of the branches to what
	7373	* follows makes it hard to avoid.
	7374	*/
	7375	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	7376	#ifdef DEBUGGING
	7377	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	7378	#else
	7379	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	7380	#endif
	7381
	7382	STATIC regnode *
	7383	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	7384	/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
	7385	{
	7386	dVAR;
	7387	register regnode ret; / Will be the head of the group. */
	7388	register regnode *br;
	7389	register regnode *lastbr;
	7390	register regnode *ender = NULL;
	7391	register I32 parno = 0;
	7392	I32 flags;
	7393	U32 oregflags = RExC_flags;
	7394	bool have_branch = 0;
	7395	bool is_open = 0;
	7396	I32 freeze_paren = 0;
	7397	I32 after_freeze = 0;
	7398
	7399	/* for (?g), (?gc), and (?o) warnings; warning
	7400	about (?c) will warn about (?g) -- japhy */
	7401
	7402	#define WASTED_O 0x01
	7403	#define WASTED_G 0x02
	7404	#define WASTED_C 0x04
	7405	#define WASTED_GC (0x02\|0x04)
	7406	I32 wastedflags = 0x00;
	7407
	7408	char * parse_start = RExC_parse; /* MJD */
	7409	char * const oregcomp_parse = RExC_parse;
	7410
	7411	GET_RE_DEBUG_FLAGS_DECL;
	7412
	7413	PERL_ARGS_ASSERT_REG;
	7414	DEBUG_PARSE("reg ");
	7415
	7416	flagp = 0; / Tentatively. */
	7417
	7418
	7419	/* Make an OPEN node, if parenthesized. */
	7420	if (paren) {
	7421	if ( RExC_parse == '') { /* (VERB:ARG) /
	7422	char *start_verb = RExC_parse;
	7423	STRLEN verb_len = 0;
	7424	char *start_arg = NULL;
	7425	unsigned char op = 0;
	7426	int argok = 1;
	7427	int internal_argval = 0; /* internal_argval is only useful if !argok */
	7428	while ( RExC_parse && RExC_parse != ')' ) {
	7429	if ( *RExC_parse == ':' ) {
	7430	start_arg = RExC_parse + 1;
	7431	break;
	7432	}
	7433	RExC_parse++;
	7434	}
	7435	++start_verb;
	7436	verb_len = RExC_parse - start_verb;
	7437	if ( start_arg ) {
	7438	RExC_parse++;
	7439	while ( RExC_parse && RExC_parse != ')' )
	7440	RExC_parse++;
	7441	if ( *RExC_parse != ')' )
	7442	vFAIL("Unterminated verb pattern argument");
	7443	if ( RExC_parse == start_arg )
	7444	start_arg = NULL;
	7445	} else {
	7446	if ( *RExC_parse != ')' )
	7447	vFAIL("Unterminated verb pattern");
	7448	}
	7449
	7450	switch ( *start_verb ) {
	7451	case 'A': /* (ACCEPT) /
	7452	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	7453	op = ACCEPT;
	7454	internal_argval = RExC_nestroot;
	7455	}
	7456	break;
	7457	case 'C': /* (COMMIT) /
	7458	if ( memEQs(start_verb,verb_len,"COMMIT") )
	7459	op = COMMIT;
	7460	break;
	7461	case 'F': /* (FAIL) /
	7462	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	7463	op = OPFAIL;
	7464	argok = 0;
	7465	}
	7466	break;
	7467	case ':': /* (:NAME) /
	7468	case 'M': /* (MARK:NAME) /
	7469	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	7470	op = MARKPOINT;
	7471	argok = -1;
	7472	}
	7473	break;
	7474	case 'P': /* (PRUNE) /
	7475	if ( memEQs(start_verb,verb_len,"PRUNE") )
	7476	op = PRUNE;
	7477	break;
	7478	case 'S': /* (SKIP) /
	7479	if ( memEQs(start_verb,verb_len,"SKIP") )
	7480	op = SKIP;
	7481	break;
	7482	case 'T': /* (THEN) /
	7483	/* [19:06] <TimToady> :: is then */
	7484	if ( memEQs(start_verb,verb_len,"THEN") ) {
	7485	op = CUTGROUP;
	7486	RExC_seen \|= REG_SEEN_CUTGROUP;
	7487	}
	7488	break;
	7489	}
	7490	if ( ! op ) {
	7491	RExC_parse++;
	7492	vFAIL3("Unknown verb pattern '%.*s'",
	7493	verb_len, start_verb);
	7494	}
	7495	if ( argok ) {
	7496	if ( start_arg && internal_argval ) {
	7497	vFAIL3("Verb pattern '%.*s' may not have an argument",
	7498	verb_len, start_verb);
	7499	} else if ( argok < 0 && !start_arg ) {
	7500	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	7501	verb_len, start_verb);
	7502	} else {
	7503	ret = reganode(pRExC_state, op, internal_argval);
	7504	if ( ! internal_argval && ! SIZE_ONLY ) {
	7505	if (start_arg) {
	7506	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	7507	ARG(ret) = add_data( pRExC_state, 1, "S" );
	7508	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	7509	ret->flags = 0;
	7510	} else {
	7511	ret->flags = 1;
	7512	}
	7513	}
	7514	}
	7515	if (!internal_argval)
	7516	RExC_seen \|= REG_SEEN_VERBARG;
	7517	} else if ( start_arg ) {
	7518	vFAIL3("Verb pattern '%.*s' may not have an argument",
	7519	verb_len, start_verb);
	7520	} else {
	7521	ret = reg_node(pRExC_state, op);
	7522	}
	7523	nextchar(pRExC_state);
	7524	return ret;
	7525	} else
	7526	if (RExC_parse == '?') { / (?...) */
	7527	bool is_logical = 0;
	7528	const char * const seqstart = RExC_parse;
	7529	bool has_use_defaults = FALSE;
	7530
	7531	RExC_parse++;
	7532	paren = *RExC_parse++;
	7533	ret = NULL; /* For look-ahead/behind. */
	7534	switch (paren) {
	7535
	7536	case 'P': /* (?P...) variants for those used to PCRE/Python */
	7537	paren = *RExC_parse++;
	7538	if ( paren == '<') /* (?P<...>) named capture */
	7539	goto named_capture;
	7540	else if (paren == '>') { /* (?P>name) named recursion */
	7541	goto named_recursion;
	7542	}
	7543	else if (paren == '=') { /* (?P=...) named backref */
	7544	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	7545	you change this make sure you change that */
	7546	char* name_start = RExC_parse;
	7547	U32 num = 0;
	7548	SV *sv_dat = reg_scan_name(pRExC_state,
	7549	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7550	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	7551	vFAIL2("Sequence %.3s... not terminated",parse_start);
	7552
	7553	if (!SIZE_ONLY) {
	7554	num = add_data( pRExC_state, 1, "S" );
	7555	RExC_rxi->data->data[num]=(void*)sv_dat;
	7556	SvREFCNT_inc_simple_void(sv_dat);
	7557	}
	7558	RExC_sawback = 1;
	7559	ret = reganode(pRExC_state,
	7560	((! FOLD)
	7561	? NREF
	7562	: (MORE_ASCII_RESTRICTED)
	7563	? NREFFA
	7564	: (AT_LEAST_UNI_SEMANTICS)
	7565	? NREFFU
	7566	: (LOC)
	7567	? NREFFL
	7568	: NREFF),
	7569	num);
	7570	*flagp \|= HASWIDTH;
	7571
	7572	Set_Node_Offset(ret, parse_start+1);
	7573	Set_Node_Cur_Length(ret); /* MJD */
	7574
	7575	nextchar(pRExC_state);
	7576	return ret;
	7577	}
	7578	RExC_parse++;
	7579	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7580	/NOTREACHED/
	7581	case '<': /* (?<...) */
	7582	if (*RExC_parse == '!')
	7583	paren = ',';
	7584	else if (*RExC_parse != '=')
	7585	named_capture:
	7586	{ /* (?<...>) */
	7587	char *name_start;
	7588	SV *svname;
	7589	paren= '>';
	7590	case '\'': /* (?'...') */
	7591	name_start= RExC_parse;
	7592	svname = reg_scan_name(pRExC_state,
	7593	SIZE_ONLY ? /* reverse test from the others */
	7594	REG_RSN_RETURN_NAME :
	7595	REG_RSN_RETURN_NULL);
	7596	if (RExC_parse == name_start) {
	7597	RExC_parse++;
	7598	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7599	/NOTREACHED/
	7600	}
	7601	if (*RExC_parse != paren)
	7602	vFAIL2("Sequence (?%c... not terminated",
	7603	paren=='>' ? '<' : paren);
	7604	if (SIZE_ONLY) {
	7605	HE *he_str;
	7606	SV *sv_dat = NULL;
	7607	if (!svname) /* shouldn't happen */
	7608	Perl_croak(aTHX_
	7609	"panic: reg_scan_name returned NULL");
	7610	if (!RExC_paren_names) {
	7611	RExC_paren_names= newHV();
	7612	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	7613	#ifdef DEBUGGING
	7614	RExC_paren_name_list= newAV();
	7615	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	7616	#endif
	7617	}
	7618	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	7619	if ( he_str )
	7620	sv_dat = HeVAL(he_str);
	7621	if ( ! sv_dat ) {
	7622	/* croak baby croak */
	7623	Perl_croak(aTHX_
	7624	"panic: paren_name hash element allocation failed");
	7625	} else if ( SvPOK(sv_dat) ) {
	7626	/* (?\|...) can mean we have dupes so scan to check
	7627	its already been stored. Maybe a flag indicating
	7628	we are inside such a construct would be useful,
	7629	but the arrays are likely to be quite small, so
	7630	for now we punt -- dmq */
	7631	IV count = SvIV(sv_dat);
	7632	I32 pv = (I32)SvPVX(sv_dat);
	7633	IV i;
	7634	for ( i = 0 ; i < count ; i++ ) {
	7635	if ( pv[i] == RExC_npar ) {
	7636	count = 0;
	7637	break;
	7638	}
	7639	}
	7640	if ( count ) {
	7641	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	7642	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	7643	pv[count] = RExC_npar;
	7644	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	7645	}
	7646	} else {
	7647	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	7648	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	7649	SvIOK_on(sv_dat);
	7650	SvIV_set(sv_dat, 1);
	7651	}
	7652	#ifdef DEBUGGING
	7653	/* Yes this does cause a memory leak in debugging Perls */
	7654	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	7655	SvREFCNT_dec(svname);
	7656	#endif
	7657
	7658	/sv_dump(sv_dat);/
	7659	}
	7660	nextchar(pRExC_state);
	7661	paren = 1;
	7662	goto capturing_parens;
	7663	}
	7664	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	7665	RExC_in_lookbehind++;
	7666	RExC_parse++;
	7667	case '=': /* (?=...) */
	7668	RExC_seen_zerolen++;
	7669	break;
	7670	case '!': /* (?!...) */
	7671	RExC_seen_zerolen++;
	7672	if (*RExC_parse == ')') {
	7673	ret=reg_node(pRExC_state, OPFAIL);
	7674	nextchar(pRExC_state);
	7675	return ret;
	7676	}
	7677	break;
	7678	case '\|': /* (?\|...) */
	7679	/* branch reset, behave like a (?:...) except that
	7680	buffers in alternations share the same numbers */
	7681	paren = ':';
	7682	after_freeze = freeze_paren = RExC_npar;
	7683	break;
	7684	case ':': /* (?:...) */
	7685	case '>': /* (?>...) */
	7686	break;
	7687	case '$': /* (?$...) */
	7688	case '@': /* (?@...) */
	7689	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	7690	break;
	7691	case '#': /* (?#...) */
	7692	while (RExC_parse && RExC_parse != ')')
	7693	RExC_parse++;
	7694	if (*RExC_parse != ')')
	7695	FAIL("Sequence (?#... not terminated");
	7696	nextchar(pRExC_state);
	7697	*flagp = TRYAGAIN;
	7698	return NULL;
	7699	case '0' : /* (?0) */
	7700	case 'R' : /* (?R) */
	7701	if (*RExC_parse != ')')
	7702	FAIL("Sequence (?R) not terminated");
	7703	ret = reg_node(pRExC_state, GOSTART);
	7704	*flagp \|= POSTPONED;
	7705	nextchar(pRExC_state);
	7706	return ret;
	7707	/notreached/
	7708	{ /* named and numeric backreferences */
	7709	I32 num;
	7710	case '&': /* (?&NAME) */
	7711	parse_start = RExC_parse - 1;
	7712	named_recursion:
	7713	{
	7714	SV *sv_dat = reg_scan_name(pRExC_state,
	7715	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7716	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	7717	}
	7718	goto gen_recurse_regop;
	7719	/* NOT REACHED */
	7720	case '+':
	7721	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	7722	RExC_parse++;
	7723	vFAIL("Illegal pattern");
	7724	}
	7725	goto parse_recursion;
	7726	/* NOT REACHED*/
	7727	case '-': /* (?-1) */
	7728	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	7729	RExC_parse--; /* rewind to let it be handled later */
	7730	goto parse_flags;
	7731	}
	7732	/FALLTHROUGH /
	7733	case '1': case '2': case '3': case '4': /* (?1) */
	7734	case '5': case '6': case '7': case '8': case '9':
	7735	RExC_parse--;
	7736	parse_recursion:
	7737	num = atoi(RExC_parse);
	7738	parse_start = RExC_parse - 1; /* MJD */
	7739	if (*RExC_parse == '-')
	7740	RExC_parse++;
	7741	while (isDIGIT(*RExC_parse))
	7742	RExC_parse++;
	7743	if (*RExC_parse!=')')
	7744	vFAIL("Expecting close bracket");
	7745
	7746	gen_recurse_regop:
	7747	if ( paren == '-' ) {
	7748	/*
	7749	Diagram of capture buffer numbering.
	7750	Top line is the normal capture buffer numbers
	7751	Bottom line is the negative indexing as from
	7752	the X (the (?-2))
	7753
	7754	+ 1 2 3 4 5 X 6 7
	7755	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	7756	- 5 4 3 2 1 X x x
	7757
	7758	*/
	7759	num = RExC_npar + num;
	7760	if (num < 1) {
	7761	RExC_parse++;
	7762	vFAIL("Reference to nonexistent group");
	7763	}
	7764	} else if ( paren == '+' ) {
	7765	num = RExC_npar + num - 1;
	7766	}
	7767
	7768	ret = reganode(pRExC_state, GOSUB, num);
	7769	if (!SIZE_ONLY) {
	7770	if (num > (I32)RExC_rx->nparens) {
	7771	RExC_parse++;
	7772	vFAIL("Reference to nonexistent group");
	7773	}
	7774	ARG2L_SET( ret, RExC_recurse_count++);
	7775	RExC_emit++;
	7776	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7777	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	7778	} else {
	7779	RExC_size++;
	7780	}
	7781	RExC_seen \|= REG_SEEN_RECURSE;
	7782	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	7783	Set_Node_Offset(ret, parse_start); /* MJD */
	7784
	7785	*flagp \|= POSTPONED;
	7786	nextchar(pRExC_state);
	7787	return ret;
	7788	} /* named and numeric backreferences */
	7789	/* NOT REACHED */
	7790
	7791	case '?': /* (??...) */
	7792	is_logical = 1;
	7793	if (*RExC_parse != '{') {
	7794	RExC_parse++;
	7795	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7796	/NOTREACHED/
	7797	}
	7798	*flagp \|= POSTPONED;
	7799	paren = *RExC_parse++;
	7800	/* FALL THROUGH */
	7801	case '{': /* (?{...}) */
	7802	{
	7803	I32 count = 1;
	7804	U32 n = 0;
	7805	char c;
	7806	char *s = RExC_parse;
	7807
	7808	RExC_seen_zerolen++;
	7809	RExC_seen \|= REG_SEEN_EVAL;
	7810	while (count && (c = *RExC_parse)) {
	7811	if (c == '\\') {
	7812	if (RExC_parse[1])
	7813	RExC_parse++;
	7814	}
	7815	else if (c == '{')
	7816	count++;
	7817	else if (c == '}')
	7818	count--;
	7819	RExC_parse++;
	7820	}
	7821	if (*RExC_parse != ')') {
	7822	RExC_parse = s;
	7823	vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
	7824	}
	7825	if (!SIZE_ONLY) {
	7826	PAD *pad;
	7827	OP_4tree sop, rop;
	7828	SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
	7829
	7830	ENTER;
	7831	Perl_save_re_context(aTHX);
	7832	rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
	7833	sop->op_private \|= OPpREFCOUNTED;
	7834	/* re_dup will OpREFCNT_inc */
	7835	OpREFCNT_set(sop, 1);
	7836	LEAVE;
	7837
	7838	n = add_data(pRExC_state, 3, "nop");
	7839	RExC_rxi->data->data[n] = (void*)rop;
	7840	RExC_rxi->data->data[n+1] = (void*)sop;
	7841	RExC_rxi->data->data[n+2] = (void*)pad;
	7842	SvREFCNT_dec(sv);
	7843	}
	7844	else { /* First pass */
	7845	if (PL_reginterp_cnt < ++RExC_seen_evals
	7846	&& IN_PERL_RUNTIME)
	7847	/* No compiled RE interpolated, has runtime
	7848	components ===> unsafe. */
	7849	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	7850	if (PL_tainting && PL_tainted)
	7851	FAIL("Eval-group in insecure regular expression");
	7852	#if PERL_VERSION > 8
	7853	if (IN_PERL_COMPILETIME)
	7854	PL_cv_has_eval = 1;
	7855	#endif
	7856	}
	7857
	7858	nextchar(pRExC_state);
	7859	if (is_logical) {
	7860	ret = reg_node(pRExC_state, LOGICAL);
	7861	if (!SIZE_ONLY)
	7862	ret->flags = 2;
	7863	REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
	7864	/* deal with the length of this later - MJD */
	7865	return ret;
	7866	}
	7867	ret = reganode(pRExC_state, EVAL, n);
	7868	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	7869	Set_Node_Offset(ret, parse_start);
	7870	return ret;
	7871	}
	7872	case '(': /* (?(?{...})...) and (?(?=...)...) */
	7873	{
	7874	int is_define= 0;
	7875	if (RExC_parse[0] == '?') { /* (?(?...)) */
	7876	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	7877	\|\| RExC_parse[1] == '<'
	7878	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	7879	I32 flag;
	7880
	7881	ret = reg_node(pRExC_state, LOGICAL);
	7882	if (!SIZE_ONLY)
	7883	ret->flags = 1;
	7884	REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
	7885	goto insert_if;
	7886	}
	7887	}
	7888	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	7889	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	7890	{
	7891	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	7892	char *name_start= RExC_parse++;
	7893	U32 num = 0;
	7894	SV *sv_dat=reg_scan_name(pRExC_state,
	7895	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7896	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	7897	vFAIL2("Sequence (?(%c... not terminated",
	7898	(ch == '>' ? '<' : ch));
	7899	RExC_parse++;
	7900	if (!SIZE_ONLY) {
	7901	num = add_data( pRExC_state, 1, "S" );
	7902	RExC_rxi->data->data[num]=(void*)sv_dat;
	7903	SvREFCNT_inc_simple_void(sv_dat);
	7904	}
	7905	ret = reganode(pRExC_state,NGROUPP,num);
	7906	goto insert_if_check_paren;
	7907	}
	7908	else if (RExC_parse[0] == 'D' &&
	7909	RExC_parse[1] == 'E' &&
	7910	RExC_parse[2] == 'F' &&
	7911	RExC_parse[3] == 'I' &&
	7912	RExC_parse[4] == 'N' &&
	7913	RExC_parse[5] == 'E')
	7914	{
	7915	ret = reganode(pRExC_state,DEFINEP,0);
	7916	RExC_parse +=6 ;
	7917	is_define = 1;
	7918	goto insert_if_check_paren;
	7919	}
	7920	else if (RExC_parse[0] == 'R') {
	7921	RExC_parse++;
	7922	parno = 0;
	7923	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	7924	parno = atoi(RExC_parse++);
	7925	while (isDIGIT(*RExC_parse))
	7926	RExC_parse++;
	7927	} else if (RExC_parse[0] == '&') {
	7928	SV *sv_dat;
	7929	RExC_parse++;
	7930	sv_dat = reg_scan_name(pRExC_state,
	7931	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7932	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	7933	}
	7934	ret = reganode(pRExC_state,INSUBP,parno);
	7935	goto insert_if_check_paren;
	7936	}
	7937	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	7938	/* (?(1)...) */
	7939	char c;
	7940	parno = atoi(RExC_parse++);
	7941
	7942	while (isDIGIT(*RExC_parse))
	7943	RExC_parse++;
	7944	ret = reganode(pRExC_state, GROUPP, parno);
	7945
	7946	insert_if_check_paren:
	7947	if ((c = *nextchar(pRExC_state)) != ')')
	7948	vFAIL("Switch condition not recognized");
	7949	insert_if:
	7950	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	7951	br = regbranch(pRExC_state, &flags, 1,depth+1);
	7952	if (br == NULL)
	7953	br = reganode(pRExC_state, LONGJMP, 0);
	7954	else
	7955	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	7956	c = *nextchar(pRExC_state);
	7957	if (flags&HASWIDTH)
	7958	*flagp \|= HASWIDTH;
	7959	if (c == '\|') {
	7960	if (is_define)
	7961	vFAIL("(?(DEFINE)....) does not allow branches");
	7962	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	7963	regbranch(pRExC_state, &flags, 1,depth+1);
	7964	REGTAIL(pRExC_state, ret, lastbr);
	7965	if (flags&HASWIDTH)
	7966	*flagp \|= HASWIDTH;
	7967	c = *nextchar(pRExC_state);
	7968	}
	7969	else
	7970	lastbr = NULL;
	7971	if (c != ')')
	7972	vFAIL("Switch (?(condition)... contains too many branches");
	7973	ender = reg_node(pRExC_state, TAIL);
	7974	REGTAIL(pRExC_state, br, ender);
	7975	if (lastbr) {
	7976	REGTAIL(pRExC_state, lastbr, ender);
	7977	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	7978	}
	7979	else
	7980	REGTAIL(pRExC_state, ret, ender);
	7981	RExC_size++; /* XXX WHY do we need this?!!
	7982	For large programs it seems to be required
	7983	but I can't figure out why. -- dmq*/
	7984	return ret;
	7985	}
	7986	else {
	7987	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	7988	}
	7989	}
	7990	case 0:
	7991	RExC_parse--; /* for vFAIL to print correctly */
	7992	vFAIL("Sequence (? incomplete");
	7993	break;
	7994	case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
	7995	that follow */
	7996	has_use_defaults = TRUE;
	7997	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	7998	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	7999	? REGEX_UNICODE_CHARSET
	8000	: REGEX_DEPENDS_CHARSET);
	8001	goto parse_flags;
	8002	default:
	8003	--RExC_parse;
	8004	parse_flags: /* (?i) */
	8005	{
	8006	U32 posflags = 0, negflags = 0;
	8007	U32 *flagsp = &posflags;
	8008	char has_charset_modifier = '\0';
	8009	regex_charset cs = (RExC_utf8 \|\| RExC_uni_semantics)
	8010	? REGEX_UNICODE_CHARSET
	8011	: REGEX_DEPENDS_CHARSET;
	8012
	8013	while (*RExC_parse) {
	8014	/* && strchr("iogcmsx", RExC_parse) /
	8015	/* (?g), (?gc) and (?o) are useless here
	8016	and must be globally applied -- japhy */
	8017	switch (*RExC_parse) {
	8018	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	8019	case LOCALE_PAT_MOD:
	8020	if (has_charset_modifier) {
	8021	goto excess_modifier;
	8022	}
	8023	else if (flagsp == &negflags) {
	8024	goto neg_modifier;
	8025	}
	8026	cs = REGEX_LOCALE_CHARSET;
	8027	has_charset_modifier = LOCALE_PAT_MOD;
	8028	RExC_contains_locale = 1;
	8029	break;
	8030	case UNICODE_PAT_MOD:
	8031	if (has_charset_modifier) {
	8032	goto excess_modifier;
	8033	}
	8034	else if (flagsp == &negflags) {
	8035	goto neg_modifier;
	8036	}
	8037	cs = REGEX_UNICODE_CHARSET;
	8038	has_charset_modifier = UNICODE_PAT_MOD;
	8039	break;
	8040	case ASCII_RESTRICT_PAT_MOD:
	8041	if (flagsp == &negflags) {
	8042	goto neg_modifier;
	8043	}
	8044	if (has_charset_modifier) {
	8045	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	8046	goto excess_modifier;
	8047	}
	8048	/* Doubled modifier implies more restricted */
	8049	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	8050	}
	8051	else {
	8052	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	8053	}
	8054	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	8055	break;
	8056	case DEPENDS_PAT_MOD:
	8057	if (has_use_defaults) {
	8058	goto fail_modifiers;
	8059	}
	8060	else if (flagsp == &negflags) {
	8061	goto neg_modifier;
	8062	}
	8063	else if (has_charset_modifier) {
	8064	goto excess_modifier;
	8065	}
	8066
	8067	/* The dual charset means unicode semantics if the
	8068	* pattern (or target, not known until runtime) are
	8069	* utf8, or something in the pattern indicates unicode
	8070	* semantics */
	8071	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	8072	? REGEX_UNICODE_CHARSET
	8073	: REGEX_DEPENDS_CHARSET;
	8074	has_charset_modifier = DEPENDS_PAT_MOD;
	8075	break;
	8076	excess_modifier:
	8077	RExC_parse++;
	8078	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	8079	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	8080	}
	8081	else if (has_charset_modifier == *(RExC_parse - 1)) {
	8082	vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
	8083	}
	8084	else {
	8085	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	8086	}
	8087	/NOTREACHED/
	8088	neg_modifier:
	8089	RExC_parse++;
	8090	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
	8091	/NOTREACHED/
	8092	case ONCE_PAT_MOD: /* 'o' */
	8093	case GLOBAL_PAT_MOD: /* 'g' */
	8094	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	8095	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	8096	if (! (wastedflags & wflagbit) ) {
	8097	wastedflags \|= wflagbit;
	8098	vWARN5(
	8099	RExC_parse + 1,
	8100	"Useless (%s%c) - %suse /%c modifier",
	8101	flagsp == &negflags ? "?-" : "?",
	8102	*RExC_parse,
	8103	flagsp == &negflags ? "don't " : "",
	8104	*RExC_parse
	8105	);
	8106	}
	8107	}
	8108	break;
	8109
	8110	case CONTINUE_PAT_MOD: /* 'c' */
	8111	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	8112	if (! (wastedflags & WASTED_C) ) {
	8113	wastedflags \|= WASTED_GC;
	8114	vWARN3(
	8115	RExC_parse + 1,
	8116	"Useless (%sc) - %suse /gc modifier",
	8117	flagsp == &negflags ? "?-" : "?",
	8118	flagsp == &negflags ? "don't " : ""
	8119	);
	8120	}
	8121	}
	8122	break;
	8123	case KEEPCOPY_PAT_MOD: /* 'p' */
	8124	if (flagsp == &negflags) {
	8125	if (SIZE_ONLY)
	8126	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	8127	} else {
	8128	*flagsp \|= RXf_PMf_KEEPCOPY;
	8129	}
	8130	break;
	8131	case '-':
	8132	/* A flag is a default iff it is following a minus, so
	8133	* if there is a minus, it means will be trying to
	8134	* re-specify a default which is an error */
	8135	if (has_use_defaults \|\| flagsp == &negflags) {
	8136	fail_modifiers:
	8137	RExC_parse++;
	8138	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8139	/NOTREACHED/
	8140	}
	8141	flagsp = &negflags;
	8142	wastedflags = 0; /* reset so (?g-c) warns twice */
	8143	break;
	8144	case ':':
	8145	paren = ':';
	8146	/FALLTHROUGH/
	8147	case ')':
	8148	RExC_flags \|= posflags;
	8149	RExC_flags &= ~negflags;
	8150	set_regex_charset(&RExC_flags, cs);
	8151	if (paren != ':') {
	8152	oregflags \|= posflags;
	8153	oregflags &= ~negflags;
	8154	set_regex_charset(&oregflags, cs);
	8155	}
	8156	nextchar(pRExC_state);
	8157	if (paren != ':') {
	8158	*flagp = TRYAGAIN;
	8159	return NULL;
	8160	} else {
	8161	ret = NULL;
	8162	goto parse_rest;
	8163	}
	8164	/NOTREACHED/
	8165	default:
	8166	RExC_parse++;
	8167	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8168	/NOTREACHED/
	8169	}
	8170	++RExC_parse;
	8171	}
	8172	}} /* one for the default block, one for the switch */
	8173	}
	8174	else { /* (...) */
	8175	capturing_parens:
	8176	parno = RExC_npar;
	8177	RExC_npar++;
	8178
	8179	ret = reganode(pRExC_state, OPEN, parno);
	8180	if (!SIZE_ONLY ){
	8181	if (!RExC_nestroot)
	8182	RExC_nestroot = parno;
	8183	if (RExC_seen & REG_SEEN_RECURSE
	8184	&& !RExC_open_parens[parno-1])
	8185	{
	8186	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	8187	"Setting open paren #%"IVdf" to %d\n",
	8188	(IV)parno, REG_NODE_NUM(ret)));
	8189	RExC_open_parens[parno-1]= ret;
	8190	}
	8191	}
	8192	Set_Node_Length(ret, 1); /* MJD */
	8193	Set_Node_Offset(ret, RExC_parse); /* MJD */
	8194	is_open = 1;
	8195	}
	8196	}
	8197	else /* ! paren */
	8198	ret = NULL;
	8199
	8200	parse_rest:
	8201	/* Pick up the branches, linking them together. */
	8202	parse_start = RExC_parse; /* MJD */
	8203	br = regbranch(pRExC_state, &flags, 1,depth+1);
	8204
	8205	/* branch_len = (paren != 0); */
	8206
	8207	if (br == NULL)
	8208	return(NULL);
	8209	if (*RExC_parse == '\|') {
	8210	if (!SIZE_ONLY && RExC_extralen) {
	8211	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	8212	}
	8213	else { /* MJD */
	8214	reginsert(pRExC_state, BRANCH, br, depth+1);
	8215	Set_Node_Length(br, paren != 0);
	8216	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	8217	}
	8218	have_branch = 1;
	8219	if (SIZE_ONLY)
	8220	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	8221	}
	8222	else if (paren == ':') {
	8223	*flagp \|= flags&SIMPLE;
	8224	}
	8225	if (is_open) { /* Starts with OPEN. */
	8226	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	8227	}
	8228	else if (paren != '?') /* Not Conditional */
	8229	ret = br;
	8230	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	8231	lastbr = br;
	8232	while (*RExC_parse == '\|') {
	8233	if (!SIZE_ONLY && RExC_extralen) {
	8234	ender = reganode(pRExC_state, LONGJMP,0);
	8235	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	8236	}
	8237	if (SIZE_ONLY)
	8238	RExC_extralen += 2; /* Account for LONGJMP. */
	8239	nextchar(pRExC_state);
	8240	if (freeze_paren) {
	8241	if (RExC_npar > after_freeze)
	8242	after_freeze = RExC_npar;
	8243	RExC_npar = freeze_paren;
	8244	}
	8245	br = regbranch(pRExC_state, &flags, 0, depth+1);
	8246
	8247	if (br == NULL)
	8248	return(NULL);
	8249	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	8250	lastbr = br;
	8251	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	8252	}
	8253
	8254	if (have_branch \|\| paren != ':') {
	8255	/* Make a closing node, and hook it on the end. */
	8256	switch (paren) {
	8257	case ':':
	8258	ender = reg_node(pRExC_state, TAIL);
	8259	break;
	8260	case 1:
	8261	ender = reganode(pRExC_state, CLOSE, parno);
	8262	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	8263	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	8264	"Setting close paren #%"IVdf" to %d\n",
	8265	(IV)parno, REG_NODE_NUM(ender)));
	8266	RExC_close_parens[parno-1]= ender;
	8267	if (RExC_nestroot == parno)
	8268	RExC_nestroot = 0;
	8269	}
	8270	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	8271	Set_Node_Length(ender,1); /* MJD */
	8272	break;
	8273	case '<':
	8274	case ',':
	8275	case '=':
	8276	case '!':
	8277	*flagp &= ~HASWIDTH;
	8278	/* FALL THROUGH */
	8279	case '>':
	8280	ender = reg_node(pRExC_state, SUCCEED);
	8281	break;
	8282	case 0:
	8283	ender = reg_node(pRExC_state, END);
	8284	if (!SIZE_ONLY) {
	8285	assert(!RExC_opend); /* there can only be one! */
	8286	RExC_opend = ender;
	8287	}
	8288	break;
	8289	}
	8290	REGTAIL(pRExC_state, lastbr, ender);
	8291
	8292	if (have_branch && !SIZE_ONLY) {
	8293	if (depth==1)
	8294	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	8295
	8296	/* Hook the tails of the branches to the closing node. */
	8297	for (br = ret; br; br = regnext(br)) {
	8298	const U8 op = PL_regkind[OP(br)];
	8299	if (op == BRANCH) {
	8300	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	8301	}
	8302	else if (op == BRANCHJ) {
	8303	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	8304	}
	8305	}
	8306	}
	8307	}
	8308
	8309	{
	8310	const char *p;
	8311	static const char parens[] = "=!<,>";
	8312
	8313	if (paren && (p = strchr(parens, paren))) {
	8314	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	8315	int flag = (p - parens) > 1;
	8316
	8317	if (paren == '>')
	8318	node = SUSPEND, flag = 0;
	8319	reginsert(pRExC_state, node,ret, depth+1);
	8320	Set_Node_Cur_Length(ret);
	8321	Set_Node_Offset(ret, parse_start + 1);
	8322	ret->flags = flag;
	8323	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	8324	}
	8325	}
	8326
	8327	/* Check for proper termination. */
	8328	if (paren) {
	8329	RExC_flags = oregflags;
	8330	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	8331	RExC_parse = oregcomp_parse;
	8332	vFAIL("Unmatched (");
	8333	}
	8334	}
	8335	else if (!paren && RExC_parse < RExC_end) {
	8336	if (*RExC_parse == ')') {
	8337	RExC_parse++;
	8338	vFAIL("Unmatched )");
	8339	}
	8340	else
	8341	FAIL("Junk on end of regexp"); /* "Can't happen". */
	8342	/* NOTREACHED */
	8343	}
	8344
	8345	if (RExC_in_lookbehind) {
	8346	RExC_in_lookbehind--;
	8347	}
	8348	if (after_freeze > RExC_npar)
	8349	RExC_npar = after_freeze;
	8350	return(ret);
	8351	}
	8352
	8353	/*
	8354	- regbranch - one alternative of an \| operator
	8355	*
	8356	* Implements the concatenation operator.
	8357	*/
	8358	STATIC regnode *
	8359	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	8360	{
	8361	dVAR;
	8362	register regnode *ret;
	8363	register regnode *chain = NULL;
	8364	register regnode *latest;
	8365	I32 flags = 0, c = 0;
	8366	GET_RE_DEBUG_FLAGS_DECL;
	8367
	8368	PERL_ARGS_ASSERT_REGBRANCH;
	8369
	8370	DEBUG_PARSE("brnc");
	8371
	8372	if (first)
	8373	ret = NULL;
	8374	else {
	8375	if (!SIZE_ONLY && RExC_extralen)
	8376	ret = reganode(pRExC_state, BRANCHJ,0);
	8377	else {
	8378	ret = reg_node(pRExC_state, BRANCH);
	8379	Set_Node_Length(ret, 1);
	8380	}
	8381	}
	8382
	8383	if (!first && SIZE_ONLY)
	8384	RExC_extralen += 1; /* BRANCHJ */
	8385
	8386	flagp = WORST; / Tentatively. */
	8387
	8388	RExC_parse--;
	8389	nextchar(pRExC_state);
	8390	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	8391	flags &= ~TRYAGAIN;
	8392	latest = regpiece(pRExC_state, &flags,depth+1);
	8393	if (latest == NULL) {
	8394	if (flags & TRYAGAIN)
	8395	continue;
	8396	return(NULL);
	8397	}
	8398	else if (ret == NULL)
	8399	ret = latest;
	8400	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	8401	if (chain == NULL) /* First piece. */
	8402	*flagp \|= flags&SPSTART;
	8403	else {
	8404	RExC_naughty++;
	8405	REGTAIL(pRExC_state, chain, latest);
	8406	}
	8407	chain = latest;
	8408	c++;
	8409	}
	8410	if (chain == NULL) { /* Loop ran zero times. */
	8411	chain = reg_node(pRExC_state, NOTHING);
	8412	if (ret == NULL)
	8413	ret = chain;
	8414	}
	8415	if (c == 1) {
	8416	*flagp \|= flags&SIMPLE;
	8417	}
	8418
	8419	return ret;
	8420	}
	8421
	8422	/*
	8423	- regpiece - something followed by possible [*+?]
	8424	*
	8425	* Note that the branching code sequences used for ? and the general cases
	8426	* of * and + are somewhat optimized: they use the same NOTHING node as
	8427	* both the endmarker for their branch list and the body of the last branch.
	8428	* It might seem that this node could be dispensed with entirely, but the
	8429	* endmarker role is not redundant.
	8430	*/
	8431	STATIC regnode *
	8432	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	8433	{
	8434	dVAR;
	8435	register regnode *ret;
	8436	register char op;
	8437	register char *next;
	8438	I32 flags;
	8439	const char * const origparse = RExC_parse;
	8440	I32 min;
	8441	I32 max = REG_INFTY;
	8442	#ifdef RE_TRACK_PATTERN_OFFSETS
	8443	char *parse_start;
	8444	#endif
	8445	const char *maxpos = NULL;
	8446	GET_RE_DEBUG_FLAGS_DECL;
	8447
	8448	PERL_ARGS_ASSERT_REGPIECE;
	8449
	8450	DEBUG_PARSE("piec");
	8451
	8452	ret = regatom(pRExC_state, &flags,depth+1);
	8453	if (ret == NULL) {
	8454	if (flags & TRYAGAIN)
	8455	*flagp \|= TRYAGAIN;
	8456	return(NULL);
	8457	}
	8458
	8459	op = *RExC_parse;
	8460
	8461	if (op == '{' && regcurly(RExC_parse)) {
	8462	maxpos = NULL;
	8463	#ifdef RE_TRACK_PATTERN_OFFSETS
	8464	parse_start = RExC_parse; /* MJD */
	8465	#endif
	8466	next = RExC_parse + 1;
	8467	while (isDIGIT(next) \|\| next == ',') {
	8468	if (*next == ',') {
	8469	if (maxpos)
	8470	break;
	8471	else
	8472	maxpos = next;
	8473	}
	8474	next++;
	8475	}
	8476	if (next == '}') { / got one */
	8477	if (!maxpos)
	8478	maxpos = next;
	8479	RExC_parse++;
	8480	min = atoi(RExC_parse);
	8481	if (*maxpos == ',')
	8482	maxpos++;
	8483	else
	8484	maxpos = RExC_parse;
	8485	max = atoi(maxpos);
	8486	if (!max && *maxpos != '0')
	8487	max = REG_INFTY; /* meaning "infinity" */
	8488	else if (max >= REG_INFTY)
	8489	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	8490	RExC_parse = next;
	8491	nextchar(pRExC_state);
	8492
	8493	do_curly:
	8494	if ((flags&SIMPLE)) {
	8495	RExC_naughty += 2 + RExC_naughty / 2;
	8496	reginsert(pRExC_state, CURLY, ret, depth+1);
	8497	Set_Node_Offset(ret, parse_start+1); /* MJD */
	8498	Set_Node_Cur_Length(ret);
	8499	}
	8500	else {
	8501	regnode * const w = reg_node(pRExC_state, WHILEM);
	8502
	8503	w->flags = 0;
	8504	REGTAIL(pRExC_state, ret, w);
	8505	if (!SIZE_ONLY && RExC_extralen) {
	8506	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	8507	reginsert(pRExC_state, NOTHING,ret, depth+1);
	8508	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	8509	}
	8510	reginsert(pRExC_state, CURLYX,ret, depth+1);
	8511	/* MJD hk */
	8512	Set_Node_Offset(ret, parse_start+1);
	8513	Set_Node_Length(ret,
	8514	op == '{' ? (RExC_parse - parse_start) : 1);
	8515
	8516	if (!SIZE_ONLY && RExC_extralen)
	8517	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	8518	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	8519	if (SIZE_ONLY)
	8520	RExC_whilem_seen++, RExC_extralen += 3;
	8521	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	8522	}
	8523	ret->flags = 0;
	8524
	8525	if (min > 0)
	8526	*flagp = WORST;
	8527	if (max > 0)
	8528	*flagp \|= HASWIDTH;
	8529	if (max < min)
	8530	vFAIL("Can't do {n,m} with n > m");
	8531	if (!SIZE_ONLY) {
	8532	ARG1_SET(ret, (U16)min);
	8533	ARG2_SET(ret, (U16)max);
	8534	}
	8535
	8536	goto nest_check;
	8537	}
	8538	}
	8539
	8540	if (!ISMULT1(op)) {
	8541	*flagp = flags;
	8542	return(ret);
	8543	}
	8544
	8545	#if 0 /* Now runtime fix should be reliable. */
	8546
	8547	/* if this is reinstated, don't forget to put this back into perldiag:
	8548
	8549	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	8550
	8551	(F) The part of the regexp subject to either the * or + quantifier
	8552	could match an empty string. The {#} shows in the regular
	8553	expression about where the problem was discovered.
	8554
	8555	*/
	8556
	8557	if (!(flags&HASWIDTH) && op != '?')
	8558	vFAIL("Regexp *+ operand could be empty");
	8559	#endif
	8560
	8561	#ifdef RE_TRACK_PATTERN_OFFSETS
	8562	parse_start = RExC_parse;
	8563	#endif
	8564	nextchar(pRExC_state);
	8565
	8566	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	8567
	8568	if (op == '*' && (flags&SIMPLE)) {
	8569	reginsert(pRExC_state, STAR, ret, depth+1);
	8570	ret->flags = 0;
	8571	RExC_naughty += 4;
	8572	}
	8573	else if (op == '*') {
	8574	min = 0;
	8575	goto do_curly;
	8576	}
	8577	else if (op == '+' && (flags&SIMPLE)) {
	8578	reginsert(pRExC_state, PLUS, ret, depth+1);
	8579	ret->flags = 0;
	8580	RExC_naughty += 3;
	8581	}
	8582	else if (op == '+') {
	8583	min = 1;
	8584	goto do_curly;
	8585	}
	8586	else if (op == '?') {
	8587	min = 0; max = 1;
	8588	goto do_curly;
	8589	}
	8590	nest_check:
	8591	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	8592	ckWARN3reg(RExC_parse,
	8593	"%.*s matches null string many times",
	8594	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	8595	origparse);
	8596	}
	8597
	8598	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	8599	nextchar(pRExC_state);
	8600	reginsert(pRExC_state, MINMOD, ret, depth+1);
	8601	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	8602	}
	8603	#ifndef REG_ALLOW_MINMOD_SUSPEND
	8604	else
	8605	#endif
	8606	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	8607	regnode *ender;
	8608	nextchar(pRExC_state);
	8609	ender = reg_node(pRExC_state, SUCCEED);
	8610	REGTAIL(pRExC_state, ret, ender);
	8611	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	8612	ret->flags = 0;
	8613	ender = reg_node(pRExC_state, TAIL);
	8614	REGTAIL(pRExC_state, ret, ender);
	8615	/ret= ender;/
	8616	}
	8617
	8618	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	8619	RExC_parse++;
	8620	vFAIL("Nested quantifiers");
	8621	}
	8622
	8623	return(ret);
	8624	}
	8625
	8626
	8627	/* reg_namedseq(pRExC_state,UVp, UV depth)
	8628
	8629	This is expected to be called by a parser routine that has
	8630	recognized '\N' and needs to handle the rest. RExC_parse is
	8631	expected to point at the first char following the N at the time
	8632	of the call.
	8633
	8634	The \N may be inside (indicated by valuep not being NULL) or outside a
	8635	character class.
	8636
	8637	\N may begin either a named sequence, or if outside a character class, mean
	8638	to match a non-newline. For non single-quoted regexes, the tokenizer has
	8639	attempted to decide which, and in the case of a named sequence converted it
	8640	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	8641	where c1... are the characters in the sequence. For single-quoted regexes,
	8642	the tokenizer passes the \N sequence through unchanged; this code will not
	8643	attempt to determine this nor expand those. The net effect is that if the
	8644	beginning of the passed-in pattern isn't '{U+' or there is no '}', it
	8645	signals that this \N occurrence means to match a non-newline.
	8646
	8647	Only the \N{U+...} form should occur in a character class, for the same
	8648	reason that '.' inside a character class means to just match a period: it
	8649	just doesn't make sense.
	8650
	8651	If valuep is non-null then it is assumed that we are parsing inside
	8652	of a charclass definition and the first codepoint in the resolved
	8653	string is returned via *valuep and the routine will return NULL.
	8654	In this mode if a multichar string is returned from the charnames
	8655	handler, a warning will be issued, and only the first char in the
	8656	sequence will be examined. If the string returned is zero length
	8657	then the value of *valuep is undefined and NON-NULL will
	8658	be returned to indicate failure. (This will NOT be a valid pointer
	8659	to a regnode.)
	8660
	8661	If valuep is null then it is assumed that we are parsing normal text and a
	8662	new EXACT node is inserted into the program containing the resolved string,
	8663	and a pointer to the new node is returned. But if the string is zero length
	8664	a NOTHING node is emitted instead.
	8665
	8666	On success RExC_parse is set to the char following the endbrace.
	8667	Parsing failures will generate a fatal error via vFAIL(...)
	8668	*/
	8669	STATIC regnode *
	8670	S_reg_namedseq(pTHX_ RExC_state_t pRExC_state, UV valuep, I32 *flagp, U32 depth)
	8671	{
	8672	char * endbrace; /* '}' following the name */
	8673	regnode *ret = NULL;
	8674	char* p;
	8675
	8676	GET_RE_DEBUG_FLAGS_DECL;
	8677
	8678	PERL_ARGS_ASSERT_REG_NAMEDSEQ;
	8679
	8680	GET_RE_DEBUG_FLAGS;
	8681
	8682	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	8683	* modifier. The other meaning does not */
	8684	p = (RExC_flags & RXf_PMf_EXTENDED)
	8685	? regwhite( pRExC_state, RExC_parse )
	8686	: RExC_parse;
	8687
	8688	/* Disambiguate between \N meaning a named character versus \N meaning
	8689	* [^\n]. The former is assumed when it can't be the latter. */
	8690	if (*p != '{' \|\| regcurly(p)) {
	8691	RExC_parse = p;
	8692	if (valuep) {
	8693	/* no bare \N in a charclass */
	8694	vFAIL("\\N in a character class must be a named character: \\N{...}");
	8695	}
	8696	nextchar(pRExC_state);
	8697	ret = reg_node(pRExC_state, REG_ANY);
	8698	*flagp \|= HASWIDTH\|SIMPLE;
	8699	RExC_naughty++;
	8700	RExC_parse--;
	8701	Set_Node_Length(ret, 1); /* MJD */
	8702	return ret;
	8703	}
	8704
	8705	/* Here, we have decided it should be a named sequence */
	8706
	8707	/* The test above made sure that the next real character is a '{', but
	8708	* under the /x modifier, it could be separated by space (or a comment and
	8709	* \n) and this is not allowed (for consistency with \x{...} and the
	8710	* tokenizer handling of \N{NAME}). */
	8711	if (*RExC_parse != '{') {
	8712	vFAIL("Missing braces on \\N{}");
	8713	}
	8714
	8715	RExC_parse++; /* Skip past the '{' */
	8716
	8717	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	8718	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	8719	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	8720	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	8721	{
	8722	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	8723	vFAIL("\\N{NAME} must be resolved by the lexer");
	8724	}
	8725
	8726	if (endbrace == RExC_parse) { /* empty: \N{} */
	8727	if (! valuep) {
	8728	RExC_parse = endbrace + 1;
	8729	return reg_node(pRExC_state,NOTHING);
	8730	}
	8731
	8732	if (SIZE_ONLY) {
	8733	ckWARNreg(RExC_parse,
	8734	"Ignoring zero length \\N{} in character class"
	8735	);
	8736	RExC_parse = endbrace + 1;
	8737	}
	8738	*valuep = 0;
	8739	return (regnode ) &RExC_parse; / Invalid regnode pointer */
	8740	}
	8741
	8742	REQUIRE_UTF8; /* named sequences imply Unicode semantics */
	8743	RExC_parse += 2; /* Skip past the 'U+' */
	8744
	8745	if (valuep) { /* In a bracketed char class */
	8746	/* We only pay attention to the first char of
	8747	multichar strings being returned. I kinda wonder
	8748	if this makes sense as it does change the behaviour
	8749	from earlier versions, OTOH that behaviour was broken
	8750	as well. XXX Solution is to recharacterize as
	8751	[rest-of-class]\|multi1\|multi2... */
	8752
	8753	STRLEN length_of_hex;
	8754	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	8755	\| PERL_SCAN_DISALLOW_PREFIX
	8756	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	8757
	8758	char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
	8759	if (endchar < endbrace) {
	8760	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	8761	}
	8762
	8763	length_of_hex = (STRLEN)(endchar - RExC_parse);
	8764	*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
	8765
	8766	/* The tokenizer should have guaranteed validity, but it's possible to
	8767	* bypass it by using single quoting, so check */
	8768	if (length_of_hex == 0
	8769	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	8770	{
	8771	RExC_parse += length_of_hex; /* Includes all the valid */
	8772	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	8773	? UTF8SKIP(RExC_parse)
	8774	: 1;
	8775	/* Guard against malformed utf8 */
	8776	if (RExC_parse >= endchar) RExC_parse = endchar;
	8777	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	8778	}
	8779
	8780	RExC_parse = endbrace + 1;
	8781	if (endchar == endbrace) return NULL;
	8782
	8783	ret = (regnode ) &RExC_parse; / Invalid regnode pointer */
	8784	}
	8785	else { /* Not a char class */
	8786
	8787	/* What is done here is to convert this to a sub-pattern of the form
	8788	* (?:\x{char1}\x{char2}...)
	8789	* and then call reg recursively. That way, it retains its atomicness,
	8790	* while not having to worry about special handling that some code
	8791	* points may have. toke.c has converted the original Unicode values
	8792	* to native, so that we can just pass on the hex values unchanged. We
	8793	* do have to set a flag to keep recoding from happening in the
	8794	* recursion */
	8795
	8796	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8\|SVs_TEMP);
	8797	STRLEN len;
	8798	char endchar; / Points to '.' or '}' ending cur char in the input
	8799	stream */
	8800	char *orig_end = RExC_end;
	8801
	8802	while (RExC_parse < endbrace) {
	8803
	8804	/* Code points are separated by dots. If none, there is only one
	8805	* code point, and is terminated by the brace */
	8806	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	8807
	8808	/* Convert to notation the rest of the code understands */
	8809	sv_catpv(substitute_parse, "\\x{");
	8810	sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
	8811	sv_catpv(substitute_parse, "}");
	8812
	8813	/* Point to the beginning of the next character in the sequence. */
	8814	RExC_parse = endchar + 1;
	8815	}
	8816	sv_catpv(substitute_parse, ")");
	8817
	8818	RExC_parse = SvPV(substitute_parse, len);
	8819
	8820	/* Don't allow empty number */
	8821	if (len < 8) {
	8822	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	8823	}
	8824	RExC_end = RExC_parse + len;
	8825
	8826	/* The values are Unicode, and therefore not subject to recoding */
	8827	RExC_override_recoding = 1;
	8828
	8829	ret = reg(pRExC_state, 1, flagp, depth+1);
	8830
	8831	RExC_parse = endbrace;
	8832	RExC_end = orig_end;
	8833	RExC_override_recoding = 0;
	8834
	8835	nextchar(pRExC_state);
	8836	}
	8837
	8838	return ret;
	8839	}
	8840
	8841
	8842	/*
	8843	* reg_recode
	8844	*
	8845	* It returns the code point in utf8 for the value in *encp.
	8846	* value: a code value in the source encoding
	8847	* encp: a pointer to an Encode object
	8848	*
	8849	* If the result from Encode is not a single character,
	8850	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	8851	*/
	8852	STATIC UV
	8853	S_reg_recode(pTHX_ const char value, SV **encp)
	8854	{
	8855	STRLEN numlen = 1;
	8856	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	8857	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	8858	const STRLEN newlen = SvCUR(sv);
	8859	UV uv = UNICODE_REPLACEMENT;
	8860
	8861	PERL_ARGS_ASSERT_REG_RECODE;
	8862
	8863	if (newlen)
	8864	uv = SvUTF8(sv)
	8865	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	8866	: (U8)s;
	8867
	8868	if (!newlen \|\| numlen != newlen) {
	8869	uv = UNICODE_REPLACEMENT;
	8870	*encp = NULL;
	8871	}
	8872	return uv;
	8873	}
	8874
	8875
	8876	/*
	8877	- regatom - the lowest level
	8878
	8879	Try to identify anything special at the start of the pattern. If there
	8880	is, then handle it as required. This may involve generating a single regop,
	8881	such as for an assertion; or it may involve recursing, such as to
	8882	handle a () structure.
	8883
	8884	If the string doesn't start with something special then we gobble up
	8885	as much literal text as we can.
	8886
	8887	Once we have been able to handle whatever type of thing started the
	8888	sequence, we return.
	8889
	8890	Note: we have to be careful with escapes, as they can be both literal
	8891	and special, and in the case of \10 and friends can either, depending
	8892	on context. Specifically there are two separate switches for handling
	8893	escape sequences, with the one for handling literal escapes requiring
	8894	a dummy entry for all of the special escapes that are actually handled
	8895	by the other.
	8896	*/
	8897
	8898	STATIC regnode *
	8899	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	8900	{
	8901	dVAR;
	8902	register regnode *ret = NULL;
	8903	I32 flags;
	8904	char *parse_start = RExC_parse;
	8905	U8 op;
	8906	GET_RE_DEBUG_FLAGS_DECL;
	8907	DEBUG_PARSE("atom");
	8908	flagp = WORST; / Tentatively. */
	8909
	8910	PERL_ARGS_ASSERT_REGATOM;
	8911
	8912	tryagain:
	8913	switch ((U8)*RExC_parse) {
	8914	case '^':
	8915	RExC_seen_zerolen++;
	8916	nextchar(pRExC_state);
	8917	if (RExC_flags & RXf_PMf_MULTILINE)
	8918	ret = reg_node(pRExC_state, MBOL);
	8919	else if (RExC_flags & RXf_PMf_SINGLELINE)
	8920	ret = reg_node(pRExC_state, SBOL);
	8921	else
	8922	ret = reg_node(pRExC_state, BOL);
	8923	Set_Node_Length(ret, 1); /* MJD */
	8924	break;
	8925	case '$':
	8926	nextchar(pRExC_state);
	8927	if (*RExC_parse)
	8928	RExC_seen_zerolen++;
	8929	if (RExC_flags & RXf_PMf_MULTILINE)
	8930	ret = reg_node(pRExC_state, MEOL);
	8931	else if (RExC_flags & RXf_PMf_SINGLELINE)
	8932	ret = reg_node(pRExC_state, SEOL);
	8933	else
	8934	ret = reg_node(pRExC_state, EOL);
	8935	Set_Node_Length(ret, 1); /* MJD */
	8936	break;
	8937	case '.':
	8938	nextchar(pRExC_state);
	8939	if (RExC_flags & RXf_PMf_SINGLELINE)
	8940	ret = reg_node(pRExC_state, SANY);
	8941	else
	8942	ret = reg_node(pRExC_state, REG_ANY);
	8943	*flagp \|= HASWIDTH\|SIMPLE;
	8944	RExC_naughty++;
	8945	Set_Node_Length(ret, 1); /* MJD */
	8946	break;
	8947	case '[':
	8948	{
	8949	char * const oregcomp_parse = ++RExC_parse;
	8950	ret = regclass(pRExC_state,depth+1);
	8951	if (*RExC_parse != ']') {
	8952	RExC_parse = oregcomp_parse;
	8953	vFAIL("Unmatched [");
	8954	}
	8955	nextchar(pRExC_state);
	8956	*flagp \|= HASWIDTH\|SIMPLE;
	8957	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	8958	break;
	8959	}
	8960	case '(':
	8961	nextchar(pRExC_state);
	8962	ret = reg(pRExC_state, 1, &flags,depth+1);
	8963	if (ret == NULL) {
	8964	if (flags & TRYAGAIN) {
	8965	if (RExC_parse == RExC_end) {
	8966	/* Make parent create an empty node if needed. */
	8967	*flagp \|= TRYAGAIN;
	8968	return(NULL);
	8969	}
	8970	goto tryagain;
	8971	}
	8972	return(NULL);
	8973	}
	8974	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	8975	break;
	8976	case '\|':
	8977	case ')':
	8978	if (flags & TRYAGAIN) {
	8979	*flagp \|= TRYAGAIN;
	8980	return NULL;
	8981	}
	8982	vFAIL("Internal urp");
	8983	/* Supposed to be caught earlier. */
	8984	break;
	8985	case '{':
	8986	if (!regcurly(RExC_parse)) {
	8987	RExC_parse++;
	8988	goto defchar;
	8989	}
	8990	/* FALL THROUGH */
	8991	case '?':
	8992	case '+':
	8993	case '*':
	8994	RExC_parse++;
	8995	vFAIL("Quantifier follows nothing");
	8996	break;
	8997	case '\\':
	8998	/* Special Escapes
	8999
	9000	This switch handles escape sequences that resolve to some kind
	9001	of special regop and not to literal text. Escape sequnces that
	9002	resolve to literal text are handled below in the switch marked
	9003	"Literal Escapes".
	9004
	9005	Every entry in this switch must have a corresponding entry
	9006	in the literal escape switch. However, the opposite is not
	9007	required, as the default for this switch is to jump to the
	9008	literal text handling code.
	9009	*/
	9010	switch ((U8)*++RExC_parse) {
	9011	/* Special Escapes */
	9012	case 'A':
	9013	RExC_seen_zerolen++;
	9014	ret = reg_node(pRExC_state, SBOL);
	9015	*flagp \|= SIMPLE;
	9016	goto finish_meta_pat;
	9017	case 'G':
	9018	ret = reg_node(pRExC_state, GPOS);
	9019	RExC_seen \|= REG_SEEN_GPOS;
	9020	*flagp \|= SIMPLE;
	9021	goto finish_meta_pat;
	9022	case 'K':
	9023	RExC_seen_zerolen++;
	9024	ret = reg_node(pRExC_state, KEEPS);
	9025	*flagp \|= SIMPLE;
	9026	/* XXX:dmq : disabling in-place substitution seems to
	9027	* be necessary here to avoid cases of memory corruption, as
	9028	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	9029	*/
	9030	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	9031	goto finish_meta_pat;
	9032	case 'Z':
	9033	ret = reg_node(pRExC_state, SEOL);
	9034	*flagp \|= SIMPLE;
	9035	RExC_seen_zerolen++; /* Do not optimize RE away */
	9036	goto finish_meta_pat;
	9037	case 'z':
	9038	ret = reg_node(pRExC_state, EOS);
	9039	*flagp \|= SIMPLE;
	9040	RExC_seen_zerolen++; /* Do not optimize RE away */
	9041	goto finish_meta_pat;
	9042	case 'C':
	9043	ret = reg_node(pRExC_state, CANY);
	9044	RExC_seen \|= REG_SEEN_CANY;
	9045	*flagp \|= HASWIDTH\|SIMPLE;
	9046	goto finish_meta_pat;
	9047	case 'X':
	9048	ret = reg_node(pRExC_state, CLUMP);
	9049	*flagp \|= HASWIDTH;
	9050	goto finish_meta_pat;
	9051	case 'w':
	9052	switch (get_regex_charset(RExC_flags)) {
	9053	case REGEX_LOCALE_CHARSET:
	9054	op = ALNUML;
	9055	break;
	9056	case REGEX_UNICODE_CHARSET:
	9057	op = ALNUMU;
	9058	break;
	9059	case REGEX_ASCII_RESTRICTED_CHARSET:
	9060	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9061	op = ALNUMA;
	9062	break;
	9063	case REGEX_DEPENDS_CHARSET:
	9064	op = ALNUM;
	9065	break;
	9066	default:
	9067	goto bad_charset;
	9068	}
	9069	ret = reg_node(pRExC_state, op);
	9070	*flagp \|= HASWIDTH\|SIMPLE;
	9071	goto finish_meta_pat;
	9072	case 'W':
	9073	switch (get_regex_charset(RExC_flags)) {
	9074	case REGEX_LOCALE_CHARSET:
	9075	op = NALNUML;
	9076	break;
	9077	case REGEX_UNICODE_CHARSET:
	9078	op = NALNUMU;
	9079	break;
	9080	case REGEX_ASCII_RESTRICTED_CHARSET:
	9081	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9082	op = NALNUMA;
	9083	break;
	9084	case REGEX_DEPENDS_CHARSET:
	9085	op = NALNUM;
	9086	break;
	9087	default:
	9088	goto bad_charset;
	9089	}
	9090	ret = reg_node(pRExC_state, op);
	9091	*flagp \|= HASWIDTH\|SIMPLE;
	9092	goto finish_meta_pat;
	9093	case 'b':
	9094	RExC_seen_zerolen++;
	9095	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	9096	switch (get_regex_charset(RExC_flags)) {
	9097	case REGEX_LOCALE_CHARSET:
	9098	op = BOUNDL;
	9099	break;
	9100	case REGEX_UNICODE_CHARSET:
	9101	op = BOUNDU;
	9102	break;
	9103	case REGEX_ASCII_RESTRICTED_CHARSET:
	9104	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9105	op = BOUNDA;
	9106	break;
	9107	case REGEX_DEPENDS_CHARSET:
	9108	op = BOUND;
	9109	break;
	9110	default:
	9111	goto bad_charset;
	9112	}
	9113	ret = reg_node(pRExC_state, op);
	9114	FLAGS(ret) = get_regex_charset(RExC_flags);
	9115	*flagp \|= SIMPLE;
	9116	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	9117	ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
	9118	}
	9119	goto finish_meta_pat;
	9120	case 'B':
	9121	RExC_seen_zerolen++;
	9122	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	9123	switch (get_regex_charset(RExC_flags)) {
	9124	case REGEX_LOCALE_CHARSET:
	9125	op = NBOUNDL;
	9126	break;
	9127	case REGEX_UNICODE_CHARSET:
	9128	op = NBOUNDU;
	9129	break;
	9130	case REGEX_ASCII_RESTRICTED_CHARSET:
	9131	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9132	op = NBOUNDA;
	9133	break;
	9134	case REGEX_DEPENDS_CHARSET:
	9135	op = NBOUND;
	9136	break;
	9137	default:
	9138	goto bad_charset;
	9139	}
	9140	ret = reg_node(pRExC_state, op);
	9141	FLAGS(ret) = get_regex_charset(RExC_flags);
	9142	*flagp \|= SIMPLE;
	9143	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	9144	ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
	9145	}
	9146	goto finish_meta_pat;
	9147	case 's':
	9148	switch (get_regex_charset(RExC_flags)) {
	9149	case REGEX_LOCALE_CHARSET:
	9150	op = SPACEL;
	9151	break;
	9152	case REGEX_UNICODE_CHARSET:
	9153	op = SPACEU;
	9154	break;
	9155	case REGEX_ASCII_RESTRICTED_CHARSET:
	9156	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9157	op = SPACEA;
	9158	break;
	9159	case REGEX_DEPENDS_CHARSET:
	9160	op = SPACE;
	9161	break;
	9162	default:
	9163	goto bad_charset;
	9164	}
	9165	ret = reg_node(pRExC_state, op);
	9166	*flagp \|= HASWIDTH\|SIMPLE;
	9167	goto finish_meta_pat;
	9168	case 'S':
	9169	switch (get_regex_charset(RExC_flags)) {
	9170	case REGEX_LOCALE_CHARSET:
	9171	op = NSPACEL;
	9172	break;
	9173	case REGEX_UNICODE_CHARSET:
	9174	op = NSPACEU;
	9175	break;
	9176	case REGEX_ASCII_RESTRICTED_CHARSET:
	9177	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9178	op = NSPACEA;
	9179	break;
	9180	case REGEX_DEPENDS_CHARSET:
	9181	op = NSPACE;
	9182	break;
	9183	default:
	9184	goto bad_charset;
	9185	}
	9186	ret = reg_node(pRExC_state, op);
	9187	*flagp \|= HASWIDTH\|SIMPLE;
	9188	goto finish_meta_pat;
	9189	case 'd':
	9190	switch (get_regex_charset(RExC_flags)) {
	9191	case REGEX_LOCALE_CHARSET:
	9192	op = DIGITL;
	9193	break;
	9194	case REGEX_ASCII_RESTRICTED_CHARSET:
	9195	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9196	op = DIGITA;
	9197	break;
	9198	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	9199	case REGEX_UNICODE_CHARSET:
	9200	op = DIGIT;
	9201	break;
	9202	default:
	9203	goto bad_charset;
	9204	}
	9205	ret = reg_node(pRExC_state, op);
	9206	*flagp \|= HASWIDTH\|SIMPLE;
	9207	goto finish_meta_pat;
	9208	case 'D':
	9209	switch (get_regex_charset(RExC_flags)) {
	9210	case REGEX_LOCALE_CHARSET:
	9211	op = NDIGITL;
	9212	break;
	9213	case REGEX_ASCII_RESTRICTED_CHARSET:
	9214	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9215	op = NDIGITA;
	9216	break;
	9217	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	9218	case REGEX_UNICODE_CHARSET:
	9219	op = NDIGIT;
	9220	break;
	9221	default:
	9222	goto bad_charset;
	9223	}
	9224	ret = reg_node(pRExC_state, op);
	9225	*flagp \|= HASWIDTH\|SIMPLE;
	9226	goto finish_meta_pat;
	9227	case 'R':
	9228	ret = reg_node(pRExC_state, LNBREAK);
	9229	*flagp \|= HASWIDTH\|SIMPLE;
	9230	goto finish_meta_pat;
	9231	case 'h':
	9232	ret = reg_node(pRExC_state, HORIZWS);
	9233	*flagp \|= HASWIDTH\|SIMPLE;
	9234	goto finish_meta_pat;
	9235	case 'H':
	9236	ret = reg_node(pRExC_state, NHORIZWS);
	9237	*flagp \|= HASWIDTH\|SIMPLE;
	9238	goto finish_meta_pat;
	9239	case 'v':
	9240	ret = reg_node(pRExC_state, VERTWS);
	9241	*flagp \|= HASWIDTH\|SIMPLE;
	9242	goto finish_meta_pat;
	9243	case 'V':
	9244	ret = reg_node(pRExC_state, NVERTWS);
	9245	*flagp \|= HASWIDTH\|SIMPLE;
	9246	finish_meta_pat:
	9247	nextchar(pRExC_state);
	9248	Set_Node_Length(ret, 2); /* MJD */
	9249	break;
	9250	case 'p':
	9251	case 'P':
	9252	{
	9253	char* const oldregxend = RExC_end;
	9254	#ifdef DEBUGGING
	9255	char* parse_start = RExC_parse - 2;
	9256	#endif
	9257
	9258	if (RExC_parse[1] == '{') {
	9259	/* a lovely hack--pretend we saw [\pX] instead */
	9260	RExC_end = strchr(RExC_parse, '}');
	9261	if (!RExC_end) {
	9262	const U8 c = (U8)*RExC_parse;
	9263	RExC_parse += 2;
	9264	RExC_end = oldregxend;
	9265	vFAIL2("Missing right brace on \\%c{}", c);
	9266	}
	9267	RExC_end++;
	9268	}
	9269	else {
	9270	RExC_end = RExC_parse + 2;
	9271	if (RExC_end > oldregxend)
	9272	RExC_end = oldregxend;
	9273	}
	9274	RExC_parse--;
	9275
	9276	ret = regclass(pRExC_state,depth+1);
	9277
	9278	RExC_end = oldregxend;
	9279	RExC_parse--;
	9280
	9281	Set_Node_Offset(ret, parse_start + 2);
	9282	Set_Node_Cur_Length(ret);
	9283	nextchar(pRExC_state);
	9284	*flagp \|= HASWIDTH\|SIMPLE;
	9285	}
	9286	break;
	9287	case 'N':
	9288	/* Handle \N and \N{NAME} here and not below because it can be
	9289	multicharacter. join_exact() will join them up later on.
	9290	Also this makes sure that things like /\N{BLAH}+/ and
	9291	\N{BLAH} being multi char Just Happen. dmq*/
	9292	++RExC_parse;
	9293	ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
	9294	break;
	9295	case 'k': /* Handle \k<NAME> and \k'NAME' */
	9296	parse_named_seq:
	9297	{
	9298	char ch= RExC_parse[1];
	9299	if (ch != '<' && ch != '\'' && ch != '{') {
	9300	RExC_parse++;
	9301	vFAIL2("Sequence %.2s... not terminated",parse_start);
	9302	} else {
	9303	/* this pretty much dupes the code for (?P=...) in reg(), if
	9304	you change this make sure you change that */
	9305	char* name_start = (RExC_parse += 2);
	9306	U32 num = 0;
	9307	SV *sv_dat = reg_scan_name(pRExC_state,
	9308	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	9309	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	9310	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	9311	vFAIL2("Sequence %.3s... not terminated",parse_start);
	9312
	9313	if (!SIZE_ONLY) {
	9314	num = add_data( pRExC_state, 1, "S" );
	9315	RExC_rxi->data->data[num]=(void*)sv_dat;
	9316	SvREFCNT_inc_simple_void(sv_dat);
	9317	}
	9318
	9319	RExC_sawback = 1;
	9320	ret = reganode(pRExC_state,
	9321	((! FOLD)
	9322	? NREF
	9323	: (MORE_ASCII_RESTRICTED)
	9324	? NREFFA
	9325	: (AT_LEAST_UNI_SEMANTICS)
	9326	? NREFFU
	9327	: (LOC)
	9328	? NREFFL
	9329	: NREFF),
	9330	num);
	9331	*flagp \|= HASWIDTH;
	9332
	9333	/* override incorrect value set in reganode MJD */
	9334	Set_Node_Offset(ret, parse_start+1);
	9335	Set_Node_Cur_Length(ret); /* MJD */
	9336	nextchar(pRExC_state);
	9337
	9338	}
	9339	break;
	9340	}
	9341	case 'g':
	9342	case '1': case '2': case '3': case '4':
	9343	case '5': case '6': case '7': case '8': case '9':
	9344	{
	9345	I32 num;
	9346	bool isg = *RExC_parse == 'g';
	9347	bool isrel = 0;
	9348	bool hasbrace = 0;
	9349	if (isg) {
	9350	RExC_parse++;
	9351	if (*RExC_parse == '{') {
	9352	RExC_parse++;
	9353	hasbrace = 1;
	9354	}
	9355	if (*RExC_parse == '-') {
	9356	RExC_parse++;
	9357	isrel = 1;
	9358	}
	9359	if (hasbrace && !isDIGIT(*RExC_parse)) {
	9360	if (isrel) RExC_parse--;
	9361	RExC_parse -= 2;
	9362	goto parse_named_seq;
	9363	} }
	9364	num = atoi(RExC_parse);
	9365	if (isg && num == 0)
	9366	vFAIL("Reference to invalid group 0");
	9367	if (isrel) {
	9368	num = RExC_npar - num;
	9369	if (num < 1)
	9370	vFAIL("Reference to nonexistent or unclosed group");
	9371	}
	9372	if (!isg && num > 9 && num >= RExC_npar)
	9373	goto defchar;
	9374	else {
	9375	char * const parse_start = RExC_parse - 1; /* MJD */
	9376	while (isDIGIT(*RExC_parse))
	9377	RExC_parse++;
	9378	if (parse_start == RExC_parse - 1)
	9379	vFAIL("Unterminated \\g... pattern");
	9380	if (hasbrace) {
	9381	if (*RExC_parse != '}')
	9382	vFAIL("Unterminated \\g{...} pattern");
	9383	RExC_parse++;
	9384	}
	9385	if (!SIZE_ONLY) {
	9386	if (num > (I32)RExC_rx->nparens)
	9387	vFAIL("Reference to nonexistent group");
	9388	}
	9389	RExC_sawback = 1;
	9390	ret = reganode(pRExC_state,
	9391	((! FOLD)
	9392	? REF
	9393	: (MORE_ASCII_RESTRICTED)
	9394	? REFFA
	9395	: (AT_LEAST_UNI_SEMANTICS)
	9396	? REFFU
	9397	: (LOC)
	9398	? REFFL
	9399	: REFF),
	9400	num);
	9401	*flagp \|= HASWIDTH;
	9402
	9403	/* override incorrect value set in reganode MJD */
	9404	Set_Node_Offset(ret, parse_start+1);
	9405	Set_Node_Cur_Length(ret); /* MJD */
	9406	RExC_parse--;
	9407	nextchar(pRExC_state);
	9408	}
	9409	}
	9410	break;
	9411	case '\0':
	9412	if (RExC_parse >= RExC_end)
	9413	FAIL("Trailing \\");
	9414	/* FALL THROUGH */
	9415	default:
	9416	/* Do not generate "unrecognized" warnings here, we fall
	9417	back into the quick-grab loop below */
	9418	parse_start--;
	9419	goto defchar;
	9420	}
	9421	break;
	9422
	9423	case '#':
	9424	if (RExC_flags & RXf_PMf_EXTENDED) {
	9425	if ( reg_skipcomment( pRExC_state ) )
	9426	goto tryagain;
	9427	}
	9428	/* FALL THROUGH */
	9429
	9430	default:
	9431
	9432	parse_start = RExC_parse - 1;
	9433
	9434	RExC_parse++;
	9435
	9436	defchar: {
	9437	register STRLEN len;
	9438	register UV ender;
	9439	register char *p;
	9440	char *s;
	9441	STRLEN foldlen;
	9442	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	9443	U8 node_type;
	9444
	9445	/* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
	9446	* it is folded to 'ss' even if not utf8 */
	9447	bool is_exactfu_sharp_s;
	9448
	9449	ender = 0;
	9450	node_type = ((! FOLD) ? EXACT
	9451	: (LOC)
	9452	? EXACTFL
	9453	: (MORE_ASCII_RESTRICTED)
	9454	? EXACTFA
	9455	: (AT_LEAST_UNI_SEMANTICS)
	9456	? EXACTFU
	9457	: EXACTF);
	9458	ret = reg_node(pRExC_state, node_type);
	9459	s = STRING(ret);
	9460
	9461	/* XXX The node can hold up to 255 bytes, yet this only goes to
	9462	* 127. I (khw) do not know why. Keeping it somewhat less than
	9463	* 255 allows us to not have to worry about overflow due to
	9464	* converting to utf8 and fold expansion, but that value is
	9465	* 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
	9466	* split up by this limit into a single one using the real max of
	9467	* 255. Even at 127, this breaks under rare circumstances. If
	9468	* folding, we do not want to split a node at a character that is a
	9469	* non-final in a multi-char fold, as an input string could just
	9470	* happen to want to match across the node boundary. The join
	9471	* would solve that problem if the join actually happens. But a
	9472	* series of more than two nodes in a row each of 127 would cause
	9473	* the first join to succeed to get to 254, but then there wouldn't
	9474	* be room for the next one, which could at be one of those split
	9475	* multi-char folds. I don't know of any fool-proof solution. One
	9476	* could back off to end with only a code point that isn't such a
	9477	* non-final, but it is possible for there not to be any in the
	9478	* entire node. */
	9479	for (len = 0, p = RExC_parse - 1;
	9480	len < 127 && p < RExC_end;
	9481	len++)
	9482	{
	9483	char * const oldp = p;
	9484
	9485	if (RExC_flags & RXf_PMf_EXTENDED)
	9486	p = regwhite( pRExC_state, p );
	9487	switch ((U8)*p) {
	9488	case '^':
	9489	case '$':
	9490	case '.':
	9491	case '[':
	9492	case '(':
	9493	case ')':
	9494	case '\|':
	9495	goto loopdone;
	9496	case '\\':
	9497	/* Literal Escapes Switch
	9498
	9499	This switch is meant to handle escape sequences that
	9500	resolve to a literal character.
	9501
	9502	Every escape sequence that represents something
	9503	else, like an assertion or a char class, is handled
	9504	in the switch marked 'Special Escapes' above in this
	9505	routine, but also has an entry here as anything that
	9506	isn't explicitly mentioned here will be treated as
	9507	an unescaped equivalent literal.
	9508	*/
	9509
	9510	switch ((U8)*++p) {
	9511	/* These are all the special escapes. */
	9512	case 'A': /* Start assertion */
	9513	case 'b': case 'B': /* Word-boundary assertion*/
	9514	case 'C': /* Single char !DANGEROUS! */
	9515	case 'd': case 'D': /* digit class */
	9516	case 'g': case 'G': /* generic-backref, pos assertion */
	9517	case 'h': case 'H': /* HORIZWS */
	9518	case 'k': case 'K': /* named backref, keep marker */
	9519	case 'N': /* named char sequence */
	9520	case 'p': case 'P': /* Unicode property */
	9521	case 'R': /* LNBREAK */
	9522	case 's': case 'S': /* space class */
	9523	case 'v': case 'V': /* VERTWS */
	9524	case 'w': case 'W': /* word class */
	9525	case 'X': /* eXtended Unicode "combining character sequence" */
	9526	case 'z': case 'Z': /* End of line/string assertion */
	9527	--p;
	9528	goto loopdone;
	9529
	9530	/* Anything after here is an escape that resolves to a
	9531	literal. (Except digits, which may or may not)
	9532	*/
	9533	case 'n':
	9534	ender = '\n';
	9535	p++;
	9536	break;
	9537	case 'r':
	9538	ender = '\r';
	9539	p++;
	9540	break;
	9541	case 't':
	9542	ender = '\t';
	9543	p++;
	9544	break;
	9545	case 'f':
	9546	ender = '\f';
	9547	p++;
	9548	break;
	9549	case 'e':
	9550	ender = ASCII_TO_NATIVE('\033');
	9551	p++;
	9552	break;
	9553	case 'a':
	9554	ender = ASCII_TO_NATIVE('\007');
	9555	p++;
	9556	break;
	9557	case 'o':
	9558	{
	9559	STRLEN brace_len = len;
	9560	UV result;
	9561	const char* error_msg;
	9562
	9563	bool valid = grok_bslash_o(p,
	9564	&result,
	9565	&brace_len,
	9566	&error_msg,
	9567	1);
	9568	p += brace_len;
	9569	if (! valid) {
	9570	RExC_parse = p; /* going to die anyway; point
	9571	to exact spot of failure */
	9572	vFAIL(error_msg);
	9573	}
	9574	else
	9575	{
	9576	ender = result;
	9577	}
	9578	if (PL_encoding && ender < 0x100) {
	9579	goto recode_encoding;
	9580	}
	9581	if (ender > 0xff) {
	9582	REQUIRE_UTF8;
	9583	}
	9584	break;
	9585	}
	9586	case 'x':
	9587	if (*++p == '{') {
	9588	char* const e = strchr(p, '}');
	9589
	9590	if (!e) {
	9591	RExC_parse = p + 1;
	9592	vFAIL("Missing right brace on \\x{}");
	9593	}
	9594	else {
	9595	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	9596	\| PERL_SCAN_DISALLOW_PREFIX;
	9597	STRLEN numlen = e - p - 1;
	9598	ender = grok_hex(p + 1, &numlen, &flags, NULL);
	9599	if (ender > 0xff)
	9600	REQUIRE_UTF8;
	9601	p = e + 1;
	9602	}
	9603	}
	9604	else {
	9605	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	9606	STRLEN numlen = 2;
	9607	ender = grok_hex(p, &numlen, &flags, NULL);
	9608	p += numlen;
	9609	}
	9610	if (PL_encoding && ender < 0x100)
	9611	goto recode_encoding;
	9612	break;
	9613	case 'c':
	9614	p++;
	9615	ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
	9616	break;
	9617	case '0': case '1': case '2': case '3':case '4':
	9618	case '5': case '6': case '7': case '8':case '9':
	9619	if (*p == '0' \|\|
	9620	(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
	9621	{
	9622	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	9623	STRLEN numlen = 3;
	9624	ender = grok_oct(p, &numlen, &flags, NULL);
	9625	if (ender > 0xff) {
	9626	REQUIRE_UTF8;
	9627	}
	9628	p += numlen;
	9629	}
	9630	else {
	9631	--p;
	9632	goto loopdone;
	9633	}
	9634	if (PL_encoding && ender < 0x100)
	9635	goto recode_encoding;
	9636	break;
	9637	recode_encoding:
	9638	if (! RExC_override_recoding) {
	9639	SV* enc = PL_encoding;
	9640	ender = reg_recode((const char)(U8)ender, &enc);
	9641	if (!enc && SIZE_ONLY)
	9642	ckWARNreg(p, "Invalid escape in the specified encoding");
	9643	REQUIRE_UTF8;
	9644	}
	9645	break;
	9646	case '\0':
	9647	if (p >= RExC_end)
	9648	FAIL("Trailing \\");
	9649	/* FALL THROUGH */
	9650	default:
	9651	if (!SIZE_ONLY&& isALPHA(*p)) {
	9652	/* Include any { following the alpha to emphasize
	9653	* that it could be part of an escape at some point
	9654	* in the future */
	9655	int len = (*(p + 1) == '{') ? 2 : 1;
	9656	ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
	9657	}
	9658	goto normal_default;
	9659	}
	9660	break;
	9661	default:
	9662	normal_default:
	9663	if (UTF8_IS_START(*p) && UTF) {
	9664	STRLEN numlen;
	9665	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	9666	&numlen, UTF8_ALLOW_DEFAULT);
	9667	p += numlen;
	9668	}
	9669	else
	9670	ender = (U8) *p++;
	9671	break;
	9672	} /* End of switch on the literal */
	9673
	9674	is_exactfu_sharp_s = (node_type == EXACTFU
	9675	&& ender == LATIN_SMALL_LETTER_SHARP_S);
	9676	if ( RExC_flags & RXf_PMf_EXTENDED)
	9677	p = regwhite( pRExC_state, p );
	9678	if ((UTF && FOLD) \|\| is_exactfu_sharp_s) {
	9679	/* Prime the casefolded buffer. Locale rules, which apply
	9680	* only to code points < 256, aren't known until execution,
	9681	* so for them, just output the original character using
	9682	* utf8. If we start to fold non-UTF patterns, be sure to
	9683	* update join_exact() */
	9684	if (LOC && ender < 256) {
	9685	if (UNI_IS_INVARIANT(ender)) {
	9686	*tmpbuf = (U8) ender;
	9687	foldlen = 1;
	9688	} else {
	9689	*tmpbuf = UTF8_TWO_BYTE_HI(ender);
	9690	*(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
	9691	foldlen = 2;
	9692	}
	9693	}
	9694	else if (isASCII(ender)) { /* Note: Here can't also be LOC
	9695	*/
	9696	ender = toLOWER(ender);
	9697	*tmpbuf = (U8) ender;
	9698	foldlen = 1;
	9699	}
	9700	else if (! MORE_ASCII_RESTRICTED && ! LOC) {
	9701
	9702	/* Locale and /aa require more selectivity about the
	9703	* fold, so are handled below. Otherwise, here, just
	9704	* use the fold */
	9705	ender = toFOLD_uni(ender, tmpbuf, &foldlen);
	9706	}
	9707	else {
	9708	/* Under locale rules or /aa we are not to mix,
	9709	* respectively, ords < 256 or ASCII with non-. So
	9710	* reject folds that mix them, using only the
	9711	* non-folded code point. So do the fold to a
	9712	* temporary, and inspect each character in it. */
	9713	U8 trialbuf[UTF8_MAXBYTES_CASE+1];
	9714	U8* s = trialbuf;
	9715	UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
	9716	U8* e = s + foldlen;
	9717	bool fold_ok = TRUE;
	9718
	9719	while (s < e) {
	9720	if (isASCII(*s)
	9721	\|\| (LOC && (UTF8_IS_INVARIANT(*s)
	9722	\|\| UTF8_IS_DOWNGRADEABLE_START(*s))))
	9723	{
	9724	fold_ok = FALSE;
	9725	break;
	9726	}
	9727	s += UTF8SKIP(s);
	9728	}
	9729	if (fold_ok) {
	9730	Copy(trialbuf, tmpbuf, foldlen, U8);
	9731	ender = tmpender;
	9732	}
	9733	else {
	9734	uvuni_to_utf8(tmpbuf, ender);
	9735	foldlen = UNISKIP(ender);
	9736	}
	9737	}
	9738	}
	9739	if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+. /
	9740	if (len)
	9741	p = oldp;
	9742	else if (UTF \|\| is_exactfu_sharp_s) {
	9743	if (FOLD) {
	9744	/* Emit all the Unicode characters. */
	9745	STRLEN numlen;
	9746	for (foldbuf = tmpbuf;
	9747	foldlen;
	9748	foldlen -= numlen) {
	9749	ender = utf8_to_uvchr(foldbuf, &numlen);
	9750	if (numlen > 0) {
	9751	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9752	s += unilen;
	9753	len += unilen;
	9754	/* In EBCDIC the numlen
	9755	* and unilen can differ. */
	9756	foldbuf += numlen;
	9757	if (numlen >= foldlen)
	9758	break;
	9759	}
	9760	else
	9761	break; /* "Can't happen." */
	9762	}
	9763	}
	9764	else {
	9765	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9766	if (unilen > 0) {
	9767	s += unilen;
	9768	len += unilen;
	9769	}
	9770	}
	9771	}
	9772	else {
	9773	len++;
	9774	REGC((char)ender, s++);
	9775	}
	9776	break;
	9777	}
	9778	if (UTF \|\| is_exactfu_sharp_s) {
	9779	if (FOLD) {
	9780	/* Emit all the Unicode characters. */
	9781	STRLEN numlen;
	9782	for (foldbuf = tmpbuf;
	9783	foldlen;
	9784	foldlen -= numlen) {
	9785	ender = utf8_to_uvchr(foldbuf, &numlen);
	9786	if (numlen > 0) {
	9787	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9788	len += unilen;
	9789	s += unilen;
	9790	/* In EBCDIC the numlen
	9791	* and unilen can differ. */
	9792	foldbuf += numlen;
	9793	if (numlen >= foldlen)
	9794	break;
	9795	}
	9796	else
	9797	break;
	9798	}
	9799	}
	9800	else {
	9801	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9802	if (unilen > 0) {
	9803	s += unilen;
	9804	len += unilen;
	9805	}
	9806	}
	9807	len--;
	9808	}
	9809	else {
	9810	REGC((char)ender, s++);
	9811	}
	9812	}
	9813	loopdone: /* Jumped to when encounters something that shouldn't be in
	9814	the node */
	9815	RExC_parse = p - 1;
	9816	Set_Node_Cur_Length(ret); /* MJD */
	9817	nextchar(pRExC_state);
	9818	{
	9819	/* len is STRLEN which is unsigned, need to copy to signed */
	9820	IV iv = len;
	9821	if (iv < 0)
	9822	vFAIL("Internal disaster");
	9823	}
	9824	if (len > 0)
	9825	*flagp \|= HASWIDTH;
	9826	if (len == 1 && UNI_IS_INVARIANT(ender))
	9827	*flagp \|= SIMPLE;
	9828
	9829	if (SIZE_ONLY)
	9830	RExC_size += STR_SZ(len);
	9831	else {
	9832	STR_LEN(ret) = len;
	9833	RExC_emit += STR_SZ(len);
	9834	}
	9835	}
	9836	break;
	9837	}
	9838
	9839	return(ret);
	9840
	9841	/* Jumped to when an unrecognized character set is encountered */
	9842	bad_charset:
	9843	Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
	9844	return(NULL);
	9845	}
	9846
	9847	STATIC char *
	9848	S_regwhite( RExC_state_t pRExC_state, char p )
	9849	{
	9850	const char *e = RExC_end;
	9851
	9852	PERL_ARGS_ASSERT_REGWHITE;
	9853
	9854	while (p < e) {
	9855	if (isSPACE(*p))
	9856	++p;
	9857	else if (*p == '#') {
	9858	bool ended = 0;
	9859	do {
	9860	if (*p++ == '\n') {
	9861	ended = 1;
	9862	break;
	9863	}
	9864	} while (p < e);
	9865	if (!ended)
	9866	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	9867	}
	9868	else
	9869	break;
	9870	}
	9871	return p;
	9872	}
	9873
	9874	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	9875	Character classes ([:foo:]) can also be negated ([:^foo:]).
	9876	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	9877	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	9878	but trigger failures because they are currently unimplemented. */
	9879
	9880	#define POSIXCC_DONE(c) ((c) == ':')
	9881	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	9882	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	9883
	9884	STATIC I32
	9885	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
	9886	{
	9887	dVAR;
	9888	I32 namedclass = OOB_NAMEDCLASS;
	9889
	9890	PERL_ARGS_ASSERT_REGPPOSIXCC;
	9891
	9892	if (value == '[' && RExC_parse + 1 < RExC_end &&
	9893	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	9894	POSIXCC(UCHARAT(RExC_parse))) {
	9895	const char c = UCHARAT(RExC_parse);
	9896	char* const s = RExC_parse++;
	9897
	9898	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	9899	RExC_parse++;
	9900	if (RExC_parse == RExC_end)
	9901	/* Grandfather lone [:, [=, [. */
	9902	RExC_parse = s;
	9903	else {
	9904	const char* const t = RExC_parse++; /* skip over the c */
	9905	assert(*t == c);
	9906
	9907	if (UCHARAT(RExC_parse) == ']') {
	9908	const char *posixcc = s + 1;
	9909	RExC_parse++; /* skip over the ending ] */
	9910
	9911	if (*s == ':') {
	9912	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	9913	const I32 skip = t - posixcc;
	9914
	9915	/* Initially switch on the length of the name. */
	9916	switch (skip) {
	9917	case 4:
	9918	if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
	9919	namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
	9920	break;
	9921	case 5:
	9922	/* Names all of length 5. */
	9923	/* alnum alpha ascii blank cntrl digit graph lower
	9924	print punct space upper */
	9925	/* Offset 4 gives the best switch position. */
	9926	switch (posixcc[4]) {
	9927	case 'a':
	9928	if (memEQ(posixcc, "alph", 4)) /* alpha */
	9929	namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
	9930	break;
	9931	case 'e':
	9932	if (memEQ(posixcc, "spac", 4)) /* space */
	9933	namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
	9934	break;
	9935	case 'h':
	9936	if (memEQ(posixcc, "grap", 4)) /* graph */
	9937	namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
	9938	break;
	9939	case 'i':
	9940	if (memEQ(posixcc, "asci", 4)) /* ascii */
	9941	namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
	9942	break;
	9943	case 'k':
	9944	if (memEQ(posixcc, "blan", 4)) /* blank */
	9945	namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
	9946	break;
	9947	case 'l':
	9948	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	9949	namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
	9950	break;
	9951	case 'm':
	9952	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	9953	namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
	9954	break;
	9955	case 'r':
	9956	if (memEQ(posixcc, "lowe", 4)) /* lower */
	9957	namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
	9958	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	9959	namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
	9960	break;
	9961	case 't':
	9962	if (memEQ(posixcc, "digi", 4)) /* digit */
	9963	namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
	9964	else if (memEQ(posixcc, "prin", 4)) /* print */
	9965	namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
	9966	else if (memEQ(posixcc, "punc", 4)) /* punct */
	9967	namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
	9968	break;
	9969	}
	9970	break;
	9971	case 6:
	9972	if (memEQ(posixcc, "xdigit", 6))
	9973	namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
	9974	break;
	9975	}
	9976
	9977	if (namedclass == OOB_NAMEDCLASS)
	9978	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	9979	t - s - 1, s + 1);
	9980	assert (posixcc[skip] == ':');
	9981	assert (posixcc[skip+1] == ']');
	9982	} else if (!SIZE_ONLY) {
	9983	/* [[=foo=]] and [[.foo.]] are still future. */
	9984
	9985	/* adjust RExC_parse so the warning shows after
	9986	the class closes */
	9987	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	9988	RExC_parse++;
	9989	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9990	}
	9991	} else {
	9992	/* Maternal grandfather:
	9993	* "[:" ending in ":" but not in ":]" */
	9994	RExC_parse = s;
	9995	}
	9996	}
	9997	}
	9998
	9999	return namedclass;
	10000	}
	10001
	10002	STATIC void
	10003	S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
	10004	{
	10005	dVAR;
	10006
	10007	PERL_ARGS_ASSERT_CHECKPOSIXCC;
	10008
	10009	if (POSIXCC(UCHARAT(RExC_parse))) {
	10010	const char *s = RExC_parse;
	10011	const char c = *s++;
	10012
	10013	while (isALNUM(*s))
	10014	s++;
	10015	if (s && c == s && s[1] == ']') {
	10016	ckWARN3reg(s+2,
	10017	"POSIX syntax [%c %c] belongs inside character classes",
	10018	c, c);
	10019
	10020	/* [[=foo=]] and [[.foo.]] are still future. */
	10021	if (POSIXCC_NOTYET(c)) {
	10022	/* adjust RExC_parse so the error shows after
	10023	the class closes */
	10024	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
	10025	NOOP;
	10026	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	10027	}
	10028	}
	10029	}
	10030	}
	10031
	10032	/* Generate the code to add a full posix character <class> to the bracketed
	10033	* character class given by <node>. (<node> is needed only under locale rules)
	10034	* destlist is the inversion list for non-locale rules that this class is
	10035	* to be added to
	10036	* sourcelist is the ASCII-range inversion list to add under /a rules
	10037	* Xsourcelist is the full Unicode range list to use otherwise. */
	10038	#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
	10039	if (LOC) { \
	10040	SV* scratch_list = NULL; \
	10041	\
	10042	/* Set this class in the node for runtime matching */ \
	10043	ANYOF_CLASS_SET(node, class); \
	10044	\
	10045	/* For above Latin1 code points, we use the full Unicode range */ \
	10046	_invlist_intersection(PL_AboveLatin1, \
	10047	Xsourcelist, \
	10048	&scratch_list); \
	10049	/* And set the output to it, adding instead if there already is an \
	10050	* output. Checking if <destlist> is NULL first saves an extra \
	10051	* clone. Its reference count will be decremented at the next \
	10052	* union, etc, or if this is the only instance, at the end of the \
	10053	* routine */ \
	10054	if (! destlist) { \
	10055	destlist = scratch_list; \
	10056	} \
	10057	else { \
	10058	_invlist_union(destlist, scratch_list, &destlist); \
	10059	SvREFCNT_dec(scratch_list); \
	10060	} \
	10061	} \
	10062	else { \
	10063	/* For non-locale, just add it to any existing list */ \
	10064	_invlist_union(destlist, \
	10065	(AT_LEAST_ASCII_RESTRICTED) \
	10066	? sourcelist \
	10067	: Xsourcelist, \
	10068	&destlist); \
	10069	}
	10070
	10071	/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
	10072	*/
	10073	#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
	10074	if (LOC) { \
	10075	SV* scratch_list = NULL; \
	10076	ANYOF_CLASS_SET(node, class); \
	10077	_invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
	10078	if (! destlist) { \
	10079	destlist = scratch_list; \
	10080	} \
	10081	else { \
	10082	_invlist_union(destlist, scratch_list, &destlist); \
	10083	SvREFCNT_dec(scratch_list); \
	10084	} \
	10085	} \
	10086	else { \
	10087	_invlist_union_complement_2nd(destlist, \
	10088	(AT_LEAST_ASCII_RESTRICTED) \
	10089	? sourcelist \
	10090	: Xsourcelist, \
	10091	&destlist); \
	10092	/* Under /d, everything in the upper half of the Latin1 range \
	10093	* matches this complement */ \
	10094	if (DEPENDS_SEMANTICS) { \
	10095	ANYOF_FLAGS(node) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	10096	} \
	10097	}
	10098
	10099	/* Generate the code to add a posix character <class> to the bracketed
	10100	* character class given by <node>. (<node> is needed only under locale rules)
	10101	* destlist is the inversion list for non-locale rules that this class is
	10102	* to be added to
	10103	* sourcelist is the ASCII-range inversion list to add under /a rules
	10104	* l1_sourcelist is the Latin1 range list to use otherwise.
	10105	* Xpropertyname is the name to add to <run_time_list> of the property to
	10106	* specify the code points above Latin1 that will have to be
	10107	* determined at run-time
	10108	* run_time_list is a SV* that contains text names of properties that are to
	10109	* be computed at run time. This concatenates <Xpropertyname>
	10110	* to it, apppropriately
	10111	* This is essentially DO_POSIX, but we know only the Latin1 values at compile
	10112	* time */
	10113	#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
	10114	l1_sourcelist, Xpropertyname, run_time_list) \
	10115	/* If not /a matching, there are going to be code points we will have \
	10116	* to defer to runtime to look-up */ \
	10117	if (! AT_LEAST_ASCII_RESTRICTED) { \
	10118	Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
	10119	} \
	10120	if (LOC) { \
	10121	ANYOF_CLASS_SET(node, class); \
	10122	} \
	10123	else { \
	10124	_invlist_union(destlist, \
	10125	(AT_LEAST_ASCII_RESTRICTED) \
	10126	? sourcelist \
	10127	: l1_sourcelist, \
	10128	&destlist); \
	10129	}
	10130
	10131	/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
	10132	* this and DO_N_POSIX */
	10133	#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
	10134	l1_sourcelist, Xpropertyname, run_time_list) \
	10135	if (AT_LEAST_ASCII_RESTRICTED) { \
	10136	_invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
	10137	} \
	10138	else { \
	10139	Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
	10140	if (LOC) { \
	10141	ANYOF_CLASS_SET(node, namedclass); \
	10142	} \
	10143	else { \
	10144	SV* scratch_list = NULL; \
	10145	_invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
	10146	if (! destlist) { \
	10147	destlist = scratch_list; \
	10148	} \
	10149	else { \
	10150	_invlist_union(destlist, scratch_list, &destlist); \
	10151	SvREFCNT_dec(scratch_list); \
	10152	} \
	10153	if (DEPENDS_SEMANTICS) { \
	10154	ANYOF_FLAGS(node) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	10155	} \
	10156	} \
	10157	}
	10158
	10159	STATIC U8
	10160	S_set_regclass_bit_fold(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, SV invlist_ptr, AV alternate_ptr)
	10161	{
	10162
	10163	/* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
	10164	* Locale folding is done at run-time, so this function should not be
	10165	* called for nodes that are for locales.
	10166	*
	10167	* This function sets the bit corresponding to the fold of the input
	10168	* 'value', if not already set. The fold of 'f' is 'F', and the fold of
	10169	* 'F' is 'f'.
	10170	*
	10171	* It also knows about the characters that are in the bitmap that have
	10172	* folds that are matchable only outside it, and sets the appropriate lists
	10173	* and flags.
	10174	*
	10175	* It returns the number of bits that actually changed from 0 to 1 */
	10176
	10177	U8 stored = 0;
	10178	U8 fold;
	10179
	10180	PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
	10181
	10182	fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
	10183	: PL_fold[value];
	10184
	10185	/* It assumes the bit for 'value' has already been set */
	10186	if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
	10187	ANYOF_BITMAP_SET(node, fold);
	10188	stored++;
	10189	}
	10190	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) \|\| ! MORE_ASCII_RESTRICTED)) {
	10191	/* Certain Latin1 characters have matches outside the bitmap. To get
	10192	* here, 'value' is one of those characters. None of these matches is
	10193	* valid for ASCII characters under /aa, which have been excluded by
	10194	* the 'if' above. The matches fall into three categories:
	10195	* 1) They are singly folded-to or -from an above 255 character, as
	10196	* LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
	10197	* WITH DIAERESIS;
	10198	* 2) They are part of a multi-char fold with another character in the
	10199	* bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
	10200	* 3) They are part of a multi-char fold with a character not in the
	10201	* bitmap, such as various ligatures.
	10202	* We aren't dealing fully with multi-char folds, except we do deal
	10203	* with the pattern containing a character that has a multi-char fold
	10204	* (not so much the inverse).
	10205	* For types 1) and 3), the matches only happen when the target string
	10206	* is utf8; that's not true for 2), and we set a flag for it.
	10207	*
	10208	* The code below adds to the passed in inversion list the single fold
	10209	* closures for 'value'. The values are hard-coded here so that an
	10210	* innocent-looking character class, like /[ks]/i won't have to go out
	10211	* to disk to find the possible matches. XXX It would be better to
	10212	* generate these via regen, in case a new version of the Unicode
	10213	* standard adds new mappings, though that is not really likely. */
	10214	switch (value) {
	10215	case 'k':
	10216	case 'K':
	10217	/* KELVIN SIGN */
	10218	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212A);
	10219	break;
	10220	case 's':
	10221	case 'S':
	10222	/* LATIN SMALL LETTER LONG S */
	10223	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x017F);
	10224	break;
	10225	case MICRO_SIGN:
	10226	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10227	GREEK_SMALL_LETTER_MU);
	10228	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10229	GREEK_CAPITAL_LETTER_MU);
	10230	break;
	10231	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	10232	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	10233	/* ANGSTROM SIGN */
	10234	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212B);
	10235	if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
	10236	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10237	PL_fold_latin1[value]);
	10238	}
	10239	break;
	10240	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	10241	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10242	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	10243	break;
	10244	case LATIN_SMALL_LETTER_SHARP_S:
	10245	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10246	LATIN_CAPITAL_LETTER_SHARP_S);
	10247
	10248	/* Under /a, /d, and /u, this can match the two chars "ss" */
	10249	if (! MORE_ASCII_RESTRICTED) {
	10250	add_alternate(alternate_ptr, (U8 *) "ss", 2);
	10251
	10252	/* And under /u or /a, it can match even if the target is
	10253	* not utf8 */
	10254	if (AT_LEAST_UNI_SEMANTICS) {
	10255	ANYOF_FLAGS(node) \|= ANYOF_NONBITMAP_NON_UTF8;
	10256	}
	10257	}
	10258	break;
	10259	case 'F': case 'f':
	10260	case 'I': case 'i':
	10261	case 'L': case 'l':
	10262	case 'T': case 't':
	10263	case 'A': case 'a':
	10264	case 'H': case 'h':
	10265	case 'J': case 'j':
	10266	case 'N': case 'n':
	10267	case 'W': case 'w':
	10268	case 'Y': case 'y':
	10269	/* These all are targets of multi-character folds from code
	10270	* points that require UTF8 to express, so they can't match
	10271	* unless the target string is in UTF-8, so no action here is
	10272	* necessary, as regexec.c properly handles the general case
	10273	* for UTF-8 matching */
	10274	break;
	10275	default:
	10276	/* Use deprecated warning to increase the chances of this
	10277	* being output */
	10278	ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
	10279	break;
	10280	}
	10281	}
	10282	else if (DEPENDS_SEMANTICS
	10283	&& ! isASCII(value)
	10284	&& PL_fold_latin1[value] != value)
	10285	{
	10286	/* Under DEPENDS rules, non-ASCII Latin1 characters match their
	10287	* folds only when the target string is in UTF-8. We add the fold
	10288	* here to the list of things to match outside the bitmap, which
	10289	* won't be looked at unless it is UTF8 (or else if something else
	10290	* says to look even if not utf8, but those things better not happen
	10291	* under DEPENDS semantics. */
	10292	invlist_ptr = add_cp_to_invlist(invlist_ptr, PL_fold_latin1[value]);
	10293	}
	10294
	10295	return stored;
	10296	}
	10297
	10298
	10299	PERL_STATIC_INLINE U8
	10300	S_set_regclass_bit(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, SV invlist_ptr, AV alternate_ptr)
	10301	{
	10302	/* This inline function sets a bit in the bitmap if not already set, and if
	10303	* appropriate, its fold, returning the number of bits that actually
	10304	* changed from 0 to 1 */
	10305
	10306	U8 stored;
	10307
	10308	PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
	10309
	10310	if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
	10311	return 0;
	10312	}
	10313
	10314	ANYOF_BITMAP_SET(node, value);
	10315	stored = 1;
	10316
	10317	if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
	10318	stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
	10319	}
	10320
	10321	return stored;
	10322	}
	10323
	10324	STATIC void
	10325	S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
	10326	{
	10327	/* Adds input 'string' with length 'len' to the ANYOF node's unicode
	10328	* alternate list, pointed to by 'alternate_ptr'. This is an array of
	10329	* the multi-character folds of characters in the node */
	10330	SV *sv;
	10331
	10332	PERL_ARGS_ASSERT_ADD_ALTERNATE;
	10333
	10334	if (! *alternate_ptr) {
	10335	*alternate_ptr = newAV();
	10336	}
	10337	sv = newSVpvn_utf8((char*)string, len, TRUE);
	10338	av_push(*alternate_ptr, sv);
	10339	return;
	10340	}
	10341
	10342	/*
	10343	parse a class specification and produce either an ANYOF node that
	10344	matches the pattern or perhaps will be optimized into an EXACTish node
	10345	instead. The node contains a bit map for the first 256 characters, with the
	10346	corresponding bit set if that character is in the list. For characters
	10347	above 255, a range list is used */
	10348
	10349	STATIC regnode *
	10350	S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
	10351	{
	10352	dVAR;
	10353	register UV nextvalue;
	10354	register IV prevvalue = OOB_UNICODE;
	10355	register IV range = 0;
	10356	UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
	10357	register regnode *ret;
	10358	STRLEN numlen;
	10359	IV namedclass;
	10360	char *rangebegin = NULL;
	10361	bool need_class = 0;
	10362	bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
	10363	SV *listsv = NULL;
	10364	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	10365	than just initialized. */
	10366	SV* properties = NULL; /* Code points that match \p{} \P{} */
	10367	UV element_count = 0; /* Number of distinct elements in the class.
	10368	Optimizations may be possible if this is tiny */
	10369	UV n;
	10370
	10371	/* Unicode properties are stored in a swash; this holds the current one
	10372	* being parsed. If this swash is the only above-latin1 component of the
	10373	* character class, an optimization is to pass it directly on to the
	10374	* execution engine. Otherwise, it is set to NULL to indicate that there
	10375	* are other things in the class that have to be dealt with at execution
	10376	* time */
	10377	SV* swash = NULL; /* Code points that match \p{} \P{} */
	10378
	10379	/* Set if a component of this character class is user-defined; just passed
	10380	* on to the engine */
	10381	UV has_user_defined_property = 0;
	10382
	10383	/* code points this node matches that can't be stored in the bitmap */
	10384	SV* nonbitmap = NULL;
	10385
	10386	/* The items that are to match that aren't stored in the bitmap, but are a
	10387	* result of things that are stored there. This is the fold closure of
	10388	* such a character, either because it has DEPENDS semantics and shouldn't
	10389	* be matched unless the target string is utf8, or is a code point that is
	10390	* too large for the bit map, as for example, the fold of the MICRO SIGN is
	10391	* above 255. This all is solely for performance reasons. By having this
	10392	* code know the outside-the-bitmap folds that the bitmapped characters are
	10393	* involved with, we don't have to go out to disk to find the list of
	10394	* matches, unless the character class includes code points that aren't
	10395	* storable in the bit map. That means that a character class with an 's'
	10396	* in it, for example, doesn't need to go out to disk to find everything
	10397	* that matches. A 2nd list is used so that the 'nonbitmap' list is kept
	10398	* empty unless there is something whose fold we don't know about, and will
	10399	* have to go out to the disk to find. */
	10400	SV* l1_fold_invlist = NULL;
	10401
	10402	/* List of multi-character folds that are matched by this node */
	10403	AV* unicode_alternate = NULL;
	10404	#ifdef EBCDIC
	10405	UV literal_endpoint = 0;
	10406	#endif
	10407	UV stored = 0; /* how many chars stored in the bitmap */
	10408
	10409	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	10410	case we need to change the emitted regop to an EXACT. */
	10411	const char * orig_parse = RExC_parse;
	10412	GET_RE_DEBUG_FLAGS_DECL;
	10413
	10414	PERL_ARGS_ASSERT_REGCLASS;
	10415	#ifndef DEBUGGING
	10416	PERL_UNUSED_ARG(depth);
	10417	#endif
	10418
	10419	DEBUG_PARSE("clas");
	10420
	10421	/* Assume we are going to generate an ANYOF node. */
	10422	ret = reganode(pRExC_state, ANYOF, 0);
	10423
	10424
	10425	if (!SIZE_ONLY) {
	10426	ANYOF_FLAGS(ret) = 0;
	10427	}
	10428
	10429	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	10430	RExC_naughty++;
	10431	RExC_parse++;
	10432	if (!SIZE_ONLY)
	10433	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	10434
	10435	/* We have decided to not allow multi-char folds in inverted character
	10436	* classes, due to the confusion that can happen, especially with
	10437	* classes that are designed for a non-Unicode world: You have the
	10438	* peculiar case that:
	10439	"s s" =~ /^[^\xDF]+$/i => Y
	10440	"ss" =~ /^[^\xDF]+$/i => N
	10441	*
	10442	* See [perl #89750] */
	10443	allow_full_fold = FALSE;
	10444	}
	10445
	10446	if (SIZE_ONLY) {
	10447	RExC_size += ANYOF_SKIP;
	10448	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	10449	}
	10450	else {
	10451	RExC_emit += ANYOF_SKIP;
	10452	if (LOC) {
	10453	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	10454	}
	10455	ANYOF_BITMAP_ZERO(ret);
	10456	listsv = newSVpvs("# comment\n");
	10457	initial_listsv_len = SvCUR(listsv);
	10458	}
	10459
	10460	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	10461
	10462	if (!SIZE_ONLY && POSIXCC(nextvalue))
	10463	checkposixcc(pRExC_state);
	10464
	10465	/* allow 1st char to be ] (allowing it to be - is dealt with later) */
	10466	if (UCHARAT(RExC_parse) == ']')
	10467	goto charclassloop;
	10468
	10469	parseit:
	10470	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	10471
	10472	charclassloop:
	10473
	10474	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	10475
	10476	if (!range) {
	10477	rangebegin = RExC_parse;
	10478	element_count++;
	10479	}
	10480	if (UTF) {
	10481	value = utf8n_to_uvchr((U8*)RExC_parse,
	10482	RExC_end - RExC_parse,
	10483	&numlen, UTF8_ALLOW_DEFAULT);
	10484	RExC_parse += numlen;
	10485	}
	10486	else
	10487	value = UCHARAT(RExC_parse++);
	10488
	10489	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	10490	if (value == '[' && POSIXCC(nextvalue))
	10491	namedclass = regpposixcc(pRExC_state, value);
	10492	else if (value == '\\') {
	10493	if (UTF) {
	10494	value = utf8n_to_uvchr((U8*)RExC_parse,
	10495	RExC_end - RExC_parse,
	10496	&numlen, UTF8_ALLOW_DEFAULT);
	10497	RExC_parse += numlen;
	10498	}
	10499	else
	10500	value = UCHARAT(RExC_parse++);
	10501	/* Some compilers cannot handle switching on 64-bit integer
	10502	* values, therefore value cannot be an UV. Yes, this will
	10503	* be a problem later if we want switch on Unicode.
	10504	* A similar issue a little bit later when switching on
	10505	* namedclass. --jhi */
	10506	switch ((I32)value) {
	10507	case 'w': namedclass = ANYOF_ALNUM; break;
	10508	case 'W': namedclass = ANYOF_NALNUM; break;
	10509	case 's': namedclass = ANYOF_SPACE; break;
	10510	case 'S': namedclass = ANYOF_NSPACE; break;
	10511	case 'd': namedclass = ANYOF_DIGIT; break;
	10512	case 'D': namedclass = ANYOF_NDIGIT; break;
	10513	case 'v': namedclass = ANYOF_VERTWS; break;
	10514	case 'V': namedclass = ANYOF_NVERTWS; break;
	10515	case 'h': namedclass = ANYOF_HORIZWS; break;
	10516	case 'H': namedclass = ANYOF_NHORIZWS; break;
	10517	case 'N': /* Handle \N{NAME} in class */
	10518	{
	10519	/* We only pay attention to the first char of
	10520	multichar strings being returned. I kinda wonder
	10521	if this makes sense as it does change the behaviour
	10522	from earlier versions, OTOH that behaviour was broken
	10523	as well. */
	10524	UV v; /* value is register so we cant & it /grrr */
	10525	if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
	10526	goto parseit;
	10527	}
	10528	value= v;
	10529	}
	10530	break;
	10531	case 'p':
	10532	case 'P':
	10533	{
	10534	char *e;
	10535	if (RExC_parse >= RExC_end)
	10536	vFAIL2("Empty \\%c{}", (U8)value);
	10537	if (*RExC_parse == '{') {
	10538	const U8 c = (U8)value;
	10539	e = strchr(RExC_parse++, '}');
	10540	if (!e)
	10541	vFAIL2("Missing right brace on \\%c{}", c);
	10542	while (isSPACE(UCHARAT(RExC_parse)))
	10543	RExC_parse++;
	10544	if (e == RExC_parse)
	10545	vFAIL2("Empty \\%c{}", c);
	10546	n = e - RExC_parse;
	10547	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	10548	n--;
	10549	}
	10550	else {
	10551	e = RExC_parse;
	10552	n = 1;
	10553	}
	10554	if (!SIZE_ONLY) {
	10555	SV** invlistsvp;
	10556	SV* invlist;
	10557	char* name;
	10558	if (UCHARAT(RExC_parse) == '^') {
	10559	RExC_parse++;
	10560	n--;
	10561	value = value == 'p' ? 'P' : 'p'; /* toggle */
	10562	while (isSPACE(UCHARAT(RExC_parse))) {
	10563	RExC_parse++;
	10564	n--;
	10565	}
	10566	}
	10567	/* Try to get the definition of the property into
	10568	* <invlist>. If /i is in effect, the effective property
	10569	* will have its name be <__NAME_i>. The design is
	10570	* discussed in commit
	10571	* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
	10572	Newx(name, n + sizeof("_i__\n"), char);
	10573
	10574	sprintf(name, "%s%.*s%s\n",
	10575	(FOLD) ? "__" : "",
	10576	(int)n,
	10577	RExC_parse,
	10578	(FOLD) ? "_i" : ""
	10579	);
	10580
	10581	/* Look up the property name, and get its swash and
	10582	* inversion list, if the property is found */
	10583	if (swash) {
	10584	SvREFCNT_dec(swash);
	10585	}
	10586	swash = _core_swash_init("utf8", name, &PL_sv_undef,
	10587	1, /* binary */
	10588	0, /* not tr/// */
	10589	TRUE, /* this routine will handle
	10590	undefined properties */
	10591	NULL, FALSE /* No inversion list */
	10592	);
	10593	if ( ! swash
	10594	\|\| ! SvROK(swash)
	10595	\|\| ! SvTYPE(SvRV(swash)) == SVt_PVHV
	10596	\|\| ! (invlistsvp =
	10597	hv_fetchs(MUTABLE_HV(SvRV(swash)),
	10598	"INVLIST", FALSE))
	10599	\|\| ! (invlist = *invlistsvp))
	10600	{
	10601	if (swash) {
	10602	SvREFCNT_dec(swash);
	10603	swash = NULL;
	10604	}
	10605
	10606	/* Here didn't find it. It could be a user-defined
	10607	* property that will be available at run-time. Add it
	10608	* to the list to look up then */
	10609	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
	10610	(value == 'p' ? '+' : '!'),
	10611	name);
	10612	has_user_defined_property = 1;
	10613
	10614	/* We don't know yet, so have to assume that the
	10615	* property could match something in the Latin1 range,
	10616	* hence something that isn't utf8 */
	10617	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	10618	}
	10619	else {
	10620
	10621	/* Here, did get the swash and its inversion list. If
	10622	* the swash is from a user-defined property, then this
	10623	* whole character class should be regarded as such */
	10624	SV** user_defined_svp =
	10625	hv_fetchs(MUTABLE_HV(SvRV(swash)),
	10626	"USER_DEFINED", FALSE);
	10627	if (user_defined_svp) {
	10628	has_user_defined_property
	10629	\|= SvUV(*user_defined_svp);
	10630	}
	10631
	10632	/* Invert if asking for the complement */
	10633	if (value == 'P') {
	10634	_invlist_union_complement_2nd(properties, invlist, &properties);
	10635
	10636	/* The swash can't be used as-is, because we've
	10637	* inverted things; delay removing it to here after
	10638	* have copied its invlist above */
	10639	SvREFCNT_dec(swash);
	10640	swash = NULL;
	10641	}
	10642	else {
	10643	_invlist_union(properties, invlist, &properties);
	10644	}
	10645	}
	10646	Safefree(name);
	10647	}
	10648	RExC_parse = e + 1;
	10649	namedclass = ANYOF_MAX; /* no official name, but it's named */
	10650
	10651	/* \p means they want Unicode semantics */
	10652	RExC_uni_semantics = 1;
	10653	}
	10654	break;
	10655	case 'n': value = '\n'; break;
	10656	case 'r': value = '\r'; break;
	10657	case 't': value = '\t'; break;
	10658	case 'f': value = '\f'; break;
	10659	case 'b': value = '\b'; break;
	10660	case 'e': value = ASCII_TO_NATIVE('\033');break;
	10661	case 'a': value = ASCII_TO_NATIVE('\007');break;
	10662	case 'o':
	10663	RExC_parse--; /* function expects to be pointed at the 'o' */
	10664	{
	10665	const char* error_msg;
	10666	bool valid = grok_bslash_o(RExC_parse,
	10667	&value,
	10668	&numlen,
	10669	&error_msg,
	10670	SIZE_ONLY);
	10671	RExC_parse += numlen;
	10672	if (! valid) {
	10673	vFAIL(error_msg);
	10674	}
	10675	}
	10676	if (PL_encoding && value < 0x100) {
	10677	goto recode_encoding;
	10678	}
	10679	break;
	10680	case 'x':
	10681	if (*RExC_parse == '{') {
	10682	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	10683	\| PERL_SCAN_DISALLOW_PREFIX;
	10684	char * const e = strchr(RExC_parse++, '}');
	10685	if (!e)
	10686	vFAIL("Missing right brace on \\x{}");
	10687
	10688	numlen = e - RExC_parse;
	10689	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	10690	RExC_parse = e + 1;
	10691	}
	10692	else {
	10693	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	10694	numlen = 2;
	10695	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	10696	RExC_parse += numlen;
	10697	}
	10698	if (PL_encoding && value < 0x100)
	10699	goto recode_encoding;
	10700	break;
	10701	case 'c':
	10702	value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
	10703	break;
	10704	case '0': case '1': case '2': case '3': case '4':
	10705	case '5': case '6': case '7':
	10706	{
	10707	/* Take 1-3 octal digits */
	10708	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	10709	numlen = 3;
	10710	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	10711	RExC_parse += numlen;
	10712	if (PL_encoding && value < 0x100)
	10713	goto recode_encoding;
	10714	break;
	10715	}
	10716	recode_encoding:
	10717	if (! RExC_override_recoding) {
	10718	SV* enc = PL_encoding;
	10719	value = reg_recode((const char)(U8)value, &enc);
	10720	if (!enc && SIZE_ONLY)
	10721	ckWARNreg(RExC_parse,
	10722	"Invalid escape in the specified encoding");
	10723	break;
	10724	}
	10725	default:
	10726	/* Allow \_ to not give an error */
	10727	if (!SIZE_ONLY && isALNUM(value) && value != '_') {
	10728	ckWARN2reg(RExC_parse,
	10729	"Unrecognized escape \\%c in character class passed through",
	10730	(int)value);
	10731	}
	10732	break;
	10733	}
	10734	} /* end of \blah */
	10735	#ifdef EBCDIC
	10736	else
	10737	literal_endpoint++;
	10738	#endif
	10739
	10740	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	10741
	10742	/* What matches in a locale is not known until runtime, so need to
	10743	* (one time per class) allocate extra space to pass to regexec.
	10744	* The space will contain a bit for each named class that is to be
	10745	* matched against. This isn't needed for \p{} and pseudo-classes,
	10746	* as they are not affected by locale, and hence are dealt with
	10747	* separately */
	10748	if (LOC && namedclass < ANYOF_MAX && ! need_class) {
	10749	need_class = 1;
	10750	if (SIZE_ONLY) {
	10751	RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	10752	}
	10753	else {
	10754	RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	10755	ANYOF_CLASS_ZERO(ret);
	10756	}
	10757	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	10758	}
	10759
	10760	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	10761	* literal, as is the character that began the false range, i.e.
	10762	* the 'a' in the examples */
	10763	if (range) {
	10764	if (!SIZE_ONLY) {
	10765	const int w =
	10766	RExC_parse >= rangebegin ?
	10767	RExC_parse - rangebegin : 0;
	10768	ckWARN4reg(RExC_parse,
	10769	"False [] range \"%.s\"",
	10770	w, w, rangebegin);
	10771
	10772	stored +=
	10773	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	10774	if (prevvalue < 256) {
	10775	stored +=
	10776	set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
	10777	}
	10778	else {
	10779	nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
	10780	}
	10781	}
	10782
	10783	range = 0; /* this was not a true range */
	10784	}
	10785
	10786	if (!SIZE_ONLY) {
	10787
	10788	/* Possible truncation here but in some 64-bit environments
	10789	* the compiler gets heartburn about switch on 64-bit values.
	10790	* A similar issue a little earlier when switching on value.
	10791	* --jhi */
	10792	switch ((I32)namedclass) {
	10793
	10794	case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
	10795	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10796	PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
	10797	break;
	10798	case ANYOF_NALNUMC:
	10799	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10800	PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
	10801	break;
	10802	case ANYOF_ALPHA:
	10803	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10804	PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
	10805	break;
	10806	case ANYOF_NALPHA:
	10807	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10808	PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
	10809	break;
	10810	case ANYOF_ASCII:
	10811	if (LOC) {
	10812	ANYOF_CLASS_SET(ret, namedclass);
	10813	}
	10814	else {
	10815	_invlist_union(properties, PL_ASCII, &properties);
	10816	}
	10817	break;
	10818	case ANYOF_NASCII:
	10819	if (LOC) {
	10820	ANYOF_CLASS_SET(ret, namedclass);
	10821	}
	10822	else {
	10823	_invlist_union_complement_2nd(properties,
	10824	PL_ASCII, &properties);
	10825	if (DEPENDS_SEMANTICS) {
	10826	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL;
	10827	}
	10828	}
	10829	break;
	10830	case ANYOF_BLANK:
	10831	DO_POSIX(ret, namedclass, properties,
	10832	PL_PosixBlank, PL_XPosixBlank);
	10833	break;
	10834	case ANYOF_NBLANK:
	10835	DO_N_POSIX(ret, namedclass, properties,
	10836	PL_PosixBlank, PL_XPosixBlank);
	10837	break;
	10838	case ANYOF_CNTRL:
	10839	DO_POSIX(ret, namedclass, properties,
	10840	PL_PosixCntrl, PL_XPosixCntrl);
	10841	break;
	10842	case ANYOF_NCNTRL:
	10843	DO_N_POSIX(ret, namedclass, properties,
	10844	PL_PosixCntrl, PL_XPosixCntrl);
	10845	break;
	10846	case ANYOF_DIGIT:
	10847	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10848	PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
	10849	break;
	10850	case ANYOF_NDIGIT:
	10851	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10852	PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
	10853	break;
	10854	case ANYOF_GRAPH:
	10855	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10856	PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
	10857	break;
	10858	case ANYOF_NGRAPH:
	10859	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10860	PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
	10861	break;
	10862	case ANYOF_HORIZWS:
	10863	/* For these, we use the nonbitmap, as /d doesn't make a
	10864	* difference in what these match. There would be problems
	10865	* if these characters had folds other than themselves, as
	10866	* nonbitmap is subject to folding. It turns out that \h
	10867	* is just a synonym for XPosixBlank */
	10868	_invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
	10869	break;
	10870	case ANYOF_NHORIZWS:
	10871	_invlist_union_complement_2nd(nonbitmap,
	10872	PL_XPosixBlank, &nonbitmap);
	10873	break;
	10874	case ANYOF_LOWER:
	10875	case ANYOF_NLOWER:
	10876	{ /* These require special handling, as they differ under
	10877	folding, matching Cased there (which in the ASCII range
	10878	is the same as Alpha */
	10879
	10880	SV* ascii_source;
	10881	SV* l1_source;
	10882	const char *Xname;
	10883
	10884	if (FOLD && ! LOC) {
	10885	ascii_source = PL_PosixAlpha;
	10886	l1_source = PL_L1Cased;
	10887	Xname = "Cased";
	10888	}
	10889	else {
	10890	ascii_source = PL_PosixLower;
	10891	l1_source = PL_L1PosixLower;
	10892	Xname = "XPosixLower";
	10893	}
	10894	if (namedclass == ANYOF_LOWER) {
	10895	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10896	ascii_source, l1_source, Xname, listsv);
	10897	}
	10898	else {
	10899	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
	10900	properties, ascii_source, l1_source, Xname, listsv);
	10901	}
	10902	break;
	10903	}
	10904	case ANYOF_PRINT:
	10905	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10906	PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
	10907	break;
	10908	case ANYOF_NPRINT:
	10909	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10910	PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
	10911	break;
	10912	case ANYOF_PUNCT:
	10913	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10914	PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
	10915	break;
	10916	case ANYOF_NPUNCT:
	10917	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10918	PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
	10919	break;
	10920	case ANYOF_PSXSPC:
	10921	DO_POSIX(ret, namedclass, properties,
	10922	PL_PosixSpace, PL_XPosixSpace);
	10923	break;
	10924	case ANYOF_NPSXSPC:
	10925	DO_N_POSIX(ret, namedclass, properties,
	10926	PL_PosixSpace, PL_XPosixSpace);
	10927	break;
	10928	case ANYOF_SPACE:
	10929	DO_POSIX(ret, namedclass, properties,
	10930	PL_PerlSpace, PL_XPerlSpace);
	10931	break;
	10932	case ANYOF_NSPACE:
	10933	DO_N_POSIX(ret, namedclass, properties,
	10934	PL_PerlSpace, PL_XPerlSpace);
	10935	break;
	10936	case ANYOF_UPPER: /* Same as LOWER, above */
	10937	case ANYOF_NUPPER:
	10938	{
	10939	SV* ascii_source;
	10940	SV* l1_source;
	10941	const char *Xname;
	10942
	10943	if (FOLD && ! LOC) {
	10944	ascii_source = PL_PosixAlpha;
	10945	l1_source = PL_L1Cased;
	10946	Xname = "Cased";
	10947	}
	10948	else {
	10949	ascii_source = PL_PosixUpper;
	10950	l1_source = PL_L1PosixUpper;
	10951	Xname = "XPosixUpper";
	10952	}
	10953	if (namedclass == ANYOF_UPPER) {
	10954	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10955	ascii_source, l1_source, Xname, listsv);
	10956	}
	10957	else {
	10958	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
	10959	properties, ascii_source, l1_source, Xname, listsv);
	10960	}
	10961	break;
	10962	}
	10963	case ANYOF_ALNUM: /* Really is 'Word' */
	10964	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10965	PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
	10966	break;
	10967	case ANYOF_NALNUM:
	10968	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	10969	PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
	10970	break;
	10971	case ANYOF_VERTWS:
	10972	/* For these, we use the nonbitmap, as /d doesn't make a
	10973	* difference in what these match. There would be problems
	10974	* if these characters had folds other than themselves, as
	10975	* nonbitmap is subject to folding */
	10976	_invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
	10977	break;
	10978	case ANYOF_NVERTWS:
	10979	_invlist_union_complement_2nd(nonbitmap,
	10980	PL_VertSpace, &nonbitmap);
	10981	break;
	10982	case ANYOF_XDIGIT:
	10983	DO_POSIX(ret, namedclass, properties,
	10984	PL_PosixXDigit, PL_XPosixXDigit);
	10985	break;
	10986	case ANYOF_NXDIGIT:
	10987	DO_N_POSIX(ret, namedclass, properties,
	10988	PL_PosixXDigit, PL_XPosixXDigit);
	10989	break;
	10990	case ANYOF_MAX:
	10991	/* this is to handle \p and \P */
	10992	break;
	10993	default:
	10994	vFAIL("Invalid [::] class");
	10995	break;
	10996	}
	10997
	10998	continue;
	10999	}
	11000	} /* end of namedclass \blah */
	11001
	11002	if (range) {
	11003	if (prevvalue > (IV)value) /* b-a */ {
	11004	const int w = RExC_parse - rangebegin;
	11005	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	11006	range = 0; /* not a valid range */
	11007	}
	11008	}
	11009	else {
	11010	prevvalue = value; /* save the beginning of the range */
	11011	if (RExC_parse+1 < RExC_end
	11012	&& *RExC_parse == '-'
	11013	&& RExC_parse[1] != ']')
	11014	{
	11015	RExC_parse++;
	11016
	11017	/* a bad range like \w-, [:word:]- ? */
	11018	if (namedclass > OOB_NAMEDCLASS) {
	11019	if (ckWARN(WARN_REGEXP)) {
	11020	const int w =
	11021	RExC_parse >= rangebegin ?
	11022	RExC_parse - rangebegin : 0;
	11023	vWARN4(RExC_parse,
	11024	"False [] range \"%.s\"",
	11025	w, w, rangebegin);
	11026	}
	11027	if (!SIZE_ONLY)
	11028	stored +=
	11029	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	11030	} else
	11031	range = 1; /* yeah, it's a range! */
	11032	continue; /* but do it the next time */
	11033	}
	11034	}
	11035
	11036	/* non-Latin1 code point implies unicode semantics. Must be set in
	11037	* pass1 so is there for the whole of pass 2 */
	11038	if (value > 255) {
	11039	RExC_uni_semantics = 1;
	11040	}
	11041
	11042	/* now is the next time */
	11043	if (!SIZE_ONLY) {
	11044	if (prevvalue < 256) {
	11045	const IV ceilvalue = value < 256 ? value : 255;
	11046	IV i;
	11047	#ifdef EBCDIC
	11048	/* In EBCDIC [\x89-\x91] should include
	11049	* the \x8e but [i-j] should not. */
	11050	if (literal_endpoint == 2 &&
	11051	((isLOWER(prevvalue) && isLOWER(ceilvalue)) \|\|
	11052	(isUPPER(prevvalue) && isUPPER(ceilvalue))))
	11053	{
	11054	if (isLOWER(prevvalue)) {
	11055	for (i = prevvalue; i <= ceilvalue; i++)
	11056	if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	11057	stored +=
	11058	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	11059	}
	11060	} else {
	11061	for (i = prevvalue; i <= ceilvalue; i++)
	11062	if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	11063	stored +=
	11064	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	11065	}
	11066	}
	11067	}
	11068	else
	11069	#endif
	11070	for (i = prevvalue; i <= ceilvalue; i++) {
	11071	stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	11072	}
	11073	}
	11074	if (value > 255) {
	11075	const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
	11076	const UV natvalue = NATIVE_TO_UNI(value);
	11077	nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
	11078	}
	11079	#ifdef EBCDIC
	11080	literal_endpoint = 0;
	11081	#endif
	11082	}
	11083
	11084	range = 0; /* this range (if it was one) is done now */
	11085	}
	11086
	11087
	11088
	11089	if (SIZE_ONLY)
	11090	return ret;
	11091	/**** !SIZE_ONLY AFTER HERE *******/
	11092
	11093	/* If folding and there are code points above 255, we calculate all
	11094	* characters that could fold to or from the ones already on the list */
	11095	if (FOLD && nonbitmap) {
	11096	UV start, end; /* End points of code point ranges */
	11097
	11098	SV* fold_intersection = NULL;
	11099
	11100	/* This is a list of all the characters that participate in folds
	11101	* (except marks, etc in multi-char folds */
	11102	if (! PL_utf8_foldable) {
	11103	SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
	11104	PL_utf8_foldable = _swash_to_invlist(swash);
	11105	SvREFCNT_dec(swash);
	11106	}
	11107
	11108	/* This is a hash that for a particular fold gives all characters
	11109	* that are involved in it */
	11110	if (! PL_utf8_foldclosures) {
	11111
	11112	/* If we were unable to find any folds, then we likely won't be
	11113	* able to find the closures. So just create an empty list.
	11114	* Folding will effectively be restricted to the non-Unicode rules
	11115	* hard-coded into Perl. (This case happens legitimately during
	11116	* compilation of Perl itself before the Unicode tables are
	11117	* generated) */
	11118	if (invlist_len(PL_utf8_foldable) == 0) {
	11119	PL_utf8_foldclosures = newHV();
	11120	} else {
	11121	/* If the folds haven't been read in, call a fold function
	11122	* to force that */
	11123	if (! PL_utf8_tofold) {
	11124	U8 dummy[UTF8_MAXBYTES+1];
	11125	STRLEN dummy_len;
	11126
	11127	/* This particular string is above \xff in both UTF-8 and
	11128	* UTFEBCDIC */
	11129	to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
	11130	assert(PL_utf8_tofold); /* Verify that worked */
	11131	}
	11132	PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
	11133	}
	11134	}
	11135
	11136	/* Only the characters in this class that participate in folds need be
	11137	* checked. Get the intersection of this class and all the possible
	11138	* characters that are foldable. This can quickly narrow down a large
	11139	* class */
	11140	_invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
	11141
	11142	/* Now look at the foldable characters in this class individually */
	11143	invlist_iterinit(fold_intersection);
	11144	while (invlist_iternext(fold_intersection, &start, &end)) {
	11145	UV j;
	11146
	11147	/* Look at every character in the range */
	11148	for (j = start; j <= end; j++) {
	11149
	11150	/* Get its fold */
	11151	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	11152	STRLEN foldlen;
	11153	const UV f =
	11154	_to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
	11155
	11156	if (foldlen > (STRLEN)UNISKIP(f)) {
	11157
	11158	/* Any multicharacter foldings (disallowed in lookbehind
	11159	* patterns) require the following transform: [ABCDEF] ->
	11160	* (?:[ABCabcDEFd]\|pq\|rst) where E folds into "pq" and F
	11161	* folds into "rst", all other characters fold to single
	11162	* characters. We save away these multicharacter foldings,
	11163	* to be later saved as part of the additional "s" data. */
	11164	if (! RExC_in_lookbehind) {
	11165	U8* loc = foldbuf;
	11166	U8* e = foldbuf + foldlen;
	11167
	11168	/* If any of the folded characters of this are in the
	11169	* Latin1 range, tell the regex engine that this can
	11170	* match a non-utf8 target string. The only multi-byte
	11171	* fold whose source is in the Latin1 range (U+00DF)
	11172	* applies only when the target string is utf8, or
	11173	* under unicode rules */
	11174	if (j > 255 \|\| AT_LEAST_UNI_SEMANTICS) {
	11175	while (loc < e) {
	11176
	11177	/* Can't mix ascii with non- under /aa */
	11178	if (MORE_ASCII_RESTRICTED
	11179	&& (isASCII(*loc) != isASCII(j)))
	11180	{
	11181	goto end_multi_fold;
	11182	}
	11183	if (UTF8_IS_INVARIANT(*loc)
	11184	\|\| UTF8_IS_DOWNGRADEABLE_START(*loc))
	11185	{
	11186	/* Can't mix above and below 256 under LOC
	11187	*/
	11188	if (LOC) {
	11189	goto end_multi_fold;
	11190	}
	11191	ANYOF_FLAGS(ret)
	11192	\|= ANYOF_NONBITMAP_NON_UTF8;
	11193	break;
	11194	}
	11195	loc += UTF8SKIP(loc);
	11196	}
	11197	}
	11198
	11199	add_alternate(&unicode_alternate, foldbuf, foldlen);
	11200	end_multi_fold: ;
	11201	}
	11202
	11203	/* This is special-cased, as it is the only letter which
	11204	* has both a multi-fold and single-fold in Latin1. All
	11205	* the other chars that have single and multi-folds are
	11206	* always in utf8, and the utf8 folding algorithm catches
	11207	* them */
	11208	if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
	11209	stored += set_regclass_bit(pRExC_state,
	11210	ret,
	11211	LATIN_SMALL_LETTER_SHARP_S,
	11212	&l1_fold_invlist, &unicode_alternate);
	11213	}
	11214	}
	11215	else {
	11216	/* Single character fold. Add everything in its fold
	11217	* closure to the list that this node should match */
	11218	SV** listp;
	11219
	11220	/* The fold closures data structure is a hash with the keys
	11221	* being every character that is folded to, like 'k', and
	11222	* the values each an array of everything that folds to its
	11223	* key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
	11224	if ((listp = hv_fetch(PL_utf8_foldclosures,
	11225	(char *) foldbuf, foldlen, FALSE)))
	11226	{
	11227	AV* list = (AV) listp;
	11228	IV k;
	11229	for (k = 0; k <= av_len(list); k++) {
	11230	SV** c_p = av_fetch(list, k, FALSE);
	11231	UV c;
	11232	if (c_p == NULL) {
	11233	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	11234	}
	11235	c = SvUV(*c_p);
	11236
	11237	/* /aa doesn't allow folds between ASCII and non-;
	11238	* /l doesn't allow them between above and below
	11239	* 256 */
	11240	if ((MORE_ASCII_RESTRICTED
	11241	&& (isASCII(c) != isASCII(j)))
	11242	\|\| (LOC && ((c < 256) != (j < 256))))
	11243	{
	11244	continue;
	11245	}
	11246
	11247	if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
	11248	stored += set_regclass_bit(pRExC_state,
	11249	ret,
	11250	(U8) c,
	11251	&l1_fold_invlist, &unicode_alternate);
	11252	}
	11253	/* It may be that the code point is already in
	11254	* this range or already in the bitmap, in
	11255	* which case we need do nothing */
	11256	else if ((c < start \|\| c > end)
	11257	&& (c > 255
	11258	\|\| ! ANYOF_BITMAP_TEST(ret, c)))
	11259	{
	11260	nonbitmap = add_cp_to_invlist(nonbitmap, c);
	11261	}
	11262	}
	11263	}
	11264	}
	11265	}
	11266	}
	11267	SvREFCNT_dec(fold_intersection);
	11268	}
	11269
	11270	/* Combine the two lists into one. */
	11271	if (l1_fold_invlist) {
	11272	if (nonbitmap) {
	11273	_invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
	11274	SvREFCNT_dec(l1_fold_invlist);
	11275	}
	11276	else {
	11277	nonbitmap = l1_fold_invlist;
	11278	}
	11279	}
	11280
	11281	/* And combine the result (if any) with any inversion list from properties.
	11282	* The lists are kept separate up to now because we don't want to fold the
	11283	* properties */
	11284	if (properties) {
	11285	if (nonbitmap) {
	11286	_invlist_union(nonbitmap, properties, &nonbitmap);
	11287	SvREFCNT_dec(properties);
	11288	}
	11289	else {
	11290	nonbitmap = properties;
	11291	}
	11292	}
	11293
	11294	/* Here, <nonbitmap> contains all the code points we can determine at
	11295	* compile time that we haven't put into the bitmap. Go through it, and
	11296	* for things that belong in the bitmap, put them there, and delete from
	11297	* <nonbitmap> */
	11298	if (nonbitmap) {
	11299
	11300	/* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
	11301	* possibly only should match when the target string is UTF-8 */
	11302	UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
	11303
	11304	/* This gets set if we actually need to modify things */
	11305	bool change_invlist = FALSE;
	11306
	11307	UV start, end;
	11308
	11309	/* Start looking through <nonbitmap> */
	11310	invlist_iterinit(nonbitmap);
	11311	while (invlist_iternext(nonbitmap, &start, &end)) {
	11312	UV high;
	11313	int i;
	11314
	11315	/* Quit if are above what we should change */
	11316	if (start > max_cp_to_set) {
	11317	break;
	11318	}
	11319
	11320	change_invlist = TRUE;
	11321
	11322	/* Set all the bits in the range, up to the max that we are doing */
	11323	high = (end < max_cp_to_set) ? end : max_cp_to_set;
	11324	for (i = start; i <= (int) high; i++) {
	11325	if (! ANYOF_BITMAP_TEST(ret, i)) {
	11326	ANYOF_BITMAP_SET(ret, i);
	11327	stored++;
	11328	prevvalue = value;
	11329	value = i;
	11330	}
	11331	}
	11332	}
	11333
	11334	/* Done with loop; remove any code points that are in the bitmap from
	11335	* <nonbitmap> */
	11336	if (change_invlist) {
	11337	_invlist_subtract(nonbitmap,
	11338	(DEPENDS_SEMANTICS)
	11339	? PL_ASCII
	11340	: PL_Latin1,
	11341	&nonbitmap);
	11342	}
	11343
	11344	/* If have completely emptied it, remove it completely */
	11345	if (invlist_len(nonbitmap) == 0) {
	11346	SvREFCNT_dec(nonbitmap);
	11347	nonbitmap = NULL;
	11348	}
	11349	}
	11350
	11351	/* Here, we have calculated what code points should be in the character
	11352	* class. <nonbitmap> does not overlap the bitmap except possibly in the
	11353	* case of DEPENDS rules.
	11354	*
	11355	* Now we can see about various optimizations. Fold calculation (which we
	11356	* did above) needs to take place before inversion. Otherwise /[^k]/i
	11357	* would invert to include K, which under /i would match k, which it
	11358	* shouldn't. */
	11359
	11360	/* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
	11361	* set the FOLD flag yet, so this does optimize those. It doesn't
	11362	* optimize locale. Doing so perhaps could be done as long as there is
	11363	* nothing like \w in it; some thought also would have to be given to the
	11364	* interaction with above 0x100 chars */
	11365	if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
	11366	&& ! LOC
	11367	&& ! unicode_alternate
	11368	/* In case of /d, there are some things that should match only when in
	11369	* not in the bitmap, i.e., they require UTF8 to match. These are
	11370	* listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
	11371	* case, they don't require UTF8, so can invert here */
	11372	&& (! nonbitmap
	11373	\|\| ! DEPENDS_SEMANTICS
	11374	\|\| (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
	11375	&& SvCUR(listsv) == initial_listsv_len)
	11376	{
	11377	int i;
	11378	if (! nonbitmap) {
	11379	for (i = 0; i < 256; ++i) {
	11380	if (ANYOF_BITMAP_TEST(ret, i)) {
	11381	ANYOF_BITMAP_CLEAR(ret, i);
	11382	}
	11383	else {
	11384	ANYOF_BITMAP_SET(ret, i);
	11385	prevvalue = value;
	11386	value = i;
	11387	}
	11388	}
	11389	/* The inversion means that everything above 255 is matched */
	11390	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	11391	}
	11392	else {
	11393	/* Here, also has things outside the bitmap that may overlap with
	11394	* the bitmap. We have to sync them up, so that they get inverted
	11395	* in both places. Earlier, we removed all overlaps except in the
	11396	* case of /d rules, so no syncing is needed except for this case
	11397	*/
	11398	SV *remove_list = NULL;
	11399
	11400	if (DEPENDS_SEMANTICS) {
	11401	UV start, end;
	11402
	11403	/* Set the bits that correspond to the ones that aren't in the
	11404	* bitmap. Otherwise, when we invert, we'll miss these.
	11405	* Earlier, we removed from the nonbitmap all code points
	11406	* < 128, so there is no extra work here */
	11407	invlist_iterinit(nonbitmap);
	11408	while (invlist_iternext(nonbitmap, &start, &end)) {
	11409	if (start > 255) { /* The bit map goes to 255 */
	11410	break;
	11411	}
	11412	if (end > 255) {
	11413	end = 255;
	11414	}
	11415	for (i = start; i <= (int) end; ++i) {
	11416	ANYOF_BITMAP_SET(ret, i);
	11417	prevvalue = value;
	11418	value = i;
	11419	}
	11420	}
	11421	}
	11422
	11423	/* Now invert both the bitmap and the nonbitmap. Anything in the
	11424	* bitmap has to also be removed from the non-bitmap, but again,
	11425	* there should not be overlap unless is /d rules. */
	11426	_invlist_invert(nonbitmap);
	11427
	11428	/* Any swash can't be used as-is, because we've inverted things */
	11429	if (swash) {
	11430	SvREFCNT_dec(swash);
	11431	swash = NULL;
	11432	}
	11433
	11434	for (i = 0; i < 256; ++i) {
	11435	if (ANYOF_BITMAP_TEST(ret, i)) {
	11436	ANYOF_BITMAP_CLEAR(ret, i);
	11437	if (DEPENDS_SEMANTICS) {
	11438	if (! remove_list) {
	11439	remove_list = _new_invlist(2);
	11440	}
	11441	remove_list = add_cp_to_invlist(remove_list, i);
	11442	}
	11443	}
	11444	else {
	11445	ANYOF_BITMAP_SET(ret, i);
	11446	prevvalue = value;
	11447	value = i;
	11448	}
	11449	}
	11450
	11451	/* And do the removal */
	11452	if (DEPENDS_SEMANTICS) {
	11453	if (remove_list) {
	11454	_invlist_subtract(nonbitmap, remove_list, &nonbitmap);
	11455	SvREFCNT_dec(remove_list);
	11456	}
	11457	}
	11458	else {
	11459	/* There is no overlap for non-/d, so just delete anything
	11460	* below 256 */
	11461	_invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
	11462	}
	11463	}
	11464
	11465	stored = 256 - stored;
	11466
	11467	/* Clear the invert flag since have just done it here */
	11468	ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
	11469	}
	11470
	11471	/* Folding in the bitmap is taken care of above, but not for locale (for
	11472	* which we have to wait to see what folding is in effect at runtime), and
	11473	* for some things not in the bitmap (only the upper latin folds in this
	11474	* case, as all other single-char folding has been set above). Set
	11475	* run-time fold flag for these */
	11476	if (FOLD && (LOC
	11477	\|\| (DEPENDS_SEMANTICS
	11478	&& nonbitmap
	11479	&& ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
	11480	\|\| unicode_alternate))
	11481	{
	11482	ANYOF_FLAGS(ret) \|= ANYOF_LOC_NONBITMAP_FOLD;
	11483	}
	11484
	11485	/* A single character class can be "optimized" into an EXACTish node.
	11486	* Note that since we don't currently count how many characters there are
	11487	* outside the bitmap, we are XXX missing optimization possibilities for
	11488	* them. This optimization can't happen unless this is a truly single
	11489	* character class, which means that it can't be an inversion into a
	11490	* many-character class, and there must be no possibility of there being
	11491	* things outside the bitmap. 'stored' (only) for locales doesn't include
	11492	* \w, etc, so have to make a special test that they aren't present
	11493	*
	11494	* Similarly A 2-character class of the very special form like [bB] can be
	11495	* optimized into an EXACTFish node, but only for non-locales, and for
	11496	* characters which only have the two folds; so things like 'fF' and 'Ii'
	11497	* wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
	11498	* FI'. */
	11499	if (! nonbitmap
	11500	&& ! unicode_alternate
	11501	&& SvCUR(listsv) == initial_listsv_len
	11502	&& ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT\|ANYOF_UNICODE_ALL))
	11503	&& (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	11504	\|\| (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
	11505	\|\| (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	11506	&& (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
	11507	/* If the latest code point has a fold whose
	11508	* bit is set, it must be the only other one */
	11509	&& ((prevvalue = PL_fold_latin1[value]) != (IV)value)
	11510	&& ANYOF_BITMAP_TEST(ret, prevvalue)))))
	11511	{
	11512	/* Note that the information needed to decide to do this optimization
	11513	* is not currently available until the 2nd pass, and that the actually
	11514	* used EXACTish node takes less space than the calculated ANYOF node,
	11515	* and hence the amount of space calculated in the first pass is larger
	11516	* than actually used, so this optimization doesn't gain us any space.
	11517	* But an EXACT node is faster than an ANYOF node, and can be combined
	11518	* with any adjacent EXACT nodes later by the optimizer for further
	11519	* gains. The speed of executing an EXACTF is similar to an ANYOF
	11520	* node, so the optimization advantage comes from the ability to join
	11521	* it to adjacent EXACT nodes */
	11522
	11523	const char * cur_parse= RExC_parse;
	11524	U8 op;
	11525	RExC_emit = (regnode *)orig_emit;
	11526	RExC_parse = (char *)orig_parse;
	11527
	11528	if (stored == 1) {
	11529
	11530	/* A locale node with one point can be folded; all the other cases
	11531	* with folding will have two points, since we calculate them above
	11532	*/
	11533	if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
	11534	op = EXACTFL;
	11535	}
	11536	else {
	11537	op = EXACT;
	11538	}
	11539	}
	11540	else { /* else 2 chars in the bit map: the folds of each other */
	11541
	11542	/* Use the folded value, which for the cases where we get here,
	11543	* is just the lower case of the current one (which may resolve to
	11544	* itself, or to the other one */
	11545	value = toLOWER_LATIN1(value);
	11546
	11547	/* To join adjacent nodes, they must be the exact EXACTish type.
	11548	* Try to use the most likely type, by using EXACTFA if possible,
	11549	* then EXACTFU if the regex calls for it, or is required because
	11550	* the character is non-ASCII. (If <value> is ASCII, its fold is
	11551	* also ASCII for the cases where we get here.) */
	11552	if (MORE_ASCII_RESTRICTED && isASCII(value)) {
	11553	op = EXACTFA;
	11554	}
	11555	else if (AT_LEAST_UNI_SEMANTICS \|\| !isASCII(value)) {
	11556	op = EXACTFU;
	11557	}
	11558	else { /* Otherwise, more likely to be EXACTF type */
	11559	op = EXACTF;
	11560	}
	11561	}
	11562
	11563	ret = reg_node(pRExC_state, op);
	11564	RExC_parse = (char *)cur_parse;
	11565	if (UTF && ! NATIVE_IS_INVARIANT(value)) {
	11566	*STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
	11567	*(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
	11568	STR_LEN(ret)= 2;
	11569	RExC_emit += STR_SZ(2);
	11570	}
	11571	else {
	11572	*STRING(ret)= (char)value;
	11573	STR_LEN(ret)= 1;
	11574	RExC_emit += STR_SZ(1);
	11575	}
	11576	SvREFCNT_dec(listsv);
	11577	return ret;
	11578	}
	11579
	11580	/* If there is a swash and more than one element, we can't use the swash in
	11581	* the optimization below. */
	11582	if (swash && element_count > 1) {
	11583	SvREFCNT_dec(swash);
	11584	swash = NULL;
	11585	}
	11586	if (! nonbitmap
	11587	&& SvCUR(listsv) == initial_listsv_len
	11588	&& ! unicode_alternate)
	11589	{
	11590	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
	11591	SvREFCNT_dec(listsv);
	11592	SvREFCNT_dec(unicode_alternate);
	11593	}
	11594	else {
	11595	/* av[0] stores the character class description in its textual form:
	11596	* used later (regexec.c:Perl_regclass_swash()) to initialize the
	11597	* appropriate swash, and is also useful for dumping the regnode.
	11598	* av[1] if NULL, is a placeholder to later contain the swash computed
	11599	* from av[0]. But if no further computation need be done, the
	11600	* swash is stored there now.
	11601	* av[2] stores the multicharacter foldings, used later in
	11602	* regexec.c:S_reginclass().
	11603	* av[3] stores the nonbitmap inversion list for use in addition or
	11604	* instead of av[0]; not used if av[1] isn't NULL
	11605	* av[4] is set if any component of the class is from a user-defined
	11606	* property; not used if av[1] isn't NULL */
	11607	AV * const av = newAV();
	11608	SV *rv;
	11609
	11610	av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
	11611	? &PL_sv_undef
	11612	: listsv);
	11613	if (swash) {
	11614	av_store(av, 1, swash);
	11615	SvREFCNT_dec(nonbitmap);
	11616	}
	11617	else {
	11618	av_store(av, 1, NULL);
	11619	if (nonbitmap) {
	11620	av_store(av, 3, nonbitmap);
	11621	av_store(av, 4, newSVuv(has_user_defined_property));
	11622	}
	11623	}
	11624
	11625	/* Store any computed multi-char folds only if we are allowing
	11626	* them */
	11627	if (allow_full_fold) {
	11628	av_store(av, 2, MUTABLE_SV(unicode_alternate));
	11629	if (unicode_alternate) { /* This node is variable length */
	11630	OP(ret) = ANYOFV;
	11631	}
	11632	}
	11633	else {
	11634	av_store(av, 2, NULL);
	11635	}
	11636	rv = newRV_noinc(MUTABLE_SV(av));
	11637	n = add_data(pRExC_state, 1, "s");
	11638	RExC_rxi->data->data[n] = (void*)rv;
	11639	ARG_SET(ret, n);
	11640	}
	11641	return ret;
	11642	}
	11643
	11644
	11645	/* reg_skipcomment()
	11646
	11647	Absorbs an /x style # comments from the input stream.
	11648	Returns true if there is more text remaining in the stream.
	11649	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	11650	terminates the pattern without including a newline.
	11651
	11652	Note its the callers responsibility to ensure that we are
	11653	actually in /x mode
	11654
	11655	*/
	11656
	11657	STATIC bool
	11658	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	11659	{
	11660	bool ended = 0;
	11661
	11662	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	11663
	11664	while (RExC_parse < RExC_end)
	11665	if (*RExC_parse++ == '\n') {
	11666	ended = 1;
	11667	break;
	11668	}
	11669	if (!ended) {
	11670	/* we ran off the end of the pattern without ending
	11671	the comment, so we have to add an \n when wrapping */
	11672	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	11673	return 0;
	11674	} else
	11675	return 1;
	11676	}
	11677
	11678	/* nextchar()
	11679
	11680	Advances the parse position, and optionally absorbs
	11681	"whitespace" from the inputstream.
	11682
	11683	Without /x "whitespace" means (?#...) style comments only,
	11684	with /x this means (?#...) and # comments and whitespace proper.
	11685
	11686	Returns the RExC_parse point from BEFORE the scan occurs.
	11687
	11688	This is the /x friendly way of saying RExC_parse++.
	11689	*/
	11690
	11691	STATIC char*
	11692	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	11693	{
	11694	char* const retval = RExC_parse++;
	11695
	11696	PERL_ARGS_ASSERT_NEXTCHAR;
	11697
	11698	for (;;) {
	11699	if (RExC_end - RExC_parse >= 3
	11700	&& *RExC_parse == '('
	11701	&& RExC_parse[1] == '?'
	11702	&& RExC_parse[2] == '#')
	11703	{
	11704	while (*RExC_parse != ')') {
	11705	if (RExC_parse == RExC_end)
	11706	FAIL("Sequence (?#... not terminated");
	11707	RExC_parse++;
	11708	}
	11709	RExC_parse++;
	11710	continue;
	11711	}
	11712	if (RExC_flags & RXf_PMf_EXTENDED) {
	11713	if (isSPACE(*RExC_parse)) {
	11714	RExC_parse++;
	11715	continue;
	11716	}
	11717	else if (*RExC_parse == '#') {
	11718	if ( reg_skipcomment( pRExC_state ) )
	11719	continue;
	11720	}
	11721	}
	11722	return retval;
	11723	}
	11724	}
	11725
	11726	/*
	11727	- reg_node - emit a node
	11728	*/
	11729	STATIC regnode * /* Location. */
	11730	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	11731	{
	11732	dVAR;
	11733	register regnode *ptr;
	11734	regnode * const ret = RExC_emit;
	11735	GET_RE_DEBUG_FLAGS_DECL;
	11736
	11737	PERL_ARGS_ASSERT_REG_NODE;
	11738
	11739	if (SIZE_ONLY) {
	11740	SIZE_ALIGN(RExC_size);
	11741	RExC_size += 1;
	11742	return(ret);
	11743	}
	11744	if (RExC_emit >= RExC_emit_bound)
	11745	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	11746	op, RExC_emit, RExC_emit_bound);
	11747
	11748	NODE_ALIGN_FILL(ret);
	11749	ptr = ret;
	11750	FILL_ADVANCE_NODE(ptr, op);
	11751	#ifdef RE_TRACK_PATTERN_OFFSETS
	11752	if (RExC_offsets) { /* MJD */
	11753	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	11754	"reg_node", __LINE__,
	11755	PL_reg_name[op],
	11756	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	11757	? "Overwriting end of array!\n" : "OK",
	11758	(UV)(RExC_emit - RExC_emit_start),
	11759	(UV)(RExC_parse - RExC_start),
	11760	(UV)RExC_offsets[0]));
	11761	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	11762	}
	11763	#endif
	11764	RExC_emit = ptr;
	11765	return(ret);
	11766	}
	11767
	11768	/*
	11769	- reganode - emit a node with an argument
	11770	*/
	11771	STATIC regnode * /* Location. */
	11772	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	11773	{
	11774	dVAR;
	11775	register regnode *ptr;
	11776	regnode * const ret = RExC_emit;
	11777	GET_RE_DEBUG_FLAGS_DECL;
	11778
	11779	PERL_ARGS_ASSERT_REGANODE;
	11780
	11781	if (SIZE_ONLY) {
	11782	SIZE_ALIGN(RExC_size);
	11783	RExC_size += 2;
	11784	/*
	11785	We can't do this:
	11786
	11787	assert(2==regarglen[op]+1);
	11788
	11789	Anything larger than this has to allocate the extra amount.
	11790	If we changed this to be:
	11791
	11792	RExC_size += (1 + regarglen[op]);
	11793
	11794	then it wouldn't matter. Its not clear what side effect
	11795	might come from that so its not done so far.
	11796	-- dmq
	11797	*/
	11798	return(ret);
	11799	}
	11800	if (RExC_emit >= RExC_emit_bound)
	11801	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	11802	op, RExC_emit, RExC_emit_bound);
	11803
	11804	NODE_ALIGN_FILL(ret);
	11805	ptr = ret;
	11806	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	11807	#ifdef RE_TRACK_PATTERN_OFFSETS
	11808	if (RExC_offsets) { /* MJD */
	11809	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	11810	"reganode",
	11811	__LINE__,
	11812	PL_reg_name[op],
	11813	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	11814	"Overwriting end of array!\n" : "OK",
	11815	(UV)(RExC_emit - RExC_emit_start),
	11816	(UV)(RExC_parse - RExC_start),
	11817	(UV)RExC_offsets[0]));
	11818	Set_Cur_Node_Offset;
	11819	}
	11820	#endif
	11821	RExC_emit = ptr;
	11822	return(ret);
	11823	}
	11824
	11825	/*
	11826	- reguni - emit (if appropriate) a Unicode character
	11827	*/
	11828	STATIC STRLEN
	11829	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	11830	{
	11831	dVAR;
	11832
	11833	PERL_ARGS_ASSERT_REGUNI;
	11834
	11835	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	11836	}
	11837
	11838	/*
	11839	- reginsert - insert an operator in front of already-emitted operand
	11840	*
	11841	* Means relocating the operand.
	11842	*/
	11843	STATIC void
	11844	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	11845	{
	11846	dVAR;
	11847	register regnode *src;
	11848	register regnode *dst;
	11849	register regnode *place;
	11850	const int offset = regarglen[(U8)op];
	11851	const int size = NODE_STEP_REGNODE + offset;
	11852	GET_RE_DEBUG_FLAGS_DECL;
	11853
	11854	PERL_ARGS_ASSERT_REGINSERT;
	11855	PERL_UNUSED_ARG(depth);
	11856	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	11857	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	11858	if (SIZE_ONLY) {
	11859	RExC_size += size;
	11860	return;
	11861	}
	11862
	11863	src = RExC_emit;
	11864	RExC_emit += size;
	11865	dst = RExC_emit;
	11866	if (RExC_open_parens) {
	11867	int paren;
	11868	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	11869	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	11870	if ( RExC_open_parens[paren] >= opnd ) {
	11871	/DEBUG_PARSE_FMT("open"," - %d",size);/
	11872	RExC_open_parens[paren] += size;
	11873	} else {
	11874	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	11875	}
	11876	if ( RExC_close_parens[paren] >= opnd ) {
	11877	/DEBUG_PARSE_FMT("close"," - %d",size);/
	11878	RExC_close_parens[paren] += size;
	11879	} else {
	11880	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	11881	}
	11882	}
	11883	}
	11884
	11885	while (src > opnd) {
	11886	StructCopy(--src, --dst, regnode);
	11887	#ifdef RE_TRACK_PATTERN_OFFSETS
	11888	if (RExC_offsets) { /* MJD 20010112 */
	11889	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	11890	"reg_insert",
	11891	__LINE__,
	11892	PL_reg_name[op],
	11893	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	11894	? "Overwriting end of array!\n" : "OK",
	11895	(UV)(src - RExC_emit_start),
	11896	(UV)(dst - RExC_emit_start),
	11897	(UV)RExC_offsets[0]));
	11898	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	11899	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	11900	}
	11901	#endif
	11902	}
	11903
	11904
	11905	place = opnd; /* Op node, where operand used to be. */
	11906	#ifdef RE_TRACK_PATTERN_OFFSETS
	11907	if (RExC_offsets) { /* MJD */
	11908	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	11909	"reginsert",
	11910	__LINE__,
	11911	PL_reg_name[op],
	11912	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	11913	? "Overwriting end of array!\n" : "OK",
	11914	(UV)(place - RExC_emit_start),
	11915	(UV)(RExC_parse - RExC_start),
	11916	(UV)RExC_offsets[0]));
	11917	Set_Node_Offset(place, RExC_parse);
	11918	Set_Node_Length(place, 1);
	11919	}
	11920	#endif
	11921	src = NEXTOPER(place);
	11922	FILL_ADVANCE_NODE(place, op);
	11923	Zero(src, offset, regnode);
	11924	}
	11925
	11926	/*
	11927	- regtail - set the next-pointer at the end of a node chain of p to val.
	11928	- SEE ALSO: regtail_study
	11929	*/
	11930	/* TODO: All three parms should be const */
	11931	STATIC void
	11932	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	11933	{
	11934	dVAR;
	11935	register regnode *scan;
	11936	GET_RE_DEBUG_FLAGS_DECL;
	11937
	11938	PERL_ARGS_ASSERT_REGTAIL;
	11939	#ifndef DEBUGGING
	11940	PERL_UNUSED_ARG(depth);
	11941	#endif
	11942
	11943	if (SIZE_ONLY)
	11944	return;
	11945
	11946	/* Find last node. */
	11947	scan = p;
	11948	for (;;) {
	11949	regnode * const temp = regnext(scan);
	11950	DEBUG_PARSE_r({
	11951	SV * const mysv=sv_newmortal();
	11952	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	11953	regprop(RExC_rx, mysv, scan);
	11954	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	11955	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	11956	(temp == NULL ? "->" : ""),
	11957	(temp == NULL ? PL_reg_name[OP(val)] : "")
	11958	);
	11959	});
	11960	if (temp == NULL)
	11961	break;
	11962	scan = temp;
	11963	}
	11964
	11965	if (reg_off_by_arg[OP(scan)]) {
	11966	ARG_SET(scan, val - scan);
	11967	}
	11968	else {
	11969	NEXT_OFF(scan) = val - scan;
	11970	}
	11971	}
	11972
	11973	#ifdef DEBUGGING
	11974	/*
	11975	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	11976	- Look for optimizable sequences at the same time.
	11977	- currently only looks for EXACT chains.
	11978
	11979	This is experimental code. The idea is to use this routine to perform
	11980	in place optimizations on branches and groups as they are constructed,
	11981	with the long term intention of removing optimization from study_chunk so
	11982	that it is purely analytical.
	11983
	11984	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	11985	to control which is which.
	11986
	11987	*/
	11988	/* TODO: All four parms should be const */
	11989
	11990	STATIC U8
	11991	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	11992	{
	11993	dVAR;
	11994	register regnode *scan;
	11995	U8 exact = PSEUDO;
	11996	#ifdef EXPERIMENTAL_INPLACESCAN
	11997	I32 min = 0;
	11998	#endif
	11999	GET_RE_DEBUG_FLAGS_DECL;
	12000
	12001	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	12002
	12003
	12004	if (SIZE_ONLY)
	12005	return exact;
	12006
	12007	/* Find last node. */
	12008
	12009	scan = p;
	12010	for (;;) {
	12011	regnode * const temp = regnext(scan);
	12012	#ifdef EXPERIMENTAL_INPLACESCAN
	12013	if (PL_regkind[OP(scan)] == EXACT) {
	12014	bool has_exactf_sharp_s; /* Unexamined in this routine */
	12015	if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
	12016	return EXACT;
	12017	}
	12018	#endif
	12019	if ( exact ) {
	12020	switch (OP(scan)) {
	12021	case EXACT:
	12022	case EXACTF:
	12023	case EXACTFA:
	12024	case EXACTFU:
	12025	case EXACTFU_SS:
	12026	case EXACTFU_NO_TRIE:
	12027	case EXACTFL:
	12028	if( exact == PSEUDO )
	12029	exact= OP(scan);
	12030	else if ( exact != OP(scan) )
	12031	exact= 0;
	12032	case NOTHING:
	12033	break;
	12034	default:
	12035	exact= 0;
	12036	}
	12037	}
	12038	DEBUG_PARSE_r({
	12039	SV * const mysv=sv_newmortal();
	12040	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	12041	regprop(RExC_rx, mysv, scan);
	12042	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	12043	SvPV_nolen_const(mysv),
	12044	REG_NODE_NUM(scan),
	12045	PL_reg_name[exact]);
	12046	});
	12047	if (temp == NULL)
	12048	break;
	12049	scan = temp;
	12050	}
	12051	DEBUG_PARSE_r({
	12052	SV * const mysv_val=sv_newmortal();
	12053	DEBUG_PARSE_MSG("");
	12054	regprop(RExC_rx, mysv_val, val);
	12055	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	12056	SvPV_nolen_const(mysv_val),
	12057	(IV)REG_NODE_NUM(val),
	12058	(IV)(val - scan)
	12059	);
	12060	});
	12061	if (reg_off_by_arg[OP(scan)]) {
	12062	ARG_SET(scan, val - scan);
	12063	}
	12064	else {
	12065	NEXT_OFF(scan) = val - scan;
	12066	}
	12067
	12068	return exact;
	12069	}
	12070	#endif
	12071
	12072	/*
	12073	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	12074	*/
	12075	#ifdef DEBUGGING
	12076	static void
	12077	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	12078	{
	12079	int bit;
	12080	int set=0;
	12081	regex_charset cs;
	12082
	12083	for (bit=0; bit<32; bit++) {
	12084	if (flags & (1<<bit)) {
	12085	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	12086	continue;
	12087	}
	12088	if (!set++ && lead)
	12089	PerlIO_printf(Perl_debug_log, "%s",lead);
	12090	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	12091	}
	12092	}
	12093	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	12094	if (!set++ && lead) {
	12095	PerlIO_printf(Perl_debug_log, "%s",lead);
	12096	}
	12097	switch (cs) {
	12098	case REGEX_UNICODE_CHARSET:
	12099	PerlIO_printf(Perl_debug_log, "UNICODE");
	12100	break;
	12101	case REGEX_LOCALE_CHARSET:
	12102	PerlIO_printf(Perl_debug_log, "LOCALE");
	12103	break;
	12104	case REGEX_ASCII_RESTRICTED_CHARSET:
	12105	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	12106	break;
	12107	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	12108	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	12109	break;
	12110	default:
	12111	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	12112	break;
	12113	}
	12114	}
	12115	if (lead) {
	12116	if (set)
	12117	PerlIO_printf(Perl_debug_log, "\n");
	12118	else
	12119	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	12120	}
	12121	}
	12122	#endif
	12123
	12124	void
	12125	Perl_regdump(pTHX_ const regexp *r)
	12126	{
	12127	#ifdef DEBUGGING
	12128	dVAR;
	12129	SV * const sv = sv_newmortal();
	12130	SV *dsv= sv_newmortal();
	12131	RXi_GET_DECL(r,ri);
	12132	GET_RE_DEBUG_FLAGS_DECL;
	12133
	12134	PERL_ARGS_ASSERT_REGDUMP;
	12135
	12136	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	12137
	12138	/* Header fields of interest. */
	12139	if (r->anchored_substr) {
	12140	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	12141	RE_SV_DUMPLEN(r->anchored_substr), 30);
	12142	PerlIO_printf(Perl_debug_log,
	12143	"anchored %s%s at %"IVdf" ",
	12144	s, RE_SV_TAIL(r->anchored_substr),
	12145	(IV)r->anchored_offset);
	12146	} else if (r->anchored_utf8) {
	12147	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	12148	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	12149	PerlIO_printf(Perl_debug_log,
	12150	"anchored utf8 %s%s at %"IVdf" ",
	12151	s, RE_SV_TAIL(r->anchored_utf8),
	12152	(IV)r->anchored_offset);
	12153	}
	12154	if (r->float_substr) {
	12155	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	12156	RE_SV_DUMPLEN(r->float_substr), 30);
	12157	PerlIO_printf(Perl_debug_log,
	12158	"floating %s%s at %"IVdf"..%"UVuf" ",
	12159	s, RE_SV_TAIL(r->float_substr),
	12160	(IV)r->float_min_offset, (UV)r->float_max_offset);
	12161	} else if (r->float_utf8) {
	12162	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	12163	RE_SV_DUMPLEN(r->float_utf8), 30);
	12164	PerlIO_printf(Perl_debug_log,
	12165	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	12166	s, RE_SV_TAIL(r->float_utf8),
	12167	(IV)r->float_min_offset, (UV)r->float_max_offset);
	12168	}
	12169	if (r->check_substr \|\| r->check_utf8)
	12170	PerlIO_printf(Perl_debug_log,
	12171	(const char *)
	12172	(r->check_substr == r->float_substr
	12173	&& r->check_utf8 == r->float_utf8
	12174	? "(checking floating" : "(checking anchored"));
	12175	if (r->extflags & RXf_NOSCAN)
	12176	PerlIO_printf(Perl_debug_log, " noscan");
	12177	if (r->extflags & RXf_CHECK_ALL)
	12178	PerlIO_printf(Perl_debug_log, " isall");
	12179	if (r->check_substr \|\| r->check_utf8)
	12180	PerlIO_printf(Perl_debug_log, ") ");
	12181
	12182	if (ri->regstclass) {
	12183	regprop(r, sv, ri->regstclass);
	12184	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	12185	}
	12186	if (r->extflags & RXf_ANCH) {
	12187	PerlIO_printf(Perl_debug_log, "anchored");
	12188	if (r->extflags & RXf_ANCH_BOL)
	12189	PerlIO_printf(Perl_debug_log, "(BOL)");
	12190	if (r->extflags & RXf_ANCH_MBOL)
	12191	PerlIO_printf(Perl_debug_log, "(MBOL)");
	12192	if (r->extflags & RXf_ANCH_SBOL)
	12193	PerlIO_printf(Perl_debug_log, "(SBOL)");
	12194	if (r->extflags & RXf_ANCH_GPOS)
	12195	PerlIO_printf(Perl_debug_log, "(GPOS)");
	12196	PerlIO_putc(Perl_debug_log, ' ');
	12197	}
	12198	if (r->extflags & RXf_GPOS_SEEN)
	12199	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	12200	if (r->intflags & PREGf_SKIP)
	12201	PerlIO_printf(Perl_debug_log, "plus ");
	12202	if (r->intflags & PREGf_IMPLICIT)
	12203	PerlIO_printf(Perl_debug_log, "implicit ");
	12204	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	12205	if (r->extflags & RXf_EVAL_SEEN)
	12206	PerlIO_printf(Perl_debug_log, "with eval ");
	12207	PerlIO_printf(Perl_debug_log, "\n");
	12208	DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
	12209	#else
	12210	PERL_ARGS_ASSERT_REGDUMP;
	12211	PERL_UNUSED_CONTEXT;
	12212	PERL_UNUSED_ARG(r);
	12213	#endif /* DEBUGGING */
	12214	}
	12215
	12216	/*
	12217	- regprop - printable representation of opcode
	12218	*/
	12219	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	12220	STMT_START { \
	12221	if (do_sep) { \
	12222	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	12223	if (flags & ANYOF_INVERT) \
	12224	/make sure the invert info is in each / \
	12225	sv_catpvs(sv, "^"); \
	12226	do_sep = 0; \
	12227	} \
	12228	} STMT_END
	12229
	12230	void
	12231	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	12232	{
	12233	#ifdef DEBUGGING
	12234	dVAR;
	12235	register int k;
	12236	RXi_GET_DECL(prog,progi);
	12237	GET_RE_DEBUG_FLAGS_DECL;
	12238
	12239	PERL_ARGS_ASSERT_REGPROP;
	12240
	12241	sv_setpvs(sv, "");
	12242
	12243	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	12244	/* It would be nice to FAIL() here, but this may be called from
	12245	regexec.c, and it would be hard to supply pRExC_state. */
	12246	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	12247	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	12248
	12249	k = PL_regkind[OP(o)];
	12250
	12251	if (k == EXACT) {
	12252	sv_catpvs(sv, " ");
	12253	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	12254	* is a crude hack but it may be the best for now since
	12255	* we have no flag "this EXACTish node was UTF-8"
	12256	* --jhi */
	12257	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	12258	PERL_PV_ESCAPE_UNI_DETECT \|
	12259	PERL_PV_ESCAPE_NONASCII \|
	12260	PERL_PV_PRETTY_ELLIPSES \|
	12261	PERL_PV_PRETTY_LTGT \|
	12262	PERL_PV_PRETTY_NOCLEAR
	12263	);
	12264	} else if (k == TRIE) {
	12265	/* print the details of the trie in dumpuntil instead, as
	12266	* progi->data isn't available here */
	12267	const char op = OP(o);
	12268	const U32 n = ARG(o);
	12269	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	12270	(reg_ac_data *)progi->data->data[n] :
	12271	NULL;
	12272	const reg_trie_data * const trie
	12273	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	12274
	12275	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	12276	DEBUG_TRIE_COMPILE_r(
	12277	Perl_sv_catpvf(aTHX_ sv,
	12278	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	12279	(UV)trie->startstate,
	12280	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	12281	(UV)trie->wordcount,
	12282	(UV)trie->minlen,
	12283	(UV)trie->maxlen,
	12284	(UV)TRIE_CHARCOUNT(trie),
	12285	(UV)trie->uniquecharcount
	12286	)
	12287	);
	12288	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	12289	int i;
	12290	int rangestart = -1;
	12291	U8* bitmap = IS_ANYOF_TRIE(op) ? (U8)ANYOF_BITMAP(o) : (U8)TRIE_BITMAP(trie);
	12292	sv_catpvs(sv, "[");
	12293	for (i = 0; i <= 256; i++) {
	12294	if (i < 256 && BITMAP_TEST(bitmap,i)) {
	12295	if (rangestart == -1)
	12296	rangestart = i;
	12297	} else if (rangestart != -1) {
	12298	if (i <= rangestart + 3)
	12299	for (; rangestart < i; rangestart++)
	12300	put_byte(sv, rangestart);
	12301	else {
	12302	put_byte(sv, rangestart);
	12303	sv_catpvs(sv, "-");
	12304	put_byte(sv, i - 1);
	12305	}
	12306	rangestart = -1;
	12307	}
	12308	}
	12309	sv_catpvs(sv, "]");
	12310	}
	12311
	12312	} else if (k == CURLY) {
	12313	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	12314	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	12315	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	12316	}
	12317	else if (k == WHILEM && o->flags) /* Ordinal/of */
	12318	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	12319	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	12320	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	12321	if ( RXp_PAREN_NAMES(prog) ) {
	12322	if ( k != REF \|\| (OP(o) < NREF)) {
	12323	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	12324	SV **name= av_fetch(list, ARG(o), 0 );
	12325	if (name)
	12326	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	12327	}
	12328	else {
	12329	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	12330	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	12331	I32 nums=(I32)SvPVX(sv_dat);
	12332	SV **name= av_fetch(list, nums[0], 0 );
	12333	I32 n;
	12334	if (name) {
	12335	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	12336	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	12337	(n ? "," : ""), (IV)nums[n]);
	12338	}
	12339	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	12340	}
	12341	}
	12342	}
	12343	} else if (k == GOSUB)
	12344	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	12345	else if (k == VERB) {
	12346	if (!o->flags)
	12347	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	12348	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	12349	} else if (k == LOGICAL)
	12350	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	12351	else if (k == ANYOF) {
	12352	int i, rangestart = -1;
	12353	const U8 flags = ANYOF_FLAGS(o);
	12354	int do_sep = 0;
	12355
	12356	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	12357	static const char * const anyofs[] = {
	12358	"\\w",
	12359	"\\W",
	12360	"\\s",
	12361	"\\S",
	12362	"\\d",
	12363	"\\D",
	12364	"[:alnum:]",
	12365	"[:^alnum:]",
	12366	"[:alpha:]",
	12367	"[:^alpha:]",
	12368	"[:ascii:]",
	12369	"[:^ascii:]",
	12370	"[:cntrl:]",
	12371	"[:^cntrl:]",
	12372	"[:graph:]",
	12373	"[:^graph:]",
	12374	"[:lower:]",
	12375	"[:^lower:]",
	12376	"[:print:]",
	12377	"[:^print:]",
	12378	"[:punct:]",
	12379	"[:^punct:]",
	12380	"[:upper:]",
	12381	"[:^upper:]",
	12382	"[:xdigit:]",
	12383	"[:^xdigit:]",
	12384	"[:space:]",
	12385	"[:^space:]",
	12386	"[:blank:]",
	12387	"[:^blank:]"
	12388	};
	12389
	12390	if (flags & ANYOF_LOCALE)
	12391	sv_catpvs(sv, "{loc}");
	12392	if (flags & ANYOF_LOC_NONBITMAP_FOLD)
	12393	sv_catpvs(sv, "{i}");
	12394	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	12395	if (flags & ANYOF_INVERT)
	12396	sv_catpvs(sv, "^");
	12397
	12398	/* output what the standard cp 0-255 bitmap matches */
	12399	for (i = 0; i <= 256; i++) {
	12400	if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
	12401	if (rangestart == -1)
	12402	rangestart = i;
	12403	} else if (rangestart != -1) {
	12404	if (i <= rangestart + 3)
	12405	for (; rangestart < i; rangestart++)
	12406	put_byte(sv, rangestart);
	12407	else {
	12408	put_byte(sv, rangestart);
	12409	sv_catpvs(sv, "-");
	12410	put_byte(sv, i - 1);
	12411	}
	12412	do_sep = 1;
	12413	rangestart = -1;
	12414	}
	12415	}
	12416
	12417	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	12418	/* output any special charclass tests (used entirely under use locale) */
	12419	if (ANYOF_CLASS_TEST_ANY_SET(o))
	12420	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
	12421	if (ANYOF_CLASS_TEST(o,i)) {
	12422	sv_catpv(sv, anyofs[i]);
	12423	do_sep = 1;
	12424	}
	12425
	12426	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	12427
	12428	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	12429	sv_catpvs(sv, "{non-utf8-latin1-all}");
	12430	}
	12431
	12432	/* output information about the unicode matching */
	12433	if (flags & ANYOF_UNICODE_ALL)
	12434	sv_catpvs(sv, "{unicode_all}");
	12435	else if (ANYOF_NONBITMAP(o))
	12436	sv_catpvs(sv, "{unicode}");
	12437	if (flags & ANYOF_NONBITMAP_NON_UTF8)
	12438	sv_catpvs(sv, "{outside bitmap}");
	12439
	12440	if (ANYOF_NONBITMAP(o)) {
	12441	SV lv; / Set if there is something outside the bit map */
	12442	SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
	12443	bool byte_output = FALSE; /* If something in the bitmap has been
	12444	output */
	12445
	12446	if (lv && lv != &PL_sv_undef) {
	12447	if (sw) {
	12448	U8 s[UTF8_MAXBYTES_CASE+1];
	12449
	12450	for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
	12451	uvchr_to_utf8(s, i);
	12452
	12453	if (i < 256
	12454	&& ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
	12455	things already
	12456	output as part
	12457	of the bitmap */
	12458	&& swash_fetch(sw, s, TRUE))
	12459	{
	12460	if (rangestart == -1)
	12461	rangestart = i;
	12462	} else if (rangestart != -1) {
	12463	byte_output = TRUE;
	12464	if (i <= rangestart + 3)
	12465	for (; rangestart < i; rangestart++) {
	12466	put_byte(sv, rangestart);
	12467	}
	12468	else {
	12469	put_byte(sv, rangestart);
	12470	sv_catpvs(sv, "-");
	12471	put_byte(sv, i-1);
	12472	}
	12473	rangestart = -1;
	12474	}
	12475	}
	12476	}
	12477
	12478	{
	12479	char *s = savesvpv(lv);
	12480	char * const origs = s;
	12481
	12482	while (s && s != '\n')
	12483	s++;
	12484
	12485	if (*s == '\n') {
	12486	const char * const t = ++s;
	12487
	12488	if (byte_output) {
	12489	sv_catpvs(sv, " ");
	12490	}
	12491
	12492	while (*s) {
	12493	if (*s == '\n') {
	12494
	12495	/* Truncate very long output */
	12496	if (s - origs > 256) {
	12497	Perl_sv_catpvf(aTHX_ sv,
	12498	"%.*s...",
	12499	(int) (s - origs - 1),
	12500	t);
	12501	goto out_dump;
	12502	}
	12503	*s = ' ';
	12504	}
	12505	else if (*s == '\t') {
	12506	*s = '-';
	12507	}
	12508	s++;
	12509	}
	12510	if (s[-1] == ' ')
	12511	s[-1] = 0;
	12512
	12513	sv_catpv(sv, t);
	12514	}
	12515
	12516	out_dump:
	12517
	12518	Safefree(origs);
	12519	}
	12520	SvREFCNT_dec(lv);
	12521	}
	12522	}
	12523
	12524	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	12525	}
	12526	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	12527	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	12528	#else
	12529	PERL_UNUSED_CONTEXT;
	12530	PERL_UNUSED_ARG(sv);
	12531	PERL_UNUSED_ARG(o);
	12532	PERL_UNUSED_ARG(prog);
	12533	#endif /* DEBUGGING */
	12534	}
	12535
	12536	SV *
	12537	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	12538	{ /* Assume that RE_INTUIT is set */
	12539	dVAR;
	12540	struct regexp const prog = (struct regexp )SvANY(r);
	12541	GET_RE_DEBUG_FLAGS_DECL;
	12542
	12543	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	12544	PERL_UNUSED_CONTEXT;
	12545
	12546	DEBUG_COMPILE_r(
	12547	{
	12548	const char * const s = SvPV_nolen_const(prog->check_substr
	12549	? prog->check_substr : prog->check_utf8);
	12550
	12551	if (!PL_colorset) reginitcolors();
	12552	PerlIO_printf(Perl_debug_log,
	12553	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	12554	PL_colors[4],
	12555	prog->check_substr ? "" : "utf8 ",
	12556	PL_colors[5],PL_colors[0],
	12557	s,
	12558	PL_colors[1],
	12559	(strlen(s) > 60 ? "..." : ""));
	12560	} );
	12561
	12562	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	12563	}
	12564
	12565	/*
	12566	pregfree()
	12567
	12568	handles refcounting and freeing the perl core regexp structure. When
	12569	it is necessary to actually free the structure the first thing it
	12570	does is call the 'free' method of the regexp_engine associated to
	12571	the regexp, allowing the handling of the void *pprivate; member
	12572	first. (This routine is not overridable by extensions, which is why
	12573	the extensions free is called first.)
	12574
	12575	See regdupe and regdupe_internal if you change anything here.
	12576	*/
	12577	#ifndef PERL_IN_XSUB_RE
	12578	void
	12579	Perl_pregfree(pTHX_ REGEXP *r)
	12580	{
	12581	SvREFCNT_dec(r);
	12582	}
	12583
	12584	void
	12585	Perl_pregfree2(pTHX_ REGEXP *rx)
	12586	{
	12587	dVAR;
	12588	struct regexp const r = (struct regexp )SvANY(rx);
	12589	GET_RE_DEBUG_FLAGS_DECL;
	12590
	12591	PERL_ARGS_ASSERT_PREGFREE2;
	12592
	12593	if (r->mother_re) {
	12594	ReREFCNT_dec(r->mother_re);
	12595	} else {
	12596	CALLREGFREE_PVT(rx); /* free the private data */
	12597	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	12598	}
	12599	if (r->substrs) {
	12600	SvREFCNT_dec(r->anchored_substr);
	12601	SvREFCNT_dec(r->anchored_utf8);
	12602	SvREFCNT_dec(r->float_substr);
	12603	SvREFCNT_dec(r->float_utf8);
	12604	Safefree(r->substrs);
	12605	}
	12606	RX_MATCH_COPY_FREE(rx);
	12607	#ifdef PERL_OLD_COPY_ON_WRITE
	12608	SvREFCNT_dec(r->saved_copy);
	12609	#endif
	12610	Safefree(r->offs);
	12611	}
	12612
	12613	/* reg_temp_copy()
	12614
	12615	This is a hacky workaround to the structural issue of match results
	12616	being stored in the regexp structure which is in turn stored in
	12617	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	12618	could be PL_curpm in multiple contexts, and could require multiple
	12619	result sets being associated with the pattern simultaneously, such
	12620	as when doing a recursive match with (??{$qr})
	12621
	12622	The solution is to make a lightweight copy of the regexp structure
	12623	when a qr// is returned from the code executed by (??{$qr}) this
	12624	lightweight copy doesn't actually own any of its data except for
	12625	the starp/end and the actual regexp structure itself.
	12626
	12627	*/
	12628
	12629
	12630	REGEXP *
	12631	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	12632	{
	12633	struct regexp *ret;
	12634	struct regexp const r = (struct regexp )SvANY(rx);
	12635	register const I32 npar = r->nparens+1;
	12636
	12637	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	12638
	12639	if (!ret_x)
	12640	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	12641	ret = (struct regexp *)SvANY(ret_x);
	12642
	12643	(void)ReREFCNT_inc(rx);
	12644	/* We can take advantage of the existing "copied buffer" mechanism in SVs
	12645	by pointing directly at the buffer, but flagging that the allocated
	12646	space in the copy is zero. As we've just done a struct copy, it's now
	12647	a case of zero-ing that, rather than copying the current length. */
	12648	SvPV_set(ret_x, RX_WRAPPED(rx));
	12649	SvFLAGS(ret_x) \|= SvFLAGS(rx) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	12650	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	12651	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	12652	SvLEN_set(ret_x, 0);
	12653	SvSTASH_set(ret_x, NULL);
	12654	SvMAGIC_set(ret_x, NULL);
	12655	Newx(ret->offs, npar, regexp_paren_pair);
	12656	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	12657	if (r->substrs) {
	12658	Newx(ret->substrs, 1, struct reg_substr_data);
	12659	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	12660
	12661	SvREFCNT_inc_void(ret->anchored_substr);
	12662	SvREFCNT_inc_void(ret->anchored_utf8);
	12663	SvREFCNT_inc_void(ret->float_substr);
	12664	SvREFCNT_inc_void(ret->float_utf8);
	12665
	12666	/* check_substr and check_utf8, if non-NULL, point to either their
	12667	anchored or float namesakes, and don't hold a second reference. */
	12668	}
	12669	RX_MATCH_COPIED_off(ret_x);
	12670	#ifdef PERL_OLD_COPY_ON_WRITE
	12671	ret->saved_copy = NULL;
	12672	#endif
	12673	ret->mother_re = rx;
	12674
	12675	return ret_x;
	12676	}
	12677	#endif
	12678
	12679	/* regfree_internal()
	12680
	12681	Free the private data in a regexp. This is overloadable by
	12682	extensions. Perl takes care of the regexp structure in pregfree(),
	12683	this covers the *pprivate pointer which technically perl doesn't
	12684	know about, however of course we have to handle the
	12685	regexp_internal structure when no extension is in use.
	12686
	12687	Note this is called before freeing anything in the regexp
	12688	structure.
	12689	*/
	12690
	12691	void
	12692	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	12693	{
	12694	dVAR;
	12695	struct regexp const r = (struct regexp )SvANY(rx);
	12696	RXi_GET_DECL(r,ri);
	12697	GET_RE_DEBUG_FLAGS_DECL;
	12698
	12699	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	12700
	12701	DEBUG_COMPILE_r({
	12702	if (!PL_colorset)
	12703	reginitcolors();
	12704	{
	12705	SV *dsv= sv_newmortal();
	12706	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	12707	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	12708	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	12709	PL_colors[4],PL_colors[5],s);
	12710	}
	12711	});
	12712	#ifdef RE_TRACK_PATTERN_OFFSETS
	12713	if (ri->u.offsets)
	12714	Safefree(ri->u.offsets); /* 20010421 MJD */
	12715	#endif
	12716	if (ri->data) {
	12717	int n = ri->data->count;
	12718	PAD* new_comppad = NULL;
	12719	PAD* old_comppad;
	12720	PADOFFSET refcnt;
	12721
	12722	while (--n >= 0) {
	12723	/* If you add a ->what type here, update the comment in regcomp.h */
	12724	switch (ri->data->what[n]) {
	12725	case 'a':
	12726	case 's':
	12727	case 'S':
	12728	case 'u':
	12729	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	12730	break;
	12731	case 'f':
	12732	Safefree(ri->data->data[n]);
	12733	break;
	12734	case 'p':
	12735	new_comppad = MUTABLE_AV(ri->data->data[n]);
	12736	break;
	12737	case 'o':
	12738	if (new_comppad == NULL)
	12739	Perl_croak(aTHX_ "panic: pregfree comppad");
	12740	PAD_SAVE_LOCAL(old_comppad,
	12741	/* Watch out for global destruction's random ordering. */
	12742	(SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
	12743	);
	12744	OP_REFCNT_LOCK;
	12745	refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
	12746	OP_REFCNT_UNLOCK;
	12747	if (!refcnt)
	12748	op_free((OP_4tree*)ri->data->data[n]);
	12749
	12750	PAD_RESTORE_LOCAL(old_comppad);
	12751	SvREFCNT_dec(MUTABLE_SV(new_comppad));
	12752	new_comppad = NULL;
	12753	break;
	12754	case 'n':
	12755	break;
	12756	case 'T':
	12757	{ /* Aho Corasick add-on structure for a trie node.
	12758	Used in stclass optimization only */
	12759	U32 refcount;
	12760	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	12761	OP_REFCNT_LOCK;
	12762	refcount = --aho->refcount;
	12763	OP_REFCNT_UNLOCK;
	12764	if ( !refcount ) {
	12765	PerlMemShared_free(aho->states);
	12766	PerlMemShared_free(aho->fail);
	12767	/* do this last!!!! */
	12768	PerlMemShared_free(ri->data->data[n]);
	12769	PerlMemShared_free(ri->regstclass);
	12770	}
	12771	}
	12772	break;
	12773	case 't':
	12774	{
	12775	/* trie structure. */
	12776	U32 refcount;
	12777	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	12778	OP_REFCNT_LOCK;
	12779	refcount = --trie->refcount;
	12780	OP_REFCNT_UNLOCK;
	12781	if ( !refcount ) {
	12782	PerlMemShared_free(trie->charmap);
	12783	PerlMemShared_free(trie->states);
	12784	PerlMemShared_free(trie->trans);
	12785	if (trie->bitmap)
	12786	PerlMemShared_free(trie->bitmap);
	12787	if (trie->jump)
	12788	PerlMemShared_free(trie->jump);
	12789	PerlMemShared_free(trie->wordinfo);
	12790	/* do this last!!!! */
	12791	PerlMemShared_free(ri->data->data[n]);
	12792	}
	12793	}
	12794	break;
	12795	default:
	12796	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	12797	}
	12798	}
	12799	Safefree(ri->data->what);
	12800	Safefree(ri->data);
	12801	}
	12802
	12803	Safefree(ri);
	12804	}
	12805
	12806	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	12807	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	12808	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	12809
	12810	/*
	12811	re_dup - duplicate a regexp.
	12812
	12813	This routine is expected to clone a given regexp structure. It is only
	12814	compiled under USE_ITHREADS.
	12815
	12816	After all of the core data stored in struct regexp is duplicated
	12817	the regexp_engine.dupe method is used to copy any private data
	12818	stored in the *pprivate pointer. This allows extensions to handle
	12819	any duplication it needs to do.
	12820
	12821	See pregfree() and regfree_internal() if you change anything here.
	12822	*/
	12823	#if defined(USE_ITHREADS)
	12824	#ifndef PERL_IN_XSUB_RE
	12825	void
	12826	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	12827	{
	12828	dVAR;
	12829	I32 npar;
	12830	const struct regexp r = (const struct regexp )SvANY(sstr);
	12831	struct regexp ret = (struct regexp )SvANY(dstr);
	12832
	12833	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	12834
	12835	npar = r->nparens+1;
	12836	Newx(ret->offs, npar, regexp_paren_pair);
	12837	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	12838	if(ret->swap) {
	12839	/* no need to copy these */
	12840	Newx(ret->swap, npar, regexp_paren_pair);
	12841	}
	12842
	12843	if (ret->substrs) {
	12844	/* Do it this way to avoid reading from *r after the StructCopy().
	12845	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	12846	cache, it doesn't matter. */
	12847	const bool anchored = r->check_substr
	12848	? r->check_substr == r->anchored_substr
	12849	: r->check_utf8 == r->anchored_utf8;
	12850	Newx(ret->substrs, 1, struct reg_substr_data);
	12851	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	12852
	12853	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	12854	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	12855	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	12856	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	12857
	12858	/* check_substr and check_utf8, if non-NULL, point to either their
	12859	anchored or float namesakes, and don't hold a second reference. */
	12860
	12861	if (ret->check_substr) {
	12862	if (anchored) {
	12863	assert(r->check_utf8 == r->anchored_utf8);
	12864	ret->check_substr = ret->anchored_substr;
	12865	ret->check_utf8 = ret->anchored_utf8;
	12866	} else {
	12867	assert(r->check_substr == r->float_substr);
	12868	assert(r->check_utf8 == r->float_utf8);
	12869	ret->check_substr = ret->float_substr;
	12870	ret->check_utf8 = ret->float_utf8;
	12871	}
	12872	} else if (ret->check_utf8) {
	12873	if (anchored) {
	12874	ret->check_utf8 = ret->anchored_utf8;
	12875	} else {
	12876	ret->check_utf8 = ret->float_utf8;
	12877	}
	12878	}
	12879	}
	12880
	12881	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	12882
	12883	if (ret->pprivate)
	12884	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	12885
	12886	if (RX_MATCH_COPIED(dstr))
	12887	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	12888	else
	12889	ret->subbeg = NULL;
	12890	#ifdef PERL_OLD_COPY_ON_WRITE
	12891	ret->saved_copy = NULL;
	12892	#endif
	12893
	12894	if (ret->mother_re) {
	12895	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
	12896	/* Our storage points directly to our mother regexp, but that's
	12897	1: a buffer in a different thread
	12898	2: something we no longer hold a reference on
	12899	so we need to copy it locally. */
	12900	/* Note we need to use SvCUR(), rather than
	12901	SvLEN(), on our mother_re, because it, in
	12902	turn, may well be pointing to its own mother_re. */
	12903	SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
	12904	SvCUR(ret->mother_re)+1));
	12905	SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
	12906	}
	12907	ret->mother_re = NULL;
	12908	}
	12909	ret->gofs = 0;
	12910	}
	12911	#endif /* PERL_IN_XSUB_RE */
	12912
	12913	/*
	12914	regdupe_internal()
	12915
	12916	This is the internal complement to regdupe() which is used to copy
	12917	the structure pointed to by the *pprivate pointer in the regexp.
	12918	This is the core version of the extension overridable cloning hook.
	12919	The regexp structure being duplicated will be copied by perl prior
	12920	to this and will be provided as the regexp *r argument, however
	12921	with the /old/ structures pprivate pointer value. Thus this routine
	12922	may override any copying normally done by perl.
	12923
	12924	It returns a pointer to the new regexp_internal structure.
	12925	*/
	12926
	12927	void *
	12928	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	12929	{
	12930	dVAR;
	12931	struct regexp const r = (struct regexp )SvANY(rx);
	12932	regexp_internal *reti;
	12933	int len;
	12934	RXi_GET_DECL(r,ri);
	12935
	12936	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	12937
	12938	len = ProgLen(ri);
	12939
	12940	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	12941	Copy(ri->program, reti->program, len+1, regnode);
	12942
	12943
	12944	reti->regstclass = NULL;
	12945
	12946	if (ri->data) {
	12947	struct reg_data *d;
	12948	const int count = ri->data->count;
	12949	int i;
	12950
	12951	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	12952	char, struct reg_data);
	12953	Newx(d->what, count, U8);
	12954
	12955	d->count = count;
	12956	for (i = 0; i < count; i++) {
	12957	d->what[i] = ri->data->what[i];
	12958	switch (d->what[i]) {
	12959	/* legal options are one of: sSfpontTua
	12960	see also regcomp.h and pregfree() */
	12961	case 'a': /* actually an AV, but the dup function is identical. */
	12962	case 's':
	12963	case 'S':
	12964	case 'p': /* actually an AV, but the dup function is identical. */
	12965	case 'u': /* actually an HV, but the dup function is identical. */
	12966	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	12967	break;
	12968	case 'f':
	12969	/* This is cheating. */
	12970	Newx(d->data[i], 1, struct regnode_charclass_class);
	12971	StructCopy(ri->data->data[i], d->data[i],
	12972	struct regnode_charclass_class);
	12973	reti->regstclass = (regnode*)d->data[i];
	12974	break;
	12975	case 'o':
	12976	/* Compiled op trees are readonly and in shared memory,
	12977	and can thus be shared without duplication. */
	12978	OP_REFCNT_LOCK;
	12979	d->data[i] = (void)OpREFCNT_inc((OP)ri->data->data[i]);
	12980	OP_REFCNT_UNLOCK;
	12981	break;
	12982	case 'T':
	12983	/* Trie stclasses are readonly and can thus be shared
	12984	* without duplication. We free the stclass in pregfree
	12985	* when the corresponding reg_ac_data struct is freed.
	12986	*/
	12987	reti->regstclass= ri->regstclass;
	12988	/* Fall through */
	12989	case 't':
	12990	OP_REFCNT_LOCK;
	12991	((reg_trie_data*)ri->data->data[i])->refcount++;
	12992	OP_REFCNT_UNLOCK;
	12993	/* Fall through */
	12994	case 'n':
	12995	d->data[i] = ri->data->data[i];
	12996	break;
	12997	default:
	12998	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	12999	}
	13000	}
	13001
	13002	reti->data = d;
	13003	}
	13004	else
	13005	reti->data = NULL;
	13006
	13007	reti->name_list_idx = ri->name_list_idx;
	13008
	13009	#ifdef RE_TRACK_PATTERN_OFFSETS
	13010	if (ri->u.offsets) {
	13011	Newx(reti->u.offsets, 2*len+1, U32);
	13012	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	13013	}
	13014	#else
	13015	SetProgLen(reti,len);
	13016	#endif
	13017
	13018	return (void*)reti;
	13019	}
	13020
	13021	#endif /* USE_ITHREADS */
	13022
	13023	#ifndef PERL_IN_XSUB_RE
	13024
	13025	/*
	13026	- regnext - dig the "next" pointer out of a node
	13027	*/
	13028	regnode *
	13029	Perl_regnext(pTHX_ register regnode *p)
	13030	{
	13031	dVAR;
	13032	register I32 offset;
	13033
	13034	if (!p)
	13035	return(NULL);
	13036
	13037	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	13038	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	13039	}
	13040
	13041	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	13042	if (offset == 0)
	13043	return(NULL);
	13044
	13045	return(p+offset);
	13046	}
	13047	#endif
	13048
	13049	STATIC void
	13050	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	13051	{
	13052	va_list args;
	13053	STRLEN l1 = strlen(pat1);
	13054	STRLEN l2 = strlen(pat2);
	13055	char buf[512];
	13056	SV *msv;
	13057	const char *message;
	13058
	13059	PERL_ARGS_ASSERT_RE_CROAK2;
	13060
	13061	if (l1 > 510)
	13062	l1 = 510;
	13063	if (l1 + l2 > 510)
	13064	l2 = 510 - l1;
	13065	Copy(pat1, buf, l1 , char);
	13066	Copy(pat2, buf + l1, l2 , char);
	13067	buf[l1 + l2] = '\n';
	13068	buf[l1 + l2 + 1] = '\0';
	13069	#ifdef I_STDARG
	13070	/* ANSI variant takes additional second argument */
	13071	va_start(args, pat2);
	13072	#else
	13073	va_start(args);
	13074	#endif
	13075	msv = vmess(buf, &args);
	13076	va_end(args);
	13077	message = SvPV_const(msv,l1);
	13078	if (l1 > 512)
	13079	l1 = 512;
	13080	Copy(message, buf, l1 , char);
	13081	buf[l1-1] = '\0'; /* Overwrite \n */
	13082	Perl_croak(aTHX_ "%s", buf);
	13083	}
	13084
	13085	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	13086
	13087	#ifndef PERL_IN_XSUB_RE
	13088	void
	13089	Perl_save_re_context(pTHX)
	13090	{
	13091	dVAR;
	13092
	13093	struct re_save_state *state;
	13094
	13095	SAVEVPTR(PL_curcop);
	13096	SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
	13097
	13098	state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
	13099	PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
	13100	SSPUSHUV(SAVEt_RE_STATE);
	13101
	13102	Copy(&PL_reg_state, state, 1, struct re_save_state);
	13103
	13104	PL_reg_start_tmp = 0;
	13105	PL_reg_start_tmpl = 0;
	13106	PL_reg_oldsaved = NULL;
	13107	PL_reg_oldsavedlen = 0;
	13108	PL_reg_maxiter = 0;
	13109	PL_reg_leftiter = 0;
	13110	PL_reg_poscache = NULL;
	13111	PL_reg_poscache_size = 0;
	13112	#ifdef PERL_OLD_COPY_ON_WRITE
	13113	PL_nrs = NULL;
	13114	#endif
	13115
	13116	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	13117	if (PL_curpm) {
	13118	const REGEXP * const rx = PM_GETRE(PL_curpm);
	13119	if (rx) {
	13120	U32 i;
	13121	for (i = 1; i <= RX_NPARENS(rx); i++) {
	13122	char digits[TYPE_CHARS(long)];
	13123	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	13124	GV const const gvp
	13125	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	13126
	13127	if (gvp) {
	13128	GV * const gv = *gvp;
	13129	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	13130	save_scalar(gv);
	13131	}
	13132	}
	13133	}
	13134	}
	13135	}
	13136	#endif
	13137
	13138	static void
	13139	clear_re(pTHX_ void *r)
	13140	{
	13141	dVAR;
	13142	ReREFCNT_dec((REGEXP *)r);
	13143	}
	13144
	13145	#ifdef DEBUGGING
	13146
	13147	STATIC void
	13148	S_put_byte(pTHX_ SV *sv, int c)
	13149	{
	13150	PERL_ARGS_ASSERT_PUT_BYTE;
	13151
	13152	/* Our definition of isPRINT() ignores locales, so only bytes that are
	13153	not part of UTF-8 are considered printable. I assume that the same
	13154	holds for UTF-EBCDIC.
	13155	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	13156	which Wikipedia says:
	13157
	13158	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	13159	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	13160	identical, to the ASCII delete (DEL) or rubout control character.
	13161	) So the old condition can be simplified to !isPRINT(c) */
	13162	if (!isPRINT(c)) {
	13163	if (c < 256) {
	13164	Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
	13165	}
	13166	else {
	13167	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	13168	}
	13169	}
	13170	else {
	13171	const char string = c;
	13172	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	13173	sv_catpvs(sv, "\\");
	13174	sv_catpvn(sv, &string, 1);
	13175	}
	13176	}
	13177
	13178
	13179	#define CLEAR_OPTSTART \
	13180	if (optstart) STMT_START { \
	13181	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	13182	optstart=NULL; \
	13183	} STMT_END
	13184
	13185	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	13186
	13187	STATIC const regnode *
	13188	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	13189	const regnode last, const regnode plast,
	13190	SV* sv, I32 indent, U32 depth)
	13191	{
	13192	dVAR;
	13193	register U8 op = PSEUDO; /* Arbitrary non-END op. */
	13194	register const regnode *next;
	13195	const regnode *optstart= NULL;
	13196
	13197	RXi_GET_DECL(r,ri);
	13198	GET_RE_DEBUG_FLAGS_DECL;
	13199
	13200	PERL_ARGS_ASSERT_DUMPUNTIL;
	13201
	13202	#ifdef DEBUG_DUMPUNTIL
	13203	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	13204	last ? last-start : 0,plast ? plast-start : 0);
	13205	#endif
	13206
	13207	if (plast && plast < last)
	13208	last= plast;
	13209
	13210	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	13211	/* While that wasn't END last time... */
	13212	NODE_ALIGN(node);
	13213	op = OP(node);
	13214	if (op == CLOSE \|\| op == WHILEM)
	13215	indent--;
	13216	next = regnext((regnode *)node);
	13217
	13218	/* Where, what. */
	13219	if (OP(node) == OPTIMIZED) {
	13220	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	13221	optstart = node;
	13222	else
	13223	goto after_print;
	13224	} else
	13225	CLEAR_OPTSTART;
	13226
	13227	regprop(r, sv, node);
	13228	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	13229	(int)(2*indent + 1), "", SvPVX_const(sv));
	13230
	13231	if (OP(node) != OPTIMIZED) {
	13232	if (next == NULL) /* Next ptr. */
	13233	PerlIO_printf(Perl_debug_log, " (0)");
	13234	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	13235	PerlIO_printf(Perl_debug_log, " (FAIL)");
	13236	else
	13237	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	13238	(void)PerlIO_putc(Perl_debug_log, '\n');
	13239	}
	13240
	13241	after_print:
	13242	if (PL_regkind[(U8)op] == BRANCHJ) {
	13243	assert(next);
	13244	{
	13245	register const regnode *nnode = (OP(next) == LONGJMP
	13246	? regnext((regnode *)next)
	13247	: next);
	13248	if (last && nnode > last)
	13249	nnode = last;
	13250	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	13251	}
	13252	}
	13253	else if (PL_regkind[(U8)op] == BRANCH) {
	13254	assert(next);
	13255	DUMPUNTIL(NEXTOPER(node), next);
	13256	}
	13257	else if ( PL_regkind[(U8)op] == TRIE ) {
	13258	const regnode *this_trie = node;
	13259	const char op = OP(node);
	13260	const U32 n = ARG(node);
	13261	const reg_ac_data * const ac = op>=AHOCORASICK ?
	13262	(reg_ac_data *)ri->data->data[n] :
	13263	NULL;
	13264	const reg_trie_data * const trie =
	13265	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	13266	#ifdef DEBUGGING
	13267	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	13268	#endif
	13269	const regnode *nextbranch= NULL;
	13270	I32 word_idx;
	13271	sv_setpvs(sv, "");
	13272	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	13273	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	13274
	13275	PerlIO_printf(Perl_debug_log, "%*s%s ",
	13276	(int)(2*(indent+3)), "",
	13277	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	13278	PL_colors[0], PL_colors[1],
	13279	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	13280	PERL_PV_PRETTY_ELLIPSES \|
	13281	PERL_PV_PRETTY_LTGT
	13282	)
	13283	: "???"
	13284	);
	13285	if (trie->jump) {
	13286	U16 dist= trie->jump[word_idx+1];
	13287	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	13288	(UV)((dist ? this_trie + dist : next) - start));
	13289	if (dist) {
	13290	if (!nextbranch)
	13291	nextbranch= this_trie + trie->jump[0];
	13292	DUMPUNTIL(this_trie + dist, nextbranch);
	13293	}
	13294	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	13295	nextbranch= regnext((regnode *)nextbranch);
	13296	} else {
	13297	PerlIO_printf(Perl_debug_log, "\n");
	13298	}
	13299	}
	13300	if (last && next > last)
	13301	node= last;
	13302	else
	13303	node= next;
	13304	}
	13305	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	13306	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	13307	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	13308	}
	13309	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	13310	assert(next);
	13311	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	13312	}
	13313	else if ( op == PLUS \|\| op == STAR) {
	13314	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	13315	}
	13316	else if (PL_regkind[(U8)op] == ANYOF) {
	13317	/* arglen 1 + class block */
	13318	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	13319	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	13320	node = NEXTOPER(node);
	13321	}
	13322	else if (PL_regkind[(U8)op] == EXACT) {
	13323	/* Literal string, where present. */
	13324	node += NODE_SZ_STR(node) - 1;
	13325	node = NEXTOPER(node);
	13326	}
	13327	else {
	13328	node = NEXTOPER(node);
	13329	node += regarglen[(U8)op];
	13330	}
	13331	if (op == CURLYX \|\| op == OPEN)
	13332	indent++;
	13333	}
	13334	CLEAR_OPTSTART;
	13335	#ifdef DEBUG_DUMPUNTIL
	13336	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	13337	#endif
	13338	return node;
	13339	}
	13340
	13341	#endif /* DEBUGGING */
	13342
	13343	/*
	13344	* Local variables:
	13345	* c-indentation-style: bsd
	13346	* c-basic-offset: 4
	13347	* indent-tabs-mode: t
	13348	* End:
	13349	*
	13350	* ex: set ts=8 sts=4 sw=4 noet:
	13351	*/