perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	#else
	85	# include "regcomp.h"
	86	#endif
	87
	88	#include "dquote_static.c"
	89
	90	#ifdef op
	91	#undef op
	92	#endif /* op */
	93
	94	#ifdef MSDOS
	95	# if defined(BUGGY_MSC6)
	96	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	97	# pragma optimize("a",off)
	98	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	99	# pragma optimize("w",on )
	100	# endif /* BUGGY_MSC6 */
	101	#endif /* MSDOS */
	102
	103	#ifndef STATIC
	104	#define STATIC static
	105	#endif
	106
	107	typedef struct RExC_state_t {
	108	U32 flags; /* are we folding, multilining? */
	109	char precomp; / uncompiled string. */
	110	REGEXP rx_sv; / The SV that is the regexp. */
	111	regexp rx; / perl core regexp structure */
	112	regexp_internal rxi; / internal data for regexp object pprivate field */
	113	char start; / Start of input for compile */
	114	char end; / End of input for compile */
	115	char parse; / Input-scan pointer. */
	116	I32 whilem_seen; /* number of WHILEM in this expr */
	117	regnode emit_start; / Start of emitted-code area */
	118	regnode emit_bound; / First regnode outside of the allocated space */
	119	regnode emit; / Code-emit pointer; &regdummy = don't = compiling */
	120	I32 naughty; /* How bad is this pattern? */
	121	I32 sawback; /* Did we see \1, ...? */
	122	U32 seen;
	123	I32 size; /* Code size. */
	124	I32 npar; /* Capture buffer count, (OPEN). */
	125	I32 cpar; /* Capture buffer count, (CLOSE). */
	126	I32 nestroot; /* root parens we are in - used by accept */
	127	I32 extralen;
	128	I32 seen_zerolen;
	129	I32 seen_evals;
	130	regnode *open_parens; / pointers to open parens */
	131	regnode *close_parens; / pointers to close parens */
	132	regnode opend; / END node in program */
	133	I32 utf8; /* whether the pattern is utf8 or not */
	134	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	135	/* XXX use this for future optimisation of case
	136	* where pattern must be upgraded to utf8. */
	137	I32 uni_semantics; /* If a d charset modifier should use unicode
	138	rules, even if the pattern is not in
	139	utf8 */
	140	HV paren_names; / Paren names */
	141
	142	regnode *recurse; / Recurse regops */
	143	I32 recurse_count; /* Number of recurse regops */
	144	I32 in_lookbehind;
	145	I32 contains_locale;
	146	I32 override_recoding;
	147	#if ADD_TO_REGEXEC
	148	char starttry; / -Dr: where regtry was called. */
	149	#define RExC_starttry (pRExC_state->starttry)
	150	#endif
	151	#ifdef DEBUGGING
	152	const char *lastparse;
	153	I32 lastnum;
	154	AV paren_name_list; / idx -> name */
	155	#define RExC_lastparse (pRExC_state->lastparse)
	156	#define RExC_lastnum (pRExC_state->lastnum)
	157	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	158	#endif
	159	} RExC_state_t;
	160
	161	#define RExC_flags (pRExC_state->flags)
	162	#define RExC_precomp (pRExC_state->precomp)
	163	#define RExC_rx_sv (pRExC_state->rx_sv)
	164	#define RExC_rx (pRExC_state->rx)
	165	#define RExC_rxi (pRExC_state->rxi)
	166	#define RExC_start (pRExC_state->start)
	167	#define RExC_end (pRExC_state->end)
	168	#define RExC_parse (pRExC_state->parse)
	169	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	170	#ifdef RE_TRACK_PATTERN_OFFSETS
	171	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	172	#endif
	173	#define RExC_emit (pRExC_state->emit)
	174	#define RExC_emit_start (pRExC_state->emit_start)
	175	#define RExC_emit_bound (pRExC_state->emit_bound)
	176	#define RExC_naughty (pRExC_state->naughty)
	177	#define RExC_sawback (pRExC_state->sawback)
	178	#define RExC_seen (pRExC_state->seen)
	179	#define RExC_size (pRExC_state->size)
	180	#define RExC_npar (pRExC_state->npar)
	181	#define RExC_nestroot (pRExC_state->nestroot)
	182	#define RExC_extralen (pRExC_state->extralen)
	183	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	184	#define RExC_seen_evals (pRExC_state->seen_evals)
	185	#define RExC_utf8 (pRExC_state->utf8)
	186	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	187	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	188	#define RExC_open_parens (pRExC_state->open_parens)
	189	#define RExC_close_parens (pRExC_state->close_parens)
	190	#define RExC_opend (pRExC_state->opend)
	191	#define RExC_paren_names (pRExC_state->paren_names)
	192	#define RExC_recurse (pRExC_state->recurse)
	193	#define RExC_recurse_count (pRExC_state->recurse_count)
	194	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	195	#define RExC_contains_locale (pRExC_state->contains_locale)
	196	#define RExC_override_recoding (pRExC_state->override_recoding)
	197
	198
	199	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	200	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	201	((*s) == '{' && regcurly(s)))
	202
	203	#ifdef SPSTART
	204	#undef SPSTART /* dratted cpp namespace... */
	205	#endif
	206	/*
	207	* Flags to be passed up and down.
	208	*/
	209	#define WORST 0 /* Worst case. */
	210	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	211
	212	/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
	213	* character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
	214	#define SIMPLE 0x02
	215	#define SPSTART 0x04 /* Starts with * or +. */
	216	#define TRYAGAIN 0x08 /* Weeded out a declaration. */
	217	#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
	218
	219	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	220
	221	/* whether trie related optimizations are enabled */
	222	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	223	#define TRIE_STUDY_OPT
	224	#define FULL_TRIE_STUDY
	225	#define TRIE_STCLASS
	226	#endif
	227
	228
	229
	230	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	231	#define PBITVAL(paren) (1 << ((paren) & 7))
	232	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	233	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	234	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	235
	236	/* If not already in utf8, do a longjmp back to the beginning */
	237	#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
	238	#define REQUIRE_UTF8 STMT_START { \
	239	if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
	240	} STMT_END
	241
	242	/* About scan_data_t.
	243
	244	During optimisation we recurse through the regexp program performing
	245	various inplace (keyhole style) optimisations. In addition study_chunk
	246	and scan_commit populate this data structure with information about
	247	what strings MUST appear in the pattern. We look for the longest
	248	string that must appear at a fixed location, and we look for the
	249	longest string that may appear at a floating location. So for instance
	250	in the pattern:
	251
	252	/FOO[xX]A.*B[xX]BAR/
	253
	254	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	255	strings (because they follow a .* construct). study_chunk will identify
	256	both FOO and BAR as being the longest fixed and floating strings respectively.
	257
	258	The strings can be composites, for instance
	259
	260	/(f)(o)(o)/
	261
	262	will result in a composite fixed substring 'foo'.
	263
	264	For each string some basic information is maintained:
	265
	266	- offset or min_offset
	267	This is the position the string must appear at, or not before.
	268	It also implicitly (when combined with minlenp) tells us how many
	269	characters must match before the string we are searching for.
	270	Likewise when combined with minlenp and the length of the string it
	271	tells us how many characters must appear after the string we have
	272	found.
	273
	274	- max_offset
	275	Only used for floating strings. This is the rightmost point that
	276	the string can appear at. If set to I32 max it indicates that the
	277	string can occur infinitely far to the right.
	278
	279	- minlenp
	280	A pointer to the minimum length of the pattern that the string
	281	was found inside. This is important as in the case of positive
	282	lookahead or positive lookbehind we can have multiple patterns
	283	involved. Consider
	284
	285	/(?=FOO).*F/
	286
	287	The minimum length of the pattern overall is 3, the minimum length
	288	of the lookahead part is 3, but the minimum length of the part that
	289	will actually match is 1. So 'FOO's minimum length is 3, but the
	290	minimum length for the F is 1. This is important as the minimum length
	291	is used to determine offsets in front of and behind the string being
	292	looked for. Since strings can be composites this is the length of the
	293	pattern at the time it was committed with a scan_commit. Note that
	294	the length is calculated by study_chunk, so that the minimum lengths
	295	are not known until the full pattern has been compiled, thus the
	296	pointer to the value.
	297
	298	- lookbehind
	299
	300	In the case of lookbehind the string being searched for can be
	301	offset past the start point of the final matching string.
	302	If this value was just blithely removed from the min_offset it would
	303	invalidate some of the calculations for how many chars must match
	304	before or after (as they are derived from min_offset and minlen and
	305	the length of the string being searched for).
	306	When the final pattern is compiled and the data is moved from the
	307	scan_data_t structure into the regexp structure the information
	308	about lookbehind is factored in, with the information that would
	309	have been lost precalculated in the end_shift field for the
	310	associated string.
	311
	312	The fields pos_min and pos_delta are used to store the minimum offset
	313	and the delta to the maximum offset at the current point in the pattern.
	314
	315	*/
	316
	317	typedef struct scan_data_t {
	318	/I32 len_min; unused /
	319	/I32 len_delta; unused /
	320	I32 pos_min;
	321	I32 pos_delta;
	322	SV *last_found;
	323	I32 last_end; /* min value, <0 unless valid. */
	324	I32 last_start_min;
	325	I32 last_start_max;
	326	SV *longest; / Either &l_fixed, or &l_float. */
	327	SV longest_fixed; / longest fixed string found in pattern */
	328	I32 offset_fixed; /* offset where it starts */
	329	I32 minlen_fixed; / pointer to the minlen relevant to the string */
	330	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	331	SV longest_float; / longest floating string found in pattern */
	332	I32 offset_float_min; /* earliest point in string it can appear */
	333	I32 offset_float_max; /* latest point in string it can appear */
	334	I32 minlen_float; / pointer to the minlen relevant to the string */
	335	I32 lookbehind_float; /* is the position of the string modified by LB */
	336	I32 flags;
	337	I32 whilem_c;
	338	I32 *last_closep;
	339	struct regnode_charclass_class *start_class;
	340	} scan_data_t;
	341
	342	/*
	343	* Forward declarations for pregcomp()'s friends.
	344	*/
	345
	346	static const scan_data_t zero_scan_data =
	347	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	348
	349	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	350	#define SF_BEFORE_SEOL 0x0001
	351	#define SF_BEFORE_MEOL 0x0002
	352	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	353	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	354
	355	#ifdef NO_UNARY_PLUS
	356	# define SF_FIX_SHIFT_EOL (0+2)
	357	# define SF_FL_SHIFT_EOL (0+4)
	358	#else
	359	# define SF_FIX_SHIFT_EOL (+2)
	360	# define SF_FL_SHIFT_EOL (+4)
	361	#endif
	362
	363	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	364	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	365
	366	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	367	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	368	#define SF_IS_INF 0x0040
	369	#define SF_HAS_PAR 0x0080
	370	#define SF_IN_PAR 0x0100
	371	#define SF_HAS_EVAL 0x0200
	372	#define SCF_DO_SUBSTR 0x0400
	373	#define SCF_DO_STCLASS_AND 0x0800
	374	#define SCF_DO_STCLASS_OR 0x1000
	375	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	376	#define SCF_WHILEM_VISITED_POS 0x2000
	377
	378	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	379	#define SCF_SEEN_ACCEPT 0x8000
	380
	381	#define UTF cBOOL(RExC_utf8)
	382	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	383	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	384	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	385	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	386	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	387	#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	388	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
	389
	390	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	391
	392	#define OOB_UNICODE 12345678
	393	#define OOB_NAMEDCLASS -1
	394
	395	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	396	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	397
	398
	399	/* length of regex to show in messages that don't mark a position within */
	400	#define RegexLengthToShowInErrorMessages 127
	401
	402	/*
	403	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	404	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	405	* op/pragma/warn/regcomp.
	406	*/
	407	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	408	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	409
	410	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	411
	412	/*
	413	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	414	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	415	* "...".
	416	*/
	417	#define _FAIL(code) STMT_START { \
	418	const char *ellipses = ""; \
	419	IV len = RExC_end - RExC_precomp; \
	420	\
	421	if (!SIZE_ONLY) \
	422	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	423	if (len > RegexLengthToShowInErrorMessages) { \
	424	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	425	len = RegexLengthToShowInErrorMessages - 10; \
	426	ellipses = "..."; \
	427	} \
	428	code; \
	429	} STMT_END
	430
	431	#define FAIL(msg) _FAIL( \
	432	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	433	msg, (int)len, RExC_precomp, ellipses))
	434
	435	#define FAIL2(msg,arg) _FAIL( \
	436	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	437	arg, (int)len, RExC_precomp, ellipses))
	438
	439	/*
	440	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	441	*/
	442	#define Simple_vFAIL(m) STMT_START { \
	443	const IV offset = RExC_parse - RExC_precomp; \
	444	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	445	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	446	} STMT_END
	447
	448	/*
	449	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	450	*/
	451	#define vFAIL(m) STMT_START { \
	452	if (!SIZE_ONLY) \
	453	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	454	Simple_vFAIL(m); \
	455	} STMT_END
	456
	457	/*
	458	* Like Simple_vFAIL(), but accepts two arguments.
	459	*/
	460	#define Simple_vFAIL2(m,a1) STMT_START { \
	461	const IV offset = RExC_parse - RExC_precomp; \
	462	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	463	(int)offset, RExC_precomp, RExC_precomp + offset); \
	464	} STMT_END
	465
	466	/*
	467	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	468	*/
	469	#define vFAIL2(m,a1) STMT_START { \
	470	if (!SIZE_ONLY) \
	471	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	472	Simple_vFAIL2(m, a1); \
	473	} STMT_END
	474
	475
	476	/*
	477	* Like Simple_vFAIL(), but accepts three arguments.
	478	*/
	479	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	480	const IV offset = RExC_parse - RExC_precomp; \
	481	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	482	(int)offset, RExC_precomp, RExC_precomp + offset); \
	483	} STMT_END
	484
	485	/*
	486	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	487	*/
	488	#define vFAIL3(m,a1,a2) STMT_START { \
	489	if (!SIZE_ONLY) \
	490	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	491	Simple_vFAIL3(m, a1, a2); \
	492	} STMT_END
	493
	494	/*
	495	* Like Simple_vFAIL(), but accepts four arguments.
	496	*/
	497	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	498	const IV offset = RExC_parse - RExC_precomp; \
	499	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	500	(int)offset, RExC_precomp, RExC_precomp + offset); \
	501	} STMT_END
	502
	503	#define ckWARNreg(loc,m) STMT_START { \
	504	const IV offset = loc - RExC_precomp; \
	505	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	506	(int)offset, RExC_precomp, RExC_precomp + offset); \
	507	} STMT_END
	508
	509	#define ckWARNregdep(loc,m) STMT_START { \
	510	const IV offset = loc - RExC_precomp; \
	511	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	512	m REPORT_LOCATION, \
	513	(int)offset, RExC_precomp, RExC_precomp + offset); \
	514	} STMT_END
	515
	516	#define ckWARN2regdep(loc,m, a1) STMT_START { \
	517	const IV offset = loc - RExC_precomp; \
	518	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	519	m REPORT_LOCATION, \
	520	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	521	} STMT_END
	522
	523	#define ckWARN2reg(loc, m, a1) STMT_START { \
	524	const IV offset = loc - RExC_precomp; \
	525	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	526	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	527	} STMT_END
	528
	529	#define vWARN3(loc, m, a1, a2) STMT_START { \
	530	const IV offset = loc - RExC_precomp; \
	531	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	532	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	533	} STMT_END
	534
	535	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	536	const IV offset = loc - RExC_precomp; \
	537	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	538	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	539	} STMT_END
	540
	541	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	542	const IV offset = loc - RExC_precomp; \
	543	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	544	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	545	} STMT_END
	546
	547	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	548	const IV offset = loc - RExC_precomp; \
	549	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	550	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	551	} STMT_END
	552
	553	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	554	const IV offset = loc - RExC_precomp; \
	555	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	556	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	557	} STMT_END
	558
	559
	560	/* Allow for side effects in s */
	561	#define REGC(c,s) STMT_START { \
	562	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	563	} STMT_END
	564
	565	/* Macros for recording node offsets. 20001227 mjd@plover.com
	566	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	567	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	568	* Element 0 holds the number n.
	569	* Position is 1 indexed.
	570	*/
	571	#ifndef RE_TRACK_PATTERN_OFFSETS
	572	#define Set_Node_Offset_To_R(node,byte)
	573	#define Set_Node_Offset(node,byte)
	574	#define Set_Cur_Node_Offset
	575	#define Set_Node_Length_To_R(node,len)
	576	#define Set_Node_Length(node,len)
	577	#define Set_Node_Cur_Length(node)
	578	#define Node_Offset(n)
	579	#define Node_Length(n)
	580	#define Set_Node_Offset_Length(node,offset,len)
	581	#define ProgLen(ri) ri->u.proglen
	582	#define SetProgLen(ri,x) ri->u.proglen = x
	583	#else
	584	#define ProgLen(ri) ri->u.offsets[0]
	585	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	586	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	587	if (! SIZE_ONLY) { \
	588	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	589	__LINE__, (int)(node), (int)(byte))); \
	590	if((node) < 0) { \
	591	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	592	} else { \
	593	RExC_offsets[2*(node)-1] = (byte); \
	594	} \
	595	} \
	596	} STMT_END
	597
	598	#define Set_Node_Offset(node,byte) \
	599	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	600	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	601
	602	#define Set_Node_Length_To_R(node,len) STMT_START { \
	603	if (! SIZE_ONLY) { \
	604	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	605	__LINE__, (int)(node), (int)(len))); \
	606	if((node) < 0) { \
	607	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	608	} else { \
	609	RExC_offsets[2*(node)] = (len); \
	610	} \
	611	} \
	612	} STMT_END
	613
	614	#define Set_Node_Length(node,len) \
	615	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	616	#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
	617	#define Set_Node_Cur_Length(node) \
	618	Set_Node_Length(node, RExC_parse - parse_start)
	619
	620	/* Get offsets and lengths */
	621	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	622	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	623
	624	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	625	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	626	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	627	} STMT_END
	628	#endif
	629
	630	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	631	#define EXPERIMENTAL_INPLACESCAN
	632	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	633
	634	#define DEBUG_STUDYDATA(str,data,depth) \
	635	DEBUG_OPTIMISE_MORE_r(if(data){ \
	636	PerlIO_printf(Perl_debug_log, \
	637	"%*s" str "Pos:%"IVdf"/%"IVdf \
	638	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	639	(int)(depth)*2, "", \
	640	(IV)((data)->pos_min), \
	641	(IV)((data)->pos_delta), \
	642	(UV)((data)->flags), \
	643	(IV)((data)->whilem_c), \
	644	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	645	is_inf ? "INF " : "" \
	646	); \
	647	if ((data)->last_found) \
	648	PerlIO_printf(Perl_debug_log, \
	649	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	650	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	651	SvPVX_const((data)->last_found), \
	652	(IV)((data)->last_end), \
	653	(IV)((data)->last_start_min), \
	654	(IV)((data)->last_start_max), \
	655	((data)->longest && \
	656	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	657	SvPVX_const((data)->longest_fixed), \
	658	(IV)((data)->offset_fixed), \
	659	((data)->longest && \
	660	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	661	SvPVX_const((data)->longest_float), \
	662	(IV)((data)->offset_float_min), \
	663	(IV)((data)->offset_float_max) \
	664	); \
	665	PerlIO_printf(Perl_debug_log,"\n"); \
	666	});
	667
	668	static void clear_re(pTHX_ void *r);
	669
	670	/* Mark that we cannot extend a found fixed substring at this point.
	671	Update the longest found anchored substring and the longest found
	672	floating substrings if needed. */
	673
	674	STATIC void
	675	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data, I32 *minlenp, int is_inf)
	676	{
	677	const STRLEN l = CHR_SVLEN(data->last_found);
	678	const STRLEN old_l = CHR_SVLEN(*data->longest);
	679	GET_RE_DEBUG_FLAGS_DECL;
	680
	681	PERL_ARGS_ASSERT_SCAN_COMMIT;
	682
	683	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	684	SvSetMagicSV(*data->longest, data->last_found);
	685	if (*data->longest == data->longest_fixed) {
	686	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	687	if (data->flags & SF_BEFORE_EOL)
	688	data->flags
	689	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	690	else
	691	data->flags &= ~SF_FIX_BEFORE_EOL;
	692	data->minlen_fixed=minlenp;
	693	data->lookbehind_fixed=0;
	694	}
	695	else { /* data->longest == data->longest_float /
	696	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	697	data->offset_float_max = (l
	698	? data->last_start_max
	699	: data->pos_min + data->pos_delta);
	700	if (is_inf \|\| (U32)data->offset_float_max > (U32)I32_MAX)
	701	data->offset_float_max = I32_MAX;
	702	if (data->flags & SF_BEFORE_EOL)
	703	data->flags
	704	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	705	else
	706	data->flags &= ~SF_FL_BEFORE_EOL;
	707	data->minlen_float=minlenp;
	708	data->lookbehind_float=0;
	709	}
	710	}
	711	SvCUR_set(data->last_found, 0);
	712	{
	713	SV * const sv = data->last_found;
	714	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	715	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	716	if (mg)
	717	mg->mg_len = 0;
	718	}
	719	}
	720	data->last_end = -1;
	721	data->flags &= ~SF_BEFORE_EOL;
	722	DEBUG_STUDYDATA("commit: ",data,0);
	723	}
	724
	725	/* Can match anything (initialization) */
	726	STATIC void
	727	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	728	{
	729	PERL_ARGS_ASSERT_CL_ANYTHING;
	730
	731	ANYOF_BITMAP_SETALL(cl);
	732	cl->flags = ANYOF_CLASS\|ANYOF_EOS\|ANYOF_UNICODE_ALL
	733	\|ANYOF_LOC_NONBITMAP_FOLD\|ANYOF_NON_UTF8_LATIN1_ALL;
	734
	735	/* If any portion of the regex is to operate under locale rules,
	736	* initialization includes it. The reason this isn't done for all regexes
	737	* is that the optimizer was written under the assumption that locale was
	738	* all-or-nothing. Given the complexity and lack of documentation in the
	739	* optimizer, and that there are inadequate test cases for locale, so many
	740	* parts of it may not work properly, it is safest to avoid locale unless
	741	* necessary. */
	742	if (RExC_contains_locale) {
	743	ANYOF_CLASS_SETALL(cl); /* /l uses class */
	744	cl->flags \|= ANYOF_LOCALE;
	745	}
	746	else {
	747	ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
	748	}
	749	}
	750
	751	/* Can match anything (initialization) */
	752	STATIC int
	753	S_cl_is_anything(const struct regnode_charclass_class *cl)
	754	{
	755	int value;
	756
	757	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	758
	759	for (value = 0; value <= ANYOF_MAX; value += 2)
	760	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	761	return 1;
	762	if (!(cl->flags & ANYOF_UNICODE_ALL))
	763	return 0;
	764	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	765	return 0;
	766	return 1;
	767	}
	768
	769	/* Can match anything (initialization) */
	770	STATIC void
	771	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	772	{
	773	PERL_ARGS_ASSERT_CL_INIT;
	774
	775	Zero(cl, 1, struct regnode_charclass_class);
	776	cl->type = ANYOF;
	777	cl_anything(pRExC_state, cl);
	778	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	779	}
	780
	781	/* These two functions currently do the exact same thing */
	782	#define cl_init_zero S_cl_init
	783
	784	/* 'AND' a given class with another one. Can create false positives. 'cl'
	785	* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
	786	* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
	787	STATIC void
	788	S_cl_and(struct regnode_charclass_class *cl,
	789	const struct regnode_charclass_class *and_with)
	790	{
	791	PERL_ARGS_ASSERT_CL_AND;
	792
	793	assert(and_with->type == ANYOF);
	794
	795	/* I (khw) am not sure all these restrictions are necessary XXX */
	796	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	797	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	798	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	799	&& !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	800	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
	801	int i;
	802
	803	if (and_with->flags & ANYOF_INVERT)
	804	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	805	cl->bitmap[i] &= ~and_with->bitmap[i];
	806	else
	807	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	808	cl->bitmap[i] &= and_with->bitmap[i];
	809	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	810
	811	if (and_with->flags & ANYOF_INVERT) {
	812
	813	/* Here, the and'ed node is inverted. Get the AND of the flags that
	814	* aren't affected by the inversion. Those that are affected are
	815	* handled individually below */
	816	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
	817	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
	818	cl->flags \|= affected_flags;
	819
	820	/* We currently don't know how to deal with things that aren't in the
	821	* bitmap, but we know that the intersection is no greater than what
	822	* is already in cl, so let there be false positives that get sorted
	823	* out after the synthetic start class succeeds, and the node is
	824	* matched for real. */
	825
	826	/* The inversion of these two flags indicate that the resulting
	827	* intersection doesn't have them */
	828	if (and_with->flags & ANYOF_UNICODE_ALL) {
	829	cl->flags &= ~ANYOF_UNICODE_ALL;
	830	}
	831	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	832	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	833	}
	834	}
	835	else { /* and'd node is not inverted */
	836	U8 outside_bitmap_but_not_utf8; /* Temp variable */
	837
	838	if (! ANYOF_NONBITMAP(and_with)) {
	839
	840	/* Here 'and_with' doesn't match anything outside the bitmap
	841	* (except possibly ANYOF_UNICODE_ALL), which means the
	842	* intersection can't either, except for ANYOF_UNICODE_ALL, in
	843	* which case we don't know what the intersection is, but it's no
	844	* greater than what cl already has, so can just leave it alone,
	845	* with possible false positives */
	846	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
	847	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	848	cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
	849	}
	850	}
	851	else if (! ANYOF_NONBITMAP(cl)) {
	852
	853	/* Here, 'and_with' does match something outside the bitmap, and cl
	854	* doesn't have a list of things to match outside the bitmap. If
	855	* cl can match all code points above 255, the intersection will
	856	* be those above-255 code points that 'and_with' matches. If cl
	857	* can't match all Unicode code points, it means that it can't
	858	* match anything outside the bitmap (since the 'if' that got us
	859	* into this block tested for that), so we leave the bitmap empty.
	860	*/
	861	if (cl->flags & ANYOF_UNICODE_ALL) {
	862	ARG_SET(cl, ARG(and_with));
	863
	864	/* and_with's ARG may match things that don't require UTF8.
	865	* And now cl's will too, in spite of this being an 'and'. See
	866	* the comments below about the kludge */
	867	cl->flags \|= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
	868	}
	869	}
	870	else {
	871	/* Here, both 'and_with' and cl match something outside the
	872	* bitmap. Currently we do not do the intersection, so just match
	873	* whatever cl had at the beginning. */
	874	}
	875
	876
	877	/* Take the intersection of the two sets of flags. However, the
	878	* ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
	879	* kludge around the fact that this flag is not treated like the others
	880	* which are initialized in cl_anything(). The way the optimizer works
	881	* is that the synthetic start class (SSC) is initialized to match
	882	* anything, and then the first time a real node is encountered, its
	883	* values are AND'd with the SSC's with the result being the values of
	884	* the real node. However, there are paths through the optimizer where
	885	* the AND never gets called, so those initialized bits are set
	886	* inappropriately, which is not usually a big deal, as they just cause
	887	* false positives in the SSC, which will just mean a probably
	888	* imperceptible slow down in execution. However this bit has a
	889	* higher false positive consequence in that it can cause utf8.pm,
	890	* utf8_heavy.pl ... to be loaded when not necessary, which is a much
	891	* bigger slowdown and also causes significant extra memory to be used.
	892	* In order to prevent this, the code now takes a different tack. The
	893	* bit isn't set unless some part of the regular expression needs it,
	894	* but once set it won't get cleared. This means that these extra
	895	* modules won't get loaded unless there was some path through the
	896	* pattern that would have required them anyway, and so any false
	897	* positives that occur by not ANDing them out when they could be
	898	* aren't as severe as they would be if we treated this bit like all
	899	* the others */
	900	outside_bitmap_but_not_utf8 = (cl->flags \| and_with->flags)
	901	& ANYOF_NONBITMAP_NON_UTF8;
	902	cl->flags &= and_with->flags;
	903	cl->flags \|= outside_bitmap_but_not_utf8;
	904	}
	905	}
	906
	907	/* 'OR' a given class with another one. Can create false positives. 'cl'
	908	* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
	909	* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
	910	STATIC void
	911	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	912	{
	913	PERL_ARGS_ASSERT_CL_OR;
	914
	915	if (or_with->flags & ANYOF_INVERT) {
	916
	917	/* Here, the or'd node is to be inverted. This means we take the
	918	* complement of everything not in the bitmap, but currently we don't
	919	* know what that is, so give up and match anything */
	920	if (ANYOF_NONBITMAP(or_with)) {
	921	cl_anything(pRExC_state, cl);
	922	}
	923	/* We do not use
	924	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	925	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	926	* which is wasteful if CL2 is small, but we ignore CL2:
	927	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	928	* XXXX Can we handle case-fold? Unclear:
	929	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	930	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	931	*/
	932	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	933	&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	934	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
	935	int i;
	936
	937	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	938	cl->bitmap[i] \|= ~or_with->bitmap[i];
	939	} /* XXXX: logic is complicated otherwise */
	940	else {
	941	cl_anything(pRExC_state, cl);
	942	}
	943
	944	/* And, we can just take the union of the flags that aren't affected
	945	* by the inversion */
	946	cl->flags \|= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
	947
	948	/* For the remaining flags:
	949	ANYOF_UNICODE_ALL and inverted means to not match anything above
	950	255, which means that the union with cl should just be
	951	what cl has in it, so can ignore this flag
	952	ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
	953	is 127-255 to match them, but then invert that, so the
	954	union with cl should just be what cl has in it, so can
	955	ignore this flag
	956	*/
	957	} else { /* 'or_with' is not inverted */
	958	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	959	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	960	&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	961	\|\| (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
	962	int i;
	963
	964	/* OR char bitmap and class bitmap separately */
	965	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	966	cl->bitmap[i] \|= or_with->bitmap[i];
	967	if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
	968	for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
	969	cl->classflags[i] \|= or_with->classflags[i];
	970	cl->flags \|= ANYOF_CLASS;
	971	}
	972	}
	973	else { /* XXXX: logic is complicated, leave it along for a moment. */
	974	cl_anything(pRExC_state, cl);
	975	}
	976
	977	if (ANYOF_NONBITMAP(or_with)) {
	978
	979	/* Use the added node's outside-the-bit-map match if there isn't a
	980	* conflict. If there is a conflict (both nodes match something
	981	* outside the bitmap, but what they match outside is not the same
	982	* pointer, and hence not easily compared until XXX we extend
	983	* inversion lists this far), give up and allow the start class to
	984	* match everything outside the bitmap. If that stuff is all above
	985	* 255, can just set UNICODE_ALL, otherwise caould be anything. */
	986	if (! ANYOF_NONBITMAP(cl)) {
	987	ARG_SET(cl, ARG(or_with));
	988	}
	989	else if (ARG(cl) != ARG(or_with)) {
	990
	991	if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
	992	cl_anything(pRExC_state, cl);
	993	}
	994	else {
	995	cl->flags \|= ANYOF_UNICODE_ALL;
	996	}
	997	}
	998	}
	999
	1000	/* Take the union */
	1001	cl->flags \|= or_with->flags;
	1002	}
	1003	}
	1004
	1005	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	1006	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	1007	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	1008	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	1009
	1010
	1011	#ifdef DEBUGGING
	1012	/*
	1013	dump_trie(trie,widecharmap,revcharmap)
	1014	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	1015	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	1016
	1017	These routines dump out a trie in a somewhat readable format.
	1018	The _interim_ variants are used for debugging the interim
	1019	tables that are used to generate the final compressed
	1020	representation which is what dump_trie expects.
	1021
	1022	Part of the reason for their existence is to provide a form
	1023	of documentation as to how the different representations function.
	1024
	1025	*/
	1026
	1027	/*
	1028	Dumps the final compressed table form of the trie to Perl_debug_log.
	1029	Used for debugging make_trie().
	1030	*/
	1031
	1032	STATIC void
	1033	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1034	AV *revcharmap, U32 depth)
	1035	{
	1036	U32 state;
	1037	SV *sv=sv_newmortal();
	1038	int colwidth= widecharmap ? 6 : 4;
	1039	U16 word;
	1040	GET_RE_DEBUG_FLAGS_DECL;
	1041
	1042	PERL_ARGS_ASSERT_DUMP_TRIE;
	1043
	1044	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1045	(int)depth * 2 + 2,"",
	1046	"Match","Base","Ofs" );
	1047
	1048	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1049	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1050	if ( tmp ) {
	1051	PerlIO_printf( Perl_debug_log, "%*s",
	1052	colwidth,
	1053	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1054	PL_colors[0], PL_colors[1],
	1055	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1056	PERL_PV_ESCAPE_FIRSTCHAR
	1057	)
	1058	);
	1059	}
	1060	}
	1061	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1062	(int)depth * 2 + 2,"");
	1063
	1064	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1065	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1066	PerlIO_printf( Perl_debug_log, "\n");
	1067
	1068	for( state = 1 ; state < trie->statecount ; state++ ) {
	1069	const U32 base = trie->states[ state ].trans.base;
	1070
	1071	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	1072
	1073	if ( trie->states[ state ].wordnum ) {
	1074	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	1075	} else {
	1076	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1077	}
	1078
	1079	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1080
	1081	if ( base ) {
	1082	U32 ofs = 0;
	1083
	1084	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1085	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1086	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	1087	ofs++;
	1088
	1089	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1090
	1091	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1092	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1093	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1094	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1095	{
	1096	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1097	colwidth,
	1098	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	1099	} else {
	1100	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1101	}
	1102	}
	1103
	1104	PerlIO_printf( Perl_debug_log, "]");
	1105
	1106	}
	1107	PerlIO_printf( Perl_debug_log, "\n" );
	1108	}
	1109	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	1110	for (word=1; word <= trie->wordcount; word++) {
	1111	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1112	(int)word, (int)(trie->wordinfo[word].prev),
	1113	(int)(trie->wordinfo[word].len));
	1114	}
	1115	PerlIO_printf(Perl_debug_log, "\n" );
	1116	}
	1117	/*
	1118	Dumps a fully constructed but uncompressed trie in list form.
	1119	List tries normally only are used for construction when the number of
	1120	possible chars (trie->uniquecharcount) is very high.
	1121	Used for debugging make_trie().
	1122	*/
	1123	STATIC void
	1124	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1125	HV widecharmap, AV revcharmap, U32 next_alloc,
	1126	U32 depth)
	1127	{
	1128	U32 state;
	1129	SV *sv=sv_newmortal();
	1130	int colwidth= widecharmap ? 6 : 4;
	1131	GET_RE_DEBUG_FLAGS_DECL;
	1132
	1133	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1134
	1135	/* print out the table precompression. */
	1136	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1137	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1138	"------:-----+-----------------\n" );
	1139
	1140	for( state=1 ; state < next_alloc ; state ++ ) {
	1141	U16 charid;
	1142
	1143	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1144	(int)depth * 2 + 2,"", (UV)state );
	1145	if ( ! trie->states[ state ].wordnum ) {
	1146	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1147	} else {
	1148	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1149	trie->states[ state ].wordnum
	1150	);
	1151	}
	1152	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1153	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1154	if ( tmp ) {
	1155	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1156	colwidth,
	1157	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1158	PL_colors[0], PL_colors[1],
	1159	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1160	PERL_PV_ESCAPE_FIRSTCHAR
	1161	) ,
	1162	TRIE_LIST_ITEM(state,charid).forid,
	1163	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1164	);
	1165	if (!(charid % 10))
	1166	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1167	(int)((depth * 2) + 14), "");
	1168	}
	1169	}
	1170	PerlIO_printf( Perl_debug_log, "\n");
	1171	}
	1172	}
	1173
	1174	/*
	1175	Dumps a fully constructed but uncompressed trie in table form.
	1176	This is the normal DFA style state transition table, with a few
	1177	twists to facilitate compression later.
	1178	Used for debugging make_trie().
	1179	*/
	1180	STATIC void
	1181	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1182	HV widecharmap, AV revcharmap, U32 next_alloc,
	1183	U32 depth)
	1184	{
	1185	U32 state;
	1186	U16 charid;
	1187	SV *sv=sv_newmortal();
	1188	int colwidth= widecharmap ? 6 : 4;
	1189	GET_RE_DEBUG_FLAGS_DECL;
	1190
	1191	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1192
	1193	/*
	1194	print out the table precompression so that we can do a visual check
	1195	that they are identical.
	1196	*/
	1197
	1198	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1199
	1200	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1201	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1202	if ( tmp ) {
	1203	PerlIO_printf( Perl_debug_log, "%*s",
	1204	colwidth,
	1205	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1206	PL_colors[0], PL_colors[1],
	1207	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1208	PERL_PV_ESCAPE_FIRSTCHAR
	1209	)
	1210	);
	1211	}
	1212	}
	1213
	1214	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1215
	1216	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1217	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1218	}
	1219
	1220	PerlIO_printf( Perl_debug_log, "\n" );
	1221
	1222	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1223
	1224	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1225	(int)depth * 2 + 2,"",
	1226	(UV)TRIE_NODENUM( state ) );
	1227
	1228	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1229	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1230	if (v)
	1231	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1232	else
	1233	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1234	}
	1235	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1236	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1237	} else {
	1238	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1239	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1240	}
	1241	}
	1242	}
	1243
	1244	#endif
	1245
	1246
	1247	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1248	startbranch: the first branch in the whole branch sequence
	1249	first : start branch of sequence of branch-exact nodes.
	1250	May be the same as startbranch
	1251	last : Thing following the last branch.
	1252	May be the same as tail.
	1253	tail : item following the branch sequence
	1254	count : words in the sequence
	1255	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1256	depth : indent depth
	1257
	1258	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1259
	1260	A trie is an N'ary tree where the branches are determined by digital
	1261	decomposition of the key. IE, at the root node you look up the 1st character and
	1262	follow that branch repeat until you find the end of the branches. Nodes can be
	1263	marked as "accepting" meaning they represent a complete word. Eg:
	1264
	1265	/he\|she\|his\|hers/
	1266
	1267	would convert into the following structure. Numbers represent states, letters
	1268	following numbers represent valid transitions on the letter from that state, if
	1269	the number is in square brackets it represents an accepting state, otherwise it
	1270	will be in parenthesis.
	1271
	1272	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1273	\| \|
	1274	\| (2)
	1275	\| \|
	1276	(1) +-i->(6)-+-s->[7]
	1277	\|
	1278	+-s->(3)-+-h->(4)-+-e->[5]
	1279
	1280	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1281
	1282	This shows that when matching against the string 'hers' we will begin at state 1
	1283	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1284	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1285	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1286	single traverse. We store a mapping from accepting to state to which word was
	1287	matched, and then when we have multiple possibilities we try to complete the
	1288	rest of the regex in the order in which they occured in the alternation.
	1289
	1290	The only prior NFA like behaviour that would be changed by the TRIE support is
	1291	the silent ignoring of duplicate alternations which are of the form:
	1292
	1293	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1294
	1295	Thus EVAL blocks following a trie may be called a different number of times with
	1296	and without the optimisation. With the optimisations dupes will be silently
	1297	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1298	the following demonstrates:
	1299
	1300	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1301
	1302	which prints out 'word' three times, but
	1303
	1304	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1305
	1306	which doesnt print it out at all. This is due to other optimisations kicking in.
	1307
	1308	Example of what happens on a structural level:
	1309
	1310	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1311
	1312	1: CURLYM[1] {1,32767}(18)
	1313	5: BRANCH(8)
	1314	6: EXACT <ac>(16)
	1315	8: BRANCH(11)
	1316	9: EXACT <ad>(16)
	1317	11: BRANCH(14)
	1318	12: EXACT <ab>(16)
	1319	16: SUCCEED(0)
	1320	17: NOTHING(18)
	1321	18: END(0)
	1322
	1323	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1324	and should turn into:
	1325
	1326	1: CURLYM[1] {1,32767}(18)
	1327	5: TRIE(16)
	1328	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1329	<ac>
	1330	<ad>
	1331	<ab>
	1332	16: SUCCEED(0)
	1333	17: NOTHING(18)
	1334	18: END(0)
	1335
	1336	Cases where tail != last would be like /(?foo\|bar)baz/:
	1337
	1338	1: BRANCH(4)
	1339	2: EXACT <foo>(8)
	1340	4: BRANCH(7)
	1341	5: EXACT <bar>(8)
	1342	7: TAIL(8)
	1343	8: EXACT <baz>(10)
	1344	10: END(0)
	1345
	1346	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1347	and would end up looking like:
	1348
	1349	1: TRIE(8)
	1350	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1351	<foo>
	1352	<bar>
	1353	7: TAIL(8)
	1354	8: EXACT <baz>(10)
	1355	10: END(0)
	1356
	1357	d = uvuni_to_utf8_flags(d, uv, 0);
	1358
	1359	is the recommended Unicode-aware way of saying
	1360
	1361	*(d++) = uv;
	1362	*/
	1363
	1364	#define TRIE_STORE_REVCHAR \
	1365	STMT_START { \
	1366	if (UTF) { \
	1367	SV *zlopp = newSV(2); \
	1368	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1369	unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
	1370	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1371	SvPOK_on(zlopp); \
	1372	SvUTF8_on(zlopp); \
	1373	av_push(revcharmap, zlopp); \
	1374	} else { \
	1375	char ooooff = (char)uvc; \
	1376	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1377	} \
	1378	} STMT_END
	1379
	1380	#define TRIE_READ_CHAR STMT_START { \
	1381	wordlen++; \
	1382	if ( UTF ) { \
	1383	if ( folder ) { \
	1384	if ( foldlen > 0 ) { \
	1385	uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
	1386	foldlen -= len; \
	1387	scan += len; \
	1388	len = 0; \
	1389	} else { \
	1390	len = UTF8SKIP(uc);\
	1391	uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
	1392	foldlen -= UNISKIP( uvc ); \
	1393	scan = foldbuf + UNISKIP( uvc ); \
	1394	} \
	1395	} else { \
	1396	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1397	} \
	1398	} else { \
	1399	uvc = (U32)*uc; \
	1400	len = 1; \
	1401	} \
	1402	} STMT_END
	1403
	1404
	1405
	1406	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1407	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1408	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1409	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1410	} \
	1411	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1412	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1413	TRIE_LIST_CUR( state )++; \
	1414	} STMT_END
	1415
	1416	#define TRIE_LIST_NEW(state) STMT_START { \
	1417	Newxz( trie->states[ state ].trans.list, \
	1418	4, reg_trie_trans_le ); \
	1419	TRIE_LIST_CUR( state ) = 1; \
	1420	TRIE_LIST_LEN( state ) = 4; \
	1421	} STMT_END
	1422
	1423	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1424	U16 dupe= trie->states[ state ].wordnum; \
	1425	regnode * const noper_next = regnext( noper ); \
	1426	\
	1427	DEBUG_r({ \
	1428	/* store the word for dumping */ \
	1429	SV* tmp; \
	1430	if (OP(noper) != NOTHING) \
	1431	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1432	else \
	1433	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1434	av_push( trie_words, tmp ); \
	1435	}); \
	1436	\
	1437	curword++; \
	1438	trie->wordinfo[curword].prev = 0; \
	1439	trie->wordinfo[curword].len = wordlen; \
	1440	trie->wordinfo[curword].accept = state; \
	1441	\
	1442	if ( noper_next < tail ) { \
	1443	if (!trie->jump) \
	1444	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1445	trie->jump[curword] = (U16)(noper_next - convert); \
	1446	if (!jumper) \
	1447	jumper = noper_next; \
	1448	if (!nextbranch) \
	1449	nextbranch= regnext(cur); \
	1450	} \
	1451	\
	1452	if ( dupe ) { \
	1453	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1454	/* chain, so that when the bits of chain are later */\
	1455	/* linked together, the dups appear in the chain */\
	1456	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1457	trie->wordinfo[dupe].prev = curword; \
	1458	} else { \
	1459	/* we haven't inserted this word yet. */ \
	1460	trie->states[ state ].wordnum = curword; \
	1461	} \
	1462	} STMT_END
	1463
	1464
	1465	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1466	( ( base + charid >= ucharcount \
	1467	&& base + charid < ubound \
	1468	&& state == trie->trans[ base - ucharcount + charid ].check \
	1469	&& trie->trans[ base - ucharcount + charid ].next ) \
	1470	? trie->trans[ base - ucharcount + charid ].next \
	1471	: ( state==1 ? special : 0 ) \
	1472	)
	1473
	1474	#define MADE_TRIE 1
	1475	#define MADE_JUMP_TRIE 2
	1476	#define MADE_EXACT_TRIE 4
	1477
	1478	STATIC I32
	1479	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1480	{
	1481	dVAR;
	1482	/* first pass, loop through and scan words */
	1483	reg_trie_data *trie;
	1484	HV *widecharmap = NULL;
	1485	AV *revcharmap = newAV();
	1486	regnode *cur;
	1487	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1488	STRLEN len = 0;
	1489	UV uvc = 0;
	1490	U16 curword = 0;
	1491	U32 next_alloc = 0;
	1492	regnode *jumper = NULL;
	1493	regnode *nextbranch = NULL;
	1494	regnode *convert = NULL;
	1495	U32 prev_states; / temp array mapping each state to previous one */
	1496	/* we just use folder as a flag in utf8 */
	1497	const U8 * folder = NULL;
	1498
	1499	#ifdef DEBUGGING
	1500	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1501	AV *trie_words = NULL;
	1502	/* along with revcharmap, this only used during construction but both are
	1503	* useful during debugging so we store them in the struct when debugging.
	1504	*/
	1505	#else
	1506	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1507	STRLEN trie_charcount=0;
	1508	#endif
	1509	SV *re_trie_maxbuff;
	1510	GET_RE_DEBUG_FLAGS_DECL;
	1511
	1512	PERL_ARGS_ASSERT_MAKE_TRIE;
	1513	#ifndef DEBUGGING
	1514	PERL_UNUSED_ARG(depth);
	1515	#endif
	1516
	1517	switch (flags) {
	1518	case EXACT: break;
	1519	case EXACTFA:
	1520	case EXACTFU: folder = PL_fold_latin1; break;
	1521	case EXACTF: folder = PL_fold; break;
	1522	case EXACTFL: folder = PL_fold_locale; break;
	1523	default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
	1524	}
	1525
	1526	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1527	trie->refcount = 1;
	1528	trie->startstate = 1;
	1529	trie->wordcount = word_count;
	1530	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1531	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1532	if (!(UTF && folder))
	1533	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1534	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1535	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1536
	1537	DEBUG_r({
	1538	trie_words = newAV();
	1539	});
	1540
	1541	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1542	if (!SvIOK(re_trie_maxbuff)) {
	1543	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1544	}
	1545	DEBUG_OPTIMISE_r({
	1546	PerlIO_printf( Perl_debug_log,
	1547	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1548	(int)depth * 2 + 2, "",
	1549	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1550	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1551	(int)depth);
	1552	});
	1553
	1554	/* Find the node we are going to overwrite */
	1555	if ( first == startbranch && OP( last ) != BRANCH ) {
	1556	/* whole branch chain */
	1557	convert = first;
	1558	} else {
	1559	/* branch sub-chain */
	1560	convert = NEXTOPER( first );
	1561	}
	1562
	1563	/* -- First loop and Setup --
	1564
	1565	We first traverse the branches and scan each word to determine if it
	1566	contains widechars, and how many unique chars there are, this is
	1567	important as we have to build a table with at least as many columns as we
	1568	have unique chars.
	1569
	1570	We use an array of integers to represent the character codes 0..255
	1571	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1572	native representation of the character value as the key and IV's for the
	1573	coded index.
	1574
	1575	TODO If we keep track of how many times each character is used we can
	1576	remap the columns so that the table compression later on is more
	1577	efficient in terms of memory by ensuring the most common value is in the
	1578	middle and the least common are on the outside. IMO this would be better
	1579	than a most to least common mapping as theres a decent chance the most
	1580	common letter will share a node with the least common, meaning the node
	1581	will not be compressible. With a middle is most common approach the worst
	1582	case is when we have the least common nodes twice.
	1583
	1584	*/
	1585
	1586	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1587	regnode * const noper = NEXTOPER( cur );
	1588	const U8 uc = (U8)STRING( noper );
	1589	const U8 * const e = uc + STR_LEN( noper );
	1590	STRLEN foldlen = 0;
	1591	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1592	const U8 scan = (U8)NULL;
	1593	U32 wordlen = 0; /* required init */
	1594	STRLEN chars = 0;
	1595	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1596
	1597	if (OP(noper) == NOTHING) {
	1598	trie->minlen= 0;
	1599	continue;
	1600	}
	1601	if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
	1602	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1603	regardless of encoding */
	1604
	1605	for ( ; uc < e ; uc += len ) {
	1606	TRIE_CHARCOUNT(trie)++;
	1607	TRIE_READ_CHAR;
	1608	chars++;
	1609	if ( uvc < 256 ) {
	1610	if ( !trie->charmap[ uvc ] ) {
	1611	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1612	if ( folder )
	1613	trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
	1614	TRIE_STORE_REVCHAR;
	1615	}
	1616	if ( set_bit ) {
	1617	/* store the codepoint in the bitmap, and its folded
	1618	* equivalent. */
	1619	TRIE_BITMAP_SET(trie,uvc);
	1620
	1621	/* store the folded codepoint */
	1622	if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
	1623
	1624	if ( !UTF ) {
	1625	/* store first byte of utf8 representation of
	1626	variant codepoints */
	1627	if (! UNI_IS_INVARIANT(uvc)) {
	1628	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1629	}
	1630	}
	1631	set_bit = 0; /* We've done our bit :-) */
	1632	}
	1633	} else {
	1634	SV** svpp;
	1635	if ( !widecharmap )
	1636	widecharmap = newHV();
	1637
	1638	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1639
	1640	if ( !svpp )
	1641	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1642
	1643	if ( !SvTRUE( *svpp ) ) {
	1644	sv_setiv( *svpp, ++trie->uniquecharcount );
	1645	TRIE_STORE_REVCHAR;
	1646	}
	1647	}
	1648	}
	1649	if( cur == first ) {
	1650	trie->minlen=chars;
	1651	trie->maxlen=chars;
	1652	} else if (chars < trie->minlen) {
	1653	trie->minlen=chars;
	1654	} else if (chars > trie->maxlen) {
	1655	trie->maxlen=chars;
	1656	}
	1657
	1658	} /* end first pass */
	1659	DEBUG_TRIE_COMPILE_r(
	1660	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1661	(int)depth * 2 + 2,"",
	1662	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1663	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1664	(int)trie->minlen, (int)trie->maxlen )
	1665	);
	1666
	1667	/*
	1668	We now know what we are dealing with in terms of unique chars and
	1669	string sizes so we can calculate how much memory a naive
	1670	representation using a flat table will take. If it's over a reasonable
	1671	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1672	conservative but potentially much slower representation using an array
	1673	of lists.
	1674
	1675	At the end we convert both representations into the same compressed
	1676	form that will be used in regexec.c for matching with. The latter
	1677	is a form that cannot be used to construct with but has memory
	1678	properties similar to the list form and access properties similar
	1679	to the table form making it both suitable for fast searches and
	1680	small enough that its feasable to store for the duration of a program.
	1681
	1682	See the comment in the code where the compressed table is produced
	1683	inplace from the flat tabe representation for an explanation of how
	1684	the compression works.
	1685
	1686	*/
	1687
	1688
	1689	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1690	prev_states[1] = 0;
	1691
	1692	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1693	/*
	1694	Second Pass -- Array Of Lists Representation
	1695
	1696	Each state will be represented by a list of charid:state records
	1697	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1698	points of the allocated array. (See defines above).
	1699
	1700	We build the initial structure using the lists, and then convert
	1701	it into the compressed table form which allows faster lookups
	1702	(but cant be modified once converted).
	1703	*/
	1704
	1705	STRLEN transcount = 1;
	1706
	1707	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1708	"%*sCompiling trie using list compiler\n",
	1709	(int)depth * 2 + 2, ""));
	1710
	1711	trie->states = (reg_trie_state *)
	1712	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1713	sizeof(reg_trie_state) );
	1714	TRIE_LIST_NEW(1);
	1715	next_alloc = 2;
	1716
	1717	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1718
	1719	regnode * const noper = NEXTOPER( cur );
	1720	U8 uc = (U8)STRING( noper );
	1721	const U8 * const e = uc + STR_LEN( noper );
	1722	U32 state = 1; /* required init */
	1723	U16 charid = 0; /* sanity init */
	1724	U8 scan = (U8)NULL; /* sanity init */
	1725	STRLEN foldlen = 0; /* required init */
	1726	U32 wordlen = 0; /* required init */
	1727	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1728
	1729	if (OP(noper) != NOTHING) {
	1730	for ( ; uc < e ; uc += len ) {
	1731
	1732	TRIE_READ_CHAR;
	1733
	1734	if ( uvc < 256 ) {
	1735	charid = trie->charmap[ uvc ];
	1736	} else {
	1737	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1738	if ( !svpp ) {
	1739	charid = 0;
	1740	} else {
	1741	charid=(U16)SvIV( *svpp );
	1742	}
	1743	}
	1744	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1745	if ( charid ) {
	1746
	1747	U16 check;
	1748	U32 newstate = 0;
	1749
	1750	charid--;
	1751	if ( !trie->states[ state ].trans.list ) {
	1752	TRIE_LIST_NEW( state );
	1753	}
	1754	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1755	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1756	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1757	break;
	1758	}
	1759	}
	1760	if ( ! newstate ) {
	1761	newstate = next_alloc++;
	1762	prev_states[newstate] = state;
	1763	TRIE_LIST_PUSH( state, charid, newstate );
	1764	transcount++;
	1765	}
	1766	state = newstate;
	1767	} else {
	1768	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1769	}
	1770	}
	1771	}
	1772	TRIE_HANDLE_WORD(state);
	1773
	1774	} /* end second pass */
	1775
	1776	/* next alloc is the NEXT state to be allocated */
	1777	trie->statecount = next_alloc;
	1778	trie->states = (reg_trie_state *)
	1779	PerlMemShared_realloc( trie->states,
	1780	next_alloc
	1781	* sizeof(reg_trie_state) );
	1782
	1783	/* and now dump it out before we compress it */
	1784	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1785	revcharmap, next_alloc,
	1786	depth+1)
	1787	);
	1788
	1789	trie->trans = (reg_trie_trans *)
	1790	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1791	{
	1792	U32 state;
	1793	U32 tp = 0;
	1794	U32 zp = 0;
	1795
	1796
	1797	for( state=1 ; state < next_alloc ; state ++ ) {
	1798	U32 base=0;
	1799
	1800	/*
	1801	DEBUG_TRIE_COMPILE_MORE_r(
	1802	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1803	);
	1804	*/
	1805
	1806	if (trie->states[state].trans.list) {
	1807	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1808	U16 maxid=minid;
	1809	U16 idx;
	1810
	1811	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1812	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1813	if ( forid < minid ) {
	1814	minid=forid;
	1815	} else if ( forid > maxid ) {
	1816	maxid=forid;
	1817	}
	1818	}
	1819	if ( transcount < tp + maxid - minid + 1) {
	1820	transcount *= 2;
	1821	trie->trans = (reg_trie_trans *)
	1822	PerlMemShared_realloc( trie->trans,
	1823	transcount
	1824	* sizeof(reg_trie_trans) );
	1825	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1826	}
	1827	base = trie->uniquecharcount + tp - minid;
	1828	if ( maxid == minid ) {
	1829	U32 set = 0;
	1830	for ( ; zp < tp ; zp++ ) {
	1831	if ( ! trie->trans[ zp ].next ) {
	1832	base = trie->uniquecharcount + zp - minid;
	1833	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1834	trie->trans[ zp ].check = state;
	1835	set = 1;
	1836	break;
	1837	}
	1838	}
	1839	if ( !set ) {
	1840	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1841	trie->trans[ tp ].check = state;
	1842	tp++;
	1843	zp = tp;
	1844	}
	1845	} else {
	1846	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1847	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	1848	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	1849	trie->trans[ tid ].check = state;
	1850	}
	1851	tp += ( maxid - minid + 1 );
	1852	}
	1853	Safefree(trie->states[ state ].trans.list);
	1854	}
	1855	/*
	1856	DEBUG_TRIE_COMPILE_MORE_r(
	1857	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	1858	);
	1859	*/
	1860	trie->states[ state ].trans.base=base;
	1861	}
	1862	trie->lasttrans = tp + 1;
	1863	}
	1864	} else {
	1865	/*
	1866	Second Pass -- Flat Table Representation.
	1867
	1868	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	1869	We know that we will need Charcount+1 trans at most to store the data
	1870	(one row per char at worst case) So we preallocate both structures
	1871	assuming worst case.
	1872
	1873	We then construct the trie using only the .next slots of the entry
	1874	structs.
	1875
	1876	We use the .check field of the first entry of the node temporarily to
	1877	make compression both faster and easier by keeping track of how many non
	1878	zero fields are in the node.
	1879
	1880	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1881	transition.
	1882
	1883	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	1884	number representing the first entry of the node, and state as a
	1885	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	1886	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	1887	are 2 entrys per node. eg:
	1888
	1889	A B A B
	1890	1. 2 4 1. 3 7
	1891	2. 0 3 3. 0 5
	1892	3. 0 0 5. 0 0
	1893	4. 0 0 7. 0 0
	1894
	1895	The table is internally in the right hand, idx form. However as we also
	1896	have to deal with the states array which is indexed by nodenum we have to
	1897	use TRIE_NODENUM() to convert.
	1898
	1899	*/
	1900	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1901	"%*sCompiling trie using table compiler\n",
	1902	(int)depth * 2 + 2, ""));
	1903
	1904	trie->trans = (reg_trie_trans *)
	1905	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1906	* trie->uniquecharcount + 1,
	1907	sizeof(reg_trie_trans) );
	1908	trie->states = (reg_trie_state *)
	1909	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1910	sizeof(reg_trie_state) );
	1911	next_alloc = trie->uniquecharcount + 1;
	1912
	1913
	1914	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1915
	1916	regnode * const noper = NEXTOPER( cur );
	1917	const U8 uc = (U8)STRING( noper );
	1918	const U8 * const e = uc + STR_LEN( noper );
	1919
	1920	U32 state = 1; /* required init */
	1921
	1922	U16 charid = 0; /* sanity init */
	1923	U32 accept_state = 0; /* sanity init */
	1924	U8 scan = (U8)NULL; /* sanity init */
	1925
	1926	STRLEN foldlen = 0; /* required init */
	1927	U32 wordlen = 0; /* required init */
	1928	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1929
	1930	if ( OP(noper) != NOTHING ) {
	1931	for ( ; uc < e ; uc += len ) {
	1932
	1933	TRIE_READ_CHAR;
	1934
	1935	if ( uvc < 256 ) {
	1936	charid = trie->charmap[ uvc ];
	1937	} else {
	1938	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1939	charid = svpp ? (U16)SvIV(*svpp) : 0;
	1940	}
	1941	if ( charid ) {
	1942	charid--;
	1943	if ( !trie->trans[ state + charid ].next ) {
	1944	trie->trans[ state + charid ].next = next_alloc;
	1945	trie->trans[ state ].check++;
	1946	prev_states[TRIE_NODENUM(next_alloc)]
	1947	= TRIE_NODENUM(state);
	1948	next_alloc += trie->uniquecharcount;
	1949	}
	1950	state = trie->trans[ state + charid ].next;
	1951	} else {
	1952	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1953	}
	1954	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1955	}
	1956	}
	1957	accept_state = TRIE_NODENUM( state );
	1958	TRIE_HANDLE_WORD(accept_state);
	1959
	1960	} /* end second pass */
	1961
	1962	/* and now dump it out before we compress it */
	1963	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	1964	revcharmap,
	1965	next_alloc, depth+1));
	1966
	1967	{
	1968	/*
	1969	* Inplace compress the table.*
	1970
	1971	For sparse data sets the table constructed by the trie algorithm will
	1972	be mostly 0/FAIL transitions or to put it another way mostly empty.
	1973	(Note that leaf nodes will not contain any transitions.)
	1974
	1975	This algorithm compresses the tables by eliminating most such
	1976	transitions, at the cost of a modest bit of extra work during lookup:
	1977
	1978	- Each states[] entry contains a .base field which indicates the
	1979	index in the state[] array wheres its transition data is stored.
	1980
	1981	- If .base is 0 there are no valid transitions from that node.
	1982
	1983	- If .base is nonzero then charid is added to it to find an entry in
	1984	the trans array.
	1985
	1986	-If trans[states[state].base+charid].check!=state then the
	1987	transition is taken to be a 0/Fail transition. Thus if there are fail
	1988	transitions at the front of the node then the .base offset will point
	1989	somewhere inside the previous nodes data (or maybe even into a node
	1990	even earlier), but the .check field determines if the transition is
	1991	valid.
	1992
	1993	XXX - wrong maybe?
	1994	The following process inplace converts the table to the compressed
	1995	table: We first do not compress the root node 1,and mark all its
	1996	.check pointers as 1 and set its .base pointer as 1 as well. This
	1997	allows us to do a DFA construction from the compressed table later,
	1998	and ensures that any .base pointers we calculate later are greater
	1999	than 0.
	2000
	2001	- We set 'pos' to indicate the first entry of the second node.
	2002
	2003	- We then iterate over the columns of the node, finding the first and
	2004	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	2005	and set the .check pointers accordingly, and advance pos
	2006	appropriately and repreat for the next node. Note that when we copy
	2007	the next pointers we have to convert them from the original
	2008	NODEIDX form to NODENUM form as the former is not valid post
	2009	compression.
	2010
	2011	- If a node has no transitions used we mark its base as 0 and do not
	2012	advance the pos pointer.
	2013
	2014	- If a node only has one transition we use a second pointer into the
	2015	structure to fill in allocated fail transitions from other states.
	2016	This pointer is independent of the main pointer and scans forward
	2017	looking for null transitions that are allocated to a state. When it
	2018	finds one it writes the single transition into the "hole". If the
	2019	pointer doesnt find one the single transition is appended as normal.
	2020
	2021	- Once compressed we can Renew/realloc the structures to release the
	2022	excess space.
	2023
	2024	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	2025	specifically Fig 3.47 and the associated pseudocode.
	2026
	2027	demq
	2028	*/
	2029	const U32 laststate = TRIE_NODENUM( next_alloc );
	2030	U32 state, charid;
	2031	U32 pos = 0, zp=0;
	2032	trie->statecount = laststate;
	2033
	2034	for ( state = 1 ; state < laststate ; state++ ) {
	2035	U8 flag = 0;
	2036	const U32 stateidx = TRIE_NODEIDX( state );
	2037	const U32 o_used = trie->trans[ stateidx ].check;
	2038	U32 used = trie->trans[ stateidx ].check;
	2039	trie->trans[ stateidx ].check = 0;
	2040
	2041	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	2042	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2043	if ( trie->trans[ stateidx + charid ].next ) {
	2044	if (o_used == 1) {
	2045	for ( ; zp < pos ; zp++ ) {
	2046	if ( ! trie->trans[ zp ].next ) {
	2047	break;
	2048	}
	2049	}
	2050	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	2051	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2052	trie->trans[ zp ].check = state;
	2053	if ( ++zp > pos ) pos = zp;
	2054	break;
	2055	}
	2056	used--;
	2057	}
	2058	if ( !flag ) {
	2059	flag = 1;
	2060	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	2061	}
	2062	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2063	trie->trans[ pos ].check = state;
	2064	pos++;
	2065	}
	2066	}
	2067	}
	2068	trie->lasttrans = pos + 1;
	2069	trie->states = (reg_trie_state *)
	2070	PerlMemShared_realloc( trie->states, laststate
	2071	* sizeof(reg_trie_state) );
	2072	DEBUG_TRIE_COMPILE_MORE_r(
	2073	PerlIO_printf( Perl_debug_log,
	2074	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2075	(int)depth * 2 + 2,"",
	2076	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	2077	(IV)next_alloc,
	2078	(IV)pos,
	2079	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2080	);
	2081
	2082	} /* end table compress */
	2083	}
	2084	DEBUG_TRIE_COMPILE_MORE_r(
	2085	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2086	(int)depth * 2 + 2, "",
	2087	(UV)trie->statecount,
	2088	(UV)trie->lasttrans)
	2089	);
	2090	/* resize the trans array to remove unused space */
	2091	trie->trans = (reg_trie_trans *)
	2092	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2093	* sizeof(reg_trie_trans) );
	2094
	2095	{ /* Modify the program and insert the new TRIE node */
	2096	U8 nodetype =(U8)(flags & 0xFF);
	2097	char *str=NULL;
	2098
	2099	#ifdef DEBUGGING
	2100	regnode *optimize = NULL;
	2101	#ifdef RE_TRACK_PATTERN_OFFSETS
	2102
	2103	U32 mjd_offset = 0;
	2104	U32 mjd_nodelen = 0;
	2105	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2106	#endif /* DEBUGGING */
	2107	/*
	2108	This means we convert either the first branch or the first Exact,
	2109	depending on whether the thing following (in 'last') is a branch
	2110	or not and whther first is the startbranch (ie is it a sub part of
	2111	the alternation or is it the whole thing.)
	2112	Assuming its a sub part we convert the EXACT otherwise we convert
	2113	the whole branch sequence, including the first.
	2114	*/
	2115	/* Find the node we are going to overwrite */
	2116	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2117	/* branch sub-chain */
	2118	NEXT_OFF( first ) = (U16)(last - first);
	2119	#ifdef RE_TRACK_PATTERN_OFFSETS
	2120	DEBUG_r({
	2121	mjd_offset= Node_Offset((convert));
	2122	mjd_nodelen= Node_Length((convert));
	2123	});
	2124	#endif
	2125	/* whole branch chain */
	2126	}
	2127	#ifdef RE_TRACK_PATTERN_OFFSETS
	2128	else {
	2129	DEBUG_r({
	2130	const regnode *nop = NEXTOPER( convert );
	2131	mjd_offset= Node_Offset((nop));
	2132	mjd_nodelen= Node_Length((nop));
	2133	});
	2134	}
	2135	DEBUG_OPTIMISE_r(
	2136	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2137	(int)depth * 2 + 2, "",
	2138	(UV)mjd_offset, (UV)mjd_nodelen)
	2139	);
	2140	#endif
	2141	/* But first we check to see if there is a common prefix we can
	2142	split out as an EXACT and put in front of the TRIE node. */
	2143	trie->startstate= 1;
	2144	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2145	U32 state;
	2146	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2147	U32 ofs = 0;
	2148	I32 idx = -1;
	2149	U32 count = 0;
	2150	const U32 base = trie->states[ state ].trans.base;
	2151
	2152	if ( trie->states[state].wordnum )
	2153	count = 1;
	2154
	2155	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2156	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2157	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2158	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2159	{
	2160	if ( ++count > 1 ) {
	2161	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2162	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2163	if ( state == 1 ) break;
	2164	if ( count == 2 ) {
	2165	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2166	DEBUG_OPTIMISE_r(
	2167	PerlIO_printf(Perl_debug_log,
	2168	"%*sNew Start State=%"UVuf" Class: [",
	2169	(int)depth * 2 + 2, "",
	2170	(UV)state));
	2171	if (idx >= 0) {
	2172	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2173	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2174
	2175	TRIE_BITMAP_SET(trie,*ch);
	2176	if ( folder )
	2177	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2178	DEBUG_OPTIMISE_r(
	2179	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2180	);
	2181	}
	2182	}
	2183	TRIE_BITMAP_SET(trie,*ch);
	2184	if ( folder )
	2185	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2186	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2187	}
	2188	idx = ofs;
	2189	}
	2190	}
	2191	if ( count == 1 ) {
	2192	SV **tmp = av_fetch( revcharmap, idx, 0);
	2193	STRLEN len;
	2194	char ch = SvPV( tmp, len );
	2195	DEBUG_OPTIMISE_r({
	2196	SV *sv=sv_newmortal();
	2197	PerlIO_printf( Perl_debug_log,
	2198	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2199	(int)depth * 2 + 2, "",
	2200	(UV)state, (UV)idx,
	2201	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2202	PL_colors[0], PL_colors[1],
	2203	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2204	PERL_PV_ESCAPE_FIRSTCHAR
	2205	)
	2206	);
	2207	});
	2208	if ( state==1 ) {
	2209	OP( convert ) = nodetype;
	2210	str=STRING(convert);
	2211	STR_LEN(convert)=0;
	2212	}
	2213	STR_LEN(convert) += len;
	2214	while (len--)
	2215	str++ = ch++;
	2216	} else {
	2217	#ifdef DEBUGGING
	2218	if (state>1)
	2219	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2220	#endif
	2221	break;
	2222	}
	2223	}
	2224	trie->prefixlen = (state-1);
	2225	if (str) {
	2226	regnode *n = convert+NODE_SZ_STR(convert);
	2227	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2228	trie->startstate = state;
	2229	trie->minlen -= (state - 1);
	2230	trie->maxlen -= (state - 1);
	2231	#ifdef DEBUGGING
	2232	/* At least the UNICOS C compiler choked on this
	2233	* being argument to DEBUG_r(), so let's just have
	2234	* it right here. */
	2235	if (
	2236	#ifdef PERL_EXT_RE_BUILD
	2237	1
	2238	#else
	2239	DEBUG_r_TEST
	2240	#endif
	2241	) {
	2242	regnode *fix = convert;
	2243	U32 word = trie->wordcount;
	2244	mjd_nodelen++;
	2245	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2246	while( ++fix < n ) {
	2247	Set_Node_Offset_Length(fix, 0, 0);
	2248	}
	2249	while (word--) {
	2250	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2251	if (tmp) {
	2252	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2253	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2254	else
	2255	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2256	}
	2257	}
	2258	}
	2259	#endif
	2260	if (trie->maxlen) {
	2261	convert = n;
	2262	} else {
	2263	NEXT_OFF(convert) = (U16)(tail - convert);
	2264	DEBUG_r(optimize= n);
	2265	}
	2266	}
	2267	}
	2268	if (!jumper)
	2269	jumper = last;
	2270	if ( trie->maxlen ) {
	2271	NEXT_OFF( convert ) = (U16)(tail - convert);
	2272	ARG_SET( convert, data_slot );
	2273	/* Store the offset to the first unabsorbed branch in
	2274	jump[0], which is otherwise unused by the jump logic.
	2275	We use this when dumping a trie and during optimisation. */
	2276	if (trie->jump)
	2277	trie->jump[0] = (U16)(nextbranch - convert);
	2278
	2279	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2280	* and there is a bitmap
	2281	* and the first "jump target" node we found leaves enough room
	2282	* then convert the TRIE node into a TRIEC node, with the bitmap
	2283	* embedded inline in the opcode - this is hypothetically faster.
	2284	*/
	2285	if ( !trie->states[trie->startstate].wordnum
	2286	&& trie->bitmap
	2287	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2288	{
	2289	OP( convert ) = TRIEC;
	2290	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2291	PerlMemShared_free(trie->bitmap);
	2292	trie->bitmap= NULL;
	2293	} else
	2294	OP( convert ) = TRIE;
	2295
	2296	/* store the type in the flags */
	2297	convert->flags = nodetype;
	2298	DEBUG_r({
	2299	optimize = convert
	2300	+ NODE_STEP_REGNODE
	2301	+ regarglen[ OP( convert ) ];
	2302	});
	2303	/* XXX We really should free up the resource in trie now,
	2304	as we won't use them - (which resources?) dmq */
	2305	}
	2306	/* needed for dumping*/
	2307	DEBUG_r(if (optimize) {
	2308	regnode *opt = convert;
	2309
	2310	while ( ++opt < optimize) {
	2311	Set_Node_Offset_Length(opt,0,0);
	2312	}
	2313	/*
	2314	Try to clean up some of the debris left after the
	2315	optimisation.
	2316	*/
	2317	while( optimize < jumper ) {
	2318	mjd_nodelen += Node_Length((optimize));
	2319	OP( optimize ) = OPTIMIZED;
	2320	Set_Node_Offset_Length(optimize,0,0);
	2321	optimize++;
	2322	}
	2323	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2324	});
	2325	} /* end node insert */
	2326
	2327	/* Finish populating the prev field of the wordinfo array. Walk back
	2328	* from each accept state until we find another accept state, and if
	2329	* so, point the first word's .prev field at the second word. If the
	2330	* second already has a .prev field set, stop now. This will be the
	2331	* case either if we've already processed that word's accept state,
	2332	* or that state had multiple words, and the overspill words were
	2333	* already linked up earlier.
	2334	*/
	2335	{
	2336	U16 word;
	2337	U32 state;
	2338	U16 prev;
	2339
	2340	for (word=1; word <= trie->wordcount; word++) {
	2341	prev = 0;
	2342	if (trie->wordinfo[word].prev)
	2343	continue;
	2344	state = trie->wordinfo[word].accept;
	2345	while (state) {
	2346	state = prev_states[state];
	2347	if (!state)
	2348	break;
	2349	prev = trie->states[state].wordnum;
	2350	if (prev)
	2351	break;
	2352	}
	2353	trie->wordinfo[word].prev = prev;
	2354	}
	2355	Safefree(prev_states);
	2356	}
	2357
	2358
	2359	/* and now dump out the compressed format */
	2360	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2361
	2362	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2363	#ifdef DEBUGGING
	2364	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2365	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2366	#else
	2367	SvREFCNT_dec(revcharmap);
	2368	#endif
	2369	return trie->jump
	2370	? MADE_JUMP_TRIE
	2371	: trie->startstate>1
	2372	? MADE_EXACT_TRIE
	2373	: MADE_TRIE;
	2374	}
	2375
	2376	STATIC void
	2377	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2378	{
	2379	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2380
	2381	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2382	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2383	ISBN 0-201-10088-6
	2384
	2385	We find the fail state for each state in the trie, this state is the longest proper
	2386	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2387	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2388	the DFA not to have to restart after its tried and failed a word at a given point, it
	2389	simply continues as though it had been matching the other word in the first place.
	2390	Consider
	2391	'abcdgu'=~/abcdefg\|cdgu/
	2392	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2393	fail, which would bring us to the state representing 'd' in the second word where we would
	2394	try 'g' and succeed, proceeding to match 'cdgu'.
	2395	*/
	2396	/* add a fail transition */
	2397	const U32 trie_offset = ARG(source);
	2398	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2399	U32 *q;
	2400	const U32 ucharcount = trie->uniquecharcount;
	2401	const U32 numstates = trie->statecount;
	2402	const U32 ubound = trie->lasttrans + ucharcount;
	2403	U32 q_read = 0;
	2404	U32 q_write = 0;
	2405	U32 charid;
	2406	U32 base = trie->states[ 1 ].trans.base;
	2407	U32 *fail;
	2408	reg_ac_data *aho;
	2409	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2410	GET_RE_DEBUG_FLAGS_DECL;
	2411
	2412	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2413	#ifndef DEBUGGING
	2414	PERL_UNUSED_ARG(depth);
	2415	#endif
	2416
	2417
	2418	ARG_SET( stclass, data_slot );
	2419	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2420	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2421	aho->trie=trie_offset;
	2422	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2423	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2424	Newxz( q, numstates, U32);
	2425	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2426	aho->refcount = 1;
	2427	fail = aho->fail;
	2428	/* initialize fail[0..1] to be 1 so that we always have
	2429	a valid final fail state */
	2430	fail[ 0 ] = fail[ 1 ] = 1;
	2431
	2432	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2433	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2434	if ( newstate ) {
	2435	q[ q_write ] = newstate;
	2436	/* set to point at the root */
	2437	fail[ q[ q_write++ ] ]=1;
	2438	}
	2439	}
	2440	while ( q_read < q_write) {
	2441	const U32 cur = q[ q_read++ % numstates ];
	2442	base = trie->states[ cur ].trans.base;
	2443
	2444	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2445	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2446	if (ch_state) {
	2447	U32 fail_state = cur;
	2448	U32 fail_base;
	2449	do {
	2450	fail_state = fail[ fail_state ];
	2451	fail_base = aho->states[ fail_state ].trans.base;
	2452	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2453
	2454	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2455	fail[ ch_state ] = fail_state;
	2456	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2457	{
	2458	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2459	}
	2460	q[ q_write++ % numstates] = ch_state;
	2461	}
	2462	}
	2463	}
	2464	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2465	when we fail in state 1, this allows us to use the
	2466	charclass scan to find a valid start char. This is based on the principle
	2467	that theres a good chance the string being searched contains lots of stuff
	2468	that cant be a start char.
	2469	*/
	2470	fail[ 0 ] = fail[ 1 ] = 0;
	2471	DEBUG_TRIE_COMPILE_r({
	2472	PerlIO_printf(Perl_debug_log,
	2473	"%*sStclass Failtable (%"UVuf" states): 0",
	2474	(int)(depth * 2), "", (UV)numstates
	2475	);
	2476	for( q_read=1; q_read<numstates; q_read++ ) {
	2477	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2478	}
	2479	PerlIO_printf(Perl_debug_log, "\n");
	2480	});
	2481	Safefree(q);
	2482	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2483	}
	2484
	2485
	2486	/*
	2487	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2488	* These need to be revisited when a newer toolchain becomes available.
	2489	*/
	2490	#if defined(__sparc64__) && defined(__GNUC__)
	2491	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2492	# undef SPARC64_GCC_WORKAROUND
	2493	# define SPARC64_GCC_WORKAROUND 1
	2494	# endif
	2495	#endif
	2496
	2497	#define DEBUG_PEEP(str,scan,depth) \
	2498	DEBUG_OPTIMISE_r({if (scan){ \
	2499	SV * const mysv=sv_newmortal(); \
	2500	regnode *Next = regnext(scan); \
	2501	regprop(RExC_rx, mysv, scan); \
	2502	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2503	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2504	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2505	}});
	2506
	2507
	2508	/* The below joins as many adjacent EXACTish nodes as possible into a single
	2509	* one, and looks for problematic sequences of characters whose folds vs.
	2510	* non-folds have sufficiently different lengths, that the optimizer would be
	2511	* fooled into rejecting legitimate matches of them, and the trie construction
	2512	* code can't cope with them. The joining is only done if:
	2513	* 1) there is room in the current conglomerated node to entirely contain the
	2514	* next one.
	2515	* 2) they are the exact same node type
	2516	*
	2517	* The adjacent nodes actually may be separated by NOTHING kind nodes, and
	2518	* these get optimized out
	2519	*
	2520	* If there are problematic code sequences, *min_subtract is set to the delta
	2521	* that the minimum size of the node can be less than its actual size. And,
	2522	* the node type of the result is changed to reflect that it contains these
	2523	* sequences.
	2524	*
	2525	* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
	2526	* and contains LATIN SMALL LETTER SHARP S
	2527	*
	2528	* This is as good a place as any to discuss the design of handling these
	2529	* problematic sequences. It's been wrong in Perl for a very long time. There
	2530	* are three code points in Unicode whose folded lengths differ so much from
	2531	* the un-folded lengths that it causes problems for the optimizer and trie
	2532	* construction. Why only these are problematic, and not others where lengths
	2533	* also differ is something I (khw) do not understand. New versions of Unicode
	2534	* might add more such code points. Hopefully the logic in fold_grind.t that
	2535	* figures out what to test (in part by verifying that each size-combination
	2536	* gets tested) will catch any that do come along, so they can be added to the
	2537	* special handling below. The chances of new ones are actually rather small,
	2538	* as most, if not all, of the world's scripts that have casefolding have
	2539	* already been encoded by Unicode. Also, a number of Unicode's decisions were
	2540	* made to allow compatibility with pre-existing standards, and almost all of
	2541	* those have already been dealt with. These would otherwise be the most
	2542	* likely candidates for generating further tricky sequences. In other words,
	2543	* Unicode by itself is unlikely to add new ones unless it is for compatibility
	2544	* with pre-existing standards, and there aren't many of those left.
	2545	*
	2546	* The previous designs for dealing with these involved assigning a special
	2547	* node for them. This approach doesn't work, as evidenced by this example:
	2548	* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
	2549	* Both these fold to "sss", but if the pattern is parsed to create a node of
	2550	* that would match just the \xDF, it won't be able to handle the case where a
	2551	* successful match would have to cross the node's boundary. The new approach
	2552	* that hopefully generally solves the problem generates an EXACTFU_SS node
	2553	* that is "sss".
	2554	*
	2555	* There are a number of components to the approach (a lot of work for just
	2556	* three code points!):
	2557	* 1) This routine examines each EXACTFish node that could contain the
	2558	* problematic sequences. It returns in *min_subtract how much to
	2559	* subtract from the the actual length of the string to get a real minimum
	2560	* for one that could match it. This number is usually 0 except for the
	2561	* problematic sequences. This delta is used by the caller to adjust the
	2562	* min length of the match, and the delta between min and max, so that the
	2563	* optimizer doesn't reject these possibilities based on size constraints.
	2564	* 2) These sequences are not currently correctly handled by the trie code
	2565	* either, so it changes the joined node type to ops that are not handled
	2566	* by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
	2567	* 3) This is sufficient for the two Greek sequences (described below), but
	2568	* the one involving the Sharp s (\xDF) needs more. The node type
	2569	* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
	2570	* sequence in it. For non-UTF-8 patterns and strings, this is the only
	2571	* case where there is a possible fold length change. That means that a
	2572	* regular EXACTFU node without UTF-8 involvement doesn't have to concern
	2573	* itself with length changes, and so can be processed faster. regexec.c
	2574	* takes advantage of this. Generally, an EXACTFish node that is in UTF-8
	2575	* is pre-folded by regcomp.c. This saves effort in regex matching.
	2576	* However, probably mostly for historical reasons, the pre-folding isn't
	2577	* done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
	2578	* nodes, as what they fold to isn't known until runtime.) The fold
	2579	* possibilities for the non-UTF8 patterns are quite simple, except for
	2580	* the sharp s. All the ones that don't involve a UTF-8 target string
	2581	* are members of a fold-pair, and arrays are set up for all of them
	2582	* that quickly find the other member of the pair. It might actually
	2583	* be faster to pre-fold these, but it isn't currently done, except for
	2584	* the sharp s. Code elsewhere in this file makes sure that it gets
	2585	* folded to 'ss', even if the pattern isn't UTF-8. This avoids the
	2586	* issues described in the next item.
	2587	* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
	2588	* 'ss' or not is not knowable at compile time. It will match iff the
	2589	* target string is in UTF-8, unlike the EXACTFU nodes, where it always
	2590	* matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
	2591	* it can't be folded to "ss" at compile time, unlike EXACTFU does as
	2592	* described in item 3). An assumption that the optimizer part of
	2593	* regexec.c (probably unwittingly) makes is that a character in the
	2594	* pattern corresponds to at most a single character in the target string.
	2595	* (And I do mean character, and not byte here, unlike other parts of the
	2596	* documentation that have never been updated to account for multibyte
	2597	* Unicode.) This assumption is wrong only in this case, as all other
	2598	* cases are either 1-1 folds when no UTF-8 is involved; or is true by
	2599	* virtue of having this file pre-fold UTF-8 patterns. I'm
	2600	* reluctant to try to change this assumption, so instead the code punts.
	2601	* This routine examines EXACTF nodes for the sharp s, and returns a
	2602	* boolean indicating whether or not the node is an EXACTF node that
	2603	* contains a sharp s. When it is true, the caller sets a flag that later
	2604	* causes the optimizer in this file to not set values for the floating
	2605	* and fixed string lengths, and thus avoids the optimizer code in
	2606	* regexec.c that makes the invalid assumption. Thus, there is no
	2607	* optimization based on string lengths for EXACTF nodes that contain the
	2608	* sharp s. This only happens for /id rules (which means the pattern
	2609	* isn't in UTF-8).
	2610	*/
	2611
	2612	#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
	2613	if (PL_regkind[OP(scan)] == EXACT) \
	2614	join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
	2615
	2616	STATIC U32
	2617	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, UV min_subtract, bool has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
	2618	/* Merge several consecutive EXACTish nodes into one. */
	2619	regnode *n = regnext(scan);
	2620	U32 stringok = 1;
	2621	regnode *next = scan + NODE_SZ_STR(scan);
	2622	U32 merged = 0;
	2623	U32 stopnow = 0;
	2624	#ifdef DEBUGGING
	2625	regnode *stop = scan;
	2626	GET_RE_DEBUG_FLAGS_DECL;
	2627	#else
	2628	PERL_UNUSED_ARG(depth);
	2629	#endif
	2630
	2631	PERL_ARGS_ASSERT_JOIN_EXACT;
	2632	#ifndef EXPERIMENTAL_INPLACESCAN
	2633	PERL_UNUSED_ARG(flags);
	2634	PERL_UNUSED_ARG(val);
	2635	#endif
	2636	DEBUG_PEEP("join",scan,depth);
	2637
	2638	/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
	2639	* EXACT ones that are mergeable to the current one. */
	2640	while (n
	2641	&& (PL_regkind[OP(n)] == NOTHING
	2642	\|\| (stringok && OP(n) == OP(scan)))
	2643	&& NEXT_OFF(n)
	2644	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
	2645	{
	2646
	2647	if (OP(n) == TAIL \|\| n > next)
	2648	stringok = 0;
	2649	if (PL_regkind[OP(n)] == NOTHING) {
	2650	DEBUG_PEEP("skip:",n,depth);
	2651	NEXT_OFF(scan) += NEXT_OFF(n);
	2652	next = n + NODE_STEP_REGNODE;
	2653	#ifdef DEBUGGING
	2654	if (stringok)
	2655	stop = n;
	2656	#endif
	2657	n = regnext(n);
	2658	}
	2659	else if (stringok) {
	2660	const unsigned int oldl = STR_LEN(scan);
	2661	regnode * const nnext = regnext(n);
	2662
	2663	if (oldl + STR_LEN(n) > U8_MAX)
	2664	break;
	2665
	2666	DEBUG_PEEP("merg",n,depth);
	2667	merged++;
	2668
	2669	NEXT_OFF(scan) += NEXT_OFF(n);
	2670	STR_LEN(scan) += STR_LEN(n);
	2671	next = n + NODE_SZ_STR(n);
	2672	/* Now we can overwrite n : /
	2673	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2674	#ifdef DEBUGGING
	2675	stop = next - 1;
	2676	#endif
	2677	n = nnext;
	2678	if (stopnow) break;
	2679	}
	2680
	2681	#ifdef EXPERIMENTAL_INPLACESCAN
	2682	if (flags && !NEXT_OFF(n)) {
	2683	DEBUG_PEEP("atch", val, depth);
	2684	if (reg_off_by_arg[OP(n)]) {
	2685	ARG_SET(n, val - n);
	2686	}
	2687	else {
	2688	NEXT_OFF(n) = val - n;
	2689	}
	2690	stopnow = 1;
	2691	}
	2692	#endif
	2693	}
	2694
	2695	*min_subtract = 0;
	2696	*has_exactf_sharp_s = FALSE;
	2697
	2698	/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
	2699	* can now analyze for sequences of problematic code points. (Prior to
	2700	* this final joining, sequences could have been split over boundaries, and
	2701	* hence missed). The sequences only happen in folding, hence for any
	2702	* non-EXACT EXACTish node */
	2703	if (OP(scan) != EXACT) {
	2704	U8 *s;
	2705	U8 * s0 = (U8*) STRING(scan);
	2706	U8 * const s_end = s0 + STR_LEN(scan);
	2707
	2708	/* The below is perhaps overboard, but this allows us to save a test
	2709	* each time through the loop at the expense of a mask. This is
	2710	* because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
	2711	* single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
	2712	* This uses an exclusive 'or' to find that bit and then inverts it to
	2713	* form a mask, with just a single 0, in the bit position where 'S' and
	2714	* 's' differ. */
	2715	const U8 S_or_s_mask = ~ ('S' ^ 's');
	2716	const U8 s_masked = 's' & S_or_s_mask;
	2717
	2718	/* One pass is made over the node's string looking for all the
	2719	* possibilities. to avoid some tests in the loop, there are two main
	2720	* cases, for UTF-8 patterns (which can't have EXACTF nodes) and
	2721	* non-UTF-8 */
	2722	if (UTF) {
	2723
	2724	/* There are two problematic Greek code points in Unicode
	2725	* casefolding
	2726	*
	2727	* U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	2728	* U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	2729	*
	2730	* which casefold to
	2731	*
	2732	* Unicode UTF-8
	2733	*
	2734	* U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
	2735	* U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
	2736	*
	2737	* This means that in case-insensitive matching (or "loose
	2738	* matching", as Unicode calls it), an EXACTF of length six (the
	2739	* UTF-8 encoded byte length of the above casefolded versions) can
	2740	* match a target string of length two (the byte length of UTF-8
	2741	* encoded U+0390 or U+03B0). This would rather mess up the
	2742	* minimum length computation. (there are other code points that
	2743	* also fold to these two sequences, but the delta is smaller)
	2744	*
	2745	* If these sequences are found, the minimum length is decreased by
	2746	* four (six minus two).
	2747	*
	2748	* Similarly, 'ss' may match the single char and byte LATIN SMALL
	2749	* LETTER SHARP S. We decrease the min length by 1 for each
	2750	* occurrence of 'ss' found */
	2751
	2752	#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
	2753	# define U390_first_byte 0xb4
	2754	const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
	2755	# define U3B0_first_byte 0xb5
	2756	const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
	2757	#else
	2758	# define U390_first_byte 0xce
	2759	const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
	2760	# define U3B0_first_byte 0xcf
	2761	const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
	2762	#endif
	2763	const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
	2764	yields a net of 0 */
	2765	/* Examine the string for one of the problematic sequences */
	2766	for (s = s0;
	2767	s < s_end - 1; /* Can stop 1 before the end, as minimum length
	2768	* sequence we are looking for is 2 */
	2769	s += UTF8SKIP(s))
	2770	{
	2771
	2772	/* Look for the first byte in each problematic sequence */
	2773	switch (*s) {
	2774	/* We don't have to worry about other things that fold to
	2775	* 's' (such as the long s, U+017F), as all above-latin1
	2776	* code points have been pre-folded */
	2777	case 's':
	2778	case 'S':
	2779
	2780	/* Current character is an 's' or 'S'. If next one is
	2781	* as well, we have the dreaded sequence */
	2782	if (((*(s+1) & S_or_s_mask) == s_masked)
	2783	/* These two node types don't have special handling
	2784	* for 'ss' */
	2785	&& OP(scan) != EXACTFL && OP(scan) != EXACTFA)
	2786	{
	2787	*min_subtract += 1;
	2788	OP(scan) = EXACTFU_SS;
	2789	s++; /* No need to look at this character again */
	2790	}
	2791	break;
	2792
	2793	case U390_first_byte:
	2794	if (s_end - s >= len
	2795
	2796	/* The 1's are because are skipping comparing the
	2797	* first byte */
	2798	&& memEQ(s + 1, U390_tail, len - 1))
	2799	{
	2800	goto greek_sequence;
	2801	}
	2802	break;
	2803
	2804	case U3B0_first_byte:
	2805	if (! (s_end - s >= len
	2806	&& memEQ(s + 1, U3B0_tail, len - 1)))
	2807	{
	2808	break;
	2809	}
	2810	greek_sequence:
	2811	*min_subtract += 4;
	2812
	2813	/* This can't currently be handled by trie's, so change
	2814	* the node type to indicate this. If EXACTFA and
	2815	* EXACTFL were ever to be handled by trie's, this
	2816	* would have to be changed. If this node has already
	2817	* been changed to EXACTFU_SS in this loop, leave it as
	2818	* is. (I (khw) think it doesn't matter in regexec.c
	2819	* for UTF patterns, but no need to change it */
	2820	if (OP(scan) == EXACTFU) {
	2821	OP(scan) = EXACTFU_NO_TRIE;
	2822	}
	2823	s += 6; /* We already know what this sequence is. Skip
	2824	the rest of it */
	2825	break;
	2826	}
	2827	}
	2828	}
	2829	else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
	2830
	2831	/* Here, the pattern is not UTF-8. We need to look only for the
	2832	* 'ss' sequence, and in the EXACTF case, the sharp s, which can be
	2833	* in the final position. Otherwise we can stop looking 1 byte
	2834	* earlier because have to find both the first and second 's' */
	2835	const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
	2836
	2837	for (s = s0; s < upper; s++) {
	2838	switch (*s) {
	2839	case 'S':
	2840	case 's':
	2841	if (s_end - s > 1
	2842	&& ((*(s+1) & S_or_s_mask) == s_masked))
	2843	{
	2844	*min_subtract += 1;
	2845
	2846	/* EXACTF nodes need to know that the minimum
	2847	* length changed so that a sharp s in the string
	2848	* can match this ss in the pattern, but they
	2849	* remain EXACTF nodes, as they are not trie'able,
	2850	* so don't have to invent a new node type to
	2851	* exclude them from the trie code */
	2852	if (OP(scan) != EXACTF) {
	2853	OP(scan) = EXACTFU_SS;
	2854	}
	2855	s++;
	2856	}
	2857	break;
	2858	case LATIN_SMALL_LETTER_SHARP_S:
	2859	if (OP(scan) == EXACTF) {
	2860	*has_exactf_sharp_s = TRUE;
	2861	}
	2862	break;
	2863	}
	2864	}
	2865	}
	2866	}
	2867
	2868	#ifdef DEBUGGING
	2869	/* Allow dumping but overwriting the collection of skipped
	2870	* ops and/or strings with fake optimized ops */
	2871	n = scan + NODE_SZ_STR(scan);
	2872	while (n <= stop) {
	2873	OP(n) = OPTIMIZED;
	2874	FLAGS(n) = 0;
	2875	NEXT_OFF(n) = 0;
	2876	n++;
	2877	}
	2878	#endif
	2879	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	2880	return stopnow;
	2881	}
	2882
	2883	/* REx optimizer. Converts nodes into quicker variants "in place".
	2884	Finds fixed substrings. */
	2885
	2886	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	2887	to the position after last scanned or to NULL. */
	2888
	2889	#define INIT_AND_WITHP \
	2890	assert(!and_withp); \
	2891	Newx(and_withp,1,struct regnode_charclass_class); \
	2892	SAVEFREEPV(and_withp)
	2893
	2894	/* this is a chain of data about sub patterns we are processing that
	2895	need to be handled separately/specially in study_chunk. Its so
	2896	we can simulate recursion without losing state. */
	2897	struct scan_frame;
	2898	typedef struct scan_frame {
	2899	regnode last; / last node to process in this frame */
	2900	regnode next; / next node to process when last is reached */
	2901	struct scan_frame prev; /previous frame*/
	2902	I32 stop; /* what stopparen do we use */
	2903	} scan_frame;
	2904
	2905
	2906	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	2907
	2908	#define CASE_SYNST_FNC(nAmE) \
	2909	case nAmE: \
	2910	if (flags & SCF_DO_STCLASS_AND) { \
	2911	for (value = 0; value < 256; value++) \
	2912	if (!is_ ## nAmE ## _cp(value)) \
	2913	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2914	} \
	2915	else { \
	2916	for (value = 0; value < 256; value++) \
	2917	if (is_ ## nAmE ## _cp(value)) \
	2918	ANYOF_BITMAP_SET(data->start_class, value); \
	2919	} \
	2920	break; \
	2921	case N ## nAmE: \
	2922	if (flags & SCF_DO_STCLASS_AND) { \
	2923	for (value = 0; value < 256; value++) \
	2924	if (is_ ## nAmE ## _cp(value)) \
	2925	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2926	} \
	2927	else { \
	2928	for (value = 0; value < 256; value++) \
	2929	if (!is_ ## nAmE ## _cp(value)) \
	2930	ANYOF_BITMAP_SET(data->start_class, value); \
	2931	} \
	2932	break
	2933
	2934
	2935
	2936	STATIC I32
	2937	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	2938	I32 minlenp, I32 deltap,
	2939	regnode *last,
	2940	scan_data_t *data,
	2941	I32 stopparen,
	2942	U8* recursed,
	2943	struct regnode_charclass_class *and_withp,
	2944	U32 flags, U32 depth)
	2945	/* scanp: Start here (read-write). */
	2946	/* deltap: Write maxlen-minlen here. */
	2947	/* last: Stop before this one. */
	2948	/* data: string data about the pattern */
	2949	/* stopparen: treat close N as END */
	2950	/* recursed: which subroutines have we recursed into */
	2951	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	2952	{
	2953	dVAR;
	2954	I32 min = 0, pars = 0, code;
	2955	regnode scan = scanp, *next;
	2956	I32 delta = 0;
	2957	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	2958	int is_inf_internal = 0; /* The studied chunk is infinite */
	2959	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	2960	scan_data_t data_fake;
	2961	SV *re_trie_maxbuff = NULL;
	2962	regnode *first_non_open = scan;
	2963	I32 stopmin = I32_MAX;
	2964	scan_frame *frame = NULL;
	2965	GET_RE_DEBUG_FLAGS_DECL;
	2966
	2967	PERL_ARGS_ASSERT_STUDY_CHUNK;
	2968
	2969	#ifdef DEBUGGING
	2970	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	2971	#endif
	2972
	2973	if ( depth == 0 ) {
	2974	while (first_non_open && OP(first_non_open) == OPEN)
	2975	first_non_open=regnext(first_non_open);
	2976	}
	2977
	2978
	2979	fake_study_recurse:
	2980	while ( scan && OP(scan) != END && scan < last ){
	2981	UV min_subtract = 0; /* How much to subtract from the minimum node
	2982	length to get a real minimum (because the
	2983	folded version may be shorter) */
	2984	bool has_exactf_sharp_s = FALSE;
	2985	/* Peephole optimizer: */
	2986	DEBUG_STUDYDATA("Peep:", data,depth);
	2987	DEBUG_PEEP("Peep",scan,depth);
	2988
	2989	/* Its not clear to khw or hv why this is done here, and not in the
	2990	* clauses that deal with EXACT nodes. khw's guess is that it's
	2991	* because of a previous design */
	2992	JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
	2993
	2994	/* Follow the next-chain of the current node and optimize
	2995	away all the NOTHINGs from it. */
	2996	if (OP(scan) != CURLYX) {
	2997	const int max = (reg_off_by_arg[OP(scan)]
	2998	? I32_MAX
	2999	/* I32 may be smaller than U16 on CRAYs! */
	3000	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	3001	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	3002	int noff;
	3003	regnode *n = scan;
	3004
	3005	/* Skip NOTHING and LONGJMP. */
	3006	while ((n = regnext(n))
	3007	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	3008	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	3009	&& off + noff < max)
	3010	off += noff;
	3011	if (reg_off_by_arg[OP(scan)])
	3012	ARG(scan) = off;
	3013	else
	3014	NEXT_OFF(scan) = off;
	3015	}
	3016
	3017
	3018
	3019	/* The principal pseudo-switch. Cannot be a switch, since we
	3020	look into several different things. */
	3021	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	3022	\|\| OP(scan) == IFTHEN) {
	3023	next = regnext(scan);
	3024	code = OP(scan);
	3025	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	3026
	3027	if (OP(next) == code \|\| code == IFTHEN) {
	3028	/* NOTE - There is similar code to this block below for handling
	3029	TRIE nodes on a re-study. If you change stuff here check there
	3030	too. */
	3031	I32 max1 = 0, min1 = I32_MAX, num = 0;
	3032	struct regnode_charclass_class accum;
	3033	regnode * const startbranch=scan;
	3034
	3035	if (flags & SCF_DO_SUBSTR)
	3036	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	3037	if (flags & SCF_DO_STCLASS)
	3038	cl_init_zero(pRExC_state, &accum);
	3039
	3040	while (OP(scan) == code) {
	3041	I32 deltanext, minnext, f = 0, fake;
	3042	struct regnode_charclass_class this_class;
	3043
	3044	num++;
	3045	data_fake.flags = 0;
	3046	if (data) {
	3047	data_fake.whilem_c = data->whilem_c;
	3048	data_fake.last_closep = data->last_closep;
	3049	}
	3050	else
	3051	data_fake.last_closep = &fake;
	3052
	3053	data_fake.pos_delta = delta;
	3054	next = regnext(scan);
	3055	scan = NEXTOPER(scan);
	3056	if (code != BRANCH)
	3057	scan = NEXTOPER(scan);
	3058	if (flags & SCF_DO_STCLASS) {
	3059	cl_init(pRExC_state, &this_class);
	3060	data_fake.start_class = &this_class;
	3061	f = SCF_DO_STCLASS_AND;
	3062	}
	3063	if (flags & SCF_WHILEM_VISITED_POS)
	3064	f \|= SCF_WHILEM_VISITED_POS;
	3065
	3066	/* we suppose the run is continuous, last=next...*/
	3067	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3068	next, &data_fake,
	3069	stopparen, recursed, NULL, f,depth+1);
	3070	if (min1 > minnext)
	3071	min1 = minnext;
	3072	if (max1 < minnext + deltanext)
	3073	max1 = minnext + deltanext;
	3074	if (deltanext == I32_MAX)
	3075	is_inf = is_inf_internal = 1;
	3076	scan = next;
	3077	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3078	pars++;
	3079	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	3080	if ( stopmin > minnext)
	3081	stopmin = min + min1;
	3082	flags &= ~SCF_DO_SUBSTR;
	3083	if (data)
	3084	data->flags \|= SCF_SEEN_ACCEPT;
	3085	}
	3086	if (data) {
	3087	if (data_fake.flags & SF_HAS_EVAL)
	3088	data->flags \|= SF_HAS_EVAL;
	3089	data->whilem_c = data_fake.whilem_c;
	3090	}
	3091	if (flags & SCF_DO_STCLASS)
	3092	cl_or(pRExC_state, &accum, &this_class);
	3093	}
	3094	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	3095	min1 = 0;
	3096	if (flags & SCF_DO_SUBSTR) {
	3097	data->pos_min += min1;
	3098	data->pos_delta += max1 - min1;
	3099	if (max1 != min1 \|\| is_inf)
	3100	data->longest = &(data->longest_float);
	3101	}
	3102	min += min1;
	3103	delta += max1 - min1;
	3104	if (flags & SCF_DO_STCLASS_OR) {
	3105	cl_or(pRExC_state, data->start_class, &accum);
	3106	if (min1) {
	3107	cl_and(data->start_class, and_withp);
	3108	flags &= ~SCF_DO_STCLASS;
	3109	}
	3110	}
	3111	else if (flags & SCF_DO_STCLASS_AND) {
	3112	if (min1) {
	3113	cl_and(data->start_class, &accum);
	3114	flags &= ~SCF_DO_STCLASS;
	3115	}
	3116	else {
	3117	/* Switch to OR mode: cache the old value of
	3118	* data->start_class */
	3119	INIT_AND_WITHP;
	3120	StructCopy(data->start_class, and_withp,
	3121	struct regnode_charclass_class);
	3122	flags &= ~SCF_DO_STCLASS_AND;
	3123	StructCopy(&accum, data->start_class,
	3124	struct regnode_charclass_class);
	3125	flags \|= SCF_DO_STCLASS_OR;
	3126	data->start_class->flags \|= ANYOF_EOS;
	3127	}
	3128	}
	3129
	3130	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	3131	/* demq.
	3132
	3133	Assuming this was/is a branch we are dealing with: 'scan' now
	3134	points at the item that follows the branch sequence, whatever
	3135	it is. We now start at the beginning of the sequence and look
	3136	for subsequences of
	3137
	3138	BRANCH->EXACT=>x1
	3139	BRANCH->EXACT=>x2
	3140	tail
	3141
	3142	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	3143
	3144	If we can find such a subsequence we need to turn the first
	3145	element into a trie and then add the subsequent branch exact
	3146	strings to the trie.
	3147
	3148	We have two cases
	3149
	3150	1. patterns where the whole set of branches can be converted.
	3151
	3152	2. patterns where only a subset can be converted.
	3153
	3154	In case 1 we can replace the whole set with a single regop
	3155	for the trie. In case 2 we need to keep the start and end
	3156	branches so
	3157
	3158	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	3159	becomes BRANCH TRIE; BRANCH X;
	3160
	3161	There is an additional case, that being where there is a
	3162	common prefix, which gets split out into an EXACT like node
	3163	preceding the TRIE node.
	3164
	3165	If x(1..n)==tail then we can do a simple trie, if not we make
	3166	a "jump" trie, such that when we match the appropriate word
	3167	we "jump" to the appropriate tail node. Essentially we turn
	3168	a nested if into a case structure of sorts.
	3169
	3170	*/
	3171
	3172	int made=0;
	3173	if (!re_trie_maxbuff) {
	3174	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	3175	if (!SvIOK(re_trie_maxbuff))
	3176	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	3177	}
	3178	if ( SvIV(re_trie_maxbuff)>=0 ) {
	3179	regnode *cur;
	3180	regnode first = (regnode )NULL;
	3181	regnode last = (regnode )NULL;
	3182	regnode *tail = scan;
	3183	U8 optype = 0;
	3184	U32 count=0;
	3185
	3186	#ifdef DEBUGGING
	3187	SV * const mysv = sv_newmortal(); /* for dumping */
	3188	#endif
	3189	/* var tail is used because there may be a TAIL
	3190	regop in the way. Ie, the exacts will point to the
	3191	thing following the TAIL, but the last branch will
	3192	point at the TAIL. So we advance tail. If we
	3193	have nested (?:) we may have to move through several
	3194	tails.
	3195	*/
	3196
	3197	while ( OP( tail ) == TAIL ) {
	3198	/* this is the TAIL generated by (?:) */
	3199	tail = regnext( tail );
	3200	}
	3201
	3202
	3203	DEBUG_OPTIMISE_r({
	3204	regprop(RExC_rx, mysv, tail );
	3205	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	3206	(int)depth * 2 + 2, "",
	3207	"Looking for TRIE'able sequences. Tail node is: ",
	3208	SvPV_nolen_const( mysv )
	3209	);
	3210	});
	3211
	3212	/*
	3213
	3214	step through the branches, cur represents each
	3215	branch, noper is the first thing to be matched
	3216	as part of that branch and noper_next is the
	3217	regnext() of that node. if noper is an EXACT
	3218	and noper_next is the same as scan (our current
	3219	position in the regex) then the EXACT branch is
	3220	a possible optimization target. Once we have
	3221	two or more consecutive such branches we can
	3222	create a trie of the EXACT's contents and stich
	3223	it in place. If the sequence represents all of
	3224	the branches we eliminate the whole thing and
	3225	replace it with a single TRIE. If it is a
	3226	subsequence then we need to stitch it in. This
	3227	means the first branch has to remain, and needs
	3228	to be repointed at the item on the branch chain
	3229	following the last branch optimized. This could
	3230	be either a BRANCH, in which case the
	3231	subsequence is internal, or it could be the
	3232	item following the branch sequence in which
	3233	case the subsequence is at the end.
	3234
	3235	*/
	3236
	3237	/* dont use tail as the end marker for this traverse */
	3238	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	3239	regnode * const noper = NEXTOPER( cur );
	3240	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	3241	regnode * const noper_next = regnext( noper );
	3242	#endif
	3243
	3244	DEBUG_OPTIMISE_r({
	3245	regprop(RExC_rx, mysv, cur);
	3246	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	3247	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	3248
	3249	regprop(RExC_rx, mysv, noper);
	3250	PerlIO_printf( Perl_debug_log, " -> %s",
	3251	SvPV_nolen_const(mysv));
	3252
	3253	if ( noper_next ) {
	3254	regprop(RExC_rx, mysv, noper_next );
	3255	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3256	SvPV_nolen_const(mysv));
	3257	}
	3258	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
	3259	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
	3260	});
	3261	if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
	3262	: PL_regkind[ OP( noper ) ] == EXACT )
	3263	\|\| OP(noper) == NOTHING )
	3264	#ifdef NOJUMPTRIE
	3265	&& noper_next == tail
	3266	#endif
	3267	&& count < U16_MAX)
	3268	{
	3269	count++;
	3270	if ( !first \|\| optype == NOTHING ) {
	3271	if (!first) first = cur;
	3272	optype = OP( noper );
	3273	} else {
	3274	last = cur;
	3275	}
	3276	} else {
	3277	/*
	3278	Currently the trie logic handles case insensitive matching properly only
	3279	when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
	3280	semantics).
	3281
	3282	If/when this is fixed the following define can be swapped
	3283	in below to fully enable trie logic.
	3284
	3285	#define TRIE_TYPE_IS_SAFE 1
	3286
	3287	Note that join_exact() assumes that the other types of EXACTFish nodes are not
	3288	used in tries, so that would have to be updated if this changed
	3289
	3290	*/
	3291	#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) \|\| optype==EXACT)
	3292
	3293	if ( last && TRIE_TYPE_IS_SAFE ) {
	3294	make_trie( pRExC_state,
	3295	startbranch, first, cur, tail, count,
	3296	optype, depth+1 );
	3297	}
	3298	if ( PL_regkind[ OP( noper ) ] == EXACT
	3299	#ifdef NOJUMPTRIE
	3300	&& noper_next == tail
	3301	#endif
	3302	){
	3303	count = 1;
	3304	first = cur;
	3305	optype = OP( noper );
	3306	} else {
	3307	count = 0;
	3308	first = NULL;
	3309	optype = 0;
	3310	}
	3311	last = NULL;
	3312	}
	3313	}
	3314	DEBUG_OPTIMISE_r({
	3315	regprop(RExC_rx, mysv, cur);
	3316	PerlIO_printf( Perl_debug_log,
	3317	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	3318	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3319
	3320	});
	3321
	3322	if ( last && TRIE_TYPE_IS_SAFE ) {
	3323	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
	3324	#ifdef TRIE_STUDY_OPT
	3325	if ( ((made == MADE_EXACT_TRIE &&
	3326	startbranch == first)
	3327	\|\| ( first_non_open == first )) &&
	3328	depth==0 ) {
	3329	flags \|= SCF_TRIE_RESTUDY;
	3330	if ( startbranch == first
	3331	&& scan == tail )
	3332	{
	3333	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	3334	}
	3335	}
	3336	#endif
	3337	}
	3338	}
	3339
	3340	} /* do trie */
	3341
	3342	}
	3343	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	3344	scan = NEXTOPER(NEXTOPER(scan));
	3345	} else /* single branch is optimized. */
	3346	scan = NEXTOPER(scan);
	3347	continue;
	3348	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	3349	scan_frame *newframe = NULL;
	3350	I32 paren;
	3351	regnode *start;
	3352	regnode *end;
	3353
	3354	if (OP(scan) != SUSPEND) {
	3355	/* set the pointer */
	3356	if (OP(scan) == GOSUB) {
	3357	paren = ARG(scan);
	3358	RExC_recurse[ARG2L(scan)] = scan;
	3359	start = RExC_open_parens[paren-1];
	3360	end = RExC_close_parens[paren-1];
	3361	} else {
	3362	paren = 0;
	3363	start = RExC_rxi->program + 1;
	3364	end = RExC_opend;
	3365	}
	3366	if (!recursed) {
	3367	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3368	SAVEFREEPV(recursed);
	3369	}
	3370	if (!PAREN_TEST(recursed,paren+1)) {
	3371	PAREN_SET(recursed,paren+1);
	3372	Newx(newframe,1,scan_frame);
	3373	} else {
	3374	if (flags & SCF_DO_SUBSTR) {
	3375	SCAN_COMMIT(pRExC_state,data,minlenp);
	3376	data->longest = &(data->longest_float);
	3377	}
	3378	is_inf = is_inf_internal = 1;
	3379	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3380	cl_anything(pRExC_state, data->start_class);
	3381	flags &= ~SCF_DO_STCLASS;
	3382	}
	3383	} else {
	3384	Newx(newframe,1,scan_frame);
	3385	paren = stopparen;
	3386	start = scan+2;
	3387	end = regnext(scan);
	3388	}
	3389	if (newframe) {
	3390	assert(start);
	3391	assert(end);
	3392	SAVEFREEPV(newframe);
	3393	newframe->next = regnext(scan);
	3394	newframe->last = last;
	3395	newframe->stop = stopparen;
	3396	newframe->prev = frame;
	3397
	3398	frame = newframe;
	3399	scan = start;
	3400	stopparen = paren;
	3401	last = end;
	3402
	3403	continue;
	3404	}
	3405	}
	3406	else if (OP(scan) == EXACT) {
	3407	I32 l = STR_LEN(scan);
	3408	UV uc;
	3409	if (UTF) {
	3410	const U8 * const s = (U8*)STRING(scan);
	3411	l = utf8_length(s, s + l);
	3412	uc = utf8_to_uvchr(s, NULL);
	3413	} else {
	3414	uc = ((U8)STRING(scan));
	3415	}
	3416	min += l;
	3417	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3418	/* The code below prefers earlier match for fixed
	3419	offset, later match for variable offset. */
	3420	if (data->last_end == -1) { /* Update the start info. */
	3421	data->last_start_min = data->pos_min;
	3422	data->last_start_max = is_inf
	3423	? I32_MAX : data->pos_min + data->pos_delta;
	3424	}
	3425	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3426	if (UTF)
	3427	SvUTF8_on(data->last_found);
	3428	{
	3429	SV * const sv = data->last_found;
	3430	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3431	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3432	if (mg && mg->mg_len >= 0)
	3433	mg->mg_len += utf8_length((U8*)STRING(scan),
	3434	(U8*)STRING(scan)+STR_LEN(scan));
	3435	}
	3436	data->last_end = data->pos_min + l;
	3437	data->pos_min += l; /* As in the first entry. */
	3438	data->flags &= ~SF_BEFORE_EOL;
	3439	}
	3440	if (flags & SCF_DO_STCLASS_AND) {
	3441	/* Check whether it is compatible with what we know already! */
	3442	int compat = 1;
	3443
	3444
	3445	/* If compatible, we or it in below. It is compatible if is
	3446	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3447	* it's for a locale. Even if there isn't unicode semantics
	3448	* here, at runtime there may be because of matching against a
	3449	* utf8 string, so accept a possible false positive for
	3450	* latin1-range folds */
	3451	if (uc >= 0x100 \|\|
	3452	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3453	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3454	&& (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
	3455	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3456	)
	3457	{
	3458	compat = 0;
	3459	}
	3460	ANYOF_CLASS_ZERO(data->start_class);
	3461	ANYOF_BITMAP_ZERO(data->start_class);
	3462	if (compat)
	3463	ANYOF_BITMAP_SET(data->start_class, uc);
	3464	else if (uc >= 0x100) {
	3465	int i;
	3466
	3467	/* Some Unicode code points fold to the Latin1 range; as
	3468	* XXX temporary code, instead of figuring out if this is
	3469	* one, just assume it is and set all the start class bits
	3470	* that could be some such above 255 code point's fold
	3471	* which will generate fals positives. As the code
	3472	* elsewhere that does compute the fold settles down, it
	3473	* can be extracted out and re-used here */
	3474	for (i = 0; i < 256; i++){
	3475	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3476	ANYOF_BITMAP_SET(data->start_class, i);
	3477	}
	3478	}
	3479	}
	3480	data->start_class->flags &= ~ANYOF_EOS;
	3481	if (uc < 0x100)
	3482	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3483	}
	3484	else if (flags & SCF_DO_STCLASS_OR) {
	3485	/* false positive possible if the class is case-folded */
	3486	if (uc < 0x100)
	3487	ANYOF_BITMAP_SET(data->start_class, uc);
	3488	else
	3489	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3490	data->start_class->flags &= ~ANYOF_EOS;
	3491	cl_and(data->start_class, and_withp);
	3492	}
	3493	flags &= ~SCF_DO_STCLASS;
	3494	}
	3495	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3496	I32 l = STR_LEN(scan);
	3497	UV uc = ((U8)STRING(scan));
	3498
	3499	/* Search for fixed substrings supports EXACT only. */
	3500	if (flags & SCF_DO_SUBSTR) {
	3501	assert(data);
	3502	SCAN_COMMIT(pRExC_state, data, minlenp);
	3503	}
	3504	if (UTF) {
	3505	const U8 * const s = (U8 *)STRING(scan);
	3506	l = utf8_length(s, s + l);
	3507	uc = utf8_to_uvchr(s, NULL);
	3508	}
	3509	else if (has_exactf_sharp_s) {
	3510	RExC_seen \|= REG_SEEN_EXACTF_SHARP_S;
	3511	}
	3512	min += l - min_subtract;
	3513	if (min < 0) {
	3514	min = 0;
	3515	}
	3516	delta += min_subtract;
	3517	if (flags & SCF_DO_SUBSTR) {
	3518	data->pos_min += l - min_subtract;
	3519	if (data->pos_min < 0) {
	3520	data->pos_min = 0;
	3521	}
	3522	data->pos_delta += min_subtract;
	3523	if (min_subtract) {
	3524	data->longest = &(data->longest_float);
	3525	}
	3526	}
	3527	if (flags & SCF_DO_STCLASS_AND) {
	3528	/* Check whether it is compatible with what we know already! */
	3529	int compat = 1;
	3530	if (uc >= 0x100 \|\|
	3531	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3532	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3533	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3534	{
	3535	compat = 0;
	3536	}
	3537	ANYOF_CLASS_ZERO(data->start_class);
	3538	ANYOF_BITMAP_ZERO(data->start_class);
	3539	if (compat) {
	3540	ANYOF_BITMAP_SET(data->start_class, uc);
	3541	data->start_class->flags &= ~ANYOF_EOS;
	3542	data->start_class->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	3543	if (OP(scan) == EXACTFL) {
	3544	/* XXX This set is probably no longer necessary, and
	3545	* probably wrong as LOCALE now is on in the initial
	3546	* state */
	3547	data->start_class->flags \|= ANYOF_LOCALE;
	3548	}
	3549	else {
	3550
	3551	/* Also set the other member of the fold pair. In case
	3552	* that unicode semantics is called for at runtime, use
	3553	* the full latin1 fold. (Can't do this for locale,
	3554	* because not known until runtime) */
	3555	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3556
	3557	/* All other (EXACTFL handled above) folds except under
	3558	* /iaa that include s, S, and sharp_s also may include
	3559	* the others */
	3560	if (OP(scan) != EXACTFA) {
	3561	if (uc == 's' \|\| uc == 'S') {
	3562	ANYOF_BITMAP_SET(data->start_class,
	3563	LATIN_SMALL_LETTER_SHARP_S);
	3564	}
	3565	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3566	ANYOF_BITMAP_SET(data->start_class, 's');
	3567	ANYOF_BITMAP_SET(data->start_class, 'S');
	3568	}
	3569	}
	3570	}
	3571	}
	3572	else if (uc >= 0x100) {
	3573	int i;
	3574	for (i = 0; i < 256; i++){
	3575	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3576	ANYOF_BITMAP_SET(data->start_class, i);
	3577	}
	3578	}
	3579	}
	3580	}
	3581	else if (flags & SCF_DO_STCLASS_OR) {
	3582	if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
	3583	/* false positive possible if the class is case-folded.
	3584	Assume that the locale settings are the same... */
	3585	if (uc < 0x100) {
	3586	ANYOF_BITMAP_SET(data->start_class, uc);
	3587	if (OP(scan) != EXACTFL) {
	3588
	3589	/* And set the other member of the fold pair, but
	3590	* can't do that in locale because not known until
	3591	* run-time */
	3592	ANYOF_BITMAP_SET(data->start_class,
	3593	PL_fold_latin1[uc]);
	3594
	3595	/* All folds except under /iaa that include s, S,
	3596	* and sharp_s also may include the others */
	3597	if (OP(scan) != EXACTFA) {
	3598	if (uc == 's' \|\| uc == 'S') {
	3599	ANYOF_BITMAP_SET(data->start_class,
	3600	LATIN_SMALL_LETTER_SHARP_S);
	3601	}
	3602	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3603	ANYOF_BITMAP_SET(data->start_class, 's');
	3604	ANYOF_BITMAP_SET(data->start_class, 'S');
	3605	}
	3606	}
	3607	}
	3608	}
	3609	data->start_class->flags &= ~ANYOF_EOS;
	3610	}
	3611	cl_and(data->start_class, and_withp);
	3612	}
	3613	flags &= ~SCF_DO_STCLASS;
	3614	}
	3615	else if (REGNODE_VARIES(OP(scan))) {
	3616	I32 mincount, maxcount, minnext, deltanext, fl = 0;
	3617	I32 f = flags, pos_before = 0;
	3618	regnode * const oscan = scan;
	3619	struct regnode_charclass_class this_class;
	3620	struct regnode_charclass_class *oclass = NULL;
	3621	I32 next_is_eval = 0;
	3622
	3623	switch (PL_regkind[OP(scan)]) {
	3624	case WHILEM: /* End of (?:...)* . */
	3625	scan = NEXTOPER(scan);
	3626	goto finish;
	3627	case PLUS:
	3628	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3629	next = NEXTOPER(scan);
	3630	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3631	mincount = 1;
	3632	maxcount = REG_INFTY;
	3633	next = regnext(scan);
	3634	scan = NEXTOPER(scan);
	3635	goto do_curly;
	3636	}
	3637	}
	3638	if (flags & SCF_DO_SUBSTR)
	3639	data->pos_min++;
	3640	min++;
	3641	/* Fall through. */
	3642	case STAR:
	3643	if (flags & SCF_DO_STCLASS) {
	3644	mincount = 0;
	3645	maxcount = REG_INFTY;
	3646	next = regnext(scan);
	3647	scan = NEXTOPER(scan);
	3648	goto do_curly;
	3649	}
	3650	is_inf = is_inf_internal = 1;
	3651	scan = regnext(scan);
	3652	if (flags & SCF_DO_SUBSTR) {
	3653	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3654	data->longest = &(data->longest_float);
	3655	}
	3656	goto optimize_curly_tail;
	3657	case CURLY:
	3658	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3659	&& (scan->flags == stopparen))
	3660	{
	3661	mincount = 1;
	3662	maxcount = 1;
	3663	} else {
	3664	mincount = ARG1(scan);
	3665	maxcount = ARG2(scan);
	3666	}
	3667	next = regnext(scan);
	3668	if (OP(scan) == CURLYX) {
	3669	I32 lp = (data ? *(data->last_closep) : 0);
	3670	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3671	}
	3672	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3673	next_is_eval = (OP(scan) == EVAL);
	3674	do_curly:
	3675	if (flags & SCF_DO_SUBSTR) {
	3676	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3677	pos_before = data->pos_min;
	3678	}
	3679	if (data) {
	3680	fl = data->flags;
	3681	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3682	if (is_inf)
	3683	data->flags \|= SF_IS_INF;
	3684	}
	3685	if (flags & SCF_DO_STCLASS) {
	3686	cl_init(pRExC_state, &this_class);
	3687	oclass = data->start_class;
	3688	data->start_class = &this_class;
	3689	f \|= SCF_DO_STCLASS_AND;
	3690	f &= ~SCF_DO_STCLASS_OR;
	3691	}
	3692	/* Exclude from super-linear cache processing any {n,m}
	3693	regops for which the combination of input pos and regex
	3694	pos is not enough information to determine if a match
	3695	will be possible.
	3696
	3697	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3698	regex pos at the \s*, the prospects for a match depend not
	3699	only on the input position but also on how many (bar\s*)
	3700	repeats into the {4,8} we are. */
	3701	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3702	f &= ~SCF_WHILEM_VISITED_POS;
	3703
	3704	/* This will finish on WHILEM, setting scan, or on NULL: */
	3705	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3706	last, data, stopparen, recursed, NULL,
	3707	(mincount == 0
	3708	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3709
	3710	if (flags & SCF_DO_STCLASS)
	3711	data->start_class = oclass;
	3712	if (mincount == 0 \|\| minnext == 0) {
	3713	if (flags & SCF_DO_STCLASS_OR) {
	3714	cl_or(pRExC_state, data->start_class, &this_class);
	3715	}
	3716	else if (flags & SCF_DO_STCLASS_AND) {
	3717	/* Switch to OR mode: cache the old value of
	3718	* data->start_class */
	3719	INIT_AND_WITHP;
	3720	StructCopy(data->start_class, and_withp,
	3721	struct regnode_charclass_class);
	3722	flags &= ~SCF_DO_STCLASS_AND;
	3723	StructCopy(&this_class, data->start_class,
	3724	struct regnode_charclass_class);
	3725	flags \|= SCF_DO_STCLASS_OR;
	3726	data->start_class->flags \|= ANYOF_EOS;
	3727	}
	3728	} else { /* Non-zero len */
	3729	if (flags & SCF_DO_STCLASS_OR) {
	3730	cl_or(pRExC_state, data->start_class, &this_class);
	3731	cl_and(data->start_class, and_withp);
	3732	}
	3733	else if (flags & SCF_DO_STCLASS_AND)
	3734	cl_and(data->start_class, &this_class);
	3735	flags &= ~SCF_DO_STCLASS;
	3736	}
	3737	if (!scan) /* It was not CURLYX, but CURLY. */
	3738	scan = next;
	3739	if ( /* ? quantifier ok, except for (?{ ... }) */
	3740	(next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3741	&& (minnext == 0) && (deltanext == 0)
	3742	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3743	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3744	{
	3745	ckWARNreg(RExC_parse,
	3746	"Quantifier unexpected on zero-length expression");
	3747	}
	3748
	3749	min += minnext * mincount;
	3750	is_inf_internal \|= ((maxcount == REG_INFTY
	3751	&& (minnext + deltanext) > 0)
	3752	\|\| deltanext == I32_MAX);
	3753	is_inf \|= is_inf_internal;
	3754	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3755
	3756	/* Try powerful optimization CURLYX => CURLYN. */
	3757	if ( OP(oscan) == CURLYX && data
	3758	&& data->flags & SF_IN_PAR
	3759	&& !(data->flags & SF_HAS_EVAL)
	3760	&& !deltanext && minnext == 1 ) {
	3761	/* Try to optimize to CURLYN. */
	3762	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3763	regnode * const nxt1 = nxt;
	3764	#ifdef DEBUGGING
	3765	regnode *nxt2;
	3766	#endif
	3767
	3768	/* Skip open. */
	3769	nxt = regnext(nxt);
	3770	if (!REGNODE_SIMPLE(OP(nxt))
	3771	&& !(PL_regkind[OP(nxt)] == EXACT
	3772	&& STR_LEN(nxt) == 1))
	3773	goto nogo;
	3774	#ifdef DEBUGGING
	3775	nxt2 = nxt;
	3776	#endif
	3777	nxt = regnext(nxt);
	3778	if (OP(nxt) != CLOSE)
	3779	goto nogo;
	3780	if (RExC_open_parens) {
	3781	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3782	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3783	}
	3784	/* Now we know that nxt2 is the only contents: */
	3785	oscan->flags = (U8)ARG(nxt);
	3786	OP(oscan) = CURLYN;
	3787	OP(nxt1) = NOTHING; /* was OPEN. */
	3788
	3789	#ifdef DEBUGGING
	3790	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3791	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	3792	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	3793	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3794	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3795	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	3796	#endif
	3797	}
	3798	nogo:
	3799
	3800	/* Try optimization CURLYX => CURLYM. */
	3801	if ( OP(oscan) == CURLYX && data
	3802	&& !(data->flags & SF_HAS_PAR)
	3803	&& !(data->flags & SF_HAS_EVAL)
	3804	&& !deltanext /* atom is fixed width */
	3805	&& minnext != 0 /* CURLYM can't handle zero width */
	3806	) {
	3807	/* XXXX How to optimize if data == 0? */
	3808	/* Optimize to a simpler form. */
	3809	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	3810	regnode *nxt2;
	3811
	3812	OP(oscan) = CURLYM;
	3813	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	3814	&& (OP(nxt2) != WHILEM))
	3815	nxt = nxt2;
	3816	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	3817	/* Need to optimize away parenths. */
	3818	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	3819	/* Set the parenth number. */
	3820	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	3821
	3822	oscan->flags = (U8)ARG(nxt);
	3823	if (RExC_open_parens) {
	3824	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3825	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	3826	}
	3827	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	3828	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3829
	3830	#ifdef DEBUGGING
	3831	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3832	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3833	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	3834	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	3835	#endif
	3836	#if 0
	3837	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	3838	regnode *nnxt = regnext(nxt1);
	3839	if (nnxt == nxt) {
	3840	if (reg_off_by_arg[OP(nxt1)])
	3841	ARG_SET(nxt1, nxt2 - nxt1);
	3842	else if (nxt2 - nxt1 < U16_MAX)
	3843	NEXT_OFF(nxt1) = nxt2 - nxt1;
	3844	else
	3845	OP(nxt) = NOTHING; /* Cannot beautify */
	3846	}
	3847	nxt1 = nnxt;
	3848	}
	3849	#endif
	3850	/* Optimize again: */
	3851	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	3852	NULL, stopparen, recursed, NULL, 0,depth+1);
	3853	}
	3854	else
	3855	oscan->flags = 0;
	3856	}
	3857	else if ((OP(oscan) == CURLYX)
	3858	&& (flags & SCF_WHILEM_VISITED_POS)
	3859	/* See the comment on a similar expression above.
	3860	However, this time it's not a subexpression
	3861	we care about, but the expression itself. */
	3862	&& (maxcount == REG_INFTY)
	3863	&& data && ++data->whilem_c < 16) {
	3864	/* This stays as CURLYX, we can put the count/of pair. */
	3865	/* Find WHILEM (as in regexec.c) */
	3866	regnode *nxt = oscan + NEXT_OFF(oscan);
	3867
	3868	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	3869	nxt += ARG(nxt);
	3870	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	3871	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	3872	}
	3873	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	3874	pars++;
	3875	if (flags & SCF_DO_SUBSTR) {
	3876	SV *last_str = NULL;
	3877	int counted = mincount != 0;
	3878
	3879	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	3880	#if defined(SPARC64_GCC_WORKAROUND)
	3881	I32 b = 0;
	3882	STRLEN l = 0;
	3883	const char *s = NULL;
	3884	I32 old = 0;
	3885
	3886	if (pos_before >= data->last_start_min)
	3887	b = pos_before;
	3888	else
	3889	b = data->last_start_min;
	3890
	3891	l = 0;
	3892	s = SvPV_const(data->last_found, l);
	3893	old = b - data->last_start_min;
	3894
	3895	#else
	3896	I32 b = pos_before >= data->last_start_min
	3897	? pos_before : data->last_start_min;
	3898	STRLEN l;
	3899	const char * const s = SvPV_const(data->last_found, l);
	3900	I32 old = b - data->last_start_min;
	3901	#endif
	3902
	3903	if (UTF)
	3904	old = utf8_hop((U8)s, old) - (U8)s;
	3905	l -= old;
	3906	/* Get the added string: */
	3907	last_str = newSVpvn_utf8(s + old, l, UTF);
	3908	if (deltanext == 0 && pos_before == b) {
	3909	/* What was added is a constant string */
	3910	if (mincount > 1) {
	3911	SvGROW(last_str, (mincount * l) + 1);
	3912	repeatcpy(SvPVX(last_str) + l,
	3913	SvPVX_const(last_str), l, mincount - 1);
	3914	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	3915	/* Add additional parts. */
	3916	SvCUR_set(data->last_found,
	3917	SvCUR(data->last_found) - l);
	3918	sv_catsv(data->last_found, last_str);
	3919	{
	3920	SV * sv = data->last_found;
	3921	MAGIC *mg =
	3922	SvUTF8(sv) && SvMAGICAL(sv) ?
	3923	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3924	if (mg && mg->mg_len >= 0)
	3925	mg->mg_len += CHR_SVLEN(last_str) - l;
	3926	}
	3927	data->last_end += l * (mincount - 1);
	3928	}
	3929	} else {
	3930	/* start offset must point into the last copy */
	3931	data->last_start_min += minnext * (mincount - 1);
	3932	data->last_start_max += is_inf ? I32_MAX
	3933	: (maxcount - 1) * (minnext + data->pos_delta);
	3934	}
	3935	}
	3936	/* It is counted once already... */
	3937	data->pos_min += minnext * (mincount - counted);
	3938	data->pos_delta += - counted * deltanext +
	3939	(minnext + deltanext) * maxcount - minnext * mincount;
	3940	if (mincount != maxcount) {
	3941	/* Cannot extend fixed substrings found inside
	3942	the group. */
	3943	SCAN_COMMIT(pRExC_state,data,minlenp);
	3944	if (mincount && last_str) {
	3945	SV * const sv = data->last_found;
	3946	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3947	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3948
	3949	if (mg)
	3950	mg->mg_len = -1;
	3951	sv_setsv(sv, last_str);
	3952	data->last_end = data->pos_min;
	3953	data->last_start_min =
	3954	data->pos_min - CHR_SVLEN(last_str);
	3955	data->last_start_max = is_inf
	3956	? I32_MAX
	3957	: data->pos_min + data->pos_delta
	3958	- CHR_SVLEN(last_str);
	3959	}
	3960	data->longest = &(data->longest_float);
	3961	}
	3962	SvREFCNT_dec(last_str);
	3963	}
	3964	if (data && (fl & SF_HAS_EVAL))
	3965	data->flags \|= SF_HAS_EVAL;
	3966	optimize_curly_tail:
	3967	if (OP(oscan) != CURLYX) {
	3968	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	3969	&& NEXT_OFF(next))
	3970	NEXT_OFF(oscan) += NEXT_OFF(next);
	3971	}
	3972	continue;
	3973	default: /* REF, ANYOFV, and CLUMP only? */
	3974	if (flags & SCF_DO_SUBSTR) {
	3975	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3976	data->longest = &(data->longest_float);
	3977	}
	3978	is_inf = is_inf_internal = 1;
	3979	if (flags & SCF_DO_STCLASS_OR)
	3980	cl_anything(pRExC_state, data->start_class);
	3981	flags &= ~SCF_DO_STCLASS;
	3982	break;
	3983	}
	3984	}
	3985	else if (OP(scan) == LNBREAK) {
	3986	if (flags & SCF_DO_STCLASS) {
	3987	int value = 0;
	3988	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3989	if (flags & SCF_DO_STCLASS_AND) {
	3990	for (value = 0; value < 256; value++)
	3991	if (!is_VERTWS_cp(value))
	3992	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3993	}
	3994	else {
	3995	for (value = 0; value < 256; value++)
	3996	if (is_VERTWS_cp(value))
	3997	ANYOF_BITMAP_SET(data->start_class, value);
	3998	}
	3999	if (flags & SCF_DO_STCLASS_OR)
	4000	cl_and(data->start_class, and_withp);
	4001	flags &= ~SCF_DO_STCLASS;
	4002	}
	4003	min += 1;
	4004	delta += 1;
	4005	if (flags & SCF_DO_SUBSTR) {
	4006	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4007	data->pos_min += 1;
	4008	data->pos_delta += 1;
	4009	data->longest = &(data->longest_float);
	4010	}
	4011	}
	4012	else if (REGNODE_SIMPLE(OP(scan))) {
	4013	int value = 0;
	4014
	4015	if (flags & SCF_DO_SUBSTR) {
	4016	SCAN_COMMIT(pRExC_state,data,minlenp);
	4017	data->pos_min++;
	4018	}
	4019	min++;
	4020	if (flags & SCF_DO_STCLASS) {
	4021	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	4022
	4023	/* Some of the logic below assumes that switching
	4024	locale on will only add false positives. */
	4025	switch (PL_regkind[OP(scan)]) {
	4026	case SANY:
	4027	default:
	4028	do_default:
	4029	/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
	4030	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4031	cl_anything(pRExC_state, data->start_class);
	4032	break;
	4033	case REG_ANY:
	4034	if (OP(scan) == SANY)
	4035	goto do_default;
	4036	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	4037	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	4038	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	4039	cl_anything(pRExC_state, data->start_class);
	4040	}
	4041	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	4042	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	4043	break;
	4044	case ANYOF:
	4045	if (flags & SCF_DO_STCLASS_AND)
	4046	cl_and(data->start_class,
	4047	(struct regnode_charclass_class*)scan);
	4048	else
	4049	cl_or(pRExC_state, data->start_class,
	4050	(struct regnode_charclass_class*)scan);
	4051	break;
	4052	case ALNUM:
	4053	if (flags & SCF_DO_STCLASS_AND) {
	4054	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4055	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
	4056	if (OP(scan) == ALNUMU) {
	4057	for (value = 0; value < 256; value++) {
	4058	if (!isWORDCHAR_L1(value)) {
	4059	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4060	}
	4061	}
	4062	} else {
	4063	for (value = 0; value < 256; value++) {
	4064	if (!isALNUM(value)) {
	4065	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4066	}
	4067	}
	4068	}
	4069	}
	4070	}
	4071	else {
	4072	if (data->start_class->flags & ANYOF_LOCALE)
	4073	ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
	4074
	4075	/* Even if under locale, set the bits for non-locale
	4076	* in case it isn't a true locale-node. This will
	4077	* create false positives if it truly is locale */
	4078	if (OP(scan) == ALNUMU) {
	4079	for (value = 0; value < 256; value++) {
	4080	if (isWORDCHAR_L1(value)) {
	4081	ANYOF_BITMAP_SET(data->start_class, value);
	4082	}
	4083	}
	4084	} else {
	4085	for (value = 0; value < 256; value++) {
	4086	if (isALNUM(value)) {
	4087	ANYOF_BITMAP_SET(data->start_class, value);
	4088	}
	4089	}
	4090	}
	4091	}
	4092	break;
	4093	case NALNUM:
	4094	if (flags & SCF_DO_STCLASS_AND) {
	4095	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4096	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
	4097	if (OP(scan) == NALNUMU) {
	4098	for (value = 0; value < 256; value++) {
	4099	if (isWORDCHAR_L1(value)) {
	4100	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4101	}
	4102	}
	4103	} else {
	4104	for (value = 0; value < 256; value++) {
	4105	if (isALNUM(value)) {
	4106	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4107	}
	4108	}
	4109	}
	4110	}
	4111	}
	4112	else {
	4113	if (data->start_class->flags & ANYOF_LOCALE)
	4114	ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
	4115
	4116	/* Even if under locale, set the bits for non-locale in
	4117	* case it isn't a true locale-node. This will create
	4118	* false positives if it truly is locale */
	4119	if (OP(scan) == NALNUMU) {
	4120	for (value = 0; value < 256; value++) {
	4121	if (! isWORDCHAR_L1(value)) {
	4122	ANYOF_BITMAP_SET(data->start_class, value);
	4123	}
	4124	}
	4125	} else {
	4126	for (value = 0; value < 256; value++) {
	4127	if (! isALNUM(value)) {
	4128	ANYOF_BITMAP_SET(data->start_class, value);
	4129	}
	4130	}
	4131	}
	4132	}
	4133	break;
	4134	case SPACE:
	4135	if (flags & SCF_DO_STCLASS_AND) {
	4136	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4137	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
	4138	if (OP(scan) == SPACEU) {
	4139	for (value = 0; value < 256; value++) {
	4140	if (!isSPACE_L1(value)) {
	4141	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4142	}
	4143	}
	4144	} else {
	4145	for (value = 0; value < 256; value++) {
	4146	if (!isSPACE(value)) {
	4147	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4148	}
	4149	}
	4150	}
	4151	}
	4152	}
	4153	else {
	4154	if (data->start_class->flags & ANYOF_LOCALE) {
	4155	ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
	4156	}
	4157	if (OP(scan) == SPACEU) {
	4158	for (value = 0; value < 256; value++) {
	4159	if (isSPACE_L1(value)) {
	4160	ANYOF_BITMAP_SET(data->start_class, value);
	4161	}
	4162	}
	4163	} else {
	4164	for (value = 0; value < 256; value++) {
	4165	if (isSPACE(value)) {
	4166	ANYOF_BITMAP_SET(data->start_class, value);
	4167	}
	4168	}
	4169	}
	4170	}
	4171	break;
	4172	case NSPACE:
	4173	if (flags & SCF_DO_STCLASS_AND) {
	4174	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4175	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
	4176	if (OP(scan) == NSPACEU) {
	4177	for (value = 0; value < 256; value++) {
	4178	if (isSPACE_L1(value)) {
	4179	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4180	}
	4181	}
	4182	} else {
	4183	for (value = 0; value < 256; value++) {
	4184	if (isSPACE(value)) {
	4185	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4186	}
	4187	}
	4188	}
	4189	}
	4190	}
	4191	else {
	4192	if (data->start_class->flags & ANYOF_LOCALE)
	4193	ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
	4194	if (OP(scan) == NSPACEU) {
	4195	for (value = 0; value < 256; value++) {
	4196	if (!isSPACE_L1(value)) {
	4197	ANYOF_BITMAP_SET(data->start_class, value);
	4198	}
	4199	}
	4200	}
	4201	else {
	4202	for (value = 0; value < 256; value++) {
	4203	if (!isSPACE(value)) {
	4204	ANYOF_BITMAP_SET(data->start_class, value);
	4205	}
	4206	}
	4207	}
	4208	}
	4209	break;
	4210	case DIGIT:
	4211	if (flags & SCF_DO_STCLASS_AND) {
	4212	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4213	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
	4214	for (value = 0; value < 256; value++)
	4215	if (!isDIGIT(value))
	4216	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4217	}
	4218	}
	4219	else {
	4220	if (data->start_class->flags & ANYOF_LOCALE)
	4221	ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
	4222	for (value = 0; value < 256; value++)
	4223	if (isDIGIT(value))
	4224	ANYOF_BITMAP_SET(data->start_class, value);
	4225	}
	4226	break;
	4227	case NDIGIT:
	4228	if (flags & SCF_DO_STCLASS_AND) {
	4229	if (!(data->start_class->flags & ANYOF_LOCALE))
	4230	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
	4231	for (value = 0; value < 256; value++)
	4232	if (isDIGIT(value))
	4233	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4234	}
	4235	else {
	4236	if (data->start_class->flags & ANYOF_LOCALE)
	4237	ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
	4238	for (value = 0; value < 256; value++)
	4239	if (!isDIGIT(value))
	4240	ANYOF_BITMAP_SET(data->start_class, value);
	4241	}
	4242	break;
	4243	CASE_SYNST_FNC(VERTWS);
	4244	CASE_SYNST_FNC(HORIZWS);
	4245
	4246	}
	4247	if (flags & SCF_DO_STCLASS_OR)
	4248	cl_and(data->start_class, and_withp);
	4249	flags &= ~SCF_DO_STCLASS;
	4250	}
	4251	}
	4252	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	4253	data->flags \|= (OP(scan) == MEOL
	4254	? SF_BEFORE_MEOL
	4255	: SF_BEFORE_SEOL);
	4256	}
	4257	else if ( PL_regkind[OP(scan)] == BRANCHJ
	4258	/* Lookbehind, or need to calculate parens/evals/stclass: */
	4259	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	4260	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	4261	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4262	\|\| OP(scan) == UNLESSM )
	4263	{
	4264	/* Negative Lookahead/lookbehind
	4265	In this case we can't do fixed string optimisation.
	4266	*/
	4267
	4268	I32 deltanext, minnext, fake = 0;
	4269	regnode *nscan;
	4270	struct regnode_charclass_class intrnl;
	4271	int f = 0;
	4272
	4273	data_fake.flags = 0;
	4274	if (data) {
	4275	data_fake.whilem_c = data->whilem_c;
	4276	data_fake.last_closep = data->last_closep;
	4277	}
	4278	else
	4279	data_fake.last_closep = &fake;
	4280	data_fake.pos_delta = delta;
	4281	if ( flags & SCF_DO_STCLASS && !scan->flags
	4282	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4283	cl_init(pRExC_state, &intrnl);
	4284	data_fake.start_class = &intrnl;
	4285	f \|= SCF_DO_STCLASS_AND;
	4286	}
	4287	if (flags & SCF_WHILEM_VISITED_POS)
	4288	f \|= SCF_WHILEM_VISITED_POS;
	4289	next = regnext(scan);
	4290	nscan = NEXTOPER(NEXTOPER(scan));
	4291	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	4292	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	4293	if (scan->flags) {
	4294	if (deltanext) {
	4295	FAIL("Variable length lookbehind not implemented");
	4296	}
	4297	else if (minnext > (I32)U8_MAX) {
	4298	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4299	}
	4300	scan->flags = (U8)minnext;
	4301	}
	4302	if (data) {
	4303	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4304	pars++;
	4305	if (data_fake.flags & SF_HAS_EVAL)
	4306	data->flags \|= SF_HAS_EVAL;
	4307	data->whilem_c = data_fake.whilem_c;
	4308	}
	4309	if (f & SCF_DO_STCLASS_AND) {
	4310	if (flags & SCF_DO_STCLASS_OR) {
	4311	/* OR before, AND after: ideally we would recurse with
	4312	* data_fake to get the AND applied by study of the
	4313	* remainder of the pattern, and then derecurse;
	4314	* * HACK * for now just treat as "no information".
	4315	* See [perl #56690].
	4316	*/
	4317	cl_init(pRExC_state, data->start_class);
	4318	} else {
	4319	/* AND before and after: combine and continue */
	4320	const int was = (data->start_class->flags & ANYOF_EOS);
	4321
	4322	cl_and(data->start_class, &intrnl);
	4323	if (was)
	4324	data->start_class->flags \|= ANYOF_EOS;
	4325	}
	4326	}
	4327	}
	4328	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4329	else {
	4330	/* Positive Lookahead/lookbehind
	4331	In this case we can do fixed string optimisation,
	4332	but we must be careful about it. Note in the case of
	4333	lookbehind the positions will be offset by the minimum
	4334	length of the pattern, something we won't know about
	4335	until after the recurse.
	4336	*/
	4337	I32 deltanext, fake = 0;
	4338	regnode *nscan;
	4339	struct regnode_charclass_class intrnl;
	4340	int f = 0;
	4341	/* We use SAVEFREEPV so that when the full compile
	4342	is finished perl will clean up the allocated
	4343	minlens when it's all done. This way we don't
	4344	have to worry about freeing them when we know
	4345	they wont be used, which would be a pain.
	4346	*/
	4347	I32 *minnextp;
	4348	Newx( minnextp, 1, I32 );
	4349	SAVEFREEPV(minnextp);
	4350
	4351	if (data) {
	4352	StructCopy(data, &data_fake, scan_data_t);
	4353	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	4354	f \|= SCF_DO_SUBSTR;
	4355	if (scan->flags)
	4356	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	4357	data_fake.last_found=newSVsv(data->last_found);
	4358	}
	4359	}
	4360	else
	4361	data_fake.last_closep = &fake;
	4362	data_fake.flags = 0;
	4363	data_fake.pos_delta = delta;
	4364	if (is_inf)
	4365	data_fake.flags \|= SF_IS_INF;
	4366	if ( flags & SCF_DO_STCLASS && !scan->flags
	4367	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4368	cl_init(pRExC_state, &intrnl);
	4369	data_fake.start_class = &intrnl;
	4370	f \|= SCF_DO_STCLASS_AND;
	4371	}
	4372	if (flags & SCF_WHILEM_VISITED_POS)
	4373	f \|= SCF_WHILEM_VISITED_POS;
	4374	next = regnext(scan);
	4375	nscan = NEXTOPER(NEXTOPER(scan));
	4376
	4377	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	4378	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	4379	if (scan->flags) {
	4380	if (deltanext) {
	4381	FAIL("Variable length lookbehind not implemented");
	4382	}
	4383	else if (*minnextp > (I32)U8_MAX) {
	4384	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4385	}
	4386	scan->flags = (U8)*minnextp;
	4387	}
	4388
	4389	*minnextp += min;
	4390
	4391	if (f & SCF_DO_STCLASS_AND) {
	4392	const int was = (data->start_class->flags & ANYOF_EOS);
	4393
	4394	cl_and(data->start_class, &intrnl);
	4395	if (was)
	4396	data->start_class->flags \|= ANYOF_EOS;
	4397	}
	4398	if (data) {
	4399	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4400	pars++;
	4401	if (data_fake.flags & SF_HAS_EVAL)
	4402	data->flags \|= SF_HAS_EVAL;
	4403	data->whilem_c = data_fake.whilem_c;
	4404	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	4405	if (RExC_rx->minlen<*minnextp)
	4406	RExC_rx->minlen=*minnextp;
	4407	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	4408	SvREFCNT_dec(data_fake.last_found);
	4409
	4410	if ( data_fake.minlen_fixed != minlenp )
	4411	{
	4412	data->offset_fixed= data_fake.offset_fixed;
	4413	data->minlen_fixed= data_fake.minlen_fixed;
	4414	data->lookbehind_fixed+= scan->flags;
	4415	}
	4416	if ( data_fake.minlen_float != minlenp )
	4417	{
	4418	data->minlen_float= data_fake.minlen_float;
	4419	data->offset_float_min=data_fake.offset_float_min;
	4420	data->offset_float_max=data_fake.offset_float_max;
	4421	data->lookbehind_float+= scan->flags;
	4422	}
	4423	}
	4424	}
	4425
	4426
	4427	}
	4428	#endif
	4429	}
	4430	else if (OP(scan) == OPEN) {
	4431	if (stopparen != (I32)ARG(scan))
	4432	pars++;
	4433	}
	4434	else if (OP(scan) == CLOSE) {
	4435	if (stopparen == (I32)ARG(scan)) {
	4436	break;
	4437	}
	4438	if ((I32)ARG(scan) == is_par) {
	4439	next = regnext(scan);
	4440
	4441	if ( next && (OP(next) != WHILEM) && next < last)
	4442	is_par = 0; /* Disable optimization */
	4443	}
	4444	if (data)
	4445	*(data->last_closep) = ARG(scan);
	4446	}
	4447	else if (OP(scan) == EVAL) {
	4448	if (data)
	4449	data->flags \|= SF_HAS_EVAL;
	4450	}
	4451	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4452	if (flags & SCF_DO_SUBSTR) {
	4453	SCAN_COMMIT(pRExC_state,data,minlenp);
	4454	flags &= ~SCF_DO_SUBSTR;
	4455	}
	4456	if (data && OP(scan)==ACCEPT) {
	4457	data->flags \|= SCF_SEEN_ACCEPT;
	4458	if (stopmin > min)
	4459	stopmin = min;
	4460	}
	4461	}
	4462	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4463	{
	4464	if (flags & SCF_DO_SUBSTR) {
	4465	SCAN_COMMIT(pRExC_state,data,minlenp);
	4466	data->longest = &(data->longest_float);
	4467	}
	4468	is_inf = is_inf_internal = 1;
	4469	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4470	cl_anything(pRExC_state, data->start_class);
	4471	flags &= ~SCF_DO_STCLASS;
	4472	}
	4473	else if (OP(scan) == GPOS) {
	4474	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4475	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4476	{
	4477	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4478	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4479	if (RExC_rx->gofs < (U32)min)
	4480	RExC_rx->gofs = min;
	4481	} else {
	4482	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4483	RExC_rx->gofs = 0;
	4484	}
	4485	}
	4486	#ifdef TRIE_STUDY_OPT
	4487	#ifdef FULL_TRIE_STUDY
	4488	else if (PL_regkind[OP(scan)] == TRIE) {
	4489	/* NOTE - There is similar code to this block above for handling
	4490	BRANCH nodes on the initial study. If you change stuff here
	4491	check there too. */
	4492	regnode *trie_node= scan;
	4493	regnode *tail= regnext(scan);
	4494	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4495	I32 max1 = 0, min1 = I32_MAX;
	4496	struct regnode_charclass_class accum;
	4497
	4498	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4499	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4500	if (flags & SCF_DO_STCLASS)
	4501	cl_init_zero(pRExC_state, &accum);
	4502
	4503	if (!trie->jump) {
	4504	min1= trie->minlen;
	4505	max1= trie->maxlen;
	4506	} else {
	4507	const regnode *nextbranch= NULL;
	4508	U32 word;
	4509
	4510	for ( word=1 ; word <= trie->wordcount ; word++)
	4511	{
	4512	I32 deltanext=0, minnext=0, f = 0, fake;
	4513	struct regnode_charclass_class this_class;
	4514
	4515	data_fake.flags = 0;
	4516	if (data) {
	4517	data_fake.whilem_c = data->whilem_c;
	4518	data_fake.last_closep = data->last_closep;
	4519	}
	4520	else
	4521	data_fake.last_closep = &fake;
	4522	data_fake.pos_delta = delta;
	4523	if (flags & SCF_DO_STCLASS) {
	4524	cl_init(pRExC_state, &this_class);
	4525	data_fake.start_class = &this_class;
	4526	f = SCF_DO_STCLASS_AND;
	4527	}
	4528	if (flags & SCF_WHILEM_VISITED_POS)
	4529	f \|= SCF_WHILEM_VISITED_POS;
	4530
	4531	if (trie->jump[word]) {
	4532	if (!nextbranch)
	4533	nextbranch = trie_node + trie->jump[0];
	4534	scan= trie_node + trie->jump[word];
	4535	/* We go from the jump point to the branch that follows
	4536	it. Note this means we need the vestigal unused branches
	4537	even though they arent otherwise used.
	4538	*/
	4539	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4540	&deltanext, (regnode *)nextbranch, &data_fake,
	4541	stopparen, recursed, NULL, f,depth+1);
	4542	}
	4543	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4544	nextbranch= regnext((regnode*)nextbranch);
	4545
	4546	if (min1 > (I32)(minnext + trie->minlen))
	4547	min1 = minnext + trie->minlen;
	4548	if (max1 < (I32)(minnext + deltanext + trie->maxlen))
	4549	max1 = minnext + deltanext + trie->maxlen;
	4550	if (deltanext == I32_MAX)
	4551	is_inf = is_inf_internal = 1;
	4552
	4553	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4554	pars++;
	4555	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4556	if ( stopmin > min + min1)
	4557	stopmin = min + min1;
	4558	flags &= ~SCF_DO_SUBSTR;
	4559	if (data)
	4560	data->flags \|= SCF_SEEN_ACCEPT;
	4561	}
	4562	if (data) {
	4563	if (data_fake.flags & SF_HAS_EVAL)
	4564	data->flags \|= SF_HAS_EVAL;
	4565	data->whilem_c = data_fake.whilem_c;
	4566	}
	4567	if (flags & SCF_DO_STCLASS)
	4568	cl_or(pRExC_state, &accum, &this_class);
	4569	}
	4570	}
	4571	if (flags & SCF_DO_SUBSTR) {
	4572	data->pos_min += min1;
	4573	data->pos_delta += max1 - min1;
	4574	if (max1 != min1 \|\| is_inf)
	4575	data->longest = &(data->longest_float);
	4576	}
	4577	min += min1;
	4578	delta += max1 - min1;
	4579	if (flags & SCF_DO_STCLASS_OR) {
	4580	cl_or(pRExC_state, data->start_class, &accum);
	4581	if (min1) {
	4582	cl_and(data->start_class, and_withp);
	4583	flags &= ~SCF_DO_STCLASS;
	4584	}
	4585	}
	4586	else if (flags & SCF_DO_STCLASS_AND) {
	4587	if (min1) {
	4588	cl_and(data->start_class, &accum);
	4589	flags &= ~SCF_DO_STCLASS;
	4590	}
	4591	else {
	4592	/* Switch to OR mode: cache the old value of
	4593	* data->start_class */
	4594	INIT_AND_WITHP;
	4595	StructCopy(data->start_class, and_withp,
	4596	struct regnode_charclass_class);
	4597	flags &= ~SCF_DO_STCLASS_AND;
	4598	StructCopy(&accum, data->start_class,
	4599	struct regnode_charclass_class);
	4600	flags \|= SCF_DO_STCLASS_OR;
	4601	data->start_class->flags \|= ANYOF_EOS;
	4602	}
	4603	}
	4604	scan= tail;
	4605	continue;
	4606	}
	4607	#else
	4608	else if (PL_regkind[OP(scan)] == TRIE) {
	4609	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4610	U8*bang=NULL;
	4611
	4612	min += trie->minlen;
	4613	delta += (trie->maxlen - trie->minlen);
	4614	flags &= ~SCF_DO_STCLASS; /* xxx */
	4615	if (flags & SCF_DO_SUBSTR) {
	4616	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4617	data->pos_min += trie->minlen;
	4618	data->pos_delta += (trie->maxlen - trie->minlen);
	4619	if (trie->maxlen != trie->minlen)
	4620	data->longest = &(data->longest_float);
	4621	}
	4622	if (trie->jump) /* no more substrings -- for now /grr*/
	4623	flags &= ~SCF_DO_SUBSTR;
	4624	}
	4625	#endif /* old or new */
	4626	#endif /* TRIE_STUDY_OPT */
	4627
	4628	/* Else: zero-length, ignore. */
	4629	scan = regnext(scan);
	4630	}
	4631	if (frame) {
	4632	last = frame->last;
	4633	scan = frame->next;
	4634	stopparen = frame->stop;
	4635	frame = frame->prev;
	4636	goto fake_study_recurse;
	4637	}
	4638
	4639	finish:
	4640	assert(!frame);
	4641	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4642
	4643	*scanp = scan;
	4644	*deltap = is_inf_internal ? I32_MAX : delta;
	4645	if (flags & SCF_DO_SUBSTR && is_inf)
	4646	data->pos_delta = I32_MAX - data->pos_min;
	4647	if (is_par > (I32)U8_MAX)
	4648	is_par = 0;
	4649	if (is_par && pars==1 && data) {
	4650	data->flags \|= SF_IN_PAR;
	4651	data->flags &= ~SF_HAS_PAR;
	4652	}
	4653	else if (pars && data) {
	4654	data->flags \|= SF_HAS_PAR;
	4655	data->flags &= ~SF_IN_PAR;
	4656	}
	4657	if (flags & SCF_DO_STCLASS_OR)
	4658	cl_and(data->start_class, and_withp);
	4659	if (flags & SCF_TRIE_RESTUDY)
	4660	data->flags \|= SCF_TRIE_RESTUDY;
	4661
	4662	DEBUG_STUDYDATA("post-fin:",data,depth);
	4663
	4664	return min < stopmin ? min : stopmin;
	4665	}
	4666
	4667	STATIC U32
	4668	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4669	{
	4670	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4671
	4672	PERL_ARGS_ASSERT_ADD_DATA;
	4673
	4674	Renewc(RExC_rxi->data,
	4675	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4676	char, struct reg_data);
	4677	if(count)
	4678	Renew(RExC_rxi->data->what, count + n, U8);
	4679	else
	4680	Newx(RExC_rxi->data->what, n, U8);
	4681	RExC_rxi->data->count = count + n;
	4682	Copy(s, RExC_rxi->data->what + count, n, U8);
	4683	return count;
	4684	}
	4685
	4686	/XXX: todo make this not included in a non debugging perl /
	4687	#ifndef PERL_IN_XSUB_RE
	4688	void
	4689	Perl_reginitcolors(pTHX)
	4690	{
	4691	dVAR;
	4692	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4693	if (s) {
	4694	char *t = savepv(s);
	4695	int i = 0;
	4696	PL_colors[0] = t;
	4697	while (++i < 6) {
	4698	t = strchr(t, '\t');
	4699	if (t) {
	4700	*t = '\0';
	4701	PL_colors[i] = ++t;
	4702	}
	4703	else
	4704	PL_colors[i] = t = (char *)"";
	4705	}
	4706	} else {
	4707	int i = 0;
	4708	while (i < 6)
	4709	PL_colors[i++] = (char *)"";
	4710	}
	4711	PL_colorset = 1;
	4712	}
	4713	#endif
	4714
	4715
	4716	#ifdef TRIE_STUDY_OPT
	4717	#define CHECK_RESTUDY_GOTO \
	4718	if ( \
	4719	(data.flags & SCF_TRIE_RESTUDY) \
	4720	&& ! restudied++ \
	4721	) goto reStudy
	4722	#else
	4723	#define CHECK_RESTUDY_GOTO
	4724	#endif
	4725
	4726	/*
	4727	- pregcomp - compile a regular expression into internal code
	4728	*
	4729	* We can't allocate space until we know how big the compiled form will be,
	4730	* but we can't compile it (and thus know how big it is) until we've got a
	4731	* place to put the code. So we cheat: we compile it twice, once with code
	4732	* generation turned off and size counting turned on, and once "for real".
	4733	* This also means that we don't allocate space until we are sure that the
	4734	* thing really will compile successfully, and we never have to move the
	4735	* code and thus invalidate pointers into it. (Note that it has to be in
	4736	* one piece because free() must be able to free it all.) [NB: not true in perl]
	4737	*
	4738	* Beware that the optimization-preparation code in here knows about some
	4739	* of the structure of the compiled regexp. [I'll say.]
	4740	*/
	4741
	4742
	4743
	4744	#ifndef PERL_IN_XSUB_RE
	4745	#define RE_ENGINE_PTR &PL_core_reg_engine
	4746	#else
	4747	extern const struct regexp_engine my_reg_engine;
	4748	#define RE_ENGINE_PTR &my_reg_engine
	4749	#endif
	4750
	4751	#ifndef PERL_IN_XSUB_RE
	4752	REGEXP *
	4753	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4754	{
	4755	dVAR;
	4756	HV * const table = GvHV(PL_hintgv);
	4757
	4758	PERL_ARGS_ASSERT_PREGCOMP;
	4759
	4760	/* Dispatch a request to compile a regexp to correct
	4761	regexp engine. */
	4762	if (table) {
	4763	SV **ptr= hv_fetchs(table, "regcomp", FALSE);
	4764	GET_RE_DEBUG_FLAGS_DECL;
	4765	if (ptr && SvIOK(ptr) && SvIV(ptr)) {
	4766	const regexp_engine eng=INT2PTR(regexp_engine,SvIV(*ptr));
	4767	DEBUG_COMPILE_r({
	4768	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4769	SvIV(*ptr));
	4770	});
	4771	return CALLREGCOMP_ENG(eng, pattern, flags);
	4772	}
	4773	}
	4774	return Perl_re_compile(aTHX_ pattern, flags);
	4775	}
	4776	#endif
	4777
	4778	REGEXP *
	4779	Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
	4780	{
	4781	dVAR;
	4782	REGEXP *rx;
	4783	struct regexp *r;
	4784	register regexp_internal *ri;
	4785	STRLEN plen;
	4786	char* VOL exp;
	4787	char* xend;
	4788	regnode *scan;
	4789	I32 flags;
	4790	I32 minlen = 0;
	4791	U32 pm_flags;
	4792
	4793	/* these are all flags - maybe they should be turned
	4794	* into a single int with different bit masks */
	4795	I32 sawlookahead = 0;
	4796	I32 sawplus = 0;
	4797	I32 sawopen = 0;
	4798	bool used_setjump = FALSE;
	4799	regex_charset initial_charset = get_regex_charset(orig_pm_flags);
	4800
	4801	U8 jump_ret = 0;
	4802	dJMPENV;
	4803	scan_data_t data;
	4804	RExC_state_t RExC_state;
	4805	RExC_state_t * const pRExC_state = &RExC_state;
	4806	#ifdef TRIE_STUDY_OPT
	4807	int restudied;
	4808	RExC_state_t copyRExC_state;
	4809	#endif
	4810	GET_RE_DEBUG_FLAGS_DECL;
	4811
	4812	PERL_ARGS_ASSERT_RE_COMPILE;
	4813
	4814	DEBUG_r(if (!PL_colorset) reginitcolors());
	4815
	4816	exp = SvPV(pattern, plen);
	4817
	4818	if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
	4819	RExC_utf8 = RExC_orig_utf8 = 0;
	4820	}
	4821	else {
	4822	RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
	4823	}
	4824	RExC_uni_semantics = 0;
	4825	RExC_contains_locale = 0;
	4826
	4827	/**************** LONG JUMP TARGET HERE*********************/
	4828	/* Longjmp back to here if have to switch in midstream to utf8 */
	4829	if (! RExC_orig_utf8) {
	4830	JMPENV_PUSH(jump_ret);
	4831	used_setjump = TRUE;
	4832	}
	4833
	4834	if (jump_ret == 0) { /* First time through */
	4835	xend = exp + plen;
	4836
	4837	DEBUG_COMPILE_r({
	4838	SV *dsv= sv_newmortal();
	4839	RE_PV_QUOTED_DECL(s, RExC_utf8,
	4840	dsv, exp, plen, 60);
	4841	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	4842	PL_colors[4],PL_colors[5],s);
	4843	});
	4844	}
	4845	else { /* longjumped back */
	4846	STRLEN len = plen;
	4847
	4848	/* If the cause for the longjmp was other than changing to utf8, pop
	4849	* our own setjmp, and longjmp to the correct handler */
	4850	if (jump_ret != UTF8_LONGJMP) {
	4851	JMPENV_POP;
	4852	JMPENV_JUMP(jump_ret);
	4853	}
	4854
	4855	GET_RE_DEBUG_FLAGS;
	4856
	4857	/* It's possible to write a regexp in ascii that represents Unicode
	4858	codepoints outside of the byte range, such as via \x{100}. If we
	4859	detect such a sequence we have to convert the entire pattern to utf8
	4860	and then recompile, as our sizing calculation will have been based
	4861	on 1 byte == 1 character, but we will need to use utf8 to encode
	4862	at least some part of the pattern, and therefore must convert the whole
	4863	thing.
	4864	-- dmq */
	4865	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	4866	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	4867	exp = (char*)Perl_bytes_to_utf8(aTHX_
	4868	(U8*)SvPV_nomg(pattern, plen),
	4869	&len);
	4870	xend = exp + len;
	4871	RExC_orig_utf8 = RExC_utf8 = 1;
	4872	SAVEFREEPV(exp);
	4873	}
	4874
	4875	#ifdef TRIE_STUDY_OPT
	4876	restudied = 0;
	4877	#endif
	4878
	4879	pm_flags = orig_pm_flags;
	4880
	4881	if (initial_charset == REGEX_LOCALE_CHARSET) {
	4882	RExC_contains_locale = 1;
	4883	}
	4884	else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	4885
	4886	/* Set to use unicode semantics if the pattern is in utf8 and has the
	4887	* 'depends' charset specified, as it means unicode when utf8 */
	4888	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4889	}
	4890
	4891	RExC_precomp = exp;
	4892	RExC_flags = pm_flags;
	4893	RExC_sawback = 0;
	4894
	4895	RExC_seen = 0;
	4896	RExC_in_lookbehind = 0;
	4897	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	4898	RExC_seen_evals = 0;
	4899	RExC_extralen = 0;
	4900	RExC_override_recoding = 0;
	4901
	4902	/* First pass: determine size, legality. */
	4903	RExC_parse = exp;
	4904	RExC_start = exp;
	4905	RExC_end = xend;
	4906	RExC_naughty = 0;
	4907	RExC_npar = 1;
	4908	RExC_nestroot = 0;
	4909	RExC_size = 0L;
	4910	RExC_emit = &PL_regdummy;
	4911	RExC_whilem_seen = 0;
	4912	RExC_open_parens = NULL;
	4913	RExC_close_parens = NULL;
	4914	RExC_opend = NULL;
	4915	RExC_paren_names = NULL;
	4916	#ifdef DEBUGGING
	4917	RExC_paren_name_list = NULL;
	4918	#endif
	4919	RExC_recurse = NULL;
	4920	RExC_recurse_count = 0;
	4921
	4922	#if 0 /* REGC() is (currently) a NOP at the first pass.
	4923	* Clever compilers notice this and complain. --jhi */
	4924	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	4925	#endif
	4926	DEBUG_PARSE_r(
	4927	PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
	4928	RExC_lastnum=0;
	4929	RExC_lastparse=NULL;
	4930	);
	4931	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4932	RExC_precomp = NULL;
	4933	return(NULL);
	4934	}
	4935
	4936	/* Here, finished first pass. Get rid of any added setjmp */
	4937	if (used_setjump) {
	4938	JMPENV_POP;
	4939	}
	4940
	4941	DEBUG_PARSE_r({
	4942	PerlIO_printf(Perl_debug_log,
	4943	"Required size %"IVdf" nodes\n"
	4944	"Starting second pass (creation)\n",
	4945	(IV)RExC_size);
	4946	RExC_lastnum=0;
	4947	RExC_lastparse=NULL;
	4948	});
	4949
	4950	/* The first pass could have found things that force Unicode semantics */
	4951	if ((RExC_utf8 \|\| RExC_uni_semantics)
	4952	&& get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
	4953	{
	4954	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4955	}
	4956
	4957	/* Small enough for pointer-storage convention?
	4958	If extralen==0, this means that we will not need long jumps. */
	4959	if (RExC_size >= 0x10000L && RExC_extralen)
	4960	RExC_size += RExC_extralen;
	4961	else
	4962	RExC_extralen = 0;
	4963	if (RExC_whilem_seen > 15)
	4964	RExC_whilem_seen = 15;
	4965
	4966	/* Allocate space and zero-initialize. Note, the two step process
	4967	of zeroing when in debug mode, thus anything assigned has to
	4968	happen after that */
	4969	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	4970	r = (struct regexp*)SvANY(rx);
	4971	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	4972	char, regexp_internal);
	4973	if ( r == NULL \|\| ri == NULL )
	4974	FAIL("Regexp out of space");
	4975	#ifdef DEBUGGING
	4976	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	4977	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	4978	#else
	4979	/* bulk initialize base fields with 0. */
	4980	Zero(ri, sizeof(regexp_internal), char);
	4981	#endif
	4982
	4983	/* non-zero initialization begins here */
	4984	RXi_SET( r, ri );
	4985	r->engine= RE_ENGINE_PTR;
	4986	r->extflags = pm_flags;
	4987	{
	4988	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	4989	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	4990
	4991	/* The caret is output if there are any defaults: if not all the STD
	4992	* flags are set, or if no character set specifier is needed */
	4993	bool has_default =
	4994	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	4995	\|\| ! has_charset);
	4996	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	4997	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	4998	>> RXf_PMf_STD_PMMOD_SHIFT);
	4999	const char fptr = STD_PAT_MODS; /"msix"*/
	5000	char *p;
	5001	/* Allocate for the worst case, which is all the std flags are turned
	5002	* on. If more precision is desired, we could do a population count of
	5003	* the flags set. This could be done with a small lookup table, or by
	5004	* shifting, masking and adding, or even, when available, assembly
	5005	* language for a machine-language population count.
	5006	* We never output a minus, as all those are defaults, so are
	5007	* covered by the caret */
	5008	const STRLEN wraplen = plen + has_p + has_runon
	5009	+ has_default /* If needs a caret */
	5010
	5011	/* If needs a character set specifier */
	5012	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	5013	+ (sizeof(STD_PAT_MODS) - 1)
	5014	+ (sizeof("(?:)") - 1);
	5015
	5016	p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
	5017	SvPOK_on(rx);
	5018	SvFLAGS(rx) \|= SvUTF8(pattern);
	5019	p++='('; p++='?';
	5020
	5021	/* If a default, cover it using the caret */
	5022	if (has_default) {
	5023	*p++= DEFAULT_PAT_MOD;
	5024	}
	5025	if (has_charset) {
	5026	STRLEN len;
	5027	const char* const name = get_regex_charset_name(r->extflags, &len);
	5028	Copy(name, p, len, char);
	5029	p += len;
	5030	}
	5031	if (has_p)
	5032	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	5033	{
	5034	char ch;
	5035	while((ch = *fptr++)) {
	5036	if(reganch & 1)
	5037	*p++ = ch;
	5038	reganch >>= 1;
	5039	}
	5040	}
	5041
	5042	*p++ = ':';
	5043	Copy(RExC_precomp, p, plen, char);
	5044	assert ((RX_WRAPPED(rx) - p) < 16);
	5045	r->pre_prefix = p - RX_WRAPPED(rx);
	5046	p += plen;
	5047	if (has_runon)
	5048	*p++ = '\n';
	5049	*p++ = ')';
	5050	*p = 0;
	5051	SvCUR_set(rx, p - SvPVX_const(rx));
	5052	}
	5053
	5054	r->intflags = 0;
	5055	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	5056
	5057	if (RExC_seen & REG_SEEN_RECURSE) {
	5058	Newxz(RExC_open_parens, RExC_npar,regnode *);
	5059	SAVEFREEPV(RExC_open_parens);
	5060	Newxz(RExC_close_parens,RExC_npar,regnode *);
	5061	SAVEFREEPV(RExC_close_parens);
	5062	}
	5063
	5064	/* Useful during FAIL. */
	5065	#ifdef RE_TRACK_PATTERN_OFFSETS
	5066	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	5067	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	5068	"%s %"UVuf" bytes for offset annotations.\n",
	5069	ri->u.offsets ? "Got" : "Couldn't get",
	5070	(UV)((2RExC_size+1) sizeof(U32))));
	5071	#endif
	5072	SetProgLen(ri,RExC_size);
	5073	RExC_rx_sv = rx;
	5074	RExC_rx = r;
	5075	RExC_rxi = ri;
	5076
	5077	/* Second pass: emit code. */
	5078	RExC_flags = pm_flags; /* don't let top level (?i) bleed */
	5079	RExC_parse = exp;
	5080	RExC_end = xend;
	5081	RExC_naughty = 0;
	5082	RExC_npar = 1;
	5083	RExC_emit_start = ri->program;
	5084	RExC_emit = ri->program;
	5085	RExC_emit_bound = ri->program + RExC_size + 1;
	5086
	5087	/* Store the count of eval-groups for security checks: */
	5088	RExC_rx->seen_evals = RExC_seen_evals;
	5089	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	5090	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	5091	ReREFCNT_dec(rx);
	5092	return(NULL);
	5093	}
	5094	/* XXXX To minimize changes to RE engine we always allocate
	5095	3-units-long substrs field. */
	5096	Newx(r->substrs, 1, struct reg_substr_data);
	5097	if (RExC_recurse_count) {
	5098	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	5099	SAVEFREEPV(RExC_recurse);
	5100	}
	5101
	5102	reStudy:
	5103	r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
	5104	Zero(r->substrs, 1, struct reg_substr_data);
	5105
	5106	#ifdef TRIE_STUDY_OPT
	5107	if (!restudied) {
	5108	StructCopy(&zero_scan_data, &data, scan_data_t);
	5109	copyRExC_state = RExC_state;
	5110	} else {
	5111	U32 seen=RExC_seen;
	5112	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	5113
	5114	RExC_state = copyRExC_state;
	5115	if (seen & REG_TOP_LEVEL_BRANCHES)
	5116	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	5117	else
	5118	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	5119	if (data.last_found) {
	5120	SvREFCNT_dec(data.longest_fixed);
	5121	SvREFCNT_dec(data.longest_float);
	5122	SvREFCNT_dec(data.last_found);
	5123	}
	5124	StructCopy(&zero_scan_data, &data, scan_data_t);
	5125	}
	5126	#else
	5127	StructCopy(&zero_scan_data, &data, scan_data_t);
	5128	#endif
	5129
	5130	/* Dig out information for optimizations. */
	5131	r->extflags = RExC_flags; /* was pm_op */
	5132	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	5133
	5134	if (UTF)
	5135	SvUTF8_on(rx); /* Unicode in it? */
	5136	ri->regstclass = NULL;
	5137	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	5138	r->intflags \|= PREGf_NAUGHTY;
	5139	scan = ri->program + 1; /* First BRANCH. */
	5140
	5141	/* testing for BRANCH here tells us whether there is "must appear"
	5142	data in the pattern. If there is then we can use it for optimisations */
	5143	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	5144	I32 fake;
	5145	STRLEN longest_float_length, longest_fixed_length;
	5146	struct regnode_charclass_class ch_class; /* pointed to by data */
	5147	int stclass_flag;
	5148	I32 last_close = 0; /* pointed to by data */
	5149	regnode *first= scan;
	5150	regnode *first_next= regnext(first);
	5151	/*
	5152	* Skip introductions and multiplicators >= 1
	5153	* so that we can extract the 'meat' of the pattern that must
	5154	* match in the large if() sequence following.
	5155	* NOTE that EXACT is NOT covered here, as it is normally
	5156	* picked up by the optimiser separately.
	5157	*
	5158	* This is unfortunate as the optimiser isnt handling lookahead
	5159	* properly currently.
	5160	*
	5161	*/
	5162	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	5163	/* An OR of one alternative - should not happen now. */
	5164	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	5165	/* for now we can't handle lookbehind IFMATCH*/
	5166	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	5167	(OP(first) == PLUS) \|\|
	5168	(OP(first) == MINMOD) \|\|
	5169	/* An {n,m} with n>0 */
	5170	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	5171	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	5172	{
	5173	/*
	5174	* the only op that could be a regnode is PLUS, all the rest
	5175	* will be regnode_1 or regnode_2.
	5176	*
	5177	*/
	5178	if (OP(first) == PLUS)
	5179	sawplus = 1;
	5180	else
	5181	first += regarglen[OP(first)];
	5182
	5183	first = NEXTOPER(first);
	5184	first_next= regnext(first);
	5185	}
	5186
	5187	/* Starting-point info. */
	5188	again:
	5189	DEBUG_PEEP("first:",first,0);
	5190	/* Ignore EXACT as we deal with it later. */
	5191	if (PL_regkind[OP(first)] == EXACT) {
	5192	if (OP(first) == EXACT)
	5193	NOOP; /* Empty, get anchored substr later. */
	5194	else
	5195	ri->regstclass = first;
	5196	}
	5197	#ifdef TRIE_STCLASS
	5198	else if (PL_regkind[OP(first)] == TRIE &&
	5199	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	5200	{
	5201	regnode *trie_op;
	5202	/* this can happen only on restudy */
	5203	if ( OP(first) == TRIE ) {
	5204	struct regnode_1 trieop = (struct regnode_1 )
	5205	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	5206	StructCopy(first,trieop,struct regnode_1);
	5207	trie_op=(regnode *)trieop;
	5208	} else {
	5209	struct regnode_charclass trieop = (struct regnode_charclass )
	5210	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	5211	StructCopy(first,trieop,struct regnode_charclass);
	5212	trie_op=(regnode *)trieop;
	5213	}
	5214	OP(trie_op)+=2;
	5215	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	5216	ri->regstclass = trie_op;
	5217	}
	5218	#endif
	5219	else if (REGNODE_SIMPLE(OP(first)))
	5220	ri->regstclass = first;
	5221	else if (PL_regkind[OP(first)] == BOUND \|\|
	5222	PL_regkind[OP(first)] == NBOUND)
	5223	ri->regstclass = first;
	5224	else if (PL_regkind[OP(first)] == BOL) {
	5225	r->extflags \|= (OP(first) == MBOL
	5226	? RXf_ANCH_MBOL
	5227	: (OP(first) == SBOL
	5228	? RXf_ANCH_SBOL
	5229	: RXf_ANCH_BOL));
	5230	first = NEXTOPER(first);
	5231	goto again;
	5232	}
	5233	else if (OP(first) == GPOS) {
	5234	r->extflags \|= RXf_ANCH_GPOS;
	5235	first = NEXTOPER(first);
	5236	goto again;
	5237	}
	5238	else if ((!sawopen \|\| !RExC_sawback) &&
	5239	(OP(first) == STAR &&
	5240	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	5241	!(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
	5242	{
	5243	/* turn .* into ^.* with an implied $=1 /
	5244	const int type =
	5245	(OP(NEXTOPER(first)) == REG_ANY)
	5246	? RXf_ANCH_MBOL
	5247	: RXf_ANCH_SBOL;
	5248	r->extflags \|= type;
	5249	r->intflags \|= PREGf_IMPLICIT;
	5250	first = NEXTOPER(first);
	5251	goto again;
	5252	}
	5253	if (sawplus && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	5254	&& !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
	5255	/* x+ must match at the 1st pos of run of x's */
	5256	r->intflags \|= PREGf_SKIP;
	5257
	5258	/* Scan is after the zeroth branch, first is atomic matcher. */
	5259	#ifdef TRIE_STUDY_OPT
	5260	DEBUG_PARSE_r(
	5261	if (!restudied)
	5262	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	5263	(IV)(first - scan + 1))
	5264	);
	5265	#else
	5266	DEBUG_PARSE_r(
	5267	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	5268	(IV)(first - scan + 1))
	5269	);
	5270	#endif
	5271
	5272
	5273	/*
	5274	* If there's something expensive in the r.e., find the
	5275	* longest literal string that must appear and make it the
	5276	* regmust. Resolve ties in favor of later strings, since
	5277	* the regstart check works with the beginning of the r.e.
	5278	* and avoiding duplication strengthens checking. Not a
	5279	* strong reason, but sufficient in the absence of others.
	5280	* [Now we resolve ties in favor of the earlier string if
	5281	* it happens that c_offset_min has been invalidated, since the
	5282	* earlier string may buy us something the later one won't.]
	5283	*/
	5284
	5285	data.longest_fixed = newSVpvs("");
	5286	data.longest_float = newSVpvs("");
	5287	data.last_found = newSVpvs("");
	5288	data.longest = &(data.longest_fixed);
	5289	first = scan;
	5290	if (!ri->regstclass) {
	5291	cl_init(pRExC_state, &ch_class);
	5292	data.start_class = &ch_class;
	5293	stclass_flag = SCF_DO_STCLASS_AND;
	5294	} else /* XXXX Check for BOUND? */
	5295	stclass_flag = 0;
	5296	data.last_closep = &last_close;
	5297
	5298	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	5299	&data, -1, NULL, NULL,
	5300	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag,0);
	5301
	5302
	5303	CHECK_RESTUDY_GOTO;
	5304
	5305
	5306	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	5307	&& data.last_start_min == 0 && data.last_end > 0
	5308	&& !RExC_seen_zerolen
	5309	&& !(RExC_seen & REG_SEEN_VERBARG)
	5310	&& (!(RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	5311	r->extflags \|= RXf_CHECK_ALL;
	5312	scan_commit(pRExC_state, &data,&minlen,0);
	5313	SvREFCNT_dec(data.last_found);
	5314
	5315	/* Note that code very similar to this but for anchored string
	5316	follows immediately below, changes may need to be made to both.
	5317	Be careful.
	5318	*/
	5319	longest_float_length = CHR_SVLEN(data.longest_float);
	5320	if (longest_float_length
	5321	\|\| (data.flags & SF_FL_BEFORE_EOL
	5322	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5323	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	5324	{
	5325	I32 t,ml;
	5326
	5327	/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
	5328	if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
	5329	\|\| (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	5330	&& data.offset_fixed == data.offset_float_min
	5331	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
	5332	goto remove_float; /* As in (a)+. */
	5333
	5334	/* copy the information about the longest float from the reg_scan_data
	5335	over to the program. */
	5336	if (SvUTF8(data.longest_float)) {
	5337	r->float_utf8 = data.longest_float;
	5338	r->float_substr = NULL;
	5339	} else {
	5340	r->float_substr = data.longest_float;
	5341	r->float_utf8 = NULL;
	5342	}
	5343	/* float_end_shift is how many chars that must be matched that
	5344	follow this item. We calculate it ahead of time as once the
	5345	lookbehind offset is added in we lose the ability to correctly
	5346	calculate it.*/
	5347	ml = data.minlen_float ? *(data.minlen_float)
	5348	: (I32)longest_float_length;
	5349	r->float_end_shift = ml - data.offset_float_min
	5350	- longest_float_length + (SvTAIL(data.longest_float) != 0)
	5351	+ data.lookbehind_float;
	5352	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	5353	r->float_max_offset = data.offset_float_max;
	5354	if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
	5355	r->float_max_offset -= data.lookbehind_float;
	5356
	5357	t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
	5358	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5359	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5360	fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
	5361	}
	5362	else {
	5363	remove_float:
	5364	r->float_substr = r->float_utf8 = NULL;
	5365	SvREFCNT_dec(data.longest_float);
	5366	longest_float_length = 0;
	5367	}
	5368
	5369	/* Note that code very similar to this but for floating string
	5370	is immediately above, changes may need to be made to both.
	5371	Be careful.
	5372	*/
	5373	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	5374
	5375	/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
	5376	if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
	5377	&& (longest_fixed_length
	5378	\|\| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
	5379	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5380	\|\| (RExC_flags & RXf_PMf_MULTILINE)))) )
	5381	{
	5382	I32 t,ml;
	5383
	5384	/* copy the information about the longest fixed
	5385	from the reg_scan_data over to the program. */
	5386	if (SvUTF8(data.longest_fixed)) {
	5387	r->anchored_utf8 = data.longest_fixed;
	5388	r->anchored_substr = NULL;
	5389	} else {
	5390	r->anchored_substr = data.longest_fixed;
	5391	r->anchored_utf8 = NULL;
	5392	}
	5393	/* fixed_end_shift is how many chars that must be matched that
	5394	follow this item. We calculate it ahead of time as once the
	5395	lookbehind offset is added in we lose the ability to correctly
	5396	calculate it.*/
	5397	ml = data.minlen_fixed ? *(data.minlen_fixed)
	5398	: (I32)longest_fixed_length;
	5399	r->anchored_end_shift = ml - data.offset_fixed
	5400	- longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
	5401	+ data.lookbehind_fixed;
	5402	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	5403
	5404	t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
	5405	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5406	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5407	fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
	5408	}
	5409	else {
	5410	r->anchored_substr = r->anchored_utf8 = NULL;
	5411	SvREFCNT_dec(data.longest_fixed);
	5412	longest_fixed_length = 0;
	5413	}
	5414	if (ri->regstclass
	5415	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	5416	ri->regstclass = NULL;
	5417
	5418	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	5419	&& stclass_flag
	5420	&& !(data.start_class->flags & ANYOF_EOS)
	5421	&& !cl_is_anything(data.start_class))
	5422	{
	5423	const U32 n = add_data(pRExC_state, 1, "f");
	5424	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5425
	5426	Newx(RExC_rxi->data->data[n], 1,
	5427	struct regnode_charclass_class);
	5428	StructCopy(data.start_class,
	5429	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5430	struct regnode_charclass_class);
	5431	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5432	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5433	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	5434	regprop(r, sv, (regnode*)data.start_class);
	5435	PerlIO_printf(Perl_debug_log,
	5436	"synthetic stclass \"%s\".\n",
	5437	SvPVX_const(sv));});
	5438	}
	5439
	5440	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	5441	if (longest_fixed_length > longest_float_length) {
	5442	r->check_end_shift = r->anchored_end_shift;
	5443	r->check_substr = r->anchored_substr;
	5444	r->check_utf8 = r->anchored_utf8;
	5445	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	5446	if (r->extflags & RXf_ANCH_SINGLE)
	5447	r->extflags \|= RXf_NOSCAN;
	5448	}
	5449	else {
	5450	r->check_end_shift = r->float_end_shift;
	5451	r->check_substr = r->float_substr;
	5452	r->check_utf8 = r->float_utf8;
	5453	r->check_offset_min = r->float_min_offset;
	5454	r->check_offset_max = r->float_max_offset;
	5455	}
	5456	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
	5457	This should be changed ASAP! */
	5458	if ((r->check_substr \|\| r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
	5459	r->extflags \|= RXf_USE_INTUIT;
	5460	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	5461	r->extflags \|= RXf_INTUIT_TAIL;
	5462	}
	5463	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	5464	if ( (STRLEN)minlen < longest_float_length )
	5465	minlen= longest_float_length;
	5466	if ( (STRLEN)minlen < longest_fixed_length )
	5467	minlen= longest_fixed_length;
	5468	*/
	5469	}
	5470	else {
	5471	/* Several toplevels. Best we can is to set minlen. */
	5472	I32 fake;
	5473	struct regnode_charclass_class ch_class;
	5474	I32 last_close = 0;
	5475
	5476	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	5477
	5478	scan = ri->program + 1;
	5479	cl_init(pRExC_state, &ch_class);
	5480	data.start_class = &ch_class;
	5481	data.last_closep = &last_close;
	5482
	5483
	5484	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	5485	&data, -1, NULL, NULL, SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS,0);
	5486
	5487	CHECK_RESTUDY_GOTO;
	5488
	5489	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	5490	= r->float_substr = r->float_utf8 = NULL;
	5491
	5492	if (!(data.start_class->flags & ANYOF_EOS)
	5493	&& !cl_is_anything(data.start_class))
	5494	{
	5495	const U32 n = add_data(pRExC_state, 1, "f");
	5496	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5497
	5498	Newx(RExC_rxi->data->data[n], 1,
	5499	struct regnode_charclass_class);
	5500	StructCopy(data.start_class,
	5501	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5502	struct regnode_charclass_class);
	5503	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5504	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5505	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	5506	regprop(r, sv, (regnode*)data.start_class);
	5507	PerlIO_printf(Perl_debug_log,
	5508	"synthetic stclass \"%s\".\n",
	5509	SvPVX_const(sv));});
	5510	}
	5511	}
	5512
	5513	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	5514	the "real" pattern. */
	5515	DEBUG_OPTIMISE_r({
	5516	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	5517	(IV)minlen, (IV)r->minlen);
	5518	});
	5519	r->minlenret = minlen;
	5520	if (r->minlen < minlen)
	5521	r->minlen = minlen;
	5522
	5523	if (RExC_seen & REG_SEEN_GPOS)
	5524	r->extflags \|= RXf_GPOS_SEEN;
	5525	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	5526	r->extflags \|= RXf_LOOKBEHIND_SEEN;
	5527	if (RExC_seen & REG_SEEN_EVAL)
	5528	r->extflags \|= RXf_EVAL_SEEN;
	5529	if (RExC_seen & REG_SEEN_CANY)
	5530	r->extflags \|= RXf_CANY_SEEN;
	5531	if (RExC_seen & REG_SEEN_VERBARG)
	5532	r->intflags \|= PREGf_VERBARG_SEEN;
	5533	if (RExC_seen & REG_SEEN_CUTGROUP)
	5534	r->intflags \|= PREGf_CUTGROUP_SEEN;
	5535	if (RExC_paren_names)
	5536	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	5537	else
	5538	RXp_PAREN_NAMES(r) = NULL;
	5539
	5540	#ifdef STUPID_PATTERN_CHECKS
	5541	if (RX_PRELEN(rx) == 0)
	5542	r->extflags \|= RXf_NULL;
	5543	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5544	/* XXX: this should happen BEFORE we compile */
	5545	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5546	else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
	5547	r->extflags \|= RXf_WHITE;
	5548	else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
	5549	r->extflags \|= RXf_START_ONLY;
	5550	#else
	5551	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5552	/* XXX: this should happen BEFORE we compile */
	5553	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5554	else {
	5555	regnode *first = ri->program + 1;
	5556	U8 fop = OP(first);
	5557
	5558	if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
	5559	r->extflags \|= RXf_NULL;
	5560	else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
	5561	r->extflags \|= RXf_START_ONLY;
	5562	else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
	5563	&& OP(regnext(first)) == END)
	5564	r->extflags \|= RXf_WHITE;
	5565	}
	5566	#endif
	5567	#ifdef DEBUGGING
	5568	if (RExC_paren_names) {
	5569	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	5570	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	5571	} else
	5572	#endif
	5573	ri->name_list_idx = 0;
	5574
	5575	if (RExC_recurse_count) {
	5576	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	5577	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	5578	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	5579	}
	5580	}
	5581	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	5582	/* assume we don't need to swap parens around before we match */
	5583
	5584	DEBUG_DUMP_r({
	5585	PerlIO_printf(Perl_debug_log,"Final program:\n");
	5586	regdump(r);
	5587	});
	5588	#ifdef RE_TRACK_PATTERN_OFFSETS
	5589	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	5590	const U32 len = ri->u.offsets[0];
	5591	U32 i;
	5592	GET_RE_DEBUG_FLAGS_DECL;
	5593	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	5594	for (i = 1; i <= len; i++) {
	5595	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	5596	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	5597	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	5598	}
	5599	PerlIO_printf(Perl_debug_log, "\n");
	5600	});
	5601	#endif
	5602	return rx;
	5603	}
	5604
	5605	#undef RE_ENGINE_PTR
	5606
	5607
	5608	SV*
	5609	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	5610	const U32 flags)
	5611	{
	5612	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	5613
	5614	PERL_UNUSED_ARG(value);
	5615
	5616	if (flags & RXapif_FETCH) {
	5617	return reg_named_buff_fetch(rx, key, flags);
	5618	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	5619	Perl_croak_no_modify(aTHX);
	5620	return NULL;
	5621	} else if (flags & RXapif_EXISTS) {
	5622	return reg_named_buff_exists(rx, key, flags)
	5623	? &PL_sv_yes
	5624	: &PL_sv_no;
	5625	} else if (flags & RXapif_REGNAMES) {
	5626	return reg_named_buff_all(rx, flags);
	5627	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	5628	return reg_named_buff_scalar(rx, flags);
	5629	} else {
	5630	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	5631	return NULL;
	5632	}
	5633	}
	5634
	5635	SV*
	5636	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	5637	const U32 flags)
	5638	{
	5639	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	5640	PERL_UNUSED_ARG(lastkey);
	5641
	5642	if (flags & RXapif_FIRSTKEY)
	5643	return reg_named_buff_firstkey(rx, flags);
	5644	else if (flags & RXapif_NEXTKEY)
	5645	return reg_named_buff_nextkey(rx, flags);
	5646	else {
	5647	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	5648	return NULL;
	5649	}
	5650	}
	5651
	5652	SV*
	5653	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	5654	const U32 flags)
	5655	{
	5656	AV *retarray = NULL;
	5657	SV *ret;
	5658	struct regexp const rx = (struct regexp )SvANY(r);
	5659
	5660	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	5661
	5662	if (flags & RXapif_ALL)
	5663	retarray=newAV();
	5664
	5665	if (rx && RXp_PAREN_NAMES(rx)) {
	5666	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	5667	if (he_str) {
	5668	IV i;
	5669	SV* sv_dat=HeVAL(he_str);
	5670	I32 nums=(I32)SvPVX(sv_dat);
	5671	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	5672	if ((I32)(rx->nparens) >= nums[i]
	5673	&& rx->offs[nums[i]].start != -1
	5674	&& rx->offs[nums[i]].end != -1)
	5675	{
	5676	ret = newSVpvs("");
	5677	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	5678	if (!retarray)
	5679	return ret;
	5680	} else {
	5681	if (retarray)
	5682	ret = newSVsv(&PL_sv_undef);
	5683	}
	5684	if (retarray)
	5685	av_push(retarray, ret);
	5686	}
	5687	if (retarray)
	5688	return newRV_noinc(MUTABLE_SV(retarray));
	5689	}
	5690	}
	5691	return NULL;
	5692	}
	5693
	5694	bool
	5695	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	5696	const U32 flags)
	5697	{
	5698	struct regexp const rx = (struct regexp )SvANY(r);
	5699
	5700	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	5701
	5702	if (rx && RXp_PAREN_NAMES(rx)) {
	5703	if (flags & RXapif_ALL) {
	5704	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	5705	} else {
	5706	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	5707	if (sv) {
	5708	SvREFCNT_dec(sv);
	5709	return TRUE;
	5710	} else {
	5711	return FALSE;
	5712	}
	5713	}
	5714	} else {
	5715	return FALSE;
	5716	}
	5717	}
	5718
	5719	SV*
	5720	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	5721	{
	5722	struct regexp const rx = (struct regexp )SvANY(r);
	5723
	5724	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	5725
	5726	if ( rx && RXp_PAREN_NAMES(rx) ) {
	5727	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	5728
	5729	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	5730	} else {
	5731	return FALSE;
	5732	}
	5733	}
	5734
	5735	SV*
	5736	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	5737	{
	5738	struct regexp const rx = (struct regexp )SvANY(r);
	5739	GET_RE_DEBUG_FLAGS_DECL;
	5740
	5741	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	5742
	5743	if (rx && RXp_PAREN_NAMES(rx)) {
	5744	HV *hv = RXp_PAREN_NAMES(rx);
	5745	HE *temphe;
	5746	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5747	IV i;
	5748	IV parno = 0;
	5749	SV* sv_dat = HeVAL(temphe);
	5750	I32 nums = (I32)SvPVX(sv_dat);
	5751	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5752	if ((I32)(rx->lastparen) >= nums[i] &&
	5753	rx->offs[nums[i]].start != -1 &&
	5754	rx->offs[nums[i]].end != -1)
	5755	{
	5756	parno = nums[i];
	5757	break;
	5758	}
	5759	}
	5760	if (parno \|\| flags & RXapif_ALL) {
	5761	return newSVhek(HeKEY_hek(temphe));
	5762	}
	5763	}
	5764	}
	5765	return NULL;
	5766	}
	5767
	5768	SV*
	5769	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	5770	{
	5771	SV *ret;
	5772	AV *av;
	5773	I32 length;
	5774	struct regexp const rx = (struct regexp )SvANY(r);
	5775
	5776	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	5777
	5778	if (rx && RXp_PAREN_NAMES(rx)) {
	5779	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	5780	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	5781	} else if (flags & RXapif_ONE) {
	5782	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	5783	av = MUTABLE_AV(SvRV(ret));
	5784	length = av_len(av);
	5785	SvREFCNT_dec(ret);
	5786	return newSViv(length + 1);
	5787	} else {
	5788	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	5789	return NULL;
	5790	}
	5791	}
	5792	return &PL_sv_undef;
	5793	}
	5794
	5795	SV*
	5796	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	5797	{
	5798	struct regexp const rx = (struct regexp )SvANY(r);
	5799	AV *av = newAV();
	5800
	5801	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	5802
	5803	if (rx && RXp_PAREN_NAMES(rx)) {
	5804	HV *hv= RXp_PAREN_NAMES(rx);
	5805	HE *temphe;
	5806	(void)hv_iterinit(hv);
	5807	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5808	IV i;
	5809	IV parno = 0;
	5810	SV* sv_dat = HeVAL(temphe);
	5811	I32 nums = (I32)SvPVX(sv_dat);
	5812	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5813	if ((I32)(rx->lastparen) >= nums[i] &&
	5814	rx->offs[nums[i]].start != -1 &&
	5815	rx->offs[nums[i]].end != -1)
	5816	{
	5817	parno = nums[i];
	5818	break;
	5819	}
	5820	}
	5821	if (parno \|\| flags & RXapif_ALL) {
	5822	av_push(av, newSVhek(HeKEY_hek(temphe)));
	5823	}
	5824	}
	5825	}
	5826
	5827	return newRV_noinc(MUTABLE_SV(av));
	5828	}
	5829
	5830	void
	5831	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	5832	SV * const sv)
	5833	{
	5834	struct regexp const rx = (struct regexp )SvANY(r);
	5835	char *s = NULL;
	5836	I32 i = 0;
	5837	I32 s1, t1;
	5838
	5839	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	5840
	5841	if (!rx->subbeg) {
	5842	sv_setsv(sv,&PL_sv_undef);
	5843	return;
	5844	}
	5845	else
	5846	if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
	5847	/* $` */
	5848	i = rx->offs[0].start;
	5849	s = rx->subbeg;
	5850	}
	5851	else
	5852	if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
	5853	/* $' */
	5854	s = rx->subbeg + rx->offs[0].end;
	5855	i = rx->sublen - rx->offs[0].end;
	5856	}
	5857	else
	5858	if ( 0 <= paren && paren <= (I32)rx->nparens &&
	5859	(s1 = rx->offs[paren].start) != -1 &&
	5860	(t1 = rx->offs[paren].end) != -1)
	5861	{
	5862	/* $& $1 ... */
	5863	i = t1 - s1;
	5864	s = rx->subbeg + s1;
	5865	} else {
	5866	sv_setsv(sv,&PL_sv_undef);
	5867	return;
	5868	}
	5869	assert(rx->sublen >= (s - rx->subbeg) + i );
	5870	if (i >= 0) {
	5871	const int oldtainted = PL_tainted;
	5872	TAINT_NOT;
	5873	sv_setpvn(sv, s, i);
	5874	PL_tainted = oldtainted;
	5875	if ( (rx->extflags & RXf_CANY_SEEN)
	5876	? (RXp_MATCH_UTF8(rx)
	5877	&& (!i \|\| is_utf8_string((U8*)s, i)))
	5878	: (RXp_MATCH_UTF8(rx)) )
	5879	{
	5880	SvUTF8_on(sv);
	5881	}
	5882	else
	5883	SvUTF8_off(sv);
	5884	if (PL_tainting) {
	5885	if (RXp_MATCH_TAINTED(rx)) {
	5886	if (SvTYPE(sv) >= SVt_PVMG) {
	5887	MAGIC* const mg = SvMAGIC(sv);
	5888	MAGIC* mgt;
	5889	PL_tainted = 1;
	5890	SvMAGIC_set(sv, mg->mg_moremagic);
	5891	SvTAINT(sv);
	5892	if ((mgt = SvMAGIC(sv))) {
	5893	mg->mg_moremagic = mgt;
	5894	SvMAGIC_set(sv, mg);
	5895	}
	5896	} else {
	5897	PL_tainted = 1;
	5898	SvTAINT(sv);
	5899	}
	5900	} else
	5901	SvTAINTED_off(sv);
	5902	}
	5903	} else {
	5904	sv_setsv(sv,&PL_sv_undef);
	5905	return;
	5906	}
	5907	}
	5908
	5909	void
	5910	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	5911	SV const * const value)
	5912	{
	5913	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	5914
	5915	PERL_UNUSED_ARG(rx);
	5916	PERL_UNUSED_ARG(paren);
	5917	PERL_UNUSED_ARG(value);
	5918
	5919	if (!PL_localizing)
	5920	Perl_croak_no_modify(aTHX);
	5921	}
	5922
	5923	I32
	5924	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	5925	const I32 paren)
	5926	{
	5927	struct regexp const rx = (struct regexp )SvANY(r);
	5928	I32 i;
	5929	I32 s1, t1;
	5930
	5931	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	5932
	5933	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	5934	switch (paren) {
	5935	/* $` / ${^PREMATCH} */
	5936	case RX_BUFF_IDX_PREMATCH:
	5937	if (rx->offs[0].start != -1) {
	5938	i = rx->offs[0].start;
	5939	if (i > 0) {
	5940	s1 = 0;
	5941	t1 = i;
	5942	goto getlen;
	5943	}
	5944	}
	5945	return 0;
	5946	/* $' / ${^POSTMATCH} */
	5947	case RX_BUFF_IDX_POSTMATCH:
	5948	if (rx->offs[0].end != -1) {
	5949	i = rx->sublen - rx->offs[0].end;
	5950	if (i > 0) {
	5951	s1 = rx->offs[0].end;
	5952	t1 = rx->sublen;
	5953	goto getlen;
	5954	}
	5955	}
	5956	return 0;
	5957	/* $& / ${^MATCH}, $1, $2, ... */
	5958	default:
	5959	if (paren <= (I32)rx->nparens &&
	5960	(s1 = rx->offs[paren].start) != -1 &&
	5961	(t1 = rx->offs[paren].end) != -1)
	5962	{
	5963	i = t1 - s1;
	5964	goto getlen;
	5965	} else {
	5966	if (ckWARN(WARN_UNINITIALIZED))
	5967	report_uninit((const SV *)sv);
	5968	return 0;
	5969	}
	5970	}
	5971	getlen:
	5972	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	5973	const char * const s = rx->subbeg + s1;
	5974	const U8 *ep;
	5975	STRLEN el;
	5976
	5977	i = t1 - s1;
	5978	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	5979	i = el;
	5980	}
	5981	return i;
	5982	}
	5983
	5984	SV*
	5985	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	5986	{
	5987	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	5988	PERL_UNUSED_ARG(rx);
	5989	if (0)
	5990	return NULL;
	5991	else
	5992	return newSVpvs("Regexp");
	5993	}
	5994
	5995	/* Scans the name of a named buffer from the pattern.
	5996	* If flags is REG_RSN_RETURN_NULL returns null.
	5997	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	5998	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	5999	* to the parsed name as looked up in the RExC_paren_names hash.
	6000	* If there is an error throws a vFAIL().. type exception.
	6001	*/
	6002
	6003	#define REG_RSN_RETURN_NULL 0
	6004	#define REG_RSN_RETURN_NAME 1
	6005	#define REG_RSN_RETURN_DATA 2
	6006
	6007	STATIC SV*
	6008	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	6009	{
	6010	char *name_start = RExC_parse;
	6011
	6012	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	6013
	6014	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	6015	/* skip IDFIRST by using do...while */
	6016	if (UTF)
	6017	do {
	6018	RExC_parse += UTF8SKIP(RExC_parse);
	6019	} while (isALNUM_utf8((U8*)RExC_parse));
	6020	else
	6021	do {
	6022	RExC_parse++;
	6023	} while (isALNUM(*RExC_parse));
	6024	}
	6025
	6026	if ( flags ) {
	6027	SV* sv_name
	6028	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	6029	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	6030	if ( flags == REG_RSN_RETURN_NAME)
	6031	return sv_name;
	6032	else if (flags==REG_RSN_RETURN_DATA) {
	6033	HE *he_str = NULL;
	6034	SV *sv_dat = NULL;
	6035	if ( ! sv_name ) /* should not happen*/
	6036	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	6037	if (RExC_paren_names)
	6038	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	6039	if ( he_str )
	6040	sv_dat = HeVAL(he_str);
	6041	if ( ! sv_dat )
	6042	vFAIL("Reference to nonexistent named group");
	6043	return sv_dat;
	6044	}
	6045	else {
	6046	Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
	6047	(unsigned long) flags);
	6048	}
	6049	/* NOT REACHED */
	6050	}
	6051	return NULL;
	6052	}
	6053
	6054	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	6055	int rem=(int)(RExC_end - RExC_parse); \
	6056	int cut; \
	6057	int num; \
	6058	int iscut=0; \
	6059	if (rem>10) { \
	6060	rem=10; \
	6061	iscut=1; \
	6062	} \
	6063	cut=10-rem; \
	6064	if (RExC_lastparse!=RExC_parse) \
	6065	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	6066	rem, RExC_parse, \
	6067	cut + 4, \
	6068	iscut ? "..." : "<" \
	6069	); \
	6070	else \
	6071	PerlIO_printf(Perl_debug_log,"%16s",""); \
	6072	\
	6073	if (SIZE_ONLY) \
	6074	num = RExC_size + 1; \
	6075	else \
	6076	num=REG_NODE_NUM(RExC_emit); \
	6077	if (RExC_lastnum!=num) \
	6078	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	6079	else \
	6080	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	6081	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	6082	(int)((depth*2)), "", \
	6083	(funcname) \
	6084	); \
	6085	RExC_lastnum=num; \
	6086	RExC_lastparse=RExC_parse; \
	6087	})
	6088
	6089
	6090
	6091	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	6092	DEBUG_PARSE_MSG((funcname)); \
	6093	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	6094	})
	6095	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	6096	DEBUG_PARSE_MSG((funcname)); \
	6097	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	6098	})
	6099
	6100	/* This section of code defines the inversion list object and its methods. The
	6101	* interfaces are highly subject to change, so as much as possible is static to
	6102	* this file. An inversion list is here implemented as a malloc'd C UV array
	6103	* with some added info that is placed as UVs at the beginning in a header
	6104	* portion. An inversion list for Unicode is an array of code points, sorted
	6105	* by ordinal number. The zeroth element is the first code point in the list.
	6106	* The 1th element is the first element beyond that not in the list. In other
	6107	* words, the first range is
	6108	* invlist[0]..(invlist[1]-1)
	6109	* The other ranges follow. Thus every element whose index is divisible by two
	6110	* marks the beginning of a range that is in the list, and every element not
	6111	* divisible by two marks the beginning of a range not in the list. A single
	6112	* element inversion list that contains the single code point N generally
	6113	* consists of two elements
	6114	* invlist[0] == N
	6115	* invlist[1] == N+1
	6116	* (The exception is when N is the highest representable value on the
	6117	* machine, in which case the list containing just it would be a single
	6118	* element, itself. By extension, if the last range in the list extends to
	6119	* infinity, then the first element of that range will be in the inversion list
	6120	* at a position that is divisible by two, and is the final element in the
	6121	* list.)
	6122	* Taking the complement (inverting) an inversion list is quite simple, if the
	6123	* first element is 0, remove it; otherwise add a 0 element at the beginning.
	6124	* This implementation reserves an element at the beginning of each inversion list
	6125	* to contain 0 when the list contains 0, and contains 1 otherwise. The actual
	6126	* beginning of the list is either that element if 0, or the next one if 1.
	6127	*
	6128	* More about inversion lists can be found in "Unicode Demystified"
	6129	* Chapter 13 by Richard Gillam, published by Addison-Wesley.
	6130	* More will be coming when functionality is added later.
	6131	*
	6132	* The inversion list data structure is currently implemented as an SV pointing
	6133	* to an array of UVs that the SV thinks are bytes. This allows us to have an
	6134	* array of UV whose memory management is automatically handled by the existing
	6135	* facilities for SV's.
	6136	*
	6137	* Some of the methods should always be private to the implementation, and some
	6138	* should eventually be made public */
	6139
	6140	#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
	6141	#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
	6142
	6143	#define INVLIST_ZERO_OFFSET 2 /* 0 or 1; must be last element in header */
	6144	/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
	6145	* contains the code point U+00000, and begins here. If 1, the inversion list
	6146	* doesn't contain U+0000, and it begins at the next UV in the array.
	6147	* Inverting an inversion list consists of adding or removing the 0 at the
	6148	* beginning of it. By reserving a space for that 0, inversion can be made
	6149	* very fast */
	6150
	6151	#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
	6152
	6153	/* Internally things are UVs */
	6154	#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
	6155	#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
	6156
	6157	#define INVLIST_INITIAL_LEN 10
	6158
	6159	PERL_STATIC_INLINE UV*
	6160	S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
	6161	{
	6162	/* Returns a pointer to the first element in the inversion list's array.
	6163	* This is called upon initialization of an inversion list. Where the
	6164	* array begins depends on whether the list has the code point U+0000
	6165	* in it or not. The other parameter tells it whether the code that
	6166	* follows this call is about to put a 0 in the inversion list or not.
	6167	* The first element is either the element with 0, if 0, or the next one,
	6168	* if 1 */
	6169
	6170	UV* zero = get_invlist_zero_addr(invlist);
	6171
	6172	PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
	6173
	6174	/* Must be empty */
	6175	assert(! *get_invlist_len_addr(invlist));
	6176
	6177	/* 1^1 = 0; 1^0 = 1 */
	6178	*zero = 1 ^ will_have_0;
	6179	return zero + *zero;
	6180	}
	6181
	6182	PERL_STATIC_INLINE UV*
	6183	S_invlist_array(pTHX_ SV* const invlist)
	6184	{
	6185	/* Returns the pointer to the inversion list's array. Every time the
	6186	* length changes, this needs to be called in case malloc or realloc moved
	6187	* it */
	6188
	6189	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	6190
	6191	/* Must not be empty. If these fail, you probably didn't check for <len>
	6192	* being non-zero before trying to get the array */
	6193	assert(*get_invlist_len_addr(invlist));
	6194	assert(*get_invlist_zero_addr(invlist) == 0
	6195	\|\| *get_invlist_zero_addr(invlist) == 1);
	6196
	6197	/* The array begins either at the element reserved for zero if the
	6198	* list contains 0 (that element will be set to 0), or otherwise the next
	6199	* element (in which case the reserved element will be set to 1). */
	6200	return (UV *) (get_invlist_zero_addr(invlist)
	6201	+ *get_invlist_zero_addr(invlist));
	6202	}
	6203
	6204	PERL_STATIC_INLINE UV*
	6205	S_get_invlist_len_addr(pTHX_ SV* invlist)
	6206	{
	6207	/* Return the address of the UV that contains the current number
	6208	* of used elements in the inversion list */
	6209
	6210	PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
	6211
	6212	return (UV ) (SvPVX(invlist) + (INVLIST_LEN_OFFSET sizeof (UV)));
	6213	}
	6214
	6215	PERL_STATIC_INLINE UV
	6216	S_invlist_len(pTHX_ SV* const invlist)
	6217	{
	6218	/* Returns the current number of elements stored in the inversion list's
	6219	* array */
	6220
	6221	PERL_ARGS_ASSERT_INVLIST_LEN;
	6222
	6223	return *get_invlist_len_addr(invlist);
	6224	}
	6225
	6226	PERL_STATIC_INLINE void
	6227	S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
	6228	{
	6229	/* Sets the current number of elements stored in the inversion list */
	6230
	6231	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	6232
	6233	*get_invlist_len_addr(invlist) = len;
	6234
	6235	assert(len <= SvLEN(invlist));
	6236
	6237	SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
	6238	/* If the list contains U+0000, that element is part of the header,
	6239	* and should not be counted as part of the array. It will contain
	6240	* 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
	6241	* subtract:
	6242	* SvCUR_set(invlist,
	6243	* TO_INTERNAL_SIZE(len
	6244	* - (*get_invlist_zero_addr(inv_list) ^ 1)));
	6245	* But, this is only valid if len is not 0. The consequences of not doing
	6246	* this is that the memory allocation code may think that 1 more UV is
	6247	* being used than actually is, and so might do an unnecessary grow. That
	6248	* seems worth not bothering to make this the precise amount.
	6249	*
	6250	* Note that when inverting, SvCUR shouldn't change */
	6251	}
	6252
	6253	PERL_STATIC_INLINE UV
	6254	S_invlist_max(pTHX_ SV* const invlist)
	6255	{
	6256	/* Returns the maximum number of elements storable in the inversion list's
	6257	* array, without having to realloc() */
	6258
	6259	PERL_ARGS_ASSERT_INVLIST_MAX;
	6260
	6261	return FROM_INTERNAL_SIZE(SvLEN(invlist));
	6262	}
	6263
	6264	PERL_STATIC_INLINE UV*
	6265	S_get_invlist_zero_addr(pTHX_ SV* invlist)
	6266	{
	6267	/* Return the address of the UV that is reserved to hold 0 if the inversion
	6268	* list contains 0. This has to be the last element of the heading, as the
	6269	* list proper starts with either it if 0, or the next element if not.
	6270	* (But we force it to contain either 0 or 1) */
	6271
	6272	PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
	6273
	6274	return (UV ) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET sizeof (UV)));
	6275	}
	6276
	6277	#ifndef PERL_IN_XSUB_RE
	6278	SV*
	6279	Perl__new_invlist(pTHX_ IV initial_size)
	6280	{
	6281
	6282	/* Return a pointer to a newly constructed inversion list, with enough
	6283	* space to store 'initial_size' elements. If that number is negative, a
	6284	* system default is used instead */
	6285
	6286	SV* new_list;
	6287
	6288	if (initial_size < 0) {
	6289	initial_size = INVLIST_INITIAL_LEN;
	6290	}
	6291
	6292	/* Allocate the initial space */
	6293	new_list = newSV(TO_INTERNAL_SIZE(initial_size));
	6294	invlist_set_len(new_list, 0);
	6295
	6296	/* Force iterinit() to be used to get iteration to work */
	6297	*get_invlist_iter_addr(new_list) = UV_MAX;
	6298
	6299	/* This should force a segfault if a method doesn't initialize this
	6300	* properly */
	6301	*get_invlist_zero_addr(new_list) = UV_MAX;
	6302
	6303	return new_list;
	6304	}
	6305	#endif
	6306
	6307	STATIC void
	6308	S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
	6309	{
	6310	/* Grow the maximum size of an inversion list */
	6311
	6312	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	6313
	6314	SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
	6315	}
	6316
	6317	PERL_STATIC_INLINE void
	6318	S_invlist_trim(pTHX_ SV* const invlist)
	6319	{
	6320	PERL_ARGS_ASSERT_INVLIST_TRIM;
	6321
	6322	/* Change the length of the inversion list to how many entries it currently
	6323	* has */
	6324
	6325	SvPV_shrink_to_cur((SV *) invlist);
	6326	}
	6327
	6328	/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
	6329	* etc */
	6330	#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
	6331	#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
	6332
	6333	#ifndef PERL_IN_XSUB_RE
	6334	void
	6335	Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
	6336	{
	6337	/* Subject to change or removal. Append the range from 'start' to 'end' at
	6338	* the end of the inversion list. The range must be above any existing
	6339	* ones. */
	6340
	6341	UV* array;
	6342	UV max = invlist_max(invlist);
	6343	UV len = invlist_len(invlist);
	6344
	6345	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	6346
	6347	if (len == 0) { /* Empty lists must be initialized */
	6348	array = _invlist_array_init(invlist, start == 0);
	6349	}
	6350	else {
	6351	/* Here, the existing list is non-empty. The current max entry in the
	6352	* list is generally the first value not in the set, except when the
	6353	* set extends to the end of permissible values, in which case it is
	6354	* the first entry in that final set, and so this call is an attempt to
	6355	* append out-of-order */
	6356
	6357	UV final_element = len - 1;
	6358	array = invlist_array(invlist);
	6359	if (array[final_element] > start
	6360	\|\| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
	6361	{
	6362	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
	6363	array[final_element], start,
	6364	ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
	6365	}
	6366
	6367	/* Here, it is a legal append. If the new range begins with the first
	6368	* value not in the set, it is extending the set, so the new first
	6369	* value not in the set is one greater than the newly extended range.
	6370	* */
	6371	if (array[final_element] == start) {
	6372	if (end != UV_MAX) {
	6373	array[final_element] = end + 1;
	6374	}
	6375	else {
	6376	/* But if the end is the maximum representable on the machine,
	6377	* just let the range that this would extend to have no end */
	6378	invlist_set_len(invlist, len - 1);
	6379	}
	6380	return;
	6381	}
	6382	}
	6383
	6384	/* Here the new range doesn't extend any existing set. Add it */
	6385
	6386	len += 2; /* Includes an element each for the start and end of range */
	6387
	6388	/* If overflows the existing space, extend, which may cause the array to be
	6389	* moved */
	6390	if (max < len) {
	6391	invlist_extend(invlist, len);
	6392	invlist_set_len(invlist, len); /* Have to set len here to avoid assert
	6393	failure in invlist_array() */
	6394	array = invlist_array(invlist);
	6395	}
	6396	else {
	6397	invlist_set_len(invlist, len);
	6398	}
	6399
	6400	/* The next item on the list starts the range, the one after that is
	6401	* one past the new range. */
	6402	array[len - 2] = start;
	6403	if (end != UV_MAX) {
	6404	array[len - 1] = end + 1;
	6405	}
	6406	else {
	6407	/* But if the end is the maximum representable on the machine, just let
	6408	* the range have no end */
	6409	invlist_set_len(invlist, len - 1);
	6410	}
	6411	}
	6412
	6413	STATIC IV
	6414	S_invlist_search(pTHX_ SV* const invlist, const UV cp)
	6415	{
	6416	/* Searches the inversion list for the entry that contains the input code
	6417	* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
	6418	* return value is the index into the list's array of the range that
	6419	* contains <cp> */
	6420
	6421	IV low = 0;
	6422	IV high = invlist_len(invlist);
	6423	const UV * const array = invlist_array(invlist);
	6424
	6425	PERL_ARGS_ASSERT_INVLIST_SEARCH;
	6426
	6427	/* If list is empty or the code point is before the first element, return
	6428	* failure. */
	6429	if (high == 0 \|\| cp < array[0]) {
	6430	return -1;
	6431	}
	6432
	6433	/* Binary search. What we are looking for is <i> such that
	6434	* array[i] <= cp < array[i+1]
	6435	* The loop below converges on the i+1. */
	6436	while (low < high) {
	6437	IV mid = (low + high) / 2;
	6438	if (array[mid] <= cp) {
	6439	low = mid + 1;
	6440
	6441	/* We could do this extra test to exit the loop early.
	6442	if (cp < array[low]) {
	6443	return mid;
	6444	}
	6445	*/
	6446	}
	6447	else { /* cp < array[mid] */
	6448	high = mid;
	6449	}
	6450	}
	6451
	6452	return high - 1;
	6453	}
	6454
	6455	void
	6456	Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
	6457	{
	6458	/* populates a swatch of a swash the same way swatch_get() does in utf8.c,
	6459	* but is used when the swash has an inversion list. This makes this much
	6460	* faster, as it uses a binary search instead of a linear one. This is
	6461	* intimately tied to that function, and perhaps should be in utf8.c,
	6462	* except it is intimately tied to inversion lists as well. It assumes
	6463	* that <swatch> is all 0's on input */
	6464
	6465	UV current = start;
	6466	const IV len = invlist_len(invlist);
	6467	IV i;
	6468	const UV * array;
	6469
	6470	PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
	6471
	6472	if (len == 0) { /* Empty inversion list */
	6473	return;
	6474	}
	6475
	6476	array = invlist_array(invlist);
	6477
	6478	/* Find which element it is */
	6479	i = invlist_search(invlist, start);
	6480
	6481	/* We populate from <start> to <end> */
	6482	while (current < end) {
	6483	UV upper;
	6484
	6485	/* The inversion list gives the results for every possible code point
	6486	* after the first one in the list. Only those ranges whose index is
	6487	* even are ones that the inversion list matches. For the odd ones,
	6488	* and if the initial code point is not in the list, we have to skip
	6489	* forward to the next element */
	6490	if (i == -1 \|\| ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
	6491	i++;
	6492	if (i >= len) { /* Finished if beyond the end of the array */
	6493	return;
	6494	}
	6495	current = array[i];
	6496	if (current >= end) { /* Finished if beyond the end of what we
	6497	are populating */
	6498	return;
	6499	}
	6500	}
	6501	assert(current >= start);
	6502
	6503	/* The current range ends one below the next one, except don't go past
	6504	* <end> */
	6505	i++;
	6506	upper = (i < len && array[i] < end) ? array[i] : end;
	6507
	6508	/* Here we are in a range that matches. Populate a bit in the 3-bit U8
	6509	* for each code point in it */
	6510	for (; current < upper; current++) {
	6511	const STRLEN offset = (STRLEN)(current - start);
	6512	swatch[offset >> 3] \|= 1 << (offset & 7);
	6513	}
	6514
	6515	/* Quit if at the end of the list */
	6516	if (i >= len) {
	6517
	6518	/* But first, have to deal with the highest possible code point on
	6519	* the platform. The previous code assumes that <end> is one
	6520	* beyond where we want to populate, but that is impossible at the
	6521	* platform's infinity, so have to handle it specially */
	6522	if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
	6523	{
	6524	const STRLEN offset = (STRLEN)(end - start);
	6525	swatch[offset >> 3] \|= 1 << (offset & 7);
	6526	}
	6527	return;
	6528	}
	6529
	6530	/* Advance to the next range, which will be for code points not in the
	6531	* inversion list */
	6532	current = array[i];
	6533	}
	6534
	6535	return;
	6536	}
	6537
	6538	void
	6539	Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
	6540	{
	6541	/* Take the union of two inversion lists and point <output> to it. *output
	6542	* should be defined upon input, and if it points to one of the two lists,
	6543	* the reference count to that list will be decremented.
	6544	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	6545	* Richard Gillam, published by Addison-Wesley, and explained at some
	6546	* length there. The preface says to incorporate its examples into your
	6547	* code at your own risk.
	6548	*
	6549	* The algorithm is like a merge sort.
	6550	*
	6551	* XXX A potential performance improvement is to keep track as we go along
	6552	* if only one of the inputs contributes to the result, meaning the other
	6553	* is a subset of that one. In that case, we can skip the final copy and
	6554	* return the larger of the input lists, but then outside code might need
	6555	* to keep track of whether to free the input list or not */
	6556
	6557	UV* array_a; /* a's array */
	6558	UV* array_b;
	6559	UV len_a; /* length of a's array */
	6560	UV len_b;
	6561
	6562	SV* u; /* the resulting union */
	6563	UV* array_u;
	6564	UV len_u;
	6565
	6566	UV i_a = 0; /* current index into a's array */
	6567	UV i_b = 0;
	6568	UV i_u = 0;
	6569
	6570	/* running count, as explained in the algorithm source book; items are
	6571	* stopped accumulating and are output when the count changes to/from 0.
	6572	* The count is incremented when we start a range that's in the set, and
	6573	* decremented when we start a range that's not in the set. So its range
	6574	* is 0 to 2. Only when the count is zero is something not in the set.
	6575	*/
	6576	UV count = 0;
	6577
	6578	PERL_ARGS_ASSERT__INVLIST_UNION;
	6579	assert(a != b);
	6580
	6581	/* If either one is empty, the union is the other one */
	6582	len_a = invlist_len(a);
	6583	if (len_a == 0) {
	6584	if (*output == a) {
	6585	SvREFCNT_dec(a);
	6586	}
	6587	if (*output != b) {
	6588	*output = invlist_clone(b);
	6589	} /* else output already = b; /
	6590	return;
	6591	}
	6592	else if ((len_b = invlist_len(b)) == 0) {
	6593	if (*output == b) {
	6594	SvREFCNT_dec(b);
	6595	}
	6596	if (*output != a) {
	6597	*output = invlist_clone(a);
	6598	}
	6599	/* else output already = a; /
	6600	return;
	6601	}
	6602
	6603	/* Here both lists exist and are non-empty */
	6604	array_a = invlist_array(a);
	6605	array_b = invlist_array(b);
	6606
	6607	/* Size the union for the worst case: that the sets are completely
	6608	* disjoint */
	6609	u = _new_invlist(len_a + len_b);
	6610
	6611	/* Will contain U+0000 if either component does */
	6612	array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
	6613	\|\| (len_b > 0 && array_b[0] == 0));
	6614
	6615	/* Go through each list item by item, stopping when exhausted one of
	6616	* them */
	6617	while (i_a < len_a && i_b < len_b) {
	6618	UV cp; /* The element to potentially add to the union's array */
	6619	bool cp_in_set; /* is it in the the input list's set or not */
	6620
	6621	/* We need to take one or the other of the two inputs for the union.
	6622	* Since we are merging two sorted lists, we take the smaller of the
	6623	* next items. In case of a tie, we take the one that is in its set
	6624	* first. If we took one not in the set first, it would decrement the
	6625	* count, possibly to 0 which would cause it to be output as ending the
	6626	* range, and the next time through we would take the same number, and
	6627	* output it again as beginning the next range. By doing it the
	6628	* opposite way, there is no possibility that the count will be
	6629	* momentarily decremented to 0, and thus the two adjoining ranges will
	6630	* be seamlessly merged. (In a tie and both are in the set or both not
	6631	* in the set, it doesn't matter which we take first.) */
	6632	if (array_a[i_a] < array_b[i_b]
	6633	\|\| (array_a[i_a] == array_b[i_b]
	6634	&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	6635	{
	6636	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	6637	cp= array_a[i_a++];
	6638	}
	6639	else {
	6640	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	6641	cp= array_b[i_b++];
	6642	}
	6643
	6644	/* Here, have chosen which of the two inputs to look at. Only output
	6645	* if the running count changes to/from 0, which marks the
	6646	* beginning/end of a range in that's in the set */
	6647	if (cp_in_set) {
	6648	if (count == 0) {
	6649	array_u[i_u++] = cp;
	6650	}
	6651	count++;
	6652	}
	6653	else {
	6654	count--;
	6655	if (count == 0) {
	6656	array_u[i_u++] = cp;
	6657	}
	6658	}
	6659	}
	6660
	6661	/* Here, we are finished going through at least one of the lists, which
	6662	* means there is something remaining in at most one. We check if the list
	6663	* that hasn't been exhausted is positioned such that we are in the middle
	6664	* of a range in its set or not. (i_a and i_b point to the element beyond
	6665	* the one we care about.) If in the set, we decrement 'count'; if 0, there
	6666	* is potentially more to output.
	6667	* There are four cases:
	6668	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	6669	* in the union is entirely from the non-exhausted set.
	6670	* 2) Both were in their sets, count is 2. Nothing further should
	6671	* be output, as everything that remains will be in the exhausted
	6672	* list's set, hence in the union; decrementing to 1 but not 0 insures
	6673	* that
	6674	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	6675	* Nothing further should be output because the union includes
	6676	* everything from the exhausted set. Not decrementing ensures that.
	6677	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	6678	* decrementing to 0 insures that we look at the remainder of the
	6679	* non-exhausted set */
	6680	if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	6681	\|\| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	6682	{
	6683	count--;
	6684	}
	6685
	6686	/* The final length is what we've output so far, plus what else is about to
	6687	* be output. (If 'count' is non-zero, then the input list we exhausted
	6688	* has everything remaining up to the machine's limit in its set, and hence
	6689	* in the union, so there will be no further output. */
	6690	len_u = i_u;
	6691	if (count == 0) {
	6692	/* At most one of the subexpressions will be non-zero */
	6693	len_u += (len_a - i_a) + (len_b - i_b);
	6694	}
	6695
	6696	/* Set result to final length, which can change the pointer to array_u, so
	6697	* re-find it */
	6698	if (len_u != invlist_len(u)) {
	6699	invlist_set_len(u, len_u);
	6700	invlist_trim(u);
	6701	array_u = invlist_array(u);
	6702	}
	6703
	6704	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	6705	* the other) ended with everything above it not in its set. That means
	6706	* that the remaining part of the union is precisely the same as the
	6707	* non-exhausted list, so can just copy it unchanged. (If both list were
	6708	* exhausted at the same time, then the operations below will be both 0.)
	6709	*/
	6710	if (count == 0) {
	6711	IV copy_count; /* At most one will have a non-zero copy count */
	6712	if ((copy_count = len_a - i_a) > 0) {
	6713	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	6714	}
	6715	else if ((copy_count = len_b - i_b) > 0) {
	6716	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	6717	}
	6718	}
	6719
	6720	/* We may be removing a reference to one of the inputs */
	6721	if (a == output \|\| b == output) {
	6722	SvREFCNT_dec(*output);
	6723	}
	6724
	6725	*output = u;
	6726	return;
	6727	}
	6728
	6729	void
	6730	Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
	6731	{
	6732	/* Take the intersection of two inversion lists and point <i> to it. *i
	6733	* should be defined upon input, and if it points to one of the two lists,
	6734	* the reference count to that list will be decremented.
	6735	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	6736	* Richard Gillam, published by Addison-Wesley, and explained at some
	6737	* length there. The preface says to incorporate its examples into your
	6738	* code at your own risk. In fact, it had bugs
	6739	*
	6740	* The algorithm is like a merge sort, and is essentially the same as the
	6741	* union above
	6742	*/
	6743
	6744	UV* array_a; /* a's array */
	6745	UV* array_b;
	6746	UV len_a; /* length of a's array */
	6747	UV len_b;
	6748
	6749	SV* r; /* the resulting intersection */
	6750	UV* array_r;
	6751	UV len_r;
	6752
	6753	UV i_a = 0; /* current index into a's array */
	6754	UV i_b = 0;
	6755	UV i_r = 0;
	6756
	6757	/* running count, as explained in the algorithm source book; items are
	6758	* stopped accumulating and are output when the count changes to/from 2.
	6759	* The count is incremented when we start a range that's in the set, and
	6760	* decremented when we start a range that's not in the set. So its range
	6761	* is 0 to 2. Only when the count is 2 is something in the intersection.
	6762	*/
	6763	UV count = 0;
	6764
	6765	PERL_ARGS_ASSERT__INVLIST_INTERSECTION;
	6766	assert(a != b);
	6767
	6768	/* If either one is empty, the intersection is null */
	6769	len_a = invlist_len(a);
	6770	if ((len_a == 0) \|\| ((len_b = invlist_len(b)) == 0)) {
	6771
	6772	/* If the result is the same as one of the inputs, the input is being
	6773	* overwritten */
	6774	if (*i == a) {
	6775	SvREFCNT_dec(a);
	6776	}
	6777	else if (*i == b) {
	6778	SvREFCNT_dec(b);
	6779	}
	6780
	6781	*i = _new_invlist(0);
	6782	return;
	6783	}
	6784
	6785	/* Here both lists exist and are non-empty */
	6786	array_a = invlist_array(a);
	6787	array_b = invlist_array(b);
	6788
	6789	/* Size the intersection for the worst case: that the intersection ends up
	6790	* fragmenting everything to be completely disjoint */
	6791	r= _new_invlist(len_a + len_b);
	6792
	6793	/* Will contain U+0000 iff both components do */
	6794	array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
	6795	&& len_b > 0 && array_b[0] == 0);
	6796
	6797	/* Go through each list item by item, stopping when exhausted one of
	6798	* them */
	6799	while (i_a < len_a && i_b < len_b) {
	6800	UV cp; /* The element to potentially add to the intersection's
	6801	array */
	6802	bool cp_in_set; /* Is it in the input list's set or not */
	6803
	6804	/* We need to take one or the other of the two inputs for the
	6805	* intersection. Since we are merging two sorted lists, we take the
	6806	* smaller of the next items. In case of a tie, we take the one that
	6807	* is not in its set first (a difference from the union algorithm). If
	6808	* we took one in the set first, it would increment the count, possibly
	6809	* to 2 which would cause it to be output as starting a range in the
	6810	* intersection, and the next time through we would take that same
	6811	* number, and output it again as ending the set. By doing it the
	6812	* opposite of this, there is no possibility that the count will be
	6813	* momentarily incremented to 2. (In a tie and both are in the set or
	6814	* both not in the set, it doesn't matter which we take first.) */
	6815	if (array_a[i_a] < array_b[i_b]
	6816	\|\| (array_a[i_a] == array_b[i_b]
	6817	&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	6818	{
	6819	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	6820	cp= array_a[i_a++];
	6821	}
	6822	else {
	6823	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	6824	cp= array_b[i_b++];
	6825	}
	6826
	6827	/* Here, have chosen which of the two inputs to look at. Only output
	6828	* if the running count changes to/from 2, which marks the
	6829	* beginning/end of a range that's in the intersection */
	6830	if (cp_in_set) {
	6831	count++;
	6832	if (count == 2) {
	6833	array_r[i_r++] = cp;
	6834	}
	6835	}
	6836	else {
	6837	if (count == 2) {
	6838	array_r[i_r++] = cp;
	6839	}
	6840	count--;
	6841	}
	6842	}
	6843
	6844	/* Here, we are finished going through at least one of the lists, which
	6845	* means there is something remaining in at most one. We check if the list
	6846	* that has been exhausted is positioned such that we are in the middle
	6847	* of a range in its set or not. (i_a and i_b point to elements 1 beyond
	6848	* the ones we care about.) There are four cases:
	6849	* 1) Both weren't in their sets, count is 0, and remains 0. There's
	6850	* nothing left in the intersection.
	6851	* 2) Both were in their sets, count is 2 and perhaps is incremented to
	6852	* above 2. What should be output is exactly that which is in the
	6853	* non-exhausted set, as everything it has is also in the intersection
	6854	* set, and everything it doesn't have can't be in the intersection
	6855	* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
	6856	* gets incremented to 2. Like the previous case, the intersection is
	6857	* everything that remains in the non-exhausted set.
	6858	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
	6859	* remains 1. And the intersection has nothing more. */
	6860	if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	6861	\|\| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	6862	{
	6863	count++;
	6864	}
	6865
	6866	/* The final length is what we've output so far plus what else is in the
	6867	* intersection. At most one of the subexpressions below will be non-zero */
	6868	len_r = i_r;
	6869	if (count >= 2) {
	6870	len_r += (len_a - i_a) + (len_b - i_b);
	6871	}
	6872
	6873	/* Set result to final length, which can change the pointer to array_r, so
	6874	* re-find it */
	6875	if (len_r != invlist_len(r)) {
	6876	invlist_set_len(r, len_r);
	6877	invlist_trim(r);
	6878	array_r = invlist_array(r);
	6879	}
	6880
	6881	/* Finish outputting any remaining */
	6882	if (count >= 2) { /* At most one will have a non-zero copy count */
	6883	IV copy_count;
	6884	if ((copy_count = len_a - i_a) > 0) {
	6885	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	6886	}
	6887	else if ((copy_count = len_b - i_b) > 0) {
	6888	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	6889	}
	6890	}
	6891
	6892	/* We may be removing a reference to one of the inputs */
	6893	if (a == i \|\| b == i) {
	6894	SvREFCNT_dec(*i);
	6895	}
	6896
	6897	*i = r;
	6898	return;
	6899	}
	6900
	6901	#endif
	6902
	6903	STATIC SV*
	6904	S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
	6905	{
	6906	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	6907	* set. A pointer to the inversion list is returned. This may actually be
	6908	* a new list, in which case the passed in one has been destroyed. The
	6909	* passed in inversion list can be NULL, in which case a new one is created
	6910	* with just the one range in it */
	6911
	6912	SV* range_invlist;
	6913	UV len;
	6914
	6915	if (invlist == NULL) {
	6916	invlist = _new_invlist(2);
	6917	len = 0;
	6918	}
	6919	else {
	6920	len = invlist_len(invlist);
	6921	}
	6922
	6923	/* If comes after the final entry, can just append it to the end */
	6924	if (len == 0
	6925	\|\| start >= invlist_array(invlist)
	6926	[invlist_len(invlist) - 1])
	6927	{
	6928	_append_range_to_invlist(invlist, start, end);
	6929	return invlist;
	6930	}
	6931
	6932	/* Here, can't just append things, create and return a new inversion list
	6933	* which is the union of this range and the existing inversion list */
	6934	range_invlist = _new_invlist(2);
	6935	_append_range_to_invlist(range_invlist, start, end);
	6936
	6937	_invlist_union(invlist, range_invlist, &invlist);
	6938
	6939	/* The temporary can be freed */
	6940	SvREFCNT_dec(range_invlist);
	6941
	6942	return invlist;
	6943	}
	6944
	6945	PERL_STATIC_INLINE SV*
	6946	S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
	6947	return add_range_to_invlist(invlist, cp, cp);
	6948	}
	6949
	6950	#ifndef PERL_IN_XSUB_RE
	6951	void
	6952	Perl__invlist_invert(pTHX_ SV* const invlist)
	6953	{
	6954	/* Complement the input inversion list. This adds a 0 if the list didn't
	6955	* have a zero; removes it otherwise. As described above, the data
	6956	* structure is set up so that this is very efficient */
	6957
	6958	UV* len_pos = get_invlist_len_addr(invlist);
	6959
	6960	PERL_ARGS_ASSERT__INVLIST_INVERT;
	6961
	6962	/* The inverse of matching nothing is matching everything */
	6963	if (*len_pos == 0) {
	6964	_append_range_to_invlist(invlist, 0, UV_MAX);
	6965	return;
	6966	}
	6967
	6968	/* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
	6969	* zero element was a 0, so it is being removed, so the length decrements
	6970	* by 1; and vice-versa. SvCUR is unaffected */
	6971	if (*get_invlist_zero_addr(invlist) ^= 1) {
	6972	(*len_pos)--;
	6973	}
	6974	else {
	6975	(*len_pos)++;
	6976	}
	6977	}
	6978
	6979	void
	6980	Perl__invlist_invert_prop(pTHX_ SV* const invlist)
	6981	{
	6982	/* Complement the input inversion list (which must be a Unicode property,
	6983	* all of which don't match above the Unicode maximum code point.) And
	6984	* Perl has chosen to not have the inversion match above that either. This
	6985	* adds a 0x110000 if the list didn't end with it, and removes it if it did
	6986	*/
	6987
	6988	UV len;
	6989	UV* array;
	6990
	6991	PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
	6992
	6993	_invlist_invert(invlist);
	6994
	6995	len = invlist_len(invlist);
	6996
	6997	if (len != 0) { /* If empty do nothing */
	6998	array = invlist_array(invlist);
	6999	if (array[len - 1] != PERL_UNICODE_MAX + 1) {
	7000	/* Add 0x110000. First, grow if necessary */
	7001	len++;
	7002	if (invlist_max(invlist) < len) {
	7003	invlist_extend(invlist, len);
	7004	array = invlist_array(invlist);
	7005	}
	7006	invlist_set_len(invlist, len);
	7007	array[len - 1] = PERL_UNICODE_MAX + 1;
	7008	}
	7009	else { /* Remove the 0x110000 */
	7010	invlist_set_len(invlist, len - 1);
	7011	}
	7012	}
	7013
	7014	return;
	7015	}
	7016	#endif
	7017
	7018	PERL_STATIC_INLINE SV*
	7019	S_invlist_clone(pTHX_ SV* const invlist)
	7020	{
	7021
	7022	/* Return a new inversion list that is a copy of the input one, which is
	7023	* unchanged */
	7024
	7025	/* Need to allocate extra space to accommodate Perl's addition of a
	7026	* trailing NUL to SvPV's, since it thinks they are always strings */
	7027	SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
	7028	STRLEN length = SvCUR(invlist);
	7029
	7030	PERL_ARGS_ASSERT_INVLIST_CLONE;
	7031
	7032	SvCUR_set(new_invlist, length); /* This isn't done automatically */
	7033	Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
	7034
	7035	return new_invlist;
	7036	}
	7037
	7038	#ifndef PERL_IN_XSUB_RE
	7039	void
	7040	Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
	7041	{
	7042	/* Point <result> to an inversion list which consists of all elements in
	7043	* <a> that aren't also in <b>. *result should be defined upon input, and
	7044	* if it points to C<b> its reference count will be decremented. */
	7045
	7046	PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
	7047	assert(a != b);
	7048
	7049	/* Subtracting nothing retains the original */
	7050	if (invlist_len(b) == 0) {
	7051
	7052	if (*result == b) {
	7053	SvREFCNT_dec(b);
	7054	}
	7055
	7056	/* If the result is not to be the same variable as the original, create
	7057	* a copy */
	7058	if (*result != a) {
	7059	*result = invlist_clone(a);
	7060	}
	7061	} else {
	7062	SV *b_copy = invlist_clone(b);
	7063	_invlist_invert(b_copy); /* Everything not in 'b' */
	7064
	7065	if (*result == b) {
	7066	SvREFCNT_dec(b);
	7067	}
	7068
	7069	_invlist_intersection(a, b_copy, result); /* Everything in 'a' not in
	7070	'b' */
	7071	SvREFCNT_dec(b_copy);
	7072	}
	7073
	7074	return;
	7075	}
	7076	#endif
	7077
	7078	PERL_STATIC_INLINE UV*
	7079	S_get_invlist_iter_addr(pTHX_ SV* invlist)
	7080	{
	7081	/* Return the address of the UV that contains the current iteration
	7082	* position */
	7083
	7084	PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
	7085
	7086	return (UV ) (SvPVX(invlist) + (INVLIST_ITER_OFFSET sizeof (UV)));
	7087	}
	7088
	7089	PERL_STATIC_INLINE void
	7090	S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
	7091	{
	7092	PERL_ARGS_ASSERT_INVLIST_ITERINIT;
	7093
	7094	*get_invlist_iter_addr(invlist) = 0;
	7095	}
	7096
	7097	STATIC bool
	7098	S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
	7099	{
	7100	/* An C<invlist_iterinit> call on <invlist> must be used to set this up.
	7101	* This call sets in <start> and <end>, the next range in <invlist>.
	7102	* Returns <TRUE> if successful and the next call will return the next
	7103	* range; <FALSE> if was already at the end of the list. If the latter,
	7104	* <start> and <end> are unchanged, and the next call to this function
	7105	* will start over at the beginning of the list */
	7106
	7107	UV* pos = get_invlist_iter_addr(invlist);
	7108	UV len = invlist_len(invlist);
	7109	UV *array;
	7110
	7111	PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
	7112
	7113	if (*pos >= len) {
	7114	pos = UV_MAX; / Force iternit() to be required next time */
	7115	return FALSE;
	7116	}
	7117
	7118	array = invlist_array(invlist);
	7119
	7120	start = array[(pos)++];
	7121
	7122	if (*pos >= len) {
	7123	*end = UV_MAX;
	7124	}
	7125	else {
	7126	end = array[(pos)++] - 1;
	7127	}
	7128
	7129	return TRUE;
	7130	}
	7131
	7132	#ifndef PERL_IN_XSUB_RE
	7133	SV *
	7134	Perl__invlist_contents(pTHX_ SV* const invlist)
	7135	{
	7136	/* Get the contents of an inversion list into a string SV so that they can
	7137	* be printed out. It uses the format traditionally done for debug tracing
	7138	*/
	7139
	7140	UV start, end;
	7141	SV* output = newSVpvs("\n");
	7142
	7143	PERL_ARGS_ASSERT__INVLIST_CONTENTS;
	7144
	7145	invlist_iterinit(invlist);
	7146	while (invlist_iternext(invlist, &start, &end)) {
	7147	if (end == UV_MAX) {
	7148	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
	7149	}
	7150	else if (end != start) {
	7151	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
	7152	start, end);
	7153	}
	7154	else {
	7155	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
	7156	}
	7157	}
	7158
	7159	return output;
	7160	}
	7161	#endif
	7162
	7163	#if 0
	7164	void
	7165	S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
	7166	{
	7167	/* Dumps out the ranges in an inversion list. The string 'header'
	7168	* if present is output on a line before the first range */
	7169
	7170	UV start, end;
	7171
	7172	if (header && strlen(header)) {
	7173	PerlIO_printf(Perl_debug_log, "%s\n", header);
	7174	}
	7175	invlist_iterinit(invlist);
	7176	while (invlist_iternext(invlist, &start, &end)) {
	7177	if (end == UV_MAX) {
	7178	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
	7179	}
	7180	else {
	7181	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
	7182	}
	7183	}
	7184	}
	7185	#endif
	7186
	7187	#undef HEADER_LENGTH
	7188	#undef INVLIST_INITIAL_LENGTH
	7189	#undef TO_INTERNAL_SIZE
	7190	#undef FROM_INTERNAL_SIZE
	7191	#undef INVLIST_LEN_OFFSET
	7192	#undef INVLIST_ZERO_OFFSET
	7193	#undef INVLIST_ITER_OFFSET
	7194
	7195	/* End of inversion list object */
	7196
	7197	/*
	7198	- reg - regular expression, i.e. main body or parenthesized thing
	7199	*
	7200	* Caller must absorb opening parenthesis.
	7201	*
	7202	* Combining parenthesis handling with the base level of regular expression
	7203	* is a trifle forced, but the need to tie the tails of the branches to what
	7204	* follows makes it hard to avoid.
	7205	*/
	7206	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	7207	#ifdef DEBUGGING
	7208	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	7209	#else
	7210	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	7211	#endif
	7212
	7213	STATIC regnode *
	7214	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	7215	/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
	7216	{
	7217	dVAR;
	7218	register regnode ret; / Will be the head of the group. */
	7219	register regnode *br;
	7220	register regnode *lastbr;
	7221	register regnode *ender = NULL;
	7222	register I32 parno = 0;
	7223	I32 flags;
	7224	U32 oregflags = RExC_flags;
	7225	bool have_branch = 0;
	7226	bool is_open = 0;
	7227	I32 freeze_paren = 0;
	7228	I32 after_freeze = 0;
	7229
	7230	/* for (?g), (?gc), and (?o) warnings; warning
	7231	about (?c) will warn about (?g) -- japhy */
	7232
	7233	#define WASTED_O 0x01
	7234	#define WASTED_G 0x02
	7235	#define WASTED_C 0x04
	7236	#define WASTED_GC (0x02\|0x04)
	7237	I32 wastedflags = 0x00;
	7238
	7239	char * parse_start = RExC_parse; /* MJD */
	7240	char * const oregcomp_parse = RExC_parse;
	7241
	7242	GET_RE_DEBUG_FLAGS_DECL;
	7243
	7244	PERL_ARGS_ASSERT_REG;
	7245	DEBUG_PARSE("reg ");
	7246
	7247	flagp = 0; / Tentatively. */
	7248
	7249
	7250	/* Make an OPEN node, if parenthesized. */
	7251	if (paren) {
	7252	if ( RExC_parse == '') { /* (VERB:ARG) /
	7253	char *start_verb = RExC_parse;
	7254	STRLEN verb_len = 0;
	7255	char *start_arg = NULL;
	7256	unsigned char op = 0;
	7257	int argok = 1;
	7258	int internal_argval = 0; /* internal_argval is only useful if !argok */
	7259	while ( RExC_parse && RExC_parse != ')' ) {
	7260	if ( *RExC_parse == ':' ) {
	7261	start_arg = RExC_parse + 1;
	7262	break;
	7263	}
	7264	RExC_parse++;
	7265	}
	7266	++start_verb;
	7267	verb_len = RExC_parse - start_verb;
	7268	if ( start_arg ) {
	7269	RExC_parse++;
	7270	while ( RExC_parse && RExC_parse != ')' )
	7271	RExC_parse++;
	7272	if ( *RExC_parse != ')' )
	7273	vFAIL("Unterminated verb pattern argument");
	7274	if ( RExC_parse == start_arg )
	7275	start_arg = NULL;
	7276	} else {
	7277	if ( *RExC_parse != ')' )
	7278	vFAIL("Unterminated verb pattern");
	7279	}
	7280
	7281	switch ( *start_verb ) {
	7282	case 'A': /* (ACCEPT) /
	7283	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	7284	op = ACCEPT;
	7285	internal_argval = RExC_nestroot;
	7286	}
	7287	break;
	7288	case 'C': /* (COMMIT) /
	7289	if ( memEQs(start_verb,verb_len,"COMMIT") )
	7290	op = COMMIT;
	7291	break;
	7292	case 'F': /* (FAIL) /
	7293	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	7294	op = OPFAIL;
	7295	argok = 0;
	7296	}
	7297	break;
	7298	case ':': /* (:NAME) /
	7299	case 'M': /* (MARK:NAME) /
	7300	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	7301	op = MARKPOINT;
	7302	argok = -1;
	7303	}
	7304	break;
	7305	case 'P': /* (PRUNE) /
	7306	if ( memEQs(start_verb,verb_len,"PRUNE") )
	7307	op = PRUNE;
	7308	break;
	7309	case 'S': /* (SKIP) /
	7310	if ( memEQs(start_verb,verb_len,"SKIP") )
	7311	op = SKIP;
	7312	break;
	7313	case 'T': /* (THEN) /
	7314	/* [19:06] <TimToady> :: is then */
	7315	if ( memEQs(start_verb,verb_len,"THEN") ) {
	7316	op = CUTGROUP;
	7317	RExC_seen \|= REG_SEEN_CUTGROUP;
	7318	}
	7319	break;
	7320	}
	7321	if ( ! op ) {
	7322	RExC_parse++;
	7323	vFAIL3("Unknown verb pattern '%.*s'",
	7324	verb_len, start_verb);
	7325	}
	7326	if ( argok ) {
	7327	if ( start_arg && internal_argval ) {
	7328	vFAIL3("Verb pattern '%.*s' may not have an argument",
	7329	verb_len, start_verb);
	7330	} else if ( argok < 0 && !start_arg ) {
	7331	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	7332	verb_len, start_verb);
	7333	} else {
	7334	ret = reganode(pRExC_state, op, internal_argval);
	7335	if ( ! internal_argval && ! SIZE_ONLY ) {
	7336	if (start_arg) {
	7337	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	7338	ARG(ret) = add_data( pRExC_state, 1, "S" );
	7339	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	7340	ret->flags = 0;
	7341	} else {
	7342	ret->flags = 1;
	7343	}
	7344	}
	7345	}
	7346	if (!internal_argval)
	7347	RExC_seen \|= REG_SEEN_VERBARG;
	7348	} else if ( start_arg ) {
	7349	vFAIL3("Verb pattern '%.*s' may not have an argument",
	7350	verb_len, start_verb);
	7351	} else {
	7352	ret = reg_node(pRExC_state, op);
	7353	}
	7354	nextchar(pRExC_state);
	7355	return ret;
	7356	} else
	7357	if (RExC_parse == '?') { / (?...) */
	7358	bool is_logical = 0;
	7359	const char * const seqstart = RExC_parse;
	7360	bool has_use_defaults = FALSE;
	7361
	7362	RExC_parse++;
	7363	paren = *RExC_parse++;
	7364	ret = NULL; /* For look-ahead/behind. */
	7365	switch (paren) {
	7366
	7367	case 'P': /* (?P...) variants for those used to PCRE/Python */
	7368	paren = *RExC_parse++;
	7369	if ( paren == '<') /* (?P<...>) named capture */
	7370	goto named_capture;
	7371	else if (paren == '>') { /* (?P>name) named recursion */
	7372	goto named_recursion;
	7373	}
	7374	else if (paren == '=') { /* (?P=...) named backref */
	7375	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	7376	you change this make sure you change that */
	7377	char* name_start = RExC_parse;
	7378	U32 num = 0;
	7379	SV *sv_dat = reg_scan_name(pRExC_state,
	7380	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7381	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	7382	vFAIL2("Sequence %.3s... not terminated",parse_start);
	7383
	7384	if (!SIZE_ONLY) {
	7385	num = add_data( pRExC_state, 1, "S" );
	7386	RExC_rxi->data->data[num]=(void*)sv_dat;
	7387	SvREFCNT_inc_simple_void(sv_dat);
	7388	}
	7389	RExC_sawback = 1;
	7390	ret = reganode(pRExC_state,
	7391	((! FOLD)
	7392	? NREF
	7393	: (MORE_ASCII_RESTRICTED)
	7394	? NREFFA
	7395	: (AT_LEAST_UNI_SEMANTICS)
	7396	? NREFFU
	7397	: (LOC)
	7398	? NREFFL
	7399	: NREFF),
	7400	num);
	7401	*flagp \|= HASWIDTH;
	7402
	7403	Set_Node_Offset(ret, parse_start+1);
	7404	Set_Node_Cur_Length(ret); /* MJD */
	7405
	7406	nextchar(pRExC_state);
	7407	return ret;
	7408	}
	7409	RExC_parse++;
	7410	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7411	/NOTREACHED/
	7412	case '<': /* (?<...) */
	7413	if (*RExC_parse == '!')
	7414	paren = ',';
	7415	else if (*RExC_parse != '=')
	7416	named_capture:
	7417	{ /* (?<...>) */
	7418	char *name_start;
	7419	SV *svname;
	7420	paren= '>';
	7421	case '\'': /* (?'...') */
	7422	name_start= RExC_parse;
	7423	svname = reg_scan_name(pRExC_state,
	7424	SIZE_ONLY ? /* reverse test from the others */
	7425	REG_RSN_RETURN_NAME :
	7426	REG_RSN_RETURN_NULL);
	7427	if (RExC_parse == name_start) {
	7428	RExC_parse++;
	7429	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7430	/NOTREACHED/
	7431	}
	7432	if (*RExC_parse != paren)
	7433	vFAIL2("Sequence (?%c... not terminated",
	7434	paren=='>' ? '<' : paren);
	7435	if (SIZE_ONLY) {
	7436	HE *he_str;
	7437	SV *sv_dat = NULL;
	7438	if (!svname) /* shouldn't happen */
	7439	Perl_croak(aTHX_
	7440	"panic: reg_scan_name returned NULL");
	7441	if (!RExC_paren_names) {
	7442	RExC_paren_names= newHV();
	7443	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	7444	#ifdef DEBUGGING
	7445	RExC_paren_name_list= newAV();
	7446	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	7447	#endif
	7448	}
	7449	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	7450	if ( he_str )
	7451	sv_dat = HeVAL(he_str);
	7452	if ( ! sv_dat ) {
	7453	/* croak baby croak */
	7454	Perl_croak(aTHX_
	7455	"panic: paren_name hash element allocation failed");
	7456	} else if ( SvPOK(sv_dat) ) {
	7457	/* (?\|...) can mean we have dupes so scan to check
	7458	its already been stored. Maybe a flag indicating
	7459	we are inside such a construct would be useful,
	7460	but the arrays are likely to be quite small, so
	7461	for now we punt -- dmq */
	7462	IV count = SvIV(sv_dat);
	7463	I32 pv = (I32)SvPVX(sv_dat);
	7464	IV i;
	7465	for ( i = 0 ; i < count ; i++ ) {
	7466	if ( pv[i] == RExC_npar ) {
	7467	count = 0;
	7468	break;
	7469	}
	7470	}
	7471	if ( count ) {
	7472	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	7473	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	7474	pv[count] = RExC_npar;
	7475	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	7476	}
	7477	} else {
	7478	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	7479	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	7480	SvIOK_on(sv_dat);
	7481	SvIV_set(sv_dat, 1);
	7482	}
	7483	#ifdef DEBUGGING
	7484	/* Yes this does cause a memory leak in debugging Perls */
	7485	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	7486	SvREFCNT_dec(svname);
	7487	#endif
	7488
	7489	/sv_dump(sv_dat);/
	7490	}
	7491	nextchar(pRExC_state);
	7492	paren = 1;
	7493	goto capturing_parens;
	7494	}
	7495	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	7496	RExC_in_lookbehind++;
	7497	RExC_parse++;
	7498	case '=': /* (?=...) */
	7499	RExC_seen_zerolen++;
	7500	break;
	7501	case '!': /* (?!...) */
	7502	RExC_seen_zerolen++;
	7503	if (*RExC_parse == ')') {
	7504	ret=reg_node(pRExC_state, OPFAIL);
	7505	nextchar(pRExC_state);
	7506	return ret;
	7507	}
	7508	break;
	7509	case '\|': /* (?\|...) */
	7510	/* branch reset, behave like a (?:...) except that
	7511	buffers in alternations share the same numbers */
	7512	paren = ':';
	7513	after_freeze = freeze_paren = RExC_npar;
	7514	break;
	7515	case ':': /* (?:...) */
	7516	case '>': /* (?>...) */
	7517	break;
	7518	case '$': /* (?$...) */
	7519	case '@': /* (?@...) */
	7520	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	7521	break;
	7522	case '#': /* (?#...) */
	7523	while (RExC_parse && RExC_parse != ')')
	7524	RExC_parse++;
	7525	if (*RExC_parse != ')')
	7526	FAIL("Sequence (?#... not terminated");
	7527	nextchar(pRExC_state);
	7528	*flagp = TRYAGAIN;
	7529	return NULL;
	7530	case '0' : /* (?0) */
	7531	case 'R' : /* (?R) */
	7532	if (*RExC_parse != ')')
	7533	FAIL("Sequence (?R) not terminated");
	7534	ret = reg_node(pRExC_state, GOSTART);
	7535	*flagp \|= POSTPONED;
	7536	nextchar(pRExC_state);
	7537	return ret;
	7538	/notreached/
	7539	{ /* named and numeric backreferences */
	7540	I32 num;
	7541	case '&': /* (?&NAME) */
	7542	parse_start = RExC_parse - 1;
	7543	named_recursion:
	7544	{
	7545	SV *sv_dat = reg_scan_name(pRExC_state,
	7546	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7547	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	7548	}
	7549	goto gen_recurse_regop;
	7550	/* NOT REACHED */
	7551	case '+':
	7552	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	7553	RExC_parse++;
	7554	vFAIL("Illegal pattern");
	7555	}
	7556	goto parse_recursion;
	7557	/* NOT REACHED*/
	7558	case '-': /* (?-1) */
	7559	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	7560	RExC_parse--; /* rewind to let it be handled later */
	7561	goto parse_flags;
	7562	}
	7563	/FALLTHROUGH /
	7564	case '1': case '2': case '3': case '4': /* (?1) */
	7565	case '5': case '6': case '7': case '8': case '9':
	7566	RExC_parse--;
	7567	parse_recursion:
	7568	num = atoi(RExC_parse);
	7569	parse_start = RExC_parse - 1; /* MJD */
	7570	if (*RExC_parse == '-')
	7571	RExC_parse++;
	7572	while (isDIGIT(*RExC_parse))
	7573	RExC_parse++;
	7574	if (*RExC_parse!=')')
	7575	vFAIL("Expecting close bracket");
	7576
	7577	gen_recurse_regop:
	7578	if ( paren == '-' ) {
	7579	/*
	7580	Diagram of capture buffer numbering.
	7581	Top line is the normal capture buffer numbers
	7582	Bottom line is the negative indexing as from
	7583	the X (the (?-2))
	7584
	7585	+ 1 2 3 4 5 X 6 7
	7586	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	7587	- 5 4 3 2 1 X x x
	7588
	7589	*/
	7590	num = RExC_npar + num;
	7591	if (num < 1) {
	7592	RExC_parse++;
	7593	vFAIL("Reference to nonexistent group");
	7594	}
	7595	} else if ( paren == '+' ) {
	7596	num = RExC_npar + num - 1;
	7597	}
	7598
	7599	ret = reganode(pRExC_state, GOSUB, num);
	7600	if (!SIZE_ONLY) {
	7601	if (num > (I32)RExC_rx->nparens) {
	7602	RExC_parse++;
	7603	vFAIL("Reference to nonexistent group");
	7604	}
	7605	ARG2L_SET( ret, RExC_recurse_count++);
	7606	RExC_emit++;
	7607	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7608	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	7609	} else {
	7610	RExC_size++;
	7611	}
	7612	RExC_seen \|= REG_SEEN_RECURSE;
	7613	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	7614	Set_Node_Offset(ret, parse_start); /* MJD */
	7615
	7616	*flagp \|= POSTPONED;
	7617	nextchar(pRExC_state);
	7618	return ret;
	7619	} /* named and numeric backreferences */
	7620	/* NOT REACHED */
	7621
	7622	case '?': /* (??...) */
	7623	is_logical = 1;
	7624	if (*RExC_parse != '{') {
	7625	RExC_parse++;
	7626	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7627	/NOTREACHED/
	7628	}
	7629	*flagp \|= POSTPONED;
	7630	paren = *RExC_parse++;
	7631	/* FALL THROUGH */
	7632	case '{': /* (?{...}) */
	7633	{
	7634	I32 count = 1;
	7635	U32 n = 0;
	7636	char c;
	7637	char *s = RExC_parse;
	7638
	7639	RExC_seen_zerolen++;
	7640	RExC_seen \|= REG_SEEN_EVAL;
	7641	while (count && (c = *RExC_parse)) {
	7642	if (c == '\\') {
	7643	if (RExC_parse[1])
	7644	RExC_parse++;
	7645	}
	7646	else if (c == '{')
	7647	count++;
	7648	else if (c == '}')
	7649	count--;
	7650	RExC_parse++;
	7651	}
	7652	if (*RExC_parse != ')') {
	7653	RExC_parse = s;
	7654	vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
	7655	}
	7656	if (!SIZE_ONLY) {
	7657	PAD *pad;
	7658	OP_4tree sop, rop;
	7659	SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
	7660
	7661	ENTER;
	7662	Perl_save_re_context(aTHX);
	7663	rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
	7664	sop->op_private \|= OPpREFCOUNTED;
	7665	/* re_dup will OpREFCNT_inc */
	7666	OpREFCNT_set(sop, 1);
	7667	LEAVE;
	7668
	7669	n = add_data(pRExC_state, 3, "nop");
	7670	RExC_rxi->data->data[n] = (void*)rop;
	7671	RExC_rxi->data->data[n+1] = (void*)sop;
	7672	RExC_rxi->data->data[n+2] = (void*)pad;
	7673	SvREFCNT_dec(sv);
	7674	}
	7675	else { /* First pass */
	7676	if (PL_reginterp_cnt < ++RExC_seen_evals
	7677	&& IN_PERL_RUNTIME)
	7678	/* No compiled RE interpolated, has runtime
	7679	components ===> unsafe. */
	7680	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	7681	if (PL_tainting && PL_tainted)
	7682	FAIL("Eval-group in insecure regular expression");
	7683	#if PERL_VERSION > 8
	7684	if (IN_PERL_COMPILETIME)
	7685	PL_cv_has_eval = 1;
	7686	#endif
	7687	}
	7688
	7689	nextchar(pRExC_state);
	7690	if (is_logical) {
	7691	ret = reg_node(pRExC_state, LOGICAL);
	7692	if (!SIZE_ONLY)
	7693	ret->flags = 2;
	7694	REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
	7695	/* deal with the length of this later - MJD */
	7696	return ret;
	7697	}
	7698	ret = reganode(pRExC_state, EVAL, n);
	7699	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	7700	Set_Node_Offset(ret, parse_start);
	7701	return ret;
	7702	}
	7703	case '(': /* (?(?{...})...) and (?(?=...)...) */
	7704	{
	7705	int is_define= 0;
	7706	if (RExC_parse[0] == '?') { /* (?(?...)) */
	7707	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	7708	\|\| RExC_parse[1] == '<'
	7709	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	7710	I32 flag;
	7711
	7712	ret = reg_node(pRExC_state, LOGICAL);
	7713	if (!SIZE_ONLY)
	7714	ret->flags = 1;
	7715	REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
	7716	goto insert_if;
	7717	}
	7718	}
	7719	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	7720	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	7721	{
	7722	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	7723	char *name_start= RExC_parse++;
	7724	U32 num = 0;
	7725	SV *sv_dat=reg_scan_name(pRExC_state,
	7726	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7727	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	7728	vFAIL2("Sequence (?(%c... not terminated",
	7729	(ch == '>' ? '<' : ch));
	7730	RExC_parse++;
	7731	if (!SIZE_ONLY) {
	7732	num = add_data( pRExC_state, 1, "S" );
	7733	RExC_rxi->data->data[num]=(void*)sv_dat;
	7734	SvREFCNT_inc_simple_void(sv_dat);
	7735	}
	7736	ret = reganode(pRExC_state,NGROUPP,num);
	7737	goto insert_if_check_paren;
	7738	}
	7739	else if (RExC_parse[0] == 'D' &&
	7740	RExC_parse[1] == 'E' &&
	7741	RExC_parse[2] == 'F' &&
	7742	RExC_parse[3] == 'I' &&
	7743	RExC_parse[4] == 'N' &&
	7744	RExC_parse[5] == 'E')
	7745	{
	7746	ret = reganode(pRExC_state,DEFINEP,0);
	7747	RExC_parse +=6 ;
	7748	is_define = 1;
	7749	goto insert_if_check_paren;
	7750	}
	7751	else if (RExC_parse[0] == 'R') {
	7752	RExC_parse++;
	7753	parno = 0;
	7754	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	7755	parno = atoi(RExC_parse++);
	7756	while (isDIGIT(*RExC_parse))
	7757	RExC_parse++;
	7758	} else if (RExC_parse[0] == '&') {
	7759	SV *sv_dat;
	7760	RExC_parse++;
	7761	sv_dat = reg_scan_name(pRExC_state,
	7762	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7763	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	7764	}
	7765	ret = reganode(pRExC_state,INSUBP,parno);
	7766	goto insert_if_check_paren;
	7767	}
	7768	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	7769	/* (?(1)...) */
	7770	char c;
	7771	parno = atoi(RExC_parse++);
	7772
	7773	while (isDIGIT(*RExC_parse))
	7774	RExC_parse++;
	7775	ret = reganode(pRExC_state, GROUPP, parno);
	7776
	7777	insert_if_check_paren:
	7778	if ((c = *nextchar(pRExC_state)) != ')')
	7779	vFAIL("Switch condition not recognized");
	7780	insert_if:
	7781	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	7782	br = regbranch(pRExC_state, &flags, 1,depth+1);
	7783	if (br == NULL)
	7784	br = reganode(pRExC_state, LONGJMP, 0);
	7785	else
	7786	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	7787	c = *nextchar(pRExC_state);
	7788	if (flags&HASWIDTH)
	7789	*flagp \|= HASWIDTH;
	7790	if (c == '\|') {
	7791	if (is_define)
	7792	vFAIL("(?(DEFINE)....) does not allow branches");
	7793	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	7794	regbranch(pRExC_state, &flags, 1,depth+1);
	7795	REGTAIL(pRExC_state, ret, lastbr);
	7796	if (flags&HASWIDTH)
	7797	*flagp \|= HASWIDTH;
	7798	c = *nextchar(pRExC_state);
	7799	}
	7800	else
	7801	lastbr = NULL;
	7802	if (c != ')')
	7803	vFAIL("Switch (?(condition)... contains too many branches");
	7804	ender = reg_node(pRExC_state, TAIL);
	7805	REGTAIL(pRExC_state, br, ender);
	7806	if (lastbr) {
	7807	REGTAIL(pRExC_state, lastbr, ender);
	7808	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	7809	}
	7810	else
	7811	REGTAIL(pRExC_state, ret, ender);
	7812	RExC_size++; /* XXX WHY do we need this?!!
	7813	For large programs it seems to be required
	7814	but I can't figure out why. -- dmq*/
	7815	return ret;
	7816	}
	7817	else {
	7818	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	7819	}
	7820	}
	7821	case 0:
	7822	RExC_parse--; /* for vFAIL to print correctly */
	7823	vFAIL("Sequence (? incomplete");
	7824	break;
	7825	case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
	7826	that follow */
	7827	has_use_defaults = TRUE;
	7828	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	7829	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	7830	? REGEX_UNICODE_CHARSET
	7831	: REGEX_DEPENDS_CHARSET);
	7832	goto parse_flags;
	7833	default:
	7834	--RExC_parse;
	7835	parse_flags: /* (?i) */
	7836	{
	7837	U32 posflags = 0, negflags = 0;
	7838	U32 *flagsp = &posflags;
	7839	char has_charset_modifier = '\0';
	7840	regex_charset cs = (RExC_utf8 \|\| RExC_uni_semantics)
	7841	? REGEX_UNICODE_CHARSET
	7842	: REGEX_DEPENDS_CHARSET;
	7843
	7844	while (*RExC_parse) {
	7845	/* && strchr("iogcmsx", RExC_parse) /
	7846	/* (?g), (?gc) and (?o) are useless here
	7847	and must be globally applied -- japhy */
	7848	switch (*RExC_parse) {
	7849	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	7850	case LOCALE_PAT_MOD:
	7851	if (has_charset_modifier) {
	7852	goto excess_modifier;
	7853	}
	7854	else if (flagsp == &negflags) {
	7855	goto neg_modifier;
	7856	}
	7857	cs = REGEX_LOCALE_CHARSET;
	7858	has_charset_modifier = LOCALE_PAT_MOD;
	7859	RExC_contains_locale = 1;
	7860	break;
	7861	case UNICODE_PAT_MOD:
	7862	if (has_charset_modifier) {
	7863	goto excess_modifier;
	7864	}
	7865	else if (flagsp == &negflags) {
	7866	goto neg_modifier;
	7867	}
	7868	cs = REGEX_UNICODE_CHARSET;
	7869	has_charset_modifier = UNICODE_PAT_MOD;
	7870	break;
	7871	case ASCII_RESTRICT_PAT_MOD:
	7872	if (flagsp == &negflags) {
	7873	goto neg_modifier;
	7874	}
	7875	if (has_charset_modifier) {
	7876	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	7877	goto excess_modifier;
	7878	}
	7879	/* Doubled modifier implies more restricted */
	7880	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	7881	}
	7882	else {
	7883	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	7884	}
	7885	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	7886	break;
	7887	case DEPENDS_PAT_MOD:
	7888	if (has_use_defaults) {
	7889	goto fail_modifiers;
	7890	}
	7891	else if (flagsp == &negflags) {
	7892	goto neg_modifier;
	7893	}
	7894	else if (has_charset_modifier) {
	7895	goto excess_modifier;
	7896	}
	7897
	7898	/* The dual charset means unicode semantics if the
	7899	* pattern (or target, not known until runtime) are
	7900	* utf8, or something in the pattern indicates unicode
	7901	* semantics */
	7902	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	7903	? REGEX_UNICODE_CHARSET
	7904	: REGEX_DEPENDS_CHARSET;
	7905	has_charset_modifier = DEPENDS_PAT_MOD;
	7906	break;
	7907	excess_modifier:
	7908	RExC_parse++;
	7909	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	7910	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	7911	}
	7912	else if (has_charset_modifier == *(RExC_parse - 1)) {
	7913	vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
	7914	}
	7915	else {
	7916	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	7917	}
	7918	/NOTREACHED/
	7919	neg_modifier:
	7920	RExC_parse++;
	7921	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
	7922	/NOTREACHED/
	7923	case ONCE_PAT_MOD: /* 'o' */
	7924	case GLOBAL_PAT_MOD: /* 'g' */
	7925	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	7926	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	7927	if (! (wastedflags & wflagbit) ) {
	7928	wastedflags \|= wflagbit;
	7929	vWARN5(
	7930	RExC_parse + 1,
	7931	"Useless (%s%c) - %suse /%c modifier",
	7932	flagsp == &negflags ? "?-" : "?",
	7933	*RExC_parse,
	7934	flagsp == &negflags ? "don't " : "",
	7935	*RExC_parse
	7936	);
	7937	}
	7938	}
	7939	break;
	7940
	7941	case CONTINUE_PAT_MOD: /* 'c' */
	7942	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	7943	if (! (wastedflags & WASTED_C) ) {
	7944	wastedflags \|= WASTED_GC;
	7945	vWARN3(
	7946	RExC_parse + 1,
	7947	"Useless (%sc) - %suse /gc modifier",
	7948	flagsp == &negflags ? "?-" : "?",
	7949	flagsp == &negflags ? "don't " : ""
	7950	);
	7951	}
	7952	}
	7953	break;
	7954	case KEEPCOPY_PAT_MOD: /* 'p' */
	7955	if (flagsp == &negflags) {
	7956	if (SIZE_ONLY)
	7957	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	7958	} else {
	7959	*flagsp \|= RXf_PMf_KEEPCOPY;
	7960	}
	7961	break;
	7962	case '-':
	7963	/* A flag is a default iff it is following a minus, so
	7964	* if there is a minus, it means will be trying to
	7965	* re-specify a default which is an error */
	7966	if (has_use_defaults \|\| flagsp == &negflags) {
	7967	fail_modifiers:
	7968	RExC_parse++;
	7969	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7970	/NOTREACHED/
	7971	}
	7972	flagsp = &negflags;
	7973	wastedflags = 0; /* reset so (?g-c) warns twice */
	7974	break;
	7975	case ':':
	7976	paren = ':';
	7977	/FALLTHROUGH/
	7978	case ')':
	7979	RExC_flags \|= posflags;
	7980	RExC_flags &= ~negflags;
	7981	set_regex_charset(&RExC_flags, cs);
	7982	if (paren != ':') {
	7983	oregflags \|= posflags;
	7984	oregflags &= ~negflags;
	7985	set_regex_charset(&oregflags, cs);
	7986	}
	7987	nextchar(pRExC_state);
	7988	if (paren != ':') {
	7989	*flagp = TRYAGAIN;
	7990	return NULL;
	7991	} else {
	7992	ret = NULL;
	7993	goto parse_rest;
	7994	}
	7995	/NOTREACHED/
	7996	default:
	7997	RExC_parse++;
	7998	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7999	/NOTREACHED/
	8000	}
	8001	++RExC_parse;
	8002	}
	8003	}} /* one for the default block, one for the switch */
	8004	}
	8005	else { /* (...) */
	8006	capturing_parens:
	8007	parno = RExC_npar;
	8008	RExC_npar++;
	8009
	8010	ret = reganode(pRExC_state, OPEN, parno);
	8011	if (!SIZE_ONLY ){
	8012	if (!RExC_nestroot)
	8013	RExC_nestroot = parno;
	8014	if (RExC_seen & REG_SEEN_RECURSE
	8015	&& !RExC_open_parens[parno-1])
	8016	{
	8017	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	8018	"Setting open paren #%"IVdf" to %d\n",
	8019	(IV)parno, REG_NODE_NUM(ret)));
	8020	RExC_open_parens[parno-1]= ret;
	8021	}
	8022	}
	8023	Set_Node_Length(ret, 1); /* MJD */
	8024	Set_Node_Offset(ret, RExC_parse); /* MJD */
	8025	is_open = 1;
	8026	}
	8027	}
	8028	else /* ! paren */
	8029	ret = NULL;
	8030
	8031	parse_rest:
	8032	/* Pick up the branches, linking them together. */
	8033	parse_start = RExC_parse; /* MJD */
	8034	br = regbranch(pRExC_state, &flags, 1,depth+1);
	8035
	8036	/* branch_len = (paren != 0); */
	8037
	8038	if (br == NULL)
	8039	return(NULL);
	8040	if (*RExC_parse == '\|') {
	8041	if (!SIZE_ONLY && RExC_extralen) {
	8042	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	8043	}
	8044	else { /* MJD */
	8045	reginsert(pRExC_state, BRANCH, br, depth+1);
	8046	Set_Node_Length(br, paren != 0);
	8047	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	8048	}
	8049	have_branch = 1;
	8050	if (SIZE_ONLY)
	8051	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	8052	}
	8053	else if (paren == ':') {
	8054	*flagp \|= flags&SIMPLE;
	8055	}
	8056	if (is_open) { /* Starts with OPEN. */
	8057	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	8058	}
	8059	else if (paren != '?') /* Not Conditional */
	8060	ret = br;
	8061	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	8062	lastbr = br;
	8063	while (*RExC_parse == '\|') {
	8064	if (!SIZE_ONLY && RExC_extralen) {
	8065	ender = reganode(pRExC_state, LONGJMP,0);
	8066	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	8067	}
	8068	if (SIZE_ONLY)
	8069	RExC_extralen += 2; /* Account for LONGJMP. */
	8070	nextchar(pRExC_state);
	8071	if (freeze_paren) {
	8072	if (RExC_npar > after_freeze)
	8073	after_freeze = RExC_npar;
	8074	RExC_npar = freeze_paren;
	8075	}
	8076	br = regbranch(pRExC_state, &flags, 0, depth+1);
	8077
	8078	if (br == NULL)
	8079	return(NULL);
	8080	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	8081	lastbr = br;
	8082	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	8083	}
	8084
	8085	if (have_branch \|\| paren != ':') {
	8086	/* Make a closing node, and hook it on the end. */
	8087	switch (paren) {
	8088	case ':':
	8089	ender = reg_node(pRExC_state, TAIL);
	8090	break;
	8091	case 1:
	8092	ender = reganode(pRExC_state, CLOSE, parno);
	8093	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	8094	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	8095	"Setting close paren #%"IVdf" to %d\n",
	8096	(IV)parno, REG_NODE_NUM(ender)));
	8097	RExC_close_parens[parno-1]= ender;
	8098	if (RExC_nestroot == parno)
	8099	RExC_nestroot = 0;
	8100	}
	8101	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	8102	Set_Node_Length(ender,1); /* MJD */
	8103	break;
	8104	case '<':
	8105	case ',':
	8106	case '=':
	8107	case '!':
	8108	*flagp &= ~HASWIDTH;
	8109	/* FALL THROUGH */
	8110	case '>':
	8111	ender = reg_node(pRExC_state, SUCCEED);
	8112	break;
	8113	case 0:
	8114	ender = reg_node(pRExC_state, END);
	8115	if (!SIZE_ONLY) {
	8116	assert(!RExC_opend); /* there can only be one! */
	8117	RExC_opend = ender;
	8118	}
	8119	break;
	8120	}
	8121	REGTAIL(pRExC_state, lastbr, ender);
	8122
	8123	if (have_branch && !SIZE_ONLY) {
	8124	if (depth==1)
	8125	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	8126
	8127	/* Hook the tails of the branches to the closing node. */
	8128	for (br = ret; br; br = regnext(br)) {
	8129	const U8 op = PL_regkind[OP(br)];
	8130	if (op == BRANCH) {
	8131	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	8132	}
	8133	else if (op == BRANCHJ) {
	8134	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	8135	}
	8136	}
	8137	}
	8138	}
	8139
	8140	{
	8141	const char *p;
	8142	static const char parens[] = "=!<,>";
	8143
	8144	if (paren && (p = strchr(parens, paren))) {
	8145	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	8146	int flag = (p - parens) > 1;
	8147
	8148	if (paren == '>')
	8149	node = SUSPEND, flag = 0;
	8150	reginsert(pRExC_state, node,ret, depth+1);
	8151	Set_Node_Cur_Length(ret);
	8152	Set_Node_Offset(ret, parse_start + 1);
	8153	ret->flags = flag;
	8154	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	8155	}
	8156	}
	8157
	8158	/* Check for proper termination. */
	8159	if (paren) {
	8160	RExC_flags = oregflags;
	8161	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	8162	RExC_parse = oregcomp_parse;
	8163	vFAIL("Unmatched (");
	8164	}
	8165	}
	8166	else if (!paren && RExC_parse < RExC_end) {
	8167	if (*RExC_parse == ')') {
	8168	RExC_parse++;
	8169	vFAIL("Unmatched )");
	8170	}
	8171	else
	8172	FAIL("Junk on end of regexp"); /* "Can't happen". */
	8173	/* NOTREACHED */
	8174	}
	8175
	8176	if (RExC_in_lookbehind) {
	8177	RExC_in_lookbehind--;
	8178	}
	8179	if (after_freeze > RExC_npar)
	8180	RExC_npar = after_freeze;
	8181	return(ret);
	8182	}
	8183
	8184	/*
	8185	- regbranch - one alternative of an \| operator
	8186	*
	8187	* Implements the concatenation operator.
	8188	*/
	8189	STATIC regnode *
	8190	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	8191	{
	8192	dVAR;
	8193	register regnode *ret;
	8194	register regnode *chain = NULL;
	8195	register regnode *latest;
	8196	I32 flags = 0, c = 0;
	8197	GET_RE_DEBUG_FLAGS_DECL;
	8198
	8199	PERL_ARGS_ASSERT_REGBRANCH;
	8200
	8201	DEBUG_PARSE("brnc");
	8202
	8203	if (first)
	8204	ret = NULL;
	8205	else {
	8206	if (!SIZE_ONLY && RExC_extralen)
	8207	ret = reganode(pRExC_state, BRANCHJ,0);
	8208	else {
	8209	ret = reg_node(pRExC_state, BRANCH);
	8210	Set_Node_Length(ret, 1);
	8211	}
	8212	}
	8213
	8214	if (!first && SIZE_ONLY)
	8215	RExC_extralen += 1; /* BRANCHJ */
	8216
	8217	flagp = WORST; / Tentatively. */
	8218
	8219	RExC_parse--;
	8220	nextchar(pRExC_state);
	8221	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	8222	flags &= ~TRYAGAIN;
	8223	latest = regpiece(pRExC_state, &flags,depth+1);
	8224	if (latest == NULL) {
	8225	if (flags & TRYAGAIN)
	8226	continue;
	8227	return(NULL);
	8228	}
	8229	else if (ret == NULL)
	8230	ret = latest;
	8231	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	8232	if (chain == NULL) /* First piece. */
	8233	*flagp \|= flags&SPSTART;
	8234	else {
	8235	RExC_naughty++;
	8236	REGTAIL(pRExC_state, chain, latest);
	8237	}
	8238	chain = latest;
	8239	c++;
	8240	}
	8241	if (chain == NULL) { /* Loop ran zero times. */
	8242	chain = reg_node(pRExC_state, NOTHING);
	8243	if (ret == NULL)
	8244	ret = chain;
	8245	}
	8246	if (c == 1) {
	8247	*flagp \|= flags&SIMPLE;
	8248	}
	8249
	8250	return ret;
	8251	}
	8252
	8253	/*
	8254	- regpiece - something followed by possible [*+?]
	8255	*
	8256	* Note that the branching code sequences used for ? and the general cases
	8257	* of * and + are somewhat optimized: they use the same NOTHING node as
	8258	* both the endmarker for their branch list and the body of the last branch.
	8259	* It might seem that this node could be dispensed with entirely, but the
	8260	* endmarker role is not redundant.
	8261	*/
	8262	STATIC regnode *
	8263	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	8264	{
	8265	dVAR;
	8266	register regnode *ret;
	8267	register char op;
	8268	register char *next;
	8269	I32 flags;
	8270	const char * const origparse = RExC_parse;
	8271	I32 min;
	8272	I32 max = REG_INFTY;
	8273	#ifdef RE_TRACK_PATTERN_OFFSETS
	8274	char *parse_start;
	8275	#endif
	8276	const char *maxpos = NULL;
	8277	GET_RE_DEBUG_FLAGS_DECL;
	8278
	8279	PERL_ARGS_ASSERT_REGPIECE;
	8280
	8281	DEBUG_PARSE("piec");
	8282
	8283	ret = regatom(pRExC_state, &flags,depth+1);
	8284	if (ret == NULL) {
	8285	if (flags & TRYAGAIN)
	8286	*flagp \|= TRYAGAIN;
	8287	return(NULL);
	8288	}
	8289
	8290	op = *RExC_parse;
	8291
	8292	if (op == '{' && regcurly(RExC_parse)) {
	8293	maxpos = NULL;
	8294	#ifdef RE_TRACK_PATTERN_OFFSETS
	8295	parse_start = RExC_parse; /* MJD */
	8296	#endif
	8297	next = RExC_parse + 1;
	8298	while (isDIGIT(next) \|\| next == ',') {
	8299	if (*next == ',') {
	8300	if (maxpos)
	8301	break;
	8302	else
	8303	maxpos = next;
	8304	}
	8305	next++;
	8306	}
	8307	if (next == '}') { / got one */
	8308	if (!maxpos)
	8309	maxpos = next;
	8310	RExC_parse++;
	8311	min = atoi(RExC_parse);
	8312	if (*maxpos == ',')
	8313	maxpos++;
	8314	else
	8315	maxpos = RExC_parse;
	8316	max = atoi(maxpos);
	8317	if (!max && *maxpos != '0')
	8318	max = REG_INFTY; /* meaning "infinity" */
	8319	else if (max >= REG_INFTY)
	8320	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	8321	RExC_parse = next;
	8322	nextchar(pRExC_state);
	8323
	8324	do_curly:
	8325	if ((flags&SIMPLE)) {
	8326	RExC_naughty += 2 + RExC_naughty / 2;
	8327	reginsert(pRExC_state, CURLY, ret, depth+1);
	8328	Set_Node_Offset(ret, parse_start+1); /* MJD */
	8329	Set_Node_Cur_Length(ret);
	8330	}
	8331	else {
	8332	regnode * const w = reg_node(pRExC_state, WHILEM);
	8333
	8334	w->flags = 0;
	8335	REGTAIL(pRExC_state, ret, w);
	8336	if (!SIZE_ONLY && RExC_extralen) {
	8337	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	8338	reginsert(pRExC_state, NOTHING,ret, depth+1);
	8339	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	8340	}
	8341	reginsert(pRExC_state, CURLYX,ret, depth+1);
	8342	/* MJD hk */
	8343	Set_Node_Offset(ret, parse_start+1);
	8344	Set_Node_Length(ret,
	8345	op == '{' ? (RExC_parse - parse_start) : 1);
	8346
	8347	if (!SIZE_ONLY && RExC_extralen)
	8348	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	8349	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	8350	if (SIZE_ONLY)
	8351	RExC_whilem_seen++, RExC_extralen += 3;
	8352	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	8353	}
	8354	ret->flags = 0;
	8355
	8356	if (min > 0)
	8357	*flagp = WORST;
	8358	if (max > 0)
	8359	*flagp \|= HASWIDTH;
	8360	if (max < min)
	8361	vFAIL("Can't do {n,m} with n > m");
	8362	if (!SIZE_ONLY) {
	8363	ARG1_SET(ret, (U16)min);
	8364	ARG2_SET(ret, (U16)max);
	8365	}
	8366
	8367	goto nest_check;
	8368	}
	8369	}
	8370
	8371	if (!ISMULT1(op)) {
	8372	*flagp = flags;
	8373	return(ret);
	8374	}
	8375
	8376	#if 0 /* Now runtime fix should be reliable. */
	8377
	8378	/* if this is reinstated, don't forget to put this back into perldiag:
	8379
	8380	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	8381
	8382	(F) The part of the regexp subject to either the * or + quantifier
	8383	could match an empty string. The {#} shows in the regular
	8384	expression about where the problem was discovered.
	8385
	8386	*/
	8387
	8388	if (!(flags&HASWIDTH) && op != '?')
	8389	vFAIL("Regexp *+ operand could be empty");
	8390	#endif
	8391
	8392	#ifdef RE_TRACK_PATTERN_OFFSETS
	8393	parse_start = RExC_parse;
	8394	#endif
	8395	nextchar(pRExC_state);
	8396
	8397	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	8398
	8399	if (op == '*' && (flags&SIMPLE)) {
	8400	reginsert(pRExC_state, STAR, ret, depth+1);
	8401	ret->flags = 0;
	8402	RExC_naughty += 4;
	8403	}
	8404	else if (op == '*') {
	8405	min = 0;
	8406	goto do_curly;
	8407	}
	8408	else if (op == '+' && (flags&SIMPLE)) {
	8409	reginsert(pRExC_state, PLUS, ret, depth+1);
	8410	ret->flags = 0;
	8411	RExC_naughty += 3;
	8412	}
	8413	else if (op == '+') {
	8414	min = 1;
	8415	goto do_curly;
	8416	}
	8417	else if (op == '?') {
	8418	min = 0; max = 1;
	8419	goto do_curly;
	8420	}
	8421	nest_check:
	8422	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	8423	ckWARN3reg(RExC_parse,
	8424	"%.*s matches null string many times",
	8425	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	8426	origparse);
	8427	}
	8428
	8429	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	8430	nextchar(pRExC_state);
	8431	reginsert(pRExC_state, MINMOD, ret, depth+1);
	8432	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	8433	}
	8434	#ifndef REG_ALLOW_MINMOD_SUSPEND
	8435	else
	8436	#endif
	8437	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	8438	regnode *ender;
	8439	nextchar(pRExC_state);
	8440	ender = reg_node(pRExC_state, SUCCEED);
	8441	REGTAIL(pRExC_state, ret, ender);
	8442	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	8443	ret->flags = 0;
	8444	ender = reg_node(pRExC_state, TAIL);
	8445	REGTAIL(pRExC_state, ret, ender);
	8446	/ret= ender;/
	8447	}
	8448
	8449	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	8450	RExC_parse++;
	8451	vFAIL("Nested quantifiers");
	8452	}
	8453
	8454	return(ret);
	8455	}
	8456
	8457
	8458	/* reg_namedseq(pRExC_state,UVp, UV depth)
	8459
	8460	This is expected to be called by a parser routine that has
	8461	recognized '\N' and needs to handle the rest. RExC_parse is
	8462	expected to point at the first char following the N at the time
	8463	of the call.
	8464
	8465	The \N may be inside (indicated by valuep not being NULL) or outside a
	8466	character class.
	8467
	8468	\N may begin either a named sequence, or if outside a character class, mean
	8469	to match a non-newline. For non single-quoted regexes, the tokenizer has
	8470	attempted to decide which, and in the case of a named sequence converted it
	8471	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	8472	where c1... are the characters in the sequence. For single-quoted regexes,
	8473	the tokenizer passes the \N sequence through unchanged; this code will not
	8474	attempt to determine this nor expand those. The net effect is that if the
	8475	beginning of the passed-in pattern isn't '{U+' or there is no '}', it
	8476	signals that this \N occurrence means to match a non-newline.
	8477
	8478	Only the \N{U+...} form should occur in a character class, for the same
	8479	reason that '.' inside a character class means to just match a period: it
	8480	just doesn't make sense.
	8481
	8482	If valuep is non-null then it is assumed that we are parsing inside
	8483	of a charclass definition and the first codepoint in the resolved
	8484	string is returned via *valuep and the routine will return NULL.
	8485	In this mode if a multichar string is returned from the charnames
	8486	handler, a warning will be issued, and only the first char in the
	8487	sequence will be examined. If the string returned is zero length
	8488	then the value of *valuep is undefined and NON-NULL will
	8489	be returned to indicate failure. (This will NOT be a valid pointer
	8490	to a regnode.)
	8491
	8492	If valuep is null then it is assumed that we are parsing normal text and a
	8493	new EXACT node is inserted into the program containing the resolved string,
	8494	and a pointer to the new node is returned. But if the string is zero length
	8495	a NOTHING node is emitted instead.
	8496
	8497	On success RExC_parse is set to the char following the endbrace.
	8498	Parsing failures will generate a fatal error via vFAIL(...)
	8499	*/
	8500	STATIC regnode *
	8501	S_reg_namedseq(pTHX_ RExC_state_t pRExC_state, UV valuep, I32 *flagp, U32 depth)
	8502	{
	8503	char * endbrace; /* '}' following the name */
	8504	regnode *ret = NULL;
	8505	char* p;
	8506
	8507	GET_RE_DEBUG_FLAGS_DECL;
	8508
	8509	PERL_ARGS_ASSERT_REG_NAMEDSEQ;
	8510
	8511	GET_RE_DEBUG_FLAGS;
	8512
	8513	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	8514	* modifier. The other meaning does not */
	8515	p = (RExC_flags & RXf_PMf_EXTENDED)
	8516	? regwhite( pRExC_state, RExC_parse )
	8517	: RExC_parse;
	8518
	8519	/* Disambiguate between \N meaning a named character versus \N meaning
	8520	* [^\n]. The former is assumed when it can't be the latter. */
	8521	if (*p != '{' \|\| regcurly(p)) {
	8522	RExC_parse = p;
	8523	if (valuep) {
	8524	/* no bare \N in a charclass */
	8525	vFAIL("\\N in a character class must be a named character: \\N{...}");
	8526	}
	8527	nextchar(pRExC_state);
	8528	ret = reg_node(pRExC_state, REG_ANY);
	8529	*flagp \|= HASWIDTH\|SIMPLE;
	8530	RExC_naughty++;
	8531	RExC_parse--;
	8532	Set_Node_Length(ret, 1); /* MJD */
	8533	return ret;
	8534	}
	8535
	8536	/* Here, we have decided it should be a named sequence */
	8537
	8538	/* The test above made sure that the next real character is a '{', but
	8539	* under the /x modifier, it could be separated by space (or a comment and
	8540	* \n) and this is not allowed (for consistency with \x{...} and the
	8541	* tokenizer handling of \N{NAME}). */
	8542	if (*RExC_parse != '{') {
	8543	vFAIL("Missing braces on \\N{}");
	8544	}
	8545
	8546	RExC_parse++; /* Skip past the '{' */
	8547
	8548	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	8549	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	8550	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	8551	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	8552	{
	8553	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	8554	vFAIL("\\N{NAME} must be resolved by the lexer");
	8555	}
	8556
	8557	if (endbrace == RExC_parse) { /* empty: \N{} */
	8558	if (! valuep) {
	8559	RExC_parse = endbrace + 1;
	8560	return reg_node(pRExC_state,NOTHING);
	8561	}
	8562
	8563	if (SIZE_ONLY) {
	8564	ckWARNreg(RExC_parse,
	8565	"Ignoring zero length \\N{} in character class"
	8566	);
	8567	RExC_parse = endbrace + 1;
	8568	}
	8569	*valuep = 0;
	8570	return (regnode ) &RExC_parse; / Invalid regnode pointer */
	8571	}
	8572
	8573	REQUIRE_UTF8; /* named sequences imply Unicode semantics */
	8574	RExC_parse += 2; /* Skip past the 'U+' */
	8575
	8576	if (valuep) { /* In a bracketed char class */
	8577	/* We only pay attention to the first char of
	8578	multichar strings being returned. I kinda wonder
	8579	if this makes sense as it does change the behaviour
	8580	from earlier versions, OTOH that behaviour was broken
	8581	as well. XXX Solution is to recharacterize as
	8582	[rest-of-class]\|multi1\|multi2... */
	8583
	8584	STRLEN length_of_hex;
	8585	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	8586	\| PERL_SCAN_DISALLOW_PREFIX
	8587	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	8588
	8589	char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
	8590	if (endchar < endbrace) {
	8591	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	8592	}
	8593
	8594	length_of_hex = (STRLEN)(endchar - RExC_parse);
	8595	*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
	8596
	8597	/* The tokenizer should have guaranteed validity, but it's possible to
	8598	* bypass it by using single quoting, so check */
	8599	if (length_of_hex == 0
	8600	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	8601	{
	8602	RExC_parse += length_of_hex; /* Includes all the valid */
	8603	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	8604	? UTF8SKIP(RExC_parse)
	8605	: 1;
	8606	/* Guard against malformed utf8 */
	8607	if (RExC_parse >= endchar) RExC_parse = endchar;
	8608	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	8609	}
	8610
	8611	RExC_parse = endbrace + 1;
	8612	if (endchar == endbrace) return NULL;
	8613
	8614	ret = (regnode ) &RExC_parse; / Invalid regnode pointer */
	8615	}
	8616	else { /* Not a char class */
	8617
	8618	/* What is done here is to convert this to a sub-pattern of the form
	8619	* (?:\x{char1}\x{char2}...)
	8620	* and then call reg recursively. That way, it retains its atomicness,
	8621	* while not having to worry about special handling that some code
	8622	* points may have. toke.c has converted the original Unicode values
	8623	* to native, so that we can just pass on the hex values unchanged. We
	8624	* do have to set a flag to keep recoding from happening in the
	8625	* recursion */
	8626
	8627	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8\|SVs_TEMP);
	8628	STRLEN len;
	8629	char endchar; / Points to '.' or '}' ending cur char in the input
	8630	stream */
	8631	char *orig_end = RExC_end;
	8632
	8633	while (RExC_parse < endbrace) {
	8634
	8635	/* Code points are separated by dots. If none, there is only one
	8636	* code point, and is terminated by the brace */
	8637	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	8638
	8639	/* Convert to notation the rest of the code understands */
	8640	sv_catpv(substitute_parse, "\\x{");
	8641	sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
	8642	sv_catpv(substitute_parse, "}");
	8643
	8644	/* Point to the beginning of the next character in the sequence. */
	8645	RExC_parse = endchar + 1;
	8646	}
	8647	sv_catpv(substitute_parse, ")");
	8648
	8649	RExC_parse = SvPV(substitute_parse, len);
	8650
	8651	/* Don't allow empty number */
	8652	if (len < 8) {
	8653	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	8654	}
	8655	RExC_end = RExC_parse + len;
	8656
	8657	/* The values are Unicode, and therefore not subject to recoding */
	8658	RExC_override_recoding = 1;
	8659
	8660	ret = reg(pRExC_state, 1, flagp, depth+1);
	8661
	8662	RExC_parse = endbrace;
	8663	RExC_end = orig_end;
	8664	RExC_override_recoding = 0;
	8665
	8666	nextchar(pRExC_state);
	8667	}
	8668
	8669	return ret;
	8670	}
	8671
	8672
	8673	/*
	8674	* reg_recode
	8675	*
	8676	* It returns the code point in utf8 for the value in *encp.
	8677	* value: a code value in the source encoding
	8678	* encp: a pointer to an Encode object
	8679	*
	8680	* If the result from Encode is not a single character,
	8681	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	8682	*/
	8683	STATIC UV
	8684	S_reg_recode(pTHX_ const char value, SV **encp)
	8685	{
	8686	STRLEN numlen = 1;
	8687	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	8688	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	8689	const STRLEN newlen = SvCUR(sv);
	8690	UV uv = UNICODE_REPLACEMENT;
	8691
	8692	PERL_ARGS_ASSERT_REG_RECODE;
	8693
	8694	if (newlen)
	8695	uv = SvUTF8(sv)
	8696	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	8697	: (U8)s;
	8698
	8699	if (!newlen \|\| numlen != newlen) {
	8700	uv = UNICODE_REPLACEMENT;
	8701	*encp = NULL;
	8702	}
	8703	return uv;
	8704	}
	8705
	8706
	8707	/*
	8708	- regatom - the lowest level
	8709
	8710	Try to identify anything special at the start of the pattern. If there
	8711	is, then handle it as required. This may involve generating a single regop,
	8712	such as for an assertion; or it may involve recursing, such as to
	8713	handle a () structure.
	8714
	8715	If the string doesn't start with something special then we gobble up
	8716	as much literal text as we can.
	8717
	8718	Once we have been able to handle whatever type of thing started the
	8719	sequence, we return.
	8720
	8721	Note: we have to be careful with escapes, as they can be both literal
	8722	and special, and in the case of \10 and friends can either, depending
	8723	on context. Specifically there are two separate switches for handling
	8724	escape sequences, with the one for handling literal escapes requiring
	8725	a dummy entry for all of the special escapes that are actually handled
	8726	by the other.
	8727	*/
	8728
	8729	STATIC regnode *
	8730	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	8731	{
	8732	dVAR;
	8733	register regnode *ret = NULL;
	8734	I32 flags;
	8735	char *parse_start = RExC_parse;
	8736	U8 op;
	8737	GET_RE_DEBUG_FLAGS_DECL;
	8738	DEBUG_PARSE("atom");
	8739	flagp = WORST; / Tentatively. */
	8740
	8741	PERL_ARGS_ASSERT_REGATOM;
	8742
	8743	tryagain:
	8744	switch ((U8)*RExC_parse) {
	8745	case '^':
	8746	RExC_seen_zerolen++;
	8747	nextchar(pRExC_state);
	8748	if (RExC_flags & RXf_PMf_MULTILINE)
	8749	ret = reg_node(pRExC_state, MBOL);
	8750	else if (RExC_flags & RXf_PMf_SINGLELINE)
	8751	ret = reg_node(pRExC_state, SBOL);
	8752	else
	8753	ret = reg_node(pRExC_state, BOL);
	8754	Set_Node_Length(ret, 1); /* MJD */
	8755	break;
	8756	case '$':
	8757	nextchar(pRExC_state);
	8758	if (*RExC_parse)
	8759	RExC_seen_zerolen++;
	8760	if (RExC_flags & RXf_PMf_MULTILINE)
	8761	ret = reg_node(pRExC_state, MEOL);
	8762	else if (RExC_flags & RXf_PMf_SINGLELINE)
	8763	ret = reg_node(pRExC_state, SEOL);
	8764	else
	8765	ret = reg_node(pRExC_state, EOL);
	8766	Set_Node_Length(ret, 1); /* MJD */
	8767	break;
	8768	case '.':
	8769	nextchar(pRExC_state);
	8770	if (RExC_flags & RXf_PMf_SINGLELINE)
	8771	ret = reg_node(pRExC_state, SANY);
	8772	else
	8773	ret = reg_node(pRExC_state, REG_ANY);
	8774	*flagp \|= HASWIDTH\|SIMPLE;
	8775	RExC_naughty++;
	8776	Set_Node_Length(ret, 1); /* MJD */
	8777	break;
	8778	case '[':
	8779	{
	8780	char * const oregcomp_parse = ++RExC_parse;
	8781	ret = regclass(pRExC_state,depth+1);
	8782	if (*RExC_parse != ']') {
	8783	RExC_parse = oregcomp_parse;
	8784	vFAIL("Unmatched [");
	8785	}
	8786	nextchar(pRExC_state);
	8787	*flagp \|= HASWIDTH\|SIMPLE;
	8788	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	8789	break;
	8790	}
	8791	case '(':
	8792	nextchar(pRExC_state);
	8793	ret = reg(pRExC_state, 1, &flags,depth+1);
	8794	if (ret == NULL) {
	8795	if (flags & TRYAGAIN) {
	8796	if (RExC_parse == RExC_end) {
	8797	/* Make parent create an empty node if needed. */
	8798	*flagp \|= TRYAGAIN;
	8799	return(NULL);
	8800	}
	8801	goto tryagain;
	8802	}
	8803	return(NULL);
	8804	}
	8805	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	8806	break;
	8807	case '\|':
	8808	case ')':
	8809	if (flags & TRYAGAIN) {
	8810	*flagp \|= TRYAGAIN;
	8811	return NULL;
	8812	}
	8813	vFAIL("Internal urp");
	8814	/* Supposed to be caught earlier. */
	8815	break;
	8816	case '{':
	8817	if (!regcurly(RExC_parse)) {
	8818	RExC_parse++;
	8819	goto defchar;
	8820	}
	8821	/* FALL THROUGH */
	8822	case '?':
	8823	case '+':
	8824	case '*':
	8825	RExC_parse++;
	8826	vFAIL("Quantifier follows nothing");
	8827	break;
	8828	case '\\':
	8829	/* Special Escapes
	8830
	8831	This switch handles escape sequences that resolve to some kind
	8832	of special regop and not to literal text. Escape sequnces that
	8833	resolve to literal text are handled below in the switch marked
	8834	"Literal Escapes".
	8835
	8836	Every entry in this switch must have a corresponding entry
	8837	in the literal escape switch. However, the opposite is not
	8838	required, as the default for this switch is to jump to the
	8839	literal text handling code.
	8840	*/
	8841	switch ((U8)*++RExC_parse) {
	8842	/* Special Escapes */
	8843	case 'A':
	8844	RExC_seen_zerolen++;
	8845	ret = reg_node(pRExC_state, SBOL);
	8846	*flagp \|= SIMPLE;
	8847	goto finish_meta_pat;
	8848	case 'G':
	8849	ret = reg_node(pRExC_state, GPOS);
	8850	RExC_seen \|= REG_SEEN_GPOS;
	8851	*flagp \|= SIMPLE;
	8852	goto finish_meta_pat;
	8853	case 'K':
	8854	RExC_seen_zerolen++;
	8855	ret = reg_node(pRExC_state, KEEPS);
	8856	*flagp \|= SIMPLE;
	8857	/* XXX:dmq : disabling in-place substitution seems to
	8858	* be necessary here to avoid cases of memory corruption, as
	8859	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	8860	*/
	8861	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8862	goto finish_meta_pat;
	8863	case 'Z':
	8864	ret = reg_node(pRExC_state, SEOL);
	8865	*flagp \|= SIMPLE;
	8866	RExC_seen_zerolen++; /* Do not optimize RE away */
	8867	goto finish_meta_pat;
	8868	case 'z':
	8869	ret = reg_node(pRExC_state, EOS);
	8870	*flagp \|= SIMPLE;
	8871	RExC_seen_zerolen++; /* Do not optimize RE away */
	8872	goto finish_meta_pat;
	8873	case 'C':
	8874	ret = reg_node(pRExC_state, CANY);
	8875	RExC_seen \|= REG_SEEN_CANY;
	8876	*flagp \|= HASWIDTH\|SIMPLE;
	8877	goto finish_meta_pat;
	8878	case 'X':
	8879	ret = reg_node(pRExC_state, CLUMP);
	8880	*flagp \|= HASWIDTH;
	8881	goto finish_meta_pat;
	8882	case 'w':
	8883	switch (get_regex_charset(RExC_flags)) {
	8884	case REGEX_LOCALE_CHARSET:
	8885	op = ALNUML;
	8886	break;
	8887	case REGEX_UNICODE_CHARSET:
	8888	op = ALNUMU;
	8889	break;
	8890	case REGEX_ASCII_RESTRICTED_CHARSET:
	8891	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8892	op = ALNUMA;
	8893	break;
	8894	case REGEX_DEPENDS_CHARSET:
	8895	op = ALNUM;
	8896	break;
	8897	default:
	8898	goto bad_charset;
	8899	}
	8900	ret = reg_node(pRExC_state, op);
	8901	*flagp \|= HASWIDTH\|SIMPLE;
	8902	goto finish_meta_pat;
	8903	case 'W':
	8904	switch (get_regex_charset(RExC_flags)) {
	8905	case REGEX_LOCALE_CHARSET:
	8906	op = NALNUML;
	8907	break;
	8908	case REGEX_UNICODE_CHARSET:
	8909	op = NALNUMU;
	8910	break;
	8911	case REGEX_ASCII_RESTRICTED_CHARSET:
	8912	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8913	op = NALNUMA;
	8914	break;
	8915	case REGEX_DEPENDS_CHARSET:
	8916	op = NALNUM;
	8917	break;
	8918	default:
	8919	goto bad_charset;
	8920	}
	8921	ret = reg_node(pRExC_state, op);
	8922	*flagp \|= HASWIDTH\|SIMPLE;
	8923	goto finish_meta_pat;
	8924	case 'b':
	8925	RExC_seen_zerolen++;
	8926	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8927	switch (get_regex_charset(RExC_flags)) {
	8928	case REGEX_LOCALE_CHARSET:
	8929	op = BOUNDL;
	8930	break;
	8931	case REGEX_UNICODE_CHARSET:
	8932	op = BOUNDU;
	8933	break;
	8934	case REGEX_ASCII_RESTRICTED_CHARSET:
	8935	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8936	op = BOUNDA;
	8937	break;
	8938	case REGEX_DEPENDS_CHARSET:
	8939	op = BOUND;
	8940	break;
	8941	default:
	8942	goto bad_charset;
	8943	}
	8944	ret = reg_node(pRExC_state, op);
	8945	FLAGS(ret) = get_regex_charset(RExC_flags);
	8946	*flagp \|= SIMPLE;
	8947	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	8948	ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
	8949	}
	8950	goto finish_meta_pat;
	8951	case 'B':
	8952	RExC_seen_zerolen++;
	8953	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8954	switch (get_regex_charset(RExC_flags)) {
	8955	case REGEX_LOCALE_CHARSET:
	8956	op = NBOUNDL;
	8957	break;
	8958	case REGEX_UNICODE_CHARSET:
	8959	op = NBOUNDU;
	8960	break;
	8961	case REGEX_ASCII_RESTRICTED_CHARSET:
	8962	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8963	op = NBOUNDA;
	8964	break;
	8965	case REGEX_DEPENDS_CHARSET:
	8966	op = NBOUND;
	8967	break;
	8968	default:
	8969	goto bad_charset;
	8970	}
	8971	ret = reg_node(pRExC_state, op);
	8972	FLAGS(ret) = get_regex_charset(RExC_flags);
	8973	*flagp \|= SIMPLE;
	8974	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	8975	ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
	8976	}
	8977	goto finish_meta_pat;
	8978	case 's':
	8979	switch (get_regex_charset(RExC_flags)) {
	8980	case REGEX_LOCALE_CHARSET:
	8981	op = SPACEL;
	8982	break;
	8983	case REGEX_UNICODE_CHARSET:
	8984	op = SPACEU;
	8985	break;
	8986	case REGEX_ASCII_RESTRICTED_CHARSET:
	8987	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8988	op = SPACEA;
	8989	break;
	8990	case REGEX_DEPENDS_CHARSET:
	8991	op = SPACE;
	8992	break;
	8993	default:
	8994	goto bad_charset;
	8995	}
	8996	ret = reg_node(pRExC_state, op);
	8997	*flagp \|= HASWIDTH\|SIMPLE;
	8998	goto finish_meta_pat;
	8999	case 'S':
	9000	switch (get_regex_charset(RExC_flags)) {
	9001	case REGEX_LOCALE_CHARSET:
	9002	op = NSPACEL;
	9003	break;
	9004	case REGEX_UNICODE_CHARSET:
	9005	op = NSPACEU;
	9006	break;
	9007	case REGEX_ASCII_RESTRICTED_CHARSET:
	9008	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9009	op = NSPACEA;
	9010	break;
	9011	case REGEX_DEPENDS_CHARSET:
	9012	op = NSPACE;
	9013	break;
	9014	default:
	9015	goto bad_charset;
	9016	}
	9017	ret = reg_node(pRExC_state, op);
	9018	*flagp \|= HASWIDTH\|SIMPLE;
	9019	goto finish_meta_pat;
	9020	case 'd':
	9021	switch (get_regex_charset(RExC_flags)) {
	9022	case REGEX_LOCALE_CHARSET:
	9023	op = DIGITL;
	9024	break;
	9025	case REGEX_ASCII_RESTRICTED_CHARSET:
	9026	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9027	op = DIGITA;
	9028	break;
	9029	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	9030	case REGEX_UNICODE_CHARSET:
	9031	op = DIGIT;
	9032	break;
	9033	default:
	9034	goto bad_charset;
	9035	}
	9036	ret = reg_node(pRExC_state, op);
	9037	*flagp \|= HASWIDTH\|SIMPLE;
	9038	goto finish_meta_pat;
	9039	case 'D':
	9040	switch (get_regex_charset(RExC_flags)) {
	9041	case REGEX_LOCALE_CHARSET:
	9042	op = NDIGITL;
	9043	break;
	9044	case REGEX_ASCII_RESTRICTED_CHARSET:
	9045	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	9046	op = NDIGITA;
	9047	break;
	9048	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	9049	case REGEX_UNICODE_CHARSET:
	9050	op = NDIGIT;
	9051	break;
	9052	default:
	9053	goto bad_charset;
	9054	}
	9055	ret = reg_node(pRExC_state, op);
	9056	*flagp \|= HASWIDTH\|SIMPLE;
	9057	goto finish_meta_pat;
	9058	case 'R':
	9059	ret = reg_node(pRExC_state, LNBREAK);
	9060	*flagp \|= HASWIDTH\|SIMPLE;
	9061	goto finish_meta_pat;
	9062	case 'h':
	9063	ret = reg_node(pRExC_state, HORIZWS);
	9064	*flagp \|= HASWIDTH\|SIMPLE;
	9065	goto finish_meta_pat;
	9066	case 'H':
	9067	ret = reg_node(pRExC_state, NHORIZWS);
	9068	*flagp \|= HASWIDTH\|SIMPLE;
	9069	goto finish_meta_pat;
	9070	case 'v':
	9071	ret = reg_node(pRExC_state, VERTWS);
	9072	*flagp \|= HASWIDTH\|SIMPLE;
	9073	goto finish_meta_pat;
	9074	case 'V':
	9075	ret = reg_node(pRExC_state, NVERTWS);
	9076	*flagp \|= HASWIDTH\|SIMPLE;
	9077	finish_meta_pat:
	9078	nextchar(pRExC_state);
	9079	Set_Node_Length(ret, 2); /* MJD */
	9080	break;
	9081	case 'p':
	9082	case 'P':
	9083	{
	9084	char* const oldregxend = RExC_end;
	9085	#ifdef DEBUGGING
	9086	char* parse_start = RExC_parse - 2;
	9087	#endif
	9088
	9089	if (RExC_parse[1] == '{') {
	9090	/* a lovely hack--pretend we saw [\pX] instead */
	9091	RExC_end = strchr(RExC_parse, '}');
	9092	if (!RExC_end) {
	9093	const U8 c = (U8)*RExC_parse;
	9094	RExC_parse += 2;
	9095	RExC_end = oldregxend;
	9096	vFAIL2("Missing right brace on \\%c{}", c);
	9097	}
	9098	RExC_end++;
	9099	}
	9100	else {
	9101	RExC_end = RExC_parse + 2;
	9102	if (RExC_end > oldregxend)
	9103	RExC_end = oldregxend;
	9104	}
	9105	RExC_parse--;
	9106
	9107	ret = regclass(pRExC_state,depth+1);
	9108
	9109	RExC_end = oldregxend;
	9110	RExC_parse--;
	9111
	9112	Set_Node_Offset(ret, parse_start + 2);
	9113	Set_Node_Cur_Length(ret);
	9114	nextchar(pRExC_state);
	9115	*flagp \|= HASWIDTH\|SIMPLE;
	9116	}
	9117	break;
	9118	case 'N':
	9119	/* Handle \N and \N{NAME} here and not below because it can be
	9120	multicharacter. join_exact() will join them up later on.
	9121	Also this makes sure that things like /\N{BLAH}+/ and
	9122	\N{BLAH} being multi char Just Happen. dmq*/
	9123	++RExC_parse;
	9124	ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
	9125	break;
	9126	case 'k': /* Handle \k<NAME> and \k'NAME' */
	9127	parse_named_seq:
	9128	{
	9129	char ch= RExC_parse[1];
	9130	if (ch != '<' && ch != '\'' && ch != '{') {
	9131	RExC_parse++;
	9132	vFAIL2("Sequence %.2s... not terminated",parse_start);
	9133	} else {
	9134	/* this pretty much dupes the code for (?P=...) in reg(), if
	9135	you change this make sure you change that */
	9136	char* name_start = (RExC_parse += 2);
	9137	U32 num = 0;
	9138	SV *sv_dat = reg_scan_name(pRExC_state,
	9139	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	9140	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	9141	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	9142	vFAIL2("Sequence %.3s... not terminated",parse_start);
	9143
	9144	if (!SIZE_ONLY) {
	9145	num = add_data( pRExC_state, 1, "S" );
	9146	RExC_rxi->data->data[num]=(void*)sv_dat;
	9147	SvREFCNT_inc_simple_void(sv_dat);
	9148	}
	9149
	9150	RExC_sawback = 1;
	9151	ret = reganode(pRExC_state,
	9152	((! FOLD)
	9153	? NREF
	9154	: (MORE_ASCII_RESTRICTED)
	9155	? NREFFA
	9156	: (AT_LEAST_UNI_SEMANTICS)
	9157	? NREFFU
	9158	: (LOC)
	9159	? NREFFL
	9160	: NREFF),
	9161	num);
	9162	*flagp \|= HASWIDTH;
	9163
	9164	/* override incorrect value set in reganode MJD */
	9165	Set_Node_Offset(ret, parse_start+1);
	9166	Set_Node_Cur_Length(ret); /* MJD */
	9167	nextchar(pRExC_state);
	9168
	9169	}
	9170	break;
	9171	}
	9172	case 'g':
	9173	case '1': case '2': case '3': case '4':
	9174	case '5': case '6': case '7': case '8': case '9':
	9175	{
	9176	I32 num;
	9177	bool isg = *RExC_parse == 'g';
	9178	bool isrel = 0;
	9179	bool hasbrace = 0;
	9180	if (isg) {
	9181	RExC_parse++;
	9182	if (*RExC_parse == '{') {
	9183	RExC_parse++;
	9184	hasbrace = 1;
	9185	}
	9186	if (*RExC_parse == '-') {
	9187	RExC_parse++;
	9188	isrel = 1;
	9189	}
	9190	if (hasbrace && !isDIGIT(*RExC_parse)) {
	9191	if (isrel) RExC_parse--;
	9192	RExC_parse -= 2;
	9193	goto parse_named_seq;
	9194	} }
	9195	num = atoi(RExC_parse);
	9196	if (isg && num == 0)
	9197	vFAIL("Reference to invalid group 0");
	9198	if (isrel) {
	9199	num = RExC_npar - num;
	9200	if (num < 1)
	9201	vFAIL("Reference to nonexistent or unclosed group");
	9202	}
	9203	if (!isg && num > 9 && num >= RExC_npar)
	9204	goto defchar;
	9205	else {
	9206	char * const parse_start = RExC_parse - 1; /* MJD */
	9207	while (isDIGIT(*RExC_parse))
	9208	RExC_parse++;
	9209	if (parse_start == RExC_parse - 1)
	9210	vFAIL("Unterminated \\g... pattern");
	9211	if (hasbrace) {
	9212	if (*RExC_parse != '}')
	9213	vFAIL("Unterminated \\g{...} pattern");
	9214	RExC_parse++;
	9215	}
	9216	if (!SIZE_ONLY) {
	9217	if (num > (I32)RExC_rx->nparens)
	9218	vFAIL("Reference to nonexistent group");
	9219	}
	9220	RExC_sawback = 1;
	9221	ret = reganode(pRExC_state,
	9222	((! FOLD)
	9223	? REF
	9224	: (MORE_ASCII_RESTRICTED)
	9225	? REFFA
	9226	: (AT_LEAST_UNI_SEMANTICS)
	9227	? REFFU
	9228	: (LOC)
	9229	? REFFL
	9230	: REFF),
	9231	num);
	9232	*flagp \|= HASWIDTH;
	9233
	9234	/* override incorrect value set in reganode MJD */
	9235	Set_Node_Offset(ret, parse_start+1);
	9236	Set_Node_Cur_Length(ret); /* MJD */
	9237	RExC_parse--;
	9238	nextchar(pRExC_state);
	9239	}
	9240	}
	9241	break;
	9242	case '\0':
	9243	if (RExC_parse >= RExC_end)
	9244	FAIL("Trailing \\");
	9245	/* FALL THROUGH */
	9246	default:
	9247	/* Do not generate "unrecognized" warnings here, we fall
	9248	back into the quick-grab loop below */
	9249	parse_start--;
	9250	goto defchar;
	9251	}
	9252	break;
	9253
	9254	case '#':
	9255	if (RExC_flags & RXf_PMf_EXTENDED) {
	9256	if ( reg_skipcomment( pRExC_state ) )
	9257	goto tryagain;
	9258	}
	9259	/* FALL THROUGH */
	9260
	9261	default:
	9262
	9263	parse_start = RExC_parse - 1;
	9264
	9265	RExC_parse++;
	9266
	9267	defchar: {
	9268	register STRLEN len;
	9269	register UV ender;
	9270	register char *p;
	9271	char *s;
	9272	STRLEN foldlen;
	9273	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	9274	U8 node_type;
	9275
	9276	/* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
	9277	* it is folded to 'ss' even if not utf8 */
	9278	bool is_exactfu_sharp_s;
	9279
	9280	ender = 0;
	9281	node_type = ((! FOLD) ? EXACT
	9282	: (LOC)
	9283	? EXACTFL
	9284	: (MORE_ASCII_RESTRICTED)
	9285	? EXACTFA
	9286	: (AT_LEAST_UNI_SEMANTICS)
	9287	? EXACTFU
	9288	: EXACTF);
	9289	ret = reg_node(pRExC_state, node_type);
	9290	s = STRING(ret);
	9291
	9292	/* XXX The node can hold up to 255 bytes, yet this only goes to
	9293	* 127. I (khw) do not know why. Keeping it somewhat less than
	9294	* 255 allows us to not have to worry about overflow due to
	9295	* converting to utf8 and fold expansion, but that value is
	9296	* 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
	9297	* split up by this limit into a single one using the real max of
	9298	* 255. Even at 127, this breaks under rare circumstances. If
	9299	* folding, we do not want to split a node at a character that is a
	9300	* non-final in a multi-char fold, as an input string could just
	9301	* happen to want to match across the node boundary. The join
	9302	* would solve that problem if the join actually happens. But a
	9303	* series of more than two nodes in a row each of 127 would cause
	9304	* the first join to succeed to get to 254, but then there wouldn't
	9305	* be room for the next one, which could at be one of those split
	9306	* multi-char folds. I don't know of any fool-proof solution. One
	9307	* could back off to end with only a code point that isn't such a
	9308	* non-final, but it is possible for there not to be any in the
	9309	* entire node. */
	9310	for (len = 0, p = RExC_parse - 1;
	9311	len < 127 && p < RExC_end;
	9312	len++)
	9313	{
	9314	char * const oldp = p;
	9315
	9316	if (RExC_flags & RXf_PMf_EXTENDED)
	9317	p = regwhite( pRExC_state, p );
	9318	switch ((U8)*p) {
	9319	case '^':
	9320	case '$':
	9321	case '.':
	9322	case '[':
	9323	case '(':
	9324	case ')':
	9325	case '\|':
	9326	goto loopdone;
	9327	case '\\':
	9328	/* Literal Escapes Switch
	9329
	9330	This switch is meant to handle escape sequences that
	9331	resolve to a literal character.
	9332
	9333	Every escape sequence that represents something
	9334	else, like an assertion or a char class, is handled
	9335	in the switch marked 'Special Escapes' above in this
	9336	routine, but also has an entry here as anything that
	9337	isn't explicitly mentioned here will be treated as
	9338	an unescaped equivalent literal.
	9339	*/
	9340
	9341	switch ((U8)*++p) {
	9342	/* These are all the special escapes. */
	9343	case 'A': /* Start assertion */
	9344	case 'b': case 'B': /* Word-boundary assertion*/
	9345	case 'C': /* Single char !DANGEROUS! */
	9346	case 'd': case 'D': /* digit class */
	9347	case 'g': case 'G': /* generic-backref, pos assertion */
	9348	case 'h': case 'H': /* HORIZWS */
	9349	case 'k': case 'K': /* named backref, keep marker */
	9350	case 'N': /* named char sequence */
	9351	case 'p': case 'P': /* Unicode property */
	9352	case 'R': /* LNBREAK */
	9353	case 's': case 'S': /* space class */
	9354	case 'v': case 'V': /* VERTWS */
	9355	case 'w': case 'W': /* word class */
	9356	case 'X': /* eXtended Unicode "combining character sequence" */
	9357	case 'z': case 'Z': /* End of line/string assertion */
	9358	--p;
	9359	goto loopdone;
	9360
	9361	/* Anything after here is an escape that resolves to a
	9362	literal. (Except digits, which may or may not)
	9363	*/
	9364	case 'n':
	9365	ender = '\n';
	9366	p++;
	9367	break;
	9368	case 'r':
	9369	ender = '\r';
	9370	p++;
	9371	break;
	9372	case 't':
	9373	ender = '\t';
	9374	p++;
	9375	break;
	9376	case 'f':
	9377	ender = '\f';
	9378	p++;
	9379	break;
	9380	case 'e':
	9381	ender = ASCII_TO_NATIVE('\033');
	9382	p++;
	9383	break;
	9384	case 'a':
	9385	ender = ASCII_TO_NATIVE('\007');
	9386	p++;
	9387	break;
	9388	case 'o':
	9389	{
	9390	STRLEN brace_len = len;
	9391	UV result;
	9392	const char* error_msg;
	9393
	9394	bool valid = grok_bslash_o(p,
	9395	&result,
	9396	&brace_len,
	9397	&error_msg,
	9398	1);
	9399	p += brace_len;
	9400	if (! valid) {
	9401	RExC_parse = p; /* going to die anyway; point
	9402	to exact spot of failure */
	9403	vFAIL(error_msg);
	9404	}
	9405	else
	9406	{
	9407	ender = result;
	9408	}
	9409	if (PL_encoding && ender < 0x100) {
	9410	goto recode_encoding;
	9411	}
	9412	if (ender > 0xff) {
	9413	REQUIRE_UTF8;
	9414	}
	9415	break;
	9416	}
	9417	case 'x':
	9418	if (*++p == '{') {
	9419	char* const e = strchr(p, '}');
	9420
	9421	if (!e) {
	9422	RExC_parse = p + 1;
	9423	vFAIL("Missing right brace on \\x{}");
	9424	}
	9425	else {
	9426	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	9427	\| PERL_SCAN_DISALLOW_PREFIX;
	9428	STRLEN numlen = e - p - 1;
	9429	ender = grok_hex(p + 1, &numlen, &flags, NULL);
	9430	if (ender > 0xff)
	9431	REQUIRE_UTF8;
	9432	p = e + 1;
	9433	}
	9434	}
	9435	else {
	9436	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	9437	STRLEN numlen = 2;
	9438	ender = grok_hex(p, &numlen, &flags, NULL);
	9439	p += numlen;
	9440	}
	9441	if (PL_encoding && ender < 0x100)
	9442	goto recode_encoding;
	9443	break;
	9444	case 'c':
	9445	p++;
	9446	ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
	9447	break;
	9448	case '0': case '1': case '2': case '3':case '4':
	9449	case '5': case '6': case '7': case '8':case '9':
	9450	if (*p == '0' \|\|
	9451	(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
	9452	{
	9453	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	9454	STRLEN numlen = 3;
	9455	ender = grok_oct(p, &numlen, &flags, NULL);
	9456	if (ender > 0xff) {
	9457	REQUIRE_UTF8;
	9458	}
	9459	p += numlen;
	9460	}
	9461	else {
	9462	--p;
	9463	goto loopdone;
	9464	}
	9465	if (PL_encoding && ender < 0x100)
	9466	goto recode_encoding;
	9467	break;
	9468	recode_encoding:
	9469	if (! RExC_override_recoding) {
	9470	SV* enc = PL_encoding;
	9471	ender = reg_recode((const char)(U8)ender, &enc);
	9472	if (!enc && SIZE_ONLY)
	9473	ckWARNreg(p, "Invalid escape in the specified encoding");
	9474	REQUIRE_UTF8;
	9475	}
	9476	break;
	9477	case '\0':
	9478	if (p >= RExC_end)
	9479	FAIL("Trailing \\");
	9480	/* FALL THROUGH */
	9481	default:
	9482	if (!SIZE_ONLY&& isALPHA(*p)) {
	9483	/* Include any { following the alpha to emphasize
	9484	* that it could be part of an escape at some point
	9485	* in the future */
	9486	int len = (*(p + 1) == '{') ? 2 : 1;
	9487	ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
	9488	}
	9489	goto normal_default;
	9490	}
	9491	break;
	9492	default:
	9493	normal_default:
	9494	if (UTF8_IS_START(*p) && UTF) {
	9495	STRLEN numlen;
	9496	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	9497	&numlen, UTF8_ALLOW_DEFAULT);
	9498	p += numlen;
	9499	}
	9500	else
	9501	ender = (U8) *p++;
	9502	break;
	9503	} /* End of switch on the literal */
	9504
	9505	is_exactfu_sharp_s = (node_type == EXACTFU
	9506	&& ender == LATIN_SMALL_LETTER_SHARP_S);
	9507	if ( RExC_flags & RXf_PMf_EXTENDED)
	9508	p = regwhite( pRExC_state, p );
	9509	if ((UTF && FOLD) \|\| is_exactfu_sharp_s) {
	9510	/* Prime the casefolded buffer. Locale rules, which apply
	9511	* only to code points < 256, aren't known until execution,
	9512	* so for them, just output the original character using
	9513	* utf8. If we start to fold non-UTF patterns, be sure to
	9514	* update join_exact() */
	9515	if (LOC && ender < 256) {
	9516	if (UNI_IS_INVARIANT(ender)) {
	9517	*tmpbuf = (U8) ender;
	9518	foldlen = 1;
	9519	} else {
	9520	*tmpbuf = UTF8_TWO_BYTE_HI(ender);
	9521	*(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
	9522	foldlen = 2;
	9523	}
	9524	}
	9525	else if (isASCII(ender)) { /* Note: Here can't also be LOC
	9526	*/
	9527	ender = toLOWER(ender);
	9528	*tmpbuf = (U8) ender;
	9529	foldlen = 1;
	9530	}
	9531	else if (! MORE_ASCII_RESTRICTED && ! LOC) {
	9532
	9533	/* Locale and /aa require more selectivity about the
	9534	* fold, so are handled below. Otherwise, here, just
	9535	* use the fold */
	9536	ender = toFOLD_uni(ender, tmpbuf, &foldlen);
	9537	}
	9538	else {
	9539	/* Under locale rules or /aa we are not to mix,
	9540	* respectively, ords < 256 or ASCII with non-. So
	9541	* reject folds that mix them, using only the
	9542	* non-folded code point. So do the fold to a
	9543	* temporary, and inspect each character in it. */
	9544	U8 trialbuf[UTF8_MAXBYTES_CASE+1];
	9545	U8* s = trialbuf;
	9546	UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
	9547	U8* e = s + foldlen;
	9548	bool fold_ok = TRUE;
	9549
	9550	while (s < e) {
	9551	if (isASCII(*s)
	9552	\|\| (LOC && (UTF8_IS_INVARIANT(*s)
	9553	\|\| UTF8_IS_DOWNGRADEABLE_START(*s))))
	9554	{
	9555	fold_ok = FALSE;
	9556	break;
	9557	}
	9558	s += UTF8SKIP(s);
	9559	}
	9560	if (fold_ok) {
	9561	Copy(trialbuf, tmpbuf, foldlen, U8);
	9562	ender = tmpender;
	9563	}
	9564	else {
	9565	uvuni_to_utf8(tmpbuf, ender);
	9566	foldlen = UNISKIP(ender);
	9567	}
	9568	}
	9569	}
	9570	if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+. /
	9571	if (len)
	9572	p = oldp;
	9573	else if (UTF \|\| is_exactfu_sharp_s) {
	9574	if (FOLD) {
	9575	/* Emit all the Unicode characters. */
	9576	STRLEN numlen;
	9577	for (foldbuf = tmpbuf;
	9578	foldlen;
	9579	foldlen -= numlen) {
	9580	ender = utf8_to_uvchr(foldbuf, &numlen);
	9581	if (numlen > 0) {
	9582	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9583	s += unilen;
	9584	len += unilen;
	9585	/* In EBCDIC the numlen
	9586	* and unilen can differ. */
	9587	foldbuf += numlen;
	9588	if (numlen >= foldlen)
	9589	break;
	9590	}
	9591	else
	9592	break; /* "Can't happen." */
	9593	}
	9594	}
	9595	else {
	9596	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9597	if (unilen > 0) {
	9598	s += unilen;
	9599	len += unilen;
	9600	}
	9601	}
	9602	}
	9603	else {
	9604	len++;
	9605	REGC((char)ender, s++);
	9606	}
	9607	break;
	9608	}
	9609	if (UTF \|\| is_exactfu_sharp_s) {
	9610	if (FOLD) {
	9611	/* Emit all the Unicode characters. */
	9612	STRLEN numlen;
	9613	for (foldbuf = tmpbuf;
	9614	foldlen;
	9615	foldlen -= numlen) {
	9616	ender = utf8_to_uvchr(foldbuf, &numlen);
	9617	if (numlen > 0) {
	9618	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9619	len += unilen;
	9620	s += unilen;
	9621	/* In EBCDIC the numlen
	9622	* and unilen can differ. */
	9623	foldbuf += numlen;
	9624	if (numlen >= foldlen)
	9625	break;
	9626	}
	9627	else
	9628	break;
	9629	}
	9630	}
	9631	else {
	9632	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9633	if (unilen > 0) {
	9634	s += unilen;
	9635	len += unilen;
	9636	}
	9637	}
	9638	len--;
	9639	}
	9640	else {
	9641	REGC((char)ender, s++);
	9642	}
	9643	}
	9644	loopdone: /* Jumped to when encounters something that shouldn't be in
	9645	the node */
	9646	RExC_parse = p - 1;
	9647	Set_Node_Cur_Length(ret); /* MJD */
	9648	nextchar(pRExC_state);
	9649	{
	9650	/* len is STRLEN which is unsigned, need to copy to signed */
	9651	IV iv = len;
	9652	if (iv < 0)
	9653	vFAIL("Internal disaster");
	9654	}
	9655	if (len > 0)
	9656	*flagp \|= HASWIDTH;
	9657	if (len == 1 && UNI_IS_INVARIANT(ender))
	9658	*flagp \|= SIMPLE;
	9659
	9660	if (SIZE_ONLY)
	9661	RExC_size += STR_SZ(len);
	9662	else {
	9663	STR_LEN(ret) = len;
	9664	RExC_emit += STR_SZ(len);
	9665	}
	9666	}
	9667	break;
	9668	}
	9669
	9670	return(ret);
	9671
	9672	/* Jumped to when an unrecognized character set is encountered */
	9673	bad_charset:
	9674	Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
	9675	return(NULL);
	9676	}
	9677
	9678	STATIC char *
	9679	S_regwhite( RExC_state_t pRExC_state, char p )
	9680	{
	9681	const char *e = RExC_end;
	9682
	9683	PERL_ARGS_ASSERT_REGWHITE;
	9684
	9685	while (p < e) {
	9686	if (isSPACE(*p))
	9687	++p;
	9688	else if (*p == '#') {
	9689	bool ended = 0;
	9690	do {
	9691	if (*p++ == '\n') {
	9692	ended = 1;
	9693	break;
	9694	}
	9695	} while (p < e);
	9696	if (!ended)
	9697	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	9698	}
	9699	else
	9700	break;
	9701	}
	9702	return p;
	9703	}
	9704
	9705	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	9706	Character classes ([:foo:]) can also be negated ([:^foo:]).
	9707	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	9708	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	9709	but trigger failures because they are currently unimplemented. */
	9710
	9711	#define POSIXCC_DONE(c) ((c) == ':')
	9712	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	9713	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	9714
	9715	STATIC I32
	9716	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
	9717	{
	9718	dVAR;
	9719	I32 namedclass = OOB_NAMEDCLASS;
	9720
	9721	PERL_ARGS_ASSERT_REGPPOSIXCC;
	9722
	9723	if (value == '[' && RExC_parse + 1 < RExC_end &&
	9724	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	9725	POSIXCC(UCHARAT(RExC_parse))) {
	9726	const char c = UCHARAT(RExC_parse);
	9727	char* const s = RExC_parse++;
	9728
	9729	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	9730	RExC_parse++;
	9731	if (RExC_parse == RExC_end)
	9732	/* Grandfather lone [:, [=, [. */
	9733	RExC_parse = s;
	9734	else {
	9735	const char* const t = RExC_parse++; /* skip over the c */
	9736	assert(*t == c);
	9737
	9738	if (UCHARAT(RExC_parse) == ']') {
	9739	const char *posixcc = s + 1;
	9740	RExC_parse++; /* skip over the ending ] */
	9741
	9742	if (*s == ':') {
	9743	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	9744	const I32 skip = t - posixcc;
	9745
	9746	/* Initially switch on the length of the name. */
	9747	switch (skip) {
	9748	case 4:
	9749	if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
	9750	namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
	9751	break;
	9752	case 5:
	9753	/* Names all of length 5. */
	9754	/* alnum alpha ascii blank cntrl digit graph lower
	9755	print punct space upper */
	9756	/* Offset 4 gives the best switch position. */
	9757	switch (posixcc[4]) {
	9758	case 'a':
	9759	if (memEQ(posixcc, "alph", 4)) /* alpha */
	9760	namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
	9761	break;
	9762	case 'e':
	9763	if (memEQ(posixcc, "spac", 4)) /* space */
	9764	namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
	9765	break;
	9766	case 'h':
	9767	if (memEQ(posixcc, "grap", 4)) /* graph */
	9768	namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
	9769	break;
	9770	case 'i':
	9771	if (memEQ(posixcc, "asci", 4)) /* ascii */
	9772	namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
	9773	break;
	9774	case 'k':
	9775	if (memEQ(posixcc, "blan", 4)) /* blank */
	9776	namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
	9777	break;
	9778	case 'l':
	9779	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	9780	namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
	9781	break;
	9782	case 'm':
	9783	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	9784	namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
	9785	break;
	9786	case 'r':
	9787	if (memEQ(posixcc, "lowe", 4)) /* lower */
	9788	namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
	9789	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	9790	namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
	9791	break;
	9792	case 't':
	9793	if (memEQ(posixcc, "digi", 4)) /* digit */
	9794	namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
	9795	else if (memEQ(posixcc, "prin", 4)) /* print */
	9796	namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
	9797	else if (memEQ(posixcc, "punc", 4)) /* punct */
	9798	namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
	9799	break;
	9800	}
	9801	break;
	9802	case 6:
	9803	if (memEQ(posixcc, "xdigit", 6))
	9804	namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
	9805	break;
	9806	}
	9807
	9808	if (namedclass == OOB_NAMEDCLASS)
	9809	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	9810	t - s - 1, s + 1);
	9811	assert (posixcc[skip] == ':');
	9812	assert (posixcc[skip+1] == ']');
	9813	} else if (!SIZE_ONLY) {
	9814	/* [[=foo=]] and [[.foo.]] are still future. */
	9815
	9816	/* adjust RExC_parse so the warning shows after
	9817	the class closes */
	9818	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	9819	RExC_parse++;
	9820	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9821	}
	9822	} else {
	9823	/* Maternal grandfather:
	9824	* "[:" ending in ":" but not in ":]" */
	9825	RExC_parse = s;
	9826	}
	9827	}
	9828	}
	9829
	9830	return namedclass;
	9831	}
	9832
	9833	STATIC void
	9834	S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
	9835	{
	9836	dVAR;
	9837
	9838	PERL_ARGS_ASSERT_CHECKPOSIXCC;
	9839
	9840	if (POSIXCC(UCHARAT(RExC_parse))) {
	9841	const char *s = RExC_parse;
	9842	const char c = *s++;
	9843
	9844	while (isALNUM(*s))
	9845	s++;
	9846	if (s && c == s && s[1] == ']') {
	9847	ckWARN3reg(s+2,
	9848	"POSIX syntax [%c %c] belongs inside character classes",
	9849	c, c);
	9850
	9851	/* [[=foo=]] and [[.foo.]] are still future. */
	9852	if (POSIXCC_NOTYET(c)) {
	9853	/* adjust RExC_parse so the error shows after
	9854	the class closes */
	9855	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
	9856	NOOP;
	9857	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9858	}
	9859	}
	9860	}
	9861	}
	9862
	9863	/* No locale test, and always Unicode semantics, no ignore-case differences */
	9864	#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
	9865	ANYOF_##NAME: \
	9866	for (value = 0; value < 256; value++) \
	9867	if (TEST) \
	9868	stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9869	yesno = '+'; \
	9870	what = WORD; \
	9871	break; \
	9872	case ANYOF_N##NAME: \
	9873	for (value = 0; value < 256; value++) \
	9874	if (!TEST) \
	9875	stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9876	yesno = '!'; \
	9877	what = WORD; \
	9878	break
	9879
	9880	/* Like the above, but there are differences if we are in uni-8-bit or not, so
	9881	* there are two tests passed in, to use depending on that. There aren't any
	9882	* cases where the label is different from the name, so no need for that
	9883	* parameter.
	9884	* Sets 'what' to WORD which is the property name for non-bitmap code points;
	9885	* But, uses FOLD_WORD instead if /i has been selected, to allow a different
	9886	* property name */
	9887	#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \
	9888	ANYOF_##NAME: \
	9889	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
	9890	else if (UNI_SEMANTICS) { \
	9891	for (value = 0; value < 256; value++) { \
	9892	if (TEST_8(value)) stored += \
	9893	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9894	} \
	9895	} \
	9896	else { \
	9897	for (value = 0; value < 128; value++) { \
	9898	if (TEST_7(UNI_TO_NATIVE(value))) stored += \
	9899	set_regclass_bit(pRExC_state, ret, \
	9900	(U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9901	} \
	9902	} \
	9903	yesno = '+'; \
	9904	if (FOLD) { \
	9905	what = FOLD_WORD; \
	9906	} \
	9907	else { \
	9908	what = WORD; \
	9909	} \
	9910	break; \
	9911	case ANYOF_N##NAME: \
	9912	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
	9913	else if (UNI_SEMANTICS) { \
	9914	for (value = 0; value < 256; value++) { \
	9915	if (! TEST_8(value)) stored += \
	9916	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9917	} \
	9918	} \
	9919	else { \
	9920	for (value = 0; value < 128; value++) { \
	9921	if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
	9922	pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9923	} \
	9924	if (AT_LEAST_ASCII_RESTRICTED) { \
	9925	for (value = 128; value < 256; value++) { \
	9926	stored += set_regclass_bit( \
	9927	pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9928	} \
	9929	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL; \
	9930	} \
	9931	else { \
	9932	/* For a non-ut8 target string with DEPENDS semantics, all above \
	9933	* ASCII Latin1 code points match the complement of any of the \
	9934	* classes. But in utf8, they have their Unicode semantics, so \
	9935	* can't just set them in the bitmap, or else regexec.c will think \
	9936	* they matched when they shouldn't. */ \
	9937	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	9938	} \
	9939	} \
	9940	yesno = '!'; \
	9941	if (FOLD) { \
	9942	what = FOLD_WORD; \
	9943	} \
	9944	else { \
	9945	what = WORD; \
	9946	} \
	9947	break
	9948
	9949	STATIC U8
	9950	S_set_regclass_bit_fold(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, SV invlist_ptr, AV alternate_ptr)
	9951	{
	9952
	9953	/* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
	9954	* Locale folding is done at run-time, so this function should not be
	9955	* called for nodes that are for locales.
	9956	*
	9957	* This function sets the bit corresponding to the fold of the input
	9958	* 'value', if not already set. The fold of 'f' is 'F', and the fold of
	9959	* 'F' is 'f'.
	9960	*
	9961	* It also knows about the characters that are in the bitmap that have
	9962	* folds that are matchable only outside it, and sets the appropriate lists
	9963	* and flags.
	9964	*
	9965	* It returns the number of bits that actually changed from 0 to 1 */
	9966
	9967	U8 stored = 0;
	9968	U8 fold;
	9969
	9970	PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
	9971
	9972	fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
	9973	: PL_fold[value];
	9974
	9975	/* It assumes the bit for 'value' has already been set */
	9976	if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
	9977	ANYOF_BITMAP_SET(node, fold);
	9978	stored++;
	9979	}
	9980	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) \|\| ! MORE_ASCII_RESTRICTED)) {
	9981	/* Certain Latin1 characters have matches outside the bitmap. To get
	9982	* here, 'value' is one of those characters. None of these matches is
	9983	* valid for ASCII characters under /aa, which have been excluded by
	9984	* the 'if' above. The matches fall into three categories:
	9985	* 1) They are singly folded-to or -from an above 255 character, as
	9986	* LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
	9987	* WITH DIAERESIS;
	9988	* 2) They are part of a multi-char fold with another character in the
	9989	* bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
	9990	* 3) They are part of a multi-char fold with a character not in the
	9991	* bitmap, such as various ligatures.
	9992	* We aren't dealing fully with multi-char folds, except we do deal
	9993	* with the pattern containing a character that has a multi-char fold
	9994	* (not so much the inverse).
	9995	* For types 1) and 3), the matches only happen when the target string
	9996	* is utf8; that's not true for 2), and we set a flag for it.
	9997	*
	9998	* The code below adds to the passed in inversion list the single fold
	9999	* closures for 'value'. The values are hard-coded here so that an
	10000	* innocent-looking character class, like /[ks]/i won't have to go out
	10001	* to disk to find the possible matches. XXX It would be better to
	10002	* generate these via regen, in case a new version of the Unicode
	10003	* standard adds new mappings, though that is not really likely. */
	10004	switch (value) {
	10005	case 'k':
	10006	case 'K':
	10007	/* KELVIN SIGN */
	10008	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212A);
	10009	break;
	10010	case 's':
	10011	case 'S':
	10012	/* LATIN SMALL LETTER LONG S */
	10013	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x017F);
	10014	break;
	10015	case MICRO_SIGN:
	10016	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10017	GREEK_SMALL_LETTER_MU);
	10018	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10019	GREEK_CAPITAL_LETTER_MU);
	10020	break;
	10021	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	10022	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	10023	/* ANGSTROM SIGN */
	10024	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212B);
	10025	if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
	10026	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10027	PL_fold_latin1[value]);
	10028	}
	10029	break;
	10030	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	10031	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10032	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	10033	break;
	10034	case LATIN_SMALL_LETTER_SHARP_S:
	10035	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	10036	LATIN_CAPITAL_LETTER_SHARP_S);
	10037
	10038	/* Under /a, /d, and /u, this can match the two chars "ss" */
	10039	if (! MORE_ASCII_RESTRICTED) {
	10040	add_alternate(alternate_ptr, (U8 *) "ss", 2);
	10041
	10042	/* And under /u or /a, it can match even if the target is
	10043	* not utf8 */
	10044	if (AT_LEAST_UNI_SEMANTICS) {
	10045	ANYOF_FLAGS(node) \|= ANYOF_NONBITMAP_NON_UTF8;
	10046	}
	10047	}
	10048	break;
	10049	case 'F': case 'f':
	10050	case 'I': case 'i':
	10051	case 'L': case 'l':
	10052	case 'T': case 't':
	10053	case 'A': case 'a':
	10054	case 'H': case 'h':
	10055	case 'J': case 'j':
	10056	case 'N': case 'n':
	10057	case 'W': case 'w':
	10058	case 'Y': case 'y':
	10059	/* These all are targets of multi-character folds from code
	10060	* points that require UTF8 to express, so they can't match
	10061	* unless the target string is in UTF-8, so no action here is
	10062	* necessary, as regexec.c properly handles the general case
	10063	* for UTF-8 matching */
	10064	break;
	10065	default:
	10066	/* Use deprecated warning to increase the chances of this
	10067	* being output */
	10068	ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
	10069	break;
	10070	}
	10071	}
	10072	else if (DEPENDS_SEMANTICS
	10073	&& ! isASCII(value)
	10074	&& PL_fold_latin1[value] != value)
	10075	{
	10076	/* Under DEPENDS rules, non-ASCII Latin1 characters match their
	10077	* folds only when the target string is in UTF-8. We add the fold
	10078	* here to the list of things to match outside the bitmap, which
	10079	* won't be looked at unless it is UTF8 (or else if something else
	10080	* says to look even if not utf8, but those things better not happen
	10081	* under DEPENDS semantics. */
	10082	invlist_ptr = add_cp_to_invlist(invlist_ptr, PL_fold_latin1[value]);
	10083	}
	10084
	10085	return stored;
	10086	}
	10087
	10088
	10089	PERL_STATIC_INLINE U8
	10090	S_set_regclass_bit(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, SV invlist_ptr, AV alternate_ptr)
	10091	{
	10092	/* This inline function sets a bit in the bitmap if not already set, and if
	10093	* appropriate, its fold, returning the number of bits that actually
	10094	* changed from 0 to 1 */
	10095
	10096	U8 stored;
	10097
	10098	PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
	10099
	10100	if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
	10101	return 0;
	10102	}
	10103
	10104	ANYOF_BITMAP_SET(node, value);
	10105	stored = 1;
	10106
	10107	if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
	10108	stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
	10109	}
	10110
	10111	return stored;
	10112	}
	10113
	10114	STATIC void
	10115	S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
	10116	{
	10117	/* Adds input 'string' with length 'len' to the ANYOF node's unicode
	10118	* alternate list, pointed to by 'alternate_ptr'. This is an array of
	10119	* the multi-character folds of characters in the node */
	10120	SV *sv;
	10121
	10122	PERL_ARGS_ASSERT_ADD_ALTERNATE;
	10123
	10124	if (! *alternate_ptr) {
	10125	*alternate_ptr = newAV();
	10126	}
	10127	sv = newSVpvn_utf8((char*)string, len, TRUE);
	10128	av_push(*alternate_ptr, sv);
	10129	return;
	10130	}
	10131
	10132	/*
	10133	parse a class specification and produce either an ANYOF node that
	10134	matches the pattern or perhaps will be optimized into an EXACTish node
	10135	instead. The node contains a bit map for the first 256 characters, with the
	10136	corresponding bit set if that character is in the list. For characters
	10137	above 255, a range list is used */
	10138
	10139	STATIC regnode *
	10140	S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
	10141	{
	10142	dVAR;
	10143	register UV nextvalue;
	10144	register IV prevvalue = OOB_UNICODE;
	10145	register IV range = 0;
	10146	UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
	10147	register regnode *ret;
	10148	STRLEN numlen;
	10149	IV namedclass;
	10150	char *rangebegin = NULL;
	10151	bool need_class = 0;
	10152	bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
	10153	SV *listsv = NULL;
	10154	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	10155	than just initialized. */
	10156	SV* properties = NULL; /* Code points that match \p{} \P{} */
	10157	UV element_count = 0; /* Number of distinct elements in the class.
	10158	Optimizations may be possible if this is tiny */
	10159	UV n;
	10160
	10161	/* Unicode properties are stored in a swash; this holds the current one
	10162	* being parsed. If this swash is the only above-latin1 component of the
	10163	* character class, an optimization is to pass it directly on to the
	10164	* execution engine. Otherwise, it is set to NULL to indicate that there
	10165	* are other things in the class that have to be dealt with at execution
	10166	* time */
	10167	SV* swash = NULL; /* Code points that match \p{} \P{} */
	10168
	10169	/* Set if a component of this character class is user-defined; just passed
	10170	* on to the engine */
	10171	UV has_user_defined_property = 0;
	10172
	10173	/* code points this node matches that can't be stored in the bitmap */
	10174	SV* nonbitmap = NULL;
	10175
	10176	/* The items that are to match that aren't stored in the bitmap, but are a
	10177	* result of things that are stored there. This is the fold closure of
	10178	* such a character, either because it has DEPENDS semantics and shouldn't
	10179	* be matched unless the target string is utf8, or is a code point that is
	10180	* too large for the bit map, as for example, the fold of the MICRO SIGN is
	10181	* above 255. This all is solely for performance reasons. By having this
	10182	* code know the outside-the-bitmap folds that the bitmapped characters are
	10183	* involved with, we don't have to go out to disk to find the list of
	10184	* matches, unless the character class includes code points that aren't
	10185	* storable in the bit map. That means that a character class with an 's'
	10186	* in it, for example, doesn't need to go out to disk to find everything
	10187	* that matches. A 2nd list is used so that the 'nonbitmap' list is kept
	10188	* empty unless there is something whose fold we don't know about, and will
	10189	* have to go out to the disk to find. */
	10190	SV* l1_fold_invlist = NULL;
	10191
	10192	/* List of multi-character folds that are matched by this node */
	10193	AV* unicode_alternate = NULL;
	10194	#ifdef EBCDIC
	10195	UV literal_endpoint = 0;
	10196	#endif
	10197	UV stored = 0; /* how many chars stored in the bitmap */
	10198
	10199	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	10200	case we need to change the emitted regop to an EXACT. */
	10201	const char * orig_parse = RExC_parse;
	10202	GET_RE_DEBUG_FLAGS_DECL;
	10203
	10204	PERL_ARGS_ASSERT_REGCLASS;
	10205	#ifndef DEBUGGING
	10206	PERL_UNUSED_ARG(depth);
	10207	#endif
	10208
	10209	DEBUG_PARSE("clas");
	10210
	10211	/* Assume we are going to generate an ANYOF node. */
	10212	ret = reganode(pRExC_state, ANYOF, 0);
	10213
	10214
	10215	if (!SIZE_ONLY) {
	10216	ANYOF_FLAGS(ret) = 0;
	10217	}
	10218
	10219	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	10220	RExC_naughty++;
	10221	RExC_parse++;
	10222	if (!SIZE_ONLY)
	10223	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	10224
	10225	/* We have decided to not allow multi-char folds in inverted character
	10226	* classes, due to the confusion that can happen, especially with
	10227	* classes that are designed for a non-Unicode world: You have the
	10228	* peculiar case that:
	10229	"s s" =~ /^[^\xDF]+$/i => Y
	10230	"ss" =~ /^[^\xDF]+$/i => N
	10231	*
	10232	* See [perl #89750] */
	10233	allow_full_fold = FALSE;
	10234	}
	10235
	10236	if (SIZE_ONLY) {
	10237	RExC_size += ANYOF_SKIP;
	10238	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	10239	}
	10240	else {
	10241	RExC_emit += ANYOF_SKIP;
	10242	if (LOC) {
	10243	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	10244	}
	10245	ANYOF_BITMAP_ZERO(ret);
	10246	listsv = newSVpvs("# comment\n");
	10247	initial_listsv_len = SvCUR(listsv);
	10248	}
	10249
	10250	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	10251
	10252	if (!SIZE_ONLY && POSIXCC(nextvalue))
	10253	checkposixcc(pRExC_state);
	10254
	10255	/* allow 1st char to be ] (allowing it to be - is dealt with later) */
	10256	if (UCHARAT(RExC_parse) == ']')
	10257	goto charclassloop;
	10258
	10259	parseit:
	10260	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	10261
	10262	charclassloop:
	10263
	10264	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	10265
	10266	if (!range) {
	10267	rangebegin = RExC_parse;
	10268	element_count++;
	10269	}
	10270	if (UTF) {
	10271	value = utf8n_to_uvchr((U8*)RExC_parse,
	10272	RExC_end - RExC_parse,
	10273	&numlen, UTF8_ALLOW_DEFAULT);
	10274	RExC_parse += numlen;
	10275	}
	10276	else
	10277	value = UCHARAT(RExC_parse++);
	10278
	10279	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	10280	if (value == '[' && POSIXCC(nextvalue))
	10281	namedclass = regpposixcc(pRExC_state, value);
	10282	else if (value == '\\') {
	10283	if (UTF) {
	10284	value = utf8n_to_uvchr((U8*)RExC_parse,
	10285	RExC_end - RExC_parse,
	10286	&numlen, UTF8_ALLOW_DEFAULT);
	10287	RExC_parse += numlen;
	10288	}
	10289	else
	10290	value = UCHARAT(RExC_parse++);
	10291	/* Some compilers cannot handle switching on 64-bit integer
	10292	* values, therefore value cannot be an UV. Yes, this will
	10293	* be a problem later if we want switch on Unicode.
	10294	* A similar issue a little bit later when switching on
	10295	* namedclass. --jhi */
	10296	switch ((I32)value) {
	10297	case 'w': namedclass = ANYOF_ALNUM; break;
	10298	case 'W': namedclass = ANYOF_NALNUM; break;
	10299	case 's': namedclass = ANYOF_SPACE; break;
	10300	case 'S': namedclass = ANYOF_NSPACE; break;
	10301	case 'd': namedclass = ANYOF_DIGIT; break;
	10302	case 'D': namedclass = ANYOF_NDIGIT; break;
	10303	case 'v': namedclass = ANYOF_VERTWS; break;
	10304	case 'V': namedclass = ANYOF_NVERTWS; break;
	10305	case 'h': namedclass = ANYOF_HORIZWS; break;
	10306	case 'H': namedclass = ANYOF_NHORIZWS; break;
	10307	case 'N': /* Handle \N{NAME} in class */
	10308	{
	10309	/* We only pay attention to the first char of
	10310	multichar strings being returned. I kinda wonder
	10311	if this makes sense as it does change the behaviour
	10312	from earlier versions, OTOH that behaviour was broken
	10313	as well. */
	10314	UV v; /* value is register so we cant & it /grrr */
	10315	if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
	10316	goto parseit;
	10317	}
	10318	value= v;
	10319	}
	10320	break;
	10321	case 'p':
	10322	case 'P':
	10323	{
	10324	char *e;
	10325	if (RExC_parse >= RExC_end)
	10326	vFAIL2("Empty \\%c{}", (U8)value);
	10327	if (*RExC_parse == '{') {
	10328	const U8 c = (U8)value;
	10329	e = strchr(RExC_parse++, '}');
	10330	if (!e)
	10331	vFAIL2("Missing right brace on \\%c{}", c);
	10332	while (isSPACE(UCHARAT(RExC_parse)))
	10333	RExC_parse++;
	10334	if (e == RExC_parse)
	10335	vFAIL2("Empty \\%c{}", c);
	10336	n = e - RExC_parse;
	10337	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	10338	n--;
	10339	}
	10340	else {
	10341	e = RExC_parse;
	10342	n = 1;
	10343	}
	10344	if (!SIZE_ONLY) {
	10345	SV** invlistsvp;
	10346	SV* invlist;
	10347	char* name;
	10348	if (UCHARAT(RExC_parse) == '^') {
	10349	RExC_parse++;
	10350	n--;
	10351	value = value == 'p' ? 'P' : 'p'; /* toggle */
	10352	while (isSPACE(UCHARAT(RExC_parse))) {
	10353	RExC_parse++;
	10354	n--;
	10355	}
	10356	}
	10357	/* Try to get the definition of the property into
	10358	* <invlist>. If /i is in effect, the effective property
	10359	* will have its name be <__NAME_i>. The design is
	10360	* discussed in commit
	10361	* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
	10362	Newx(name, n + sizeof("_i__\n"), char);
	10363
	10364	sprintf(name, "%s%.*s%s\n",
	10365	(FOLD) ? "__" : "",
	10366	(int)n,
	10367	RExC_parse,
	10368	(FOLD) ? "_i" : ""
	10369	);
	10370
	10371	/* Look up the property name, and get its swash and
	10372	* inversion list, if the property is found */
	10373	if (swash) {
	10374	SvREFCNT_dec(swash);
	10375	}
	10376	swash = _core_swash_init("utf8", name, &PL_sv_undef,
	10377	1, /* binary */
	10378	0, /* not tr/// */
	10379	TRUE, /* this routine will handle
	10380	undefined properties */
	10381	NULL, FALSE /* No inversion list */
	10382	);
	10383	if ( ! swash
	10384	\|\| ! SvROK(swash)
	10385	\|\| ! SvTYPE(SvRV(swash)) == SVt_PVHV
	10386	\|\| ! (invlistsvp =
	10387	hv_fetchs(MUTABLE_HV(SvRV(swash)),
	10388	"INVLIST", FALSE))
	10389	\|\| ! (invlist = *invlistsvp))
	10390	{
	10391	if (swash) {
	10392	SvREFCNT_dec(swash);
	10393	swash = NULL;
	10394	}
	10395
	10396	/* Here didn't find it. It could be a user-defined
	10397	* property that will be available at run-time. Add it
	10398	* to the list to look up then */
	10399	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
	10400	(value == 'p' ? '+' : '!'),
	10401	name);
	10402	has_user_defined_property = 1;
	10403
	10404	/* We don't know yet, so have to assume that the
	10405	* property could match something in the Latin1 range,
	10406	* hence something that isn't utf8 */
	10407	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	10408	}
	10409	else {
	10410
	10411	/* Here, did get the swash and its inversion list. If
	10412	* the swash is from a user-defined property, then this
	10413	* whole character class should be regarded as such */
	10414	SV** user_defined_svp =
	10415	hv_fetchs(MUTABLE_HV(SvRV(swash)),
	10416	"USER_DEFINED", FALSE);
	10417	if (user_defined_svp) {
	10418	has_user_defined_property
	10419	\|= SvUV(*user_defined_svp);
	10420	}
	10421
	10422	/* Invert if asking for the complement */
	10423	if (value == 'P') {
	10424
	10425	/* Add to any existing list */
	10426	if (! properties) {
	10427	properties = invlist_clone(invlist);
	10428	_invlist_invert(properties);
	10429	}
	10430	else {
	10431	invlist = invlist_clone(invlist);
	10432	_invlist_invert(invlist);
	10433	_invlist_union(properties, invlist, &properties);
	10434	SvREFCNT_dec(invlist);
	10435	}
	10436
	10437	/* The swash can't be used as-is, because we've
	10438	* inverted things; delay removing it to here after
	10439	* have copied its invlist above */
	10440	SvREFCNT_dec(swash);
	10441	swash = NULL;
	10442	}
	10443	else {
	10444	if (! properties) {
	10445	properties = invlist_clone(invlist);
	10446	}
	10447	else {
	10448	_invlist_union(properties, invlist, &properties);
	10449	}
	10450	}
	10451	}
	10452	Safefree(name);
	10453	}
	10454	RExC_parse = e + 1;
	10455	namedclass = ANYOF_MAX; /* no official name, but it's named */
	10456
	10457	/* \p means they want Unicode semantics */
	10458	RExC_uni_semantics = 1;
	10459	}
	10460	break;
	10461	case 'n': value = '\n'; break;
	10462	case 'r': value = '\r'; break;
	10463	case 't': value = '\t'; break;
	10464	case 'f': value = '\f'; break;
	10465	case 'b': value = '\b'; break;
	10466	case 'e': value = ASCII_TO_NATIVE('\033');break;
	10467	case 'a': value = ASCII_TO_NATIVE('\007');break;
	10468	case 'o':
	10469	RExC_parse--; /* function expects to be pointed at the 'o' */
	10470	{
	10471	const char* error_msg;
	10472	bool valid = grok_bslash_o(RExC_parse,
	10473	&value,
	10474	&numlen,
	10475	&error_msg,
	10476	SIZE_ONLY);
	10477	RExC_parse += numlen;
	10478	if (! valid) {
	10479	vFAIL(error_msg);
	10480	}
	10481	}
	10482	if (PL_encoding && value < 0x100) {
	10483	goto recode_encoding;
	10484	}
	10485	break;
	10486	case 'x':
	10487	if (*RExC_parse == '{') {
	10488	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	10489	\| PERL_SCAN_DISALLOW_PREFIX;
	10490	char * const e = strchr(RExC_parse++, '}');
	10491	if (!e)
	10492	vFAIL("Missing right brace on \\x{}");
	10493
	10494	numlen = e - RExC_parse;
	10495	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	10496	RExC_parse = e + 1;
	10497	}
	10498	else {
	10499	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	10500	numlen = 2;
	10501	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	10502	RExC_parse += numlen;
	10503	}
	10504	if (PL_encoding && value < 0x100)
	10505	goto recode_encoding;
	10506	break;
	10507	case 'c':
	10508	value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
	10509	break;
	10510	case '0': case '1': case '2': case '3': case '4':
	10511	case '5': case '6': case '7':
	10512	{
	10513	/* Take 1-3 octal digits */
	10514	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	10515	numlen = 3;
	10516	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	10517	RExC_parse += numlen;
	10518	if (PL_encoding && value < 0x100)
	10519	goto recode_encoding;
	10520	break;
	10521	}
	10522	recode_encoding:
	10523	if (! RExC_override_recoding) {
	10524	SV* enc = PL_encoding;
	10525	value = reg_recode((const char)(U8)value, &enc);
	10526	if (!enc && SIZE_ONLY)
	10527	ckWARNreg(RExC_parse,
	10528	"Invalid escape in the specified encoding");
	10529	break;
	10530	}
	10531	default:
	10532	/* Allow \_ to not give an error */
	10533	if (!SIZE_ONLY && isALNUM(value) && value != '_') {
	10534	ckWARN2reg(RExC_parse,
	10535	"Unrecognized escape \\%c in character class passed through",
	10536	(int)value);
	10537	}
	10538	break;
	10539	}
	10540	} /* end of \blah */
	10541	#ifdef EBCDIC
	10542	else
	10543	literal_endpoint++;
	10544	#endif
	10545
	10546	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	10547
	10548	/* What matches in a locale is not known until runtime, so need to
	10549	* (one time per class) allocate extra space to pass to regexec.
	10550	* The space will contain a bit for each named class that is to be
	10551	* matched against. This isn't needed for \p{} and pseudo-classes,
	10552	* as they are not affected by locale, and hence are dealt with
	10553	* separately */
	10554	if (LOC && namedclass < ANYOF_MAX && ! need_class) {
	10555	need_class = 1;
	10556	if (SIZE_ONLY) {
	10557	RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	10558	}
	10559	else {
	10560	RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	10561	ANYOF_CLASS_ZERO(ret);
	10562	}
	10563	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	10564	}
	10565
	10566	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	10567	* literal, as is the character that began the false range, i.e.
	10568	* the 'a' in the examples */
	10569	if (range) {
	10570	if (!SIZE_ONLY) {
	10571	const int w =
	10572	RExC_parse >= rangebegin ?
	10573	RExC_parse - rangebegin : 0;
	10574	ckWARN4reg(RExC_parse,
	10575	"False [] range \"%.s\"",
	10576	w, w, rangebegin);
	10577
	10578	stored +=
	10579	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	10580	if (prevvalue < 256) {
	10581	stored +=
	10582	set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
	10583	}
	10584	else {
	10585	nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
	10586	}
	10587	}
	10588
	10589	range = 0; /* this was not a true range */
	10590	}
	10591
	10592	if (!SIZE_ONLY) {
	10593	const char *what = NULL;
	10594	char yesno = 0;
	10595
	10596	/* Possible truncation here but in some 64-bit environments
	10597	* the compiler gets heartburn about switch on 64-bit values.
	10598	* A similar issue a little earlier when switching on value.
	10599	* --jhi */
	10600	switch ((I32)namedclass) {
	10601
	10602	case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
	10603	case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
	10604	case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
	10605	case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
	10606	case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
	10607	case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
	10608	case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
	10609	case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
	10610	case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
	10611	case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
	10612	/* \s, \w match all unicode if utf8. */
	10613	case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
	10614	case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
	10615	case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
	10616	case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
	10617	case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
	10618	case ANYOF_ASCII:
	10619	if (LOC)
	10620	ANYOF_CLASS_SET(ret, ANYOF_ASCII);
	10621	else {
	10622	for (value = 0; value < 128; value++)
	10623	stored +=
	10624	set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
	10625	}
	10626	yesno = '+';
	10627	what = NULL; /* Doesn't match outside ascii, so
	10628	don't want to add +utf8:: */
	10629	break;
	10630	case ANYOF_NASCII:
	10631	if (LOC)
	10632	ANYOF_CLASS_SET(ret, ANYOF_NASCII);
	10633	else {
	10634	for (value = 128; value < 256; value++)
	10635	stored +=
	10636	set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
	10637	}
	10638	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	10639	yesno = '!';
	10640	what = "ASCII";
	10641	break;
	10642	case ANYOF_DIGIT:
	10643	if (LOC)
	10644	ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
	10645	else {
	10646	/* consecutive digits assumed */
	10647	for (value = '0'; value <= '9'; value++)
	10648	stored +=
	10649	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	10650	}
	10651	yesno = '+';
	10652	what = "Digit";
	10653	break;
	10654	case ANYOF_NDIGIT:
	10655	if (LOC)
	10656	ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
	10657	else {
	10658	/* consecutive digits assumed */
	10659	for (value = 0; value < '0'; value++)
	10660	stored +=
	10661	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	10662	for (value = '9' + 1; value < 256; value++)
	10663	stored +=
	10664	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	10665	}
	10666	yesno = '!';
	10667	what = "Digit";
	10668	if (AT_LEAST_ASCII_RESTRICTED ) {
	10669	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	10670	}
	10671	break;
	10672	case ANYOF_MAX:
	10673	/* this is to handle \p and \P */
	10674	break;
	10675	default:
	10676	vFAIL("Invalid [::] class");
	10677	break;
	10678	}
	10679	if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
	10680	/* Strings such as "+utf8::isWord\n" */
	10681	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
	10682	}
	10683
	10684	continue;
	10685	}
	10686	} /* end of namedclass \blah */
	10687
	10688	if (range) {
	10689	if (prevvalue > (IV)value) /* b-a */ {
	10690	const int w = RExC_parse - rangebegin;
	10691	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	10692	range = 0; /* not a valid range */
	10693	}
	10694	}
	10695	else {
	10696	prevvalue = value; /* save the beginning of the range */
	10697	if (RExC_parse+1 < RExC_end
	10698	&& *RExC_parse == '-'
	10699	&& RExC_parse[1] != ']')
	10700	{
	10701	RExC_parse++;
	10702
	10703	/* a bad range like \w-, [:word:]- ? */
	10704	if (namedclass > OOB_NAMEDCLASS) {
	10705	if (ckWARN(WARN_REGEXP)) {
	10706	const int w =
	10707	RExC_parse >= rangebegin ?
	10708	RExC_parse - rangebegin : 0;
	10709	vWARN4(RExC_parse,
	10710	"False [] range \"%.s\"",
	10711	w, w, rangebegin);
	10712	}
	10713	if (!SIZE_ONLY)
	10714	stored +=
	10715	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	10716	} else
	10717	range = 1; /* yeah, it's a range! */
	10718	continue; /* but do it the next time */
	10719	}
	10720	}
	10721
	10722	/* non-Latin1 code point implies unicode semantics. Must be set in
	10723	* pass1 so is there for the whole of pass 2 */
	10724	if (value > 255) {
	10725	RExC_uni_semantics = 1;
	10726	}
	10727
	10728	/* now is the next time */
	10729	if (!SIZE_ONLY) {
	10730	if (prevvalue < 256) {
	10731	const IV ceilvalue = value < 256 ? value : 255;
	10732	IV i;
	10733	#ifdef EBCDIC
	10734	/* In EBCDIC [\x89-\x91] should include
	10735	* the \x8e but [i-j] should not. */
	10736	if (literal_endpoint == 2 &&
	10737	((isLOWER(prevvalue) && isLOWER(ceilvalue)) \|\|
	10738	(isUPPER(prevvalue) && isUPPER(ceilvalue))))
	10739	{
	10740	if (isLOWER(prevvalue)) {
	10741	for (i = prevvalue; i <= ceilvalue; i++)
	10742	if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	10743	stored +=
	10744	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10745	}
	10746	} else {
	10747	for (i = prevvalue; i <= ceilvalue; i++)
	10748	if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	10749	stored +=
	10750	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10751	}
	10752	}
	10753	}
	10754	else
	10755	#endif
	10756	for (i = prevvalue; i <= ceilvalue; i++) {
	10757	stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10758	}
	10759	}
	10760	if (value > 255) {
	10761	const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
	10762	const UV natvalue = NATIVE_TO_UNI(value);
	10763	nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
	10764	}
	10765	#ifdef EBCDIC
	10766	literal_endpoint = 0;
	10767	#endif
	10768	}
	10769
	10770	range = 0; /* this range (if it was one) is done now */
	10771	}
	10772
	10773
	10774
	10775	if (SIZE_ONLY)
	10776	return ret;
	10777	/**** !SIZE_ONLY AFTER HERE *******/
	10778
	10779	/* If folding and there are code points above 255, we calculate all
	10780	* characters that could fold to or from the ones already on the list */
	10781	if (FOLD && nonbitmap) {
	10782	UV start, end; /* End points of code point ranges */
	10783
	10784	SV* fold_intersection = NULL;
	10785
	10786	/* This is a list of all the characters that participate in folds
	10787	* (except marks, etc in multi-char folds */
	10788	if (! PL_utf8_foldable) {
	10789	SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
	10790	PL_utf8_foldable = _swash_to_invlist(swash);
	10791	SvREFCNT_dec(swash);
	10792	}
	10793
	10794	/* This is a hash that for a particular fold gives all characters
	10795	* that are involved in it */
	10796	if (! PL_utf8_foldclosures) {
	10797
	10798	/* If we were unable to find any folds, then we likely won't be
	10799	* able to find the closures. So just create an empty list.
	10800	* Folding will effectively be restricted to the non-Unicode rules
	10801	* hard-coded into Perl. (This case happens legitimately during
	10802	* compilation of Perl itself before the Unicode tables are
	10803	* generated) */
	10804	if (invlist_len(PL_utf8_foldable) == 0) {
	10805	PL_utf8_foldclosures = newHV();
	10806	} else {
	10807	/* If the folds haven't been read in, call a fold function
	10808	* to force that */
	10809	if (! PL_utf8_tofold) {
	10810	U8 dummy[UTF8_MAXBYTES+1];
	10811	STRLEN dummy_len;
	10812
	10813	/* This particular string is above \xff in both UTF-8 and
	10814	* UTFEBCDIC */
	10815	to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
	10816	assert(PL_utf8_tofold); /* Verify that worked */
	10817	}
	10818	PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
	10819	}
	10820	}
	10821
	10822	/* Only the characters in this class that participate in folds need be
	10823	* checked. Get the intersection of this class and all the possible
	10824	* characters that are foldable. This can quickly narrow down a large
	10825	* class */
	10826	_invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
	10827
	10828	/* Now look at the foldable characters in this class individually */
	10829	invlist_iterinit(fold_intersection);
	10830	while (invlist_iternext(fold_intersection, &start, &end)) {
	10831	UV j;
	10832
	10833	/* Look at every character in the range */
	10834	for (j = start; j <= end; j++) {
	10835
	10836	/* Get its fold */
	10837	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	10838	STRLEN foldlen;
	10839	const UV f =
	10840	_to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
	10841
	10842	if (foldlen > (STRLEN)UNISKIP(f)) {
	10843
	10844	/* Any multicharacter foldings (disallowed in lookbehind
	10845	* patterns) require the following transform: [ABCDEF] ->
	10846	* (?:[ABCabcDEFd]\|pq\|rst) where E folds into "pq" and F
	10847	* folds into "rst", all other characters fold to single
	10848	* characters. We save away these multicharacter foldings,
	10849	* to be later saved as part of the additional "s" data. */
	10850	if (! RExC_in_lookbehind) {
	10851	U8* loc = foldbuf;
	10852	U8* e = foldbuf + foldlen;
	10853
	10854	/* If any of the folded characters of this are in the
	10855	* Latin1 range, tell the regex engine that this can
	10856	* match a non-utf8 target string. The only multi-byte
	10857	* fold whose source is in the Latin1 range (U+00DF)
	10858	* applies only when the target string is utf8, or
	10859	* under unicode rules */
	10860	if (j > 255 \|\| AT_LEAST_UNI_SEMANTICS) {
	10861	while (loc < e) {
	10862
	10863	/* Can't mix ascii with non- under /aa */
	10864	if (MORE_ASCII_RESTRICTED
	10865	&& (isASCII(*loc) != isASCII(j)))
	10866	{
	10867	goto end_multi_fold;
	10868	}
	10869	if (UTF8_IS_INVARIANT(*loc)
	10870	\|\| UTF8_IS_DOWNGRADEABLE_START(*loc))
	10871	{
	10872	/* Can't mix above and below 256 under LOC
	10873	*/
	10874	if (LOC) {
	10875	goto end_multi_fold;
	10876	}
	10877	ANYOF_FLAGS(ret)
	10878	\|= ANYOF_NONBITMAP_NON_UTF8;
	10879	break;
	10880	}
	10881	loc += UTF8SKIP(loc);
	10882	}
	10883	}
	10884
	10885	add_alternate(&unicode_alternate, foldbuf, foldlen);
	10886	end_multi_fold: ;
	10887	}
	10888
	10889	/* This is special-cased, as it is the only letter which
	10890	* has both a multi-fold and single-fold in Latin1. All
	10891	* the other chars that have single and multi-folds are
	10892	* always in utf8, and the utf8 folding algorithm catches
	10893	* them */
	10894	if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
	10895	stored += set_regclass_bit(pRExC_state,
	10896	ret,
	10897	LATIN_SMALL_LETTER_SHARP_S,
	10898	&l1_fold_invlist, &unicode_alternate);
	10899	}
	10900	}
	10901	else {
	10902	/* Single character fold. Add everything in its fold
	10903	* closure to the list that this node should match */
	10904	SV** listp;
	10905
	10906	/* The fold closures data structure is a hash with the keys
	10907	* being every character that is folded to, like 'k', and
	10908	* the values each an array of everything that folds to its
	10909	* key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
	10910	if ((listp = hv_fetch(PL_utf8_foldclosures,
	10911	(char *) foldbuf, foldlen, FALSE)))
	10912	{
	10913	AV* list = (AV) listp;
	10914	IV k;
	10915	for (k = 0; k <= av_len(list); k++) {
	10916	SV** c_p = av_fetch(list, k, FALSE);
	10917	UV c;
	10918	if (c_p == NULL) {
	10919	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	10920	}
	10921	c = SvUV(*c_p);
	10922
	10923	/* /aa doesn't allow folds between ASCII and non-;
	10924	* /l doesn't allow them between above and below
	10925	* 256 */
	10926	if ((MORE_ASCII_RESTRICTED
	10927	&& (isASCII(c) != isASCII(j)))
	10928	\|\| (LOC && ((c < 256) != (j < 256))))
	10929	{
	10930	continue;
	10931	}
	10932
	10933	if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
	10934	stored += set_regclass_bit(pRExC_state,
	10935	ret,
	10936	(U8) c,
	10937	&l1_fold_invlist, &unicode_alternate);
	10938	}
	10939	/* It may be that the code point is already in
	10940	* this range or already in the bitmap, in
	10941	* which case we need do nothing */
	10942	else if ((c < start \|\| c > end)
	10943	&& (c > 255
	10944	\|\| ! ANYOF_BITMAP_TEST(ret, c)))
	10945	{
	10946	nonbitmap = add_cp_to_invlist(nonbitmap, c);
	10947	}
	10948	}
	10949	}
	10950	}
	10951	}
	10952	}
	10953	SvREFCNT_dec(fold_intersection);
	10954	}
	10955
	10956	/* Combine the two lists into one. */
	10957	if (l1_fold_invlist) {
	10958	if (nonbitmap) {
	10959	_invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
	10960	SvREFCNT_dec(l1_fold_invlist);
	10961	}
	10962	else {
	10963	nonbitmap = l1_fold_invlist;
	10964	}
	10965	}
	10966
	10967	/* And combine the result (if any) with any inversion list from properties.
	10968	* The lists are kept separate up to now because we don't want to fold the
	10969	* properties */
	10970	if (properties) {
	10971	if (nonbitmap) {
	10972	_invlist_union(nonbitmap, properties, &nonbitmap);
	10973	SvREFCNT_dec(properties);
	10974	}
	10975	else {
	10976	nonbitmap = properties;
	10977	}
	10978	}
	10979
	10980	/* Here, <nonbitmap> contains all the code points we can determine at
	10981	* compile time that we haven't put into the bitmap. Go through it, and
	10982	* for things that belong in the bitmap, put them there, and delete from
	10983	* <nonbitmap> */
	10984	if (nonbitmap) {
	10985
	10986	/* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
	10987	* possibly only should match when the target string is UTF-8 */
	10988	UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
	10989
	10990	/* This gets set if we actually need to modify things */
	10991	bool change_invlist = FALSE;
	10992
	10993	UV start, end;
	10994
	10995	/* Start looking through <nonbitmap> */
	10996	invlist_iterinit(nonbitmap);
	10997	while (invlist_iternext(nonbitmap, &start, &end)) {
	10998	UV high;
	10999	int i;
	11000
	11001	/* Quit if are above what we should change */
	11002	if (start > max_cp_to_set) {
	11003	break;
	11004	}
	11005
	11006	change_invlist = TRUE;
	11007
	11008	/* Set all the bits in the range, up to the max that we are doing */
	11009	high = (end < max_cp_to_set) ? end : max_cp_to_set;
	11010	for (i = start; i <= (int) high; i++) {
	11011	if (! ANYOF_BITMAP_TEST(ret, i)) {
	11012	ANYOF_BITMAP_SET(ret, i);
	11013	stored++;
	11014	prevvalue = value;
	11015	value = i;
	11016	}
	11017	}
	11018	}
	11019
	11020	/* Done with loop; set <nonbitmap> to not include any code points that
	11021	* are in the bitmap */
	11022	if (change_invlist) {
	11023	SV* keep_list = _new_invlist(2);
	11024	_append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX);
	11025	_invlist_intersection(nonbitmap, keep_list, &nonbitmap);
	11026	SvREFCNT_dec(keep_list);
	11027	}
	11028
	11029	/* If have completely emptied it, remove it completely */
	11030	if (invlist_len(nonbitmap) == 0) {
	11031	SvREFCNT_dec(nonbitmap);
	11032	nonbitmap = NULL;
	11033	}
	11034	}
	11035
	11036	/* Here, we have calculated what code points should be in the character
	11037	* class. <nonbitmap> does not overlap the bitmap except possibly in the
	11038	* case of DEPENDS rules.
	11039	*
	11040	* Now we can see about various optimizations. Fold calculation (which we
	11041	* did above) needs to take place before inversion. Otherwise /[^k]/i
	11042	* would invert to include K, which under /i would match k, which it
	11043	* shouldn't. */
	11044
	11045	/* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
	11046	* set the FOLD flag yet, so this does optimize those. It doesn't
	11047	* optimize locale. Doing so perhaps could be done as long as there is
	11048	* nothing like \w in it; some thought also would have to be given to the
	11049	* interaction with above 0x100 chars */
	11050	if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
	11051	&& ! LOC
	11052	&& ! unicode_alternate
	11053	/* In case of /d, there are some things that should match only when in
	11054	* not in the bitmap, i.e., they require UTF8 to match. These are
	11055	* listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
	11056	* case, they don't require UTF8, so can invert here */
	11057	&& (! nonbitmap
	11058	\|\| ! DEPENDS_SEMANTICS
	11059	\|\| (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
	11060	&& SvCUR(listsv) == initial_listsv_len)
	11061	{
	11062	int i;
	11063	if (! nonbitmap) {
	11064	for (i = 0; i < 256; ++i) {
	11065	if (ANYOF_BITMAP_TEST(ret, i)) {
	11066	ANYOF_BITMAP_CLEAR(ret, i);
	11067	}
	11068	else {
	11069	ANYOF_BITMAP_SET(ret, i);
	11070	prevvalue = value;
	11071	value = i;
	11072	}
	11073	}
	11074	/* The inversion means that everything above 255 is matched */
	11075	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	11076	}
	11077	else {
	11078	/* Here, also has things outside the bitmap that may overlap with
	11079	* the bitmap. We have to sync them up, so that they get inverted
	11080	* in both places. Earlier, we removed all overlaps except in the
	11081	* case of /d rules, so no syncing is needed except for this case
	11082	*/
	11083	SV *remove_list = NULL;
	11084
	11085	if (DEPENDS_SEMANTICS) {
	11086	UV start, end;
	11087
	11088	/* Set the bits that correspond to the ones that aren't in the
	11089	* bitmap. Otherwise, when we invert, we'll miss these.
	11090	* Earlier, we removed from the nonbitmap all code points
	11091	* < 128, so there is no extra work here */
	11092	invlist_iterinit(nonbitmap);
	11093	while (invlist_iternext(nonbitmap, &start, &end)) {
	11094	if (start > 255) { /* The bit map goes to 255 */
	11095	break;
	11096	}
	11097	if (end > 255) {
	11098	end = 255;
	11099	}
	11100	for (i = start; i <= (int) end; ++i) {
	11101	ANYOF_BITMAP_SET(ret, i);
	11102	prevvalue = value;
	11103	value = i;
	11104	}
	11105	}
	11106	}
	11107
	11108	/* Now invert both the bitmap and the nonbitmap. Anything in the
	11109	* bitmap has to also be removed from the non-bitmap, but again,
	11110	* there should not be overlap unless is /d rules. */
	11111	_invlist_invert(nonbitmap);
	11112
	11113	for (i = 0; i < 256; ++i) {
	11114	if (ANYOF_BITMAP_TEST(ret, i)) {
	11115	ANYOF_BITMAP_CLEAR(ret, i);
	11116	if (DEPENDS_SEMANTICS) {
	11117	if (! remove_list) {
	11118	remove_list = _new_invlist(2);
	11119	}
	11120	remove_list = add_cp_to_invlist(remove_list, i);
	11121	}
	11122	}
	11123	else {
	11124	ANYOF_BITMAP_SET(ret, i);
	11125	prevvalue = value;
	11126	value = i;
	11127	}
	11128	}
	11129
	11130	/* And do the removal */
	11131	if (DEPENDS_SEMANTICS) {
	11132	if (remove_list) {
	11133	_invlist_subtract(nonbitmap, remove_list, &nonbitmap);
	11134	SvREFCNT_dec(remove_list);
	11135	}
	11136	}
	11137	else {
	11138	/* There is no overlap for non-/d, so just delete anything
	11139	* below 256 */
	11140	SV* keep_list = _new_invlist(2);
	11141	_append_range_to_invlist(keep_list, 256, UV_MAX);
	11142	_invlist_intersection(nonbitmap, keep_list, &nonbitmap);
	11143	SvREFCNT_dec(keep_list);
	11144	}
	11145	}
	11146
	11147	stored = 256 - stored;
	11148
	11149	/* Clear the invert flag since have just done it here */
	11150	ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
	11151	}
	11152
	11153	/* Folding in the bitmap is taken care of above, but not for locale (for
	11154	* which we have to wait to see what folding is in effect at runtime), and
	11155	* for some things not in the bitmap (only the upper latin folds in this
	11156	* case, as all other single-char folding has been set above). Set
	11157	* run-time fold flag for these */
	11158	if (FOLD && (LOC
	11159	\|\| (DEPENDS_SEMANTICS
	11160	&& nonbitmap
	11161	&& ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
	11162	\|\| unicode_alternate))
	11163	{
	11164	ANYOF_FLAGS(ret) \|= ANYOF_LOC_NONBITMAP_FOLD;
	11165	}
	11166
	11167	/* A single character class can be "optimized" into an EXACTish node.
	11168	* Note that since we don't currently count how many characters there are
	11169	* outside the bitmap, we are XXX missing optimization possibilities for
	11170	* them. This optimization can't happen unless this is a truly single
	11171	* character class, which means that it can't be an inversion into a
	11172	* many-character class, and there must be no possibility of there being
	11173	* things outside the bitmap. 'stored' (only) for locales doesn't include
	11174	* \w, etc, so have to make a special test that they aren't present
	11175	*
	11176	* Similarly A 2-character class of the very special form like [bB] can be
	11177	* optimized into an EXACTFish node, but only for non-locales, and for
	11178	* characters which only have the two folds; so things like 'fF' and 'Ii'
	11179	* wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
	11180	* FI'. */
	11181	if (! nonbitmap
	11182	&& ! unicode_alternate
	11183	&& SvCUR(listsv) == initial_listsv_len
	11184	&& ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT\|ANYOF_UNICODE_ALL))
	11185	&& (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	11186	\|\| (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
	11187	\|\| (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	11188	&& (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
	11189	/* If the latest code point has a fold whose
	11190	* bit is set, it must be the only other one */
	11191	&& ((prevvalue = PL_fold_latin1[value]) != (IV)value)
	11192	&& ANYOF_BITMAP_TEST(ret, prevvalue)))))
	11193	{
	11194	/* Note that the information needed to decide to do this optimization
	11195	* is not currently available until the 2nd pass, and that the actually
	11196	* used EXACTish node takes less space than the calculated ANYOF node,
	11197	* and hence the amount of space calculated in the first pass is larger
	11198	* than actually used, so this optimization doesn't gain us any space.
	11199	* But an EXACT node is faster than an ANYOF node, and can be combined
	11200	* with any adjacent EXACT nodes later by the optimizer for further
	11201	* gains. The speed of executing an EXACTF is similar to an ANYOF
	11202	* node, so the optimization advantage comes from the ability to join
	11203	* it to adjacent EXACT nodes */
	11204
	11205	const char * cur_parse= RExC_parse;
	11206	U8 op;
	11207	RExC_emit = (regnode *)orig_emit;
	11208	RExC_parse = (char *)orig_parse;
	11209
	11210	if (stored == 1) {
	11211
	11212	/* A locale node with one point can be folded; all the other cases
	11213	* with folding will have two points, since we calculate them above
	11214	*/
	11215	if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
	11216	op = EXACTFL;
	11217	}
	11218	else {
	11219	op = EXACT;
	11220	}
	11221	}
	11222	else { /* else 2 chars in the bit map: the folds of each other */
	11223
	11224	/* Use the folded value, which for the cases where we get here,
	11225	* is just the lower case of the current one (which may resolve to
	11226	* itself, or to the other one */
	11227	value = toLOWER_LATIN1(value);
	11228
	11229	/* To join adjacent nodes, they must be the exact EXACTish type.
	11230	* Try to use the most likely type, by using EXACTFA if possible,
	11231	* then EXACTFU if the regex calls for it, or is required because
	11232	* the character is non-ASCII. (If <value> is ASCII, its fold is
	11233	* also ASCII for the cases where we get here.) */
	11234	if (MORE_ASCII_RESTRICTED && isASCII(value)) {
	11235	op = EXACTFA;
	11236	}
	11237	else if (AT_LEAST_UNI_SEMANTICS \|\| !isASCII(value)) {
	11238	op = EXACTFU;
	11239	}
	11240	else { /* Otherwise, more likely to be EXACTF type */
	11241	op = EXACTF;
	11242	}
	11243	}
	11244
	11245	ret = reg_node(pRExC_state, op);
	11246	RExC_parse = (char *)cur_parse;
	11247	if (UTF && ! NATIVE_IS_INVARIANT(value)) {
	11248	*STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
	11249	*(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
	11250	STR_LEN(ret)= 2;
	11251	RExC_emit += STR_SZ(2);
	11252	}
	11253	else {
	11254	*STRING(ret)= (char)value;
	11255	STR_LEN(ret)= 1;
	11256	RExC_emit += STR_SZ(1);
	11257	}
	11258	SvREFCNT_dec(listsv);
	11259	return ret;
	11260	}
	11261
	11262	/* If there is a swash and more than one element, we can't use the swash in
	11263	* the optimization below. */
	11264	if (swash && element_count > 1) {
	11265	SvREFCNT_dec(swash);
	11266	swash = NULL;
	11267	}
	11268	if (! nonbitmap
	11269	&& SvCUR(listsv) == initial_listsv_len
	11270	&& ! unicode_alternate)
	11271	{
	11272	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
	11273	SvREFCNT_dec(listsv);
	11274	SvREFCNT_dec(unicode_alternate);
	11275	}
	11276	else {
	11277	/* av[0] stores the character class description in its textual form:
	11278	* used later (regexec.c:Perl_regclass_swash()) to initialize the
	11279	* appropriate swash, and is also useful for dumping the regnode.
	11280	* av[1] if NULL, is a placeholder to later contain the swash computed
	11281	* from av[0]. But if no further computation need be done, the
	11282	* swash is stored there now.
	11283	* av[2] stores the multicharacter foldings, used later in
	11284	* regexec.c:S_reginclass().
	11285	* av[3] stores the nonbitmap inversion list for use in addition or
	11286	* instead of av[0]; not used if av[1] isn't NULL
	11287	* av[4] is set if any component of the class is from a user-defined
	11288	* property; not used if av[1] isn't NULL */
	11289	AV * const av = newAV();
	11290	SV *rv;
	11291
	11292	av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
	11293	? &PL_sv_undef
	11294	: listsv);
	11295	if (swash) {
	11296	av_store(av, 1, swash);
	11297	SvREFCNT_dec(nonbitmap);
	11298	}
	11299	else {
	11300	av_store(av, 1, NULL);
	11301	if (nonbitmap) {
	11302	av_store(av, 3, nonbitmap);
	11303	av_store(av, 4, newSVuv(has_user_defined_property));
	11304	}
	11305	}
	11306
	11307	/* Store any computed multi-char folds only if we are allowing
	11308	* them */
	11309	if (allow_full_fold) {
	11310	av_store(av, 2, MUTABLE_SV(unicode_alternate));
	11311	if (unicode_alternate) { /* This node is variable length */
	11312	OP(ret) = ANYOFV;
	11313	}
	11314	}
	11315	else {
	11316	av_store(av, 2, NULL);
	11317	}
	11318	rv = newRV_noinc(MUTABLE_SV(av));
	11319	n = add_data(pRExC_state, 1, "s");
	11320	RExC_rxi->data->data[n] = (void*)rv;
	11321	ARG_SET(ret, n);
	11322	}
	11323	return ret;
	11324	}
	11325	#undef _C_C_T_
	11326
	11327
	11328	/* reg_skipcomment()
	11329
	11330	Absorbs an /x style # comments from the input stream.
	11331	Returns true if there is more text remaining in the stream.
	11332	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	11333	terminates the pattern without including a newline.
	11334
	11335	Note its the callers responsibility to ensure that we are
	11336	actually in /x mode
	11337
	11338	*/
	11339
	11340	STATIC bool
	11341	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	11342	{
	11343	bool ended = 0;
	11344
	11345	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	11346
	11347	while (RExC_parse < RExC_end)
	11348	if (*RExC_parse++ == '\n') {
	11349	ended = 1;
	11350	break;
	11351	}
	11352	if (!ended) {
	11353	/* we ran off the end of the pattern without ending
	11354	the comment, so we have to add an \n when wrapping */
	11355	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	11356	return 0;
	11357	} else
	11358	return 1;
	11359	}
	11360
	11361	/* nextchar()
	11362
	11363	Advances the parse position, and optionally absorbs
	11364	"whitespace" from the inputstream.
	11365
	11366	Without /x "whitespace" means (?#...) style comments only,
	11367	with /x this means (?#...) and # comments and whitespace proper.
	11368
	11369	Returns the RExC_parse point from BEFORE the scan occurs.
	11370
	11371	This is the /x friendly way of saying RExC_parse++.
	11372	*/
	11373
	11374	STATIC char*
	11375	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	11376	{
	11377	char* const retval = RExC_parse++;
	11378
	11379	PERL_ARGS_ASSERT_NEXTCHAR;
	11380
	11381	for (;;) {
	11382	if (RExC_end - RExC_parse >= 3
	11383	&& *RExC_parse == '('
	11384	&& RExC_parse[1] == '?'
	11385	&& RExC_parse[2] == '#')
	11386	{
	11387	while (*RExC_parse != ')') {
	11388	if (RExC_parse == RExC_end)
	11389	FAIL("Sequence (?#... not terminated");
	11390	RExC_parse++;
	11391	}
	11392	RExC_parse++;
	11393	continue;
	11394	}
	11395	if (RExC_flags & RXf_PMf_EXTENDED) {
	11396	if (isSPACE(*RExC_parse)) {
	11397	RExC_parse++;
	11398	continue;
	11399	}
	11400	else if (*RExC_parse == '#') {
	11401	if ( reg_skipcomment( pRExC_state ) )
	11402	continue;
	11403	}
	11404	}
	11405	return retval;
	11406	}
	11407	}
	11408
	11409	/*
	11410	- reg_node - emit a node
	11411	*/
	11412	STATIC regnode * /* Location. */
	11413	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	11414	{
	11415	dVAR;
	11416	register regnode *ptr;
	11417	regnode * const ret = RExC_emit;
	11418	GET_RE_DEBUG_FLAGS_DECL;
	11419
	11420	PERL_ARGS_ASSERT_REG_NODE;
	11421
	11422	if (SIZE_ONLY) {
	11423	SIZE_ALIGN(RExC_size);
	11424	RExC_size += 1;
	11425	return(ret);
	11426	}
	11427	if (RExC_emit >= RExC_emit_bound)
	11428	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	11429	op, RExC_emit, RExC_emit_bound);
	11430
	11431	NODE_ALIGN_FILL(ret);
	11432	ptr = ret;
	11433	FILL_ADVANCE_NODE(ptr, op);
	11434	#ifdef RE_TRACK_PATTERN_OFFSETS
	11435	if (RExC_offsets) { /* MJD */
	11436	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	11437	"reg_node", __LINE__,
	11438	PL_reg_name[op],
	11439	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	11440	? "Overwriting end of array!\n" : "OK",
	11441	(UV)(RExC_emit - RExC_emit_start),
	11442	(UV)(RExC_parse - RExC_start),
	11443	(UV)RExC_offsets[0]));
	11444	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	11445	}
	11446	#endif
	11447	RExC_emit = ptr;
	11448	return(ret);
	11449	}
	11450
	11451	/*
	11452	- reganode - emit a node with an argument
	11453	*/
	11454	STATIC regnode * /* Location. */
	11455	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	11456	{
	11457	dVAR;
	11458	register regnode *ptr;
	11459	regnode * const ret = RExC_emit;
	11460	GET_RE_DEBUG_FLAGS_DECL;
	11461
	11462	PERL_ARGS_ASSERT_REGANODE;
	11463
	11464	if (SIZE_ONLY) {
	11465	SIZE_ALIGN(RExC_size);
	11466	RExC_size += 2;
	11467	/*
	11468	We can't do this:
	11469
	11470	assert(2==regarglen[op]+1);
	11471
	11472	Anything larger than this has to allocate the extra amount.
	11473	If we changed this to be:
	11474
	11475	RExC_size += (1 + regarglen[op]);
	11476
	11477	then it wouldn't matter. Its not clear what side effect
	11478	might come from that so its not done so far.
	11479	-- dmq
	11480	*/
	11481	return(ret);
	11482	}
	11483	if (RExC_emit >= RExC_emit_bound)
	11484	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	11485	op, RExC_emit, RExC_emit_bound);
	11486
	11487	NODE_ALIGN_FILL(ret);
	11488	ptr = ret;
	11489	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	11490	#ifdef RE_TRACK_PATTERN_OFFSETS
	11491	if (RExC_offsets) { /* MJD */
	11492	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	11493	"reganode",
	11494	__LINE__,
	11495	PL_reg_name[op],
	11496	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	11497	"Overwriting end of array!\n" : "OK",
	11498	(UV)(RExC_emit - RExC_emit_start),
	11499	(UV)(RExC_parse - RExC_start),
	11500	(UV)RExC_offsets[0]));
	11501	Set_Cur_Node_Offset;
	11502	}
	11503	#endif
	11504	RExC_emit = ptr;
	11505	return(ret);
	11506	}
	11507
	11508	/*
	11509	- reguni - emit (if appropriate) a Unicode character
	11510	*/
	11511	STATIC STRLEN
	11512	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	11513	{
	11514	dVAR;
	11515
	11516	PERL_ARGS_ASSERT_REGUNI;
	11517
	11518	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	11519	}
	11520
	11521	/*
	11522	- reginsert - insert an operator in front of already-emitted operand
	11523	*
	11524	* Means relocating the operand.
	11525	*/
	11526	STATIC void
	11527	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	11528	{
	11529	dVAR;
	11530	register regnode *src;
	11531	register regnode *dst;
	11532	register regnode *place;
	11533	const int offset = regarglen[(U8)op];
	11534	const int size = NODE_STEP_REGNODE + offset;
	11535	GET_RE_DEBUG_FLAGS_DECL;
	11536
	11537	PERL_ARGS_ASSERT_REGINSERT;
	11538	PERL_UNUSED_ARG(depth);
	11539	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	11540	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	11541	if (SIZE_ONLY) {
	11542	RExC_size += size;
	11543	return;
	11544	}
	11545
	11546	src = RExC_emit;
	11547	RExC_emit += size;
	11548	dst = RExC_emit;
	11549	if (RExC_open_parens) {
	11550	int paren;
	11551	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	11552	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	11553	if ( RExC_open_parens[paren] >= opnd ) {
	11554	/DEBUG_PARSE_FMT("open"," - %d",size);/
	11555	RExC_open_parens[paren] += size;
	11556	} else {
	11557	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	11558	}
	11559	if ( RExC_close_parens[paren] >= opnd ) {
	11560	/DEBUG_PARSE_FMT("close"," - %d",size);/
	11561	RExC_close_parens[paren] += size;
	11562	} else {
	11563	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	11564	}
	11565	}
	11566	}
	11567
	11568	while (src > opnd) {
	11569	StructCopy(--src, --dst, regnode);
	11570	#ifdef RE_TRACK_PATTERN_OFFSETS
	11571	if (RExC_offsets) { /* MJD 20010112 */
	11572	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	11573	"reg_insert",
	11574	__LINE__,
	11575	PL_reg_name[op],
	11576	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	11577	? "Overwriting end of array!\n" : "OK",
	11578	(UV)(src - RExC_emit_start),
	11579	(UV)(dst - RExC_emit_start),
	11580	(UV)RExC_offsets[0]));
	11581	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	11582	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	11583	}
	11584	#endif
	11585	}
	11586
	11587
	11588	place = opnd; /* Op node, where operand used to be. */
	11589	#ifdef RE_TRACK_PATTERN_OFFSETS
	11590	if (RExC_offsets) { /* MJD */
	11591	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	11592	"reginsert",
	11593	__LINE__,
	11594	PL_reg_name[op],
	11595	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	11596	? "Overwriting end of array!\n" : "OK",
	11597	(UV)(place - RExC_emit_start),
	11598	(UV)(RExC_parse - RExC_start),
	11599	(UV)RExC_offsets[0]));
	11600	Set_Node_Offset(place, RExC_parse);
	11601	Set_Node_Length(place, 1);
	11602	}
	11603	#endif
	11604	src = NEXTOPER(place);
	11605	FILL_ADVANCE_NODE(place, op);
	11606	Zero(src, offset, regnode);
	11607	}
	11608
	11609	/*
	11610	- regtail - set the next-pointer at the end of a node chain of p to val.
	11611	- SEE ALSO: regtail_study
	11612	*/
	11613	/* TODO: All three parms should be const */
	11614	STATIC void
	11615	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	11616	{
	11617	dVAR;
	11618	register regnode *scan;
	11619	GET_RE_DEBUG_FLAGS_DECL;
	11620
	11621	PERL_ARGS_ASSERT_REGTAIL;
	11622	#ifndef DEBUGGING
	11623	PERL_UNUSED_ARG(depth);
	11624	#endif
	11625
	11626	if (SIZE_ONLY)
	11627	return;
	11628
	11629	/* Find last node. */
	11630	scan = p;
	11631	for (;;) {
	11632	regnode * const temp = regnext(scan);
	11633	DEBUG_PARSE_r({
	11634	SV * const mysv=sv_newmortal();
	11635	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	11636	regprop(RExC_rx, mysv, scan);
	11637	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	11638	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	11639	(temp == NULL ? "->" : ""),
	11640	(temp == NULL ? PL_reg_name[OP(val)] : "")
	11641	);
	11642	});
	11643	if (temp == NULL)
	11644	break;
	11645	scan = temp;
	11646	}
	11647
	11648	if (reg_off_by_arg[OP(scan)]) {
	11649	ARG_SET(scan, val - scan);
	11650	}
	11651	else {
	11652	NEXT_OFF(scan) = val - scan;
	11653	}
	11654	}
	11655
	11656	#ifdef DEBUGGING
	11657	/*
	11658	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	11659	- Look for optimizable sequences at the same time.
	11660	- currently only looks for EXACT chains.
	11661
	11662	This is experimental code. The idea is to use this routine to perform
	11663	in place optimizations on branches and groups as they are constructed,
	11664	with the long term intention of removing optimization from study_chunk so
	11665	that it is purely analytical.
	11666
	11667	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	11668	to control which is which.
	11669
	11670	*/
	11671	/* TODO: All four parms should be const */
	11672
	11673	STATIC U8
	11674	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	11675	{
	11676	dVAR;
	11677	register regnode *scan;
	11678	U8 exact = PSEUDO;
	11679	#ifdef EXPERIMENTAL_INPLACESCAN
	11680	I32 min = 0;
	11681	#endif
	11682	GET_RE_DEBUG_FLAGS_DECL;
	11683
	11684	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	11685
	11686
	11687	if (SIZE_ONLY)
	11688	return exact;
	11689
	11690	/* Find last node. */
	11691
	11692	scan = p;
	11693	for (;;) {
	11694	regnode * const temp = regnext(scan);
	11695	#ifdef EXPERIMENTAL_INPLACESCAN
	11696	if (PL_regkind[OP(scan)] == EXACT) {
	11697	bool has_exactf_sharp_s; /* Unexamined in this routine */
	11698	if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
	11699	return EXACT;
	11700	}
	11701	#endif
	11702	if ( exact ) {
	11703	switch (OP(scan)) {
	11704	case EXACT:
	11705	case EXACTF:
	11706	case EXACTFA:
	11707	case EXACTFU:
	11708	case EXACTFU_SS:
	11709	case EXACTFU_NO_TRIE:
	11710	case EXACTFL:
	11711	if( exact == PSEUDO )
	11712	exact= OP(scan);
	11713	else if ( exact != OP(scan) )
	11714	exact= 0;
	11715	case NOTHING:
	11716	break;
	11717	default:
	11718	exact= 0;
	11719	}
	11720	}
	11721	DEBUG_PARSE_r({
	11722	SV * const mysv=sv_newmortal();
	11723	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	11724	regprop(RExC_rx, mysv, scan);
	11725	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	11726	SvPV_nolen_const(mysv),
	11727	REG_NODE_NUM(scan),
	11728	PL_reg_name[exact]);
	11729	});
	11730	if (temp == NULL)
	11731	break;
	11732	scan = temp;
	11733	}
	11734	DEBUG_PARSE_r({
	11735	SV * const mysv_val=sv_newmortal();
	11736	DEBUG_PARSE_MSG("");
	11737	regprop(RExC_rx, mysv_val, val);
	11738	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	11739	SvPV_nolen_const(mysv_val),
	11740	(IV)REG_NODE_NUM(val),
	11741	(IV)(val - scan)
	11742	);
	11743	});
	11744	if (reg_off_by_arg[OP(scan)]) {
	11745	ARG_SET(scan, val - scan);
	11746	}
	11747	else {
	11748	NEXT_OFF(scan) = val - scan;
	11749	}
	11750
	11751	return exact;
	11752	}
	11753	#endif
	11754
	11755	/*
	11756	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	11757	*/
	11758	#ifdef DEBUGGING
	11759	static void
	11760	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	11761	{
	11762	int bit;
	11763	int set=0;
	11764	regex_charset cs;
	11765
	11766	for (bit=0; bit<32; bit++) {
	11767	if (flags & (1<<bit)) {
	11768	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	11769	continue;
	11770	}
	11771	if (!set++ && lead)
	11772	PerlIO_printf(Perl_debug_log, "%s",lead);
	11773	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	11774	}
	11775	}
	11776	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	11777	if (!set++ && lead) {
	11778	PerlIO_printf(Perl_debug_log, "%s",lead);
	11779	}
	11780	switch (cs) {
	11781	case REGEX_UNICODE_CHARSET:
	11782	PerlIO_printf(Perl_debug_log, "UNICODE");
	11783	break;
	11784	case REGEX_LOCALE_CHARSET:
	11785	PerlIO_printf(Perl_debug_log, "LOCALE");
	11786	break;
	11787	case REGEX_ASCII_RESTRICTED_CHARSET:
	11788	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	11789	break;
	11790	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	11791	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	11792	break;
	11793	default:
	11794	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	11795	break;
	11796	}
	11797	}
	11798	if (lead) {
	11799	if (set)
	11800	PerlIO_printf(Perl_debug_log, "\n");
	11801	else
	11802	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	11803	}
	11804	}
	11805	#endif
	11806
	11807	void
	11808	Perl_regdump(pTHX_ const regexp *r)
	11809	{
	11810	#ifdef DEBUGGING
	11811	dVAR;
	11812	SV * const sv = sv_newmortal();
	11813	SV *dsv= sv_newmortal();
	11814	RXi_GET_DECL(r,ri);
	11815	GET_RE_DEBUG_FLAGS_DECL;
	11816
	11817	PERL_ARGS_ASSERT_REGDUMP;
	11818
	11819	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	11820
	11821	/* Header fields of interest. */
	11822	if (r->anchored_substr) {
	11823	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	11824	RE_SV_DUMPLEN(r->anchored_substr), 30);
	11825	PerlIO_printf(Perl_debug_log,
	11826	"anchored %s%s at %"IVdf" ",
	11827	s, RE_SV_TAIL(r->anchored_substr),
	11828	(IV)r->anchored_offset);
	11829	} else if (r->anchored_utf8) {
	11830	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	11831	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	11832	PerlIO_printf(Perl_debug_log,
	11833	"anchored utf8 %s%s at %"IVdf" ",
	11834	s, RE_SV_TAIL(r->anchored_utf8),
	11835	(IV)r->anchored_offset);
	11836	}
	11837	if (r->float_substr) {
	11838	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	11839	RE_SV_DUMPLEN(r->float_substr), 30);
	11840	PerlIO_printf(Perl_debug_log,
	11841	"floating %s%s at %"IVdf"..%"UVuf" ",
	11842	s, RE_SV_TAIL(r->float_substr),
	11843	(IV)r->float_min_offset, (UV)r->float_max_offset);
	11844	} else if (r->float_utf8) {
	11845	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	11846	RE_SV_DUMPLEN(r->float_utf8), 30);
	11847	PerlIO_printf(Perl_debug_log,
	11848	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	11849	s, RE_SV_TAIL(r->float_utf8),
	11850	(IV)r->float_min_offset, (UV)r->float_max_offset);
	11851	}
	11852	if (r->check_substr \|\| r->check_utf8)
	11853	PerlIO_printf(Perl_debug_log,
	11854	(const char *)
	11855	(r->check_substr == r->float_substr
	11856	&& r->check_utf8 == r->float_utf8
	11857	? "(checking floating" : "(checking anchored"));
	11858	if (r->extflags & RXf_NOSCAN)
	11859	PerlIO_printf(Perl_debug_log, " noscan");
	11860	if (r->extflags & RXf_CHECK_ALL)
	11861	PerlIO_printf(Perl_debug_log, " isall");
	11862	if (r->check_substr \|\| r->check_utf8)
	11863	PerlIO_printf(Perl_debug_log, ") ");
	11864
	11865	if (ri->regstclass) {
	11866	regprop(r, sv, ri->regstclass);
	11867	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	11868	}
	11869	if (r->extflags & RXf_ANCH) {
	11870	PerlIO_printf(Perl_debug_log, "anchored");
	11871	if (r->extflags & RXf_ANCH_BOL)
	11872	PerlIO_printf(Perl_debug_log, "(BOL)");
	11873	if (r->extflags & RXf_ANCH_MBOL)
	11874	PerlIO_printf(Perl_debug_log, "(MBOL)");
	11875	if (r->extflags & RXf_ANCH_SBOL)
	11876	PerlIO_printf(Perl_debug_log, "(SBOL)");
	11877	if (r->extflags & RXf_ANCH_GPOS)
	11878	PerlIO_printf(Perl_debug_log, "(GPOS)");
	11879	PerlIO_putc(Perl_debug_log, ' ');
	11880	}
	11881	if (r->extflags & RXf_GPOS_SEEN)
	11882	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	11883	if (r->intflags & PREGf_SKIP)
	11884	PerlIO_printf(Perl_debug_log, "plus ");
	11885	if (r->intflags & PREGf_IMPLICIT)
	11886	PerlIO_printf(Perl_debug_log, "implicit ");
	11887	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	11888	if (r->extflags & RXf_EVAL_SEEN)
	11889	PerlIO_printf(Perl_debug_log, "with eval ");
	11890	PerlIO_printf(Perl_debug_log, "\n");
	11891	DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
	11892	#else
	11893	PERL_ARGS_ASSERT_REGDUMP;
	11894	PERL_UNUSED_CONTEXT;
	11895	PERL_UNUSED_ARG(r);
	11896	#endif /* DEBUGGING */
	11897	}
	11898
	11899	/*
	11900	- regprop - printable representation of opcode
	11901	*/
	11902	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	11903	STMT_START { \
	11904	if (do_sep) { \
	11905	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	11906	if (flags & ANYOF_INVERT) \
	11907	/make sure the invert info is in each / \
	11908	sv_catpvs(sv, "^"); \
	11909	do_sep = 0; \
	11910	} \
	11911	} STMT_END
	11912
	11913	void
	11914	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	11915	{
	11916	#ifdef DEBUGGING
	11917	dVAR;
	11918	register int k;
	11919	RXi_GET_DECL(prog,progi);
	11920	GET_RE_DEBUG_FLAGS_DECL;
	11921
	11922	PERL_ARGS_ASSERT_REGPROP;
	11923
	11924	sv_setpvs(sv, "");
	11925
	11926	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	11927	/* It would be nice to FAIL() here, but this may be called from
	11928	regexec.c, and it would be hard to supply pRExC_state. */
	11929	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	11930	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	11931
	11932	k = PL_regkind[OP(o)];
	11933
	11934	if (k == EXACT) {
	11935	sv_catpvs(sv, " ");
	11936	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	11937	* is a crude hack but it may be the best for now since
	11938	* we have no flag "this EXACTish node was UTF-8"
	11939	* --jhi */
	11940	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	11941	PERL_PV_ESCAPE_UNI_DETECT \|
	11942	PERL_PV_ESCAPE_NONASCII \|
	11943	PERL_PV_PRETTY_ELLIPSES \|
	11944	PERL_PV_PRETTY_LTGT \|
	11945	PERL_PV_PRETTY_NOCLEAR
	11946	);
	11947	} else if (k == TRIE) {
	11948	/* print the details of the trie in dumpuntil instead, as
	11949	* progi->data isn't available here */
	11950	const char op = OP(o);
	11951	const U32 n = ARG(o);
	11952	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	11953	(reg_ac_data *)progi->data->data[n] :
	11954	NULL;
	11955	const reg_trie_data * const trie
	11956	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	11957
	11958	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	11959	DEBUG_TRIE_COMPILE_r(
	11960	Perl_sv_catpvf(aTHX_ sv,
	11961	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	11962	(UV)trie->startstate,
	11963	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	11964	(UV)trie->wordcount,
	11965	(UV)trie->minlen,
	11966	(UV)trie->maxlen,
	11967	(UV)TRIE_CHARCOUNT(trie),
	11968	(UV)trie->uniquecharcount
	11969	)
	11970	);
	11971	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	11972	int i;
	11973	int rangestart = -1;
	11974	U8* bitmap = IS_ANYOF_TRIE(op) ? (U8)ANYOF_BITMAP(o) : (U8)TRIE_BITMAP(trie);
	11975	sv_catpvs(sv, "[");
	11976	for (i = 0; i <= 256; i++) {
	11977	if (i < 256 && BITMAP_TEST(bitmap,i)) {
	11978	if (rangestart == -1)
	11979	rangestart = i;
	11980	} else if (rangestart != -1) {
	11981	if (i <= rangestart + 3)
	11982	for (; rangestart < i; rangestart++)
	11983	put_byte(sv, rangestart);
	11984	else {
	11985	put_byte(sv, rangestart);
	11986	sv_catpvs(sv, "-");
	11987	put_byte(sv, i - 1);
	11988	}
	11989	rangestart = -1;
	11990	}
	11991	}
	11992	sv_catpvs(sv, "]");
	11993	}
	11994
	11995	} else if (k == CURLY) {
	11996	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	11997	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	11998	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	11999	}
	12000	else if (k == WHILEM && o->flags) /* Ordinal/of */
	12001	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	12002	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	12003	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	12004	if ( RXp_PAREN_NAMES(prog) ) {
	12005	if ( k != REF \|\| (OP(o) < NREF)) {
	12006	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	12007	SV **name= av_fetch(list, ARG(o), 0 );
	12008	if (name)
	12009	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	12010	}
	12011	else {
	12012	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	12013	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	12014	I32 nums=(I32)SvPVX(sv_dat);
	12015	SV **name= av_fetch(list, nums[0], 0 );
	12016	I32 n;
	12017	if (name) {
	12018	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	12019	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	12020	(n ? "," : ""), (IV)nums[n]);
	12021	}
	12022	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	12023	}
	12024	}
	12025	}
	12026	} else if (k == GOSUB)
	12027	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	12028	else if (k == VERB) {
	12029	if (!o->flags)
	12030	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	12031	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	12032	} else if (k == LOGICAL)
	12033	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	12034	else if (k == ANYOF) {
	12035	int i, rangestart = -1;
	12036	const U8 flags = ANYOF_FLAGS(o);
	12037	int do_sep = 0;
	12038
	12039	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	12040	static const char * const anyofs[] = {
	12041	"\\w",
	12042	"\\W",
	12043	"\\s",
	12044	"\\S",
	12045	"\\d",
	12046	"\\D",
	12047	"[:alnum:]",
	12048	"[:^alnum:]",
	12049	"[:alpha:]",
	12050	"[:^alpha:]",
	12051	"[:ascii:]",
	12052	"[:^ascii:]",
	12053	"[:cntrl:]",
	12054	"[:^cntrl:]",
	12055	"[:graph:]",
	12056	"[:^graph:]",
	12057	"[:lower:]",
	12058	"[:^lower:]",
	12059	"[:print:]",
	12060	"[:^print:]",
	12061	"[:punct:]",
	12062	"[:^punct:]",
	12063	"[:upper:]",
	12064	"[:^upper:]",
	12065	"[:xdigit:]",
	12066	"[:^xdigit:]",
	12067	"[:space:]",
	12068	"[:^space:]",
	12069	"[:blank:]",
	12070	"[:^blank:]"
	12071	};
	12072
	12073	if (flags & ANYOF_LOCALE)
	12074	sv_catpvs(sv, "{loc}");
	12075	if (flags & ANYOF_LOC_NONBITMAP_FOLD)
	12076	sv_catpvs(sv, "{i}");
	12077	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	12078	if (flags & ANYOF_INVERT)
	12079	sv_catpvs(sv, "^");
	12080
	12081	/* output what the standard cp 0-255 bitmap matches */
	12082	for (i = 0; i <= 256; i++) {
	12083	if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
	12084	if (rangestart == -1)
	12085	rangestart = i;
	12086	} else if (rangestart != -1) {
	12087	if (i <= rangestart + 3)
	12088	for (; rangestart < i; rangestart++)
	12089	put_byte(sv, rangestart);
	12090	else {
	12091	put_byte(sv, rangestart);
	12092	sv_catpvs(sv, "-");
	12093	put_byte(sv, i - 1);
	12094	}
	12095	do_sep = 1;
	12096	rangestart = -1;
	12097	}
	12098	}
	12099
	12100	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	12101	/* output any special charclass tests (used entirely under use locale) */
	12102	if (ANYOF_CLASS_TEST_ANY_SET(o))
	12103	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
	12104	if (ANYOF_CLASS_TEST(o,i)) {
	12105	sv_catpv(sv, anyofs[i]);
	12106	do_sep = 1;
	12107	}
	12108
	12109	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	12110
	12111	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	12112	sv_catpvs(sv, "{non-utf8-latin1-all}");
	12113	}
	12114
	12115	/* output information about the unicode matching */
	12116	if (flags & ANYOF_UNICODE_ALL)
	12117	sv_catpvs(sv, "{unicode_all}");
	12118	else if (ANYOF_NONBITMAP(o))
	12119	sv_catpvs(sv, "{unicode}");
	12120	if (flags & ANYOF_NONBITMAP_NON_UTF8)
	12121	sv_catpvs(sv, "{outside bitmap}");
	12122
	12123	if (ANYOF_NONBITMAP(o)) {
	12124	SV lv; / Set if there is something outside the bit map */
	12125	SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
	12126	bool byte_output = FALSE; /* If something in the bitmap has been
	12127	output */
	12128
	12129	if (lv && lv != &PL_sv_undef) {
	12130	if (sw) {
	12131	U8 s[UTF8_MAXBYTES_CASE+1];
	12132
	12133	for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
	12134	uvchr_to_utf8(s, i);
	12135
	12136	if (i < 256
	12137	&& ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
	12138	things already
	12139	output as part
	12140	of the bitmap */
	12141	&& swash_fetch(sw, s, TRUE))
	12142	{
	12143	if (rangestart == -1)
	12144	rangestart = i;
	12145	} else if (rangestart != -1) {
	12146	byte_output = TRUE;
	12147	if (i <= rangestart + 3)
	12148	for (; rangestart < i; rangestart++) {
	12149	put_byte(sv, rangestart);
	12150	}
	12151	else {
	12152	put_byte(sv, rangestart);
	12153	sv_catpvs(sv, "-");
	12154	put_byte(sv, i-1);
	12155	}
	12156	rangestart = -1;
	12157	}
	12158	}
	12159	}
	12160
	12161	{
	12162	char *s = savesvpv(lv);
	12163	char * const origs = s;
	12164
	12165	while (s && s != '\n')
	12166	s++;
	12167
	12168	if (*s == '\n') {
	12169	const char * const t = ++s;
	12170
	12171	if (byte_output) {
	12172	sv_catpvs(sv, " ");
	12173	}
	12174
	12175	while (*s) {
	12176	if (*s == '\n') {
	12177
	12178	/* Truncate very long output */
	12179	if (s - origs > 256) {
	12180	Perl_sv_catpvf(aTHX_ sv,
	12181	"%.*s...",
	12182	(int) (s - origs - 1),
	12183	t);
	12184	goto out_dump;
	12185	}
	12186	*s = ' ';
	12187	}
	12188	else if (*s == '\t') {
	12189	*s = '-';
	12190	}
	12191	s++;
	12192	}
	12193	if (s[-1] == ' ')
	12194	s[-1] = 0;
	12195
	12196	sv_catpv(sv, t);
	12197	}
	12198
	12199	out_dump:
	12200
	12201	Safefree(origs);
	12202	}
	12203	SvREFCNT_dec(lv);
	12204	}
	12205	}
	12206
	12207	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	12208	}
	12209	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	12210	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	12211	#else
	12212	PERL_UNUSED_CONTEXT;
	12213	PERL_UNUSED_ARG(sv);
	12214	PERL_UNUSED_ARG(o);
	12215	PERL_UNUSED_ARG(prog);
	12216	#endif /* DEBUGGING */
	12217	}
	12218
	12219	SV *
	12220	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	12221	{ /* Assume that RE_INTUIT is set */
	12222	dVAR;
	12223	struct regexp const prog = (struct regexp )SvANY(r);
	12224	GET_RE_DEBUG_FLAGS_DECL;
	12225
	12226	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	12227	PERL_UNUSED_CONTEXT;
	12228
	12229	DEBUG_COMPILE_r(
	12230	{
	12231	const char * const s = SvPV_nolen_const(prog->check_substr
	12232	? prog->check_substr : prog->check_utf8);
	12233
	12234	if (!PL_colorset) reginitcolors();
	12235	PerlIO_printf(Perl_debug_log,
	12236	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	12237	PL_colors[4],
	12238	prog->check_substr ? "" : "utf8 ",
	12239	PL_colors[5],PL_colors[0],
	12240	s,
	12241	PL_colors[1],
	12242	(strlen(s) > 60 ? "..." : ""));
	12243	} );
	12244
	12245	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	12246	}
	12247
	12248	/*
	12249	pregfree()
	12250
	12251	handles refcounting and freeing the perl core regexp structure. When
	12252	it is necessary to actually free the structure the first thing it
	12253	does is call the 'free' method of the regexp_engine associated to
	12254	the regexp, allowing the handling of the void *pprivate; member
	12255	first. (This routine is not overridable by extensions, which is why
	12256	the extensions free is called first.)
	12257
	12258	See regdupe and regdupe_internal if you change anything here.
	12259	*/
	12260	#ifndef PERL_IN_XSUB_RE
	12261	void
	12262	Perl_pregfree(pTHX_ REGEXP *r)
	12263	{
	12264	SvREFCNT_dec(r);
	12265	}
	12266
	12267	void
	12268	Perl_pregfree2(pTHX_ REGEXP *rx)
	12269	{
	12270	dVAR;
	12271	struct regexp const r = (struct regexp )SvANY(rx);
	12272	GET_RE_DEBUG_FLAGS_DECL;
	12273
	12274	PERL_ARGS_ASSERT_PREGFREE2;
	12275
	12276	if (r->mother_re) {
	12277	ReREFCNT_dec(r->mother_re);
	12278	} else {
	12279	CALLREGFREE_PVT(rx); /* free the private data */
	12280	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	12281	}
	12282	if (r->substrs) {
	12283	SvREFCNT_dec(r->anchored_substr);
	12284	SvREFCNT_dec(r->anchored_utf8);
	12285	SvREFCNT_dec(r->float_substr);
	12286	SvREFCNT_dec(r->float_utf8);
	12287	Safefree(r->substrs);
	12288	}
	12289	RX_MATCH_COPY_FREE(rx);
	12290	#ifdef PERL_OLD_COPY_ON_WRITE
	12291	SvREFCNT_dec(r->saved_copy);
	12292	#endif
	12293	Safefree(r->offs);
	12294	}
	12295
	12296	/* reg_temp_copy()
	12297
	12298	This is a hacky workaround to the structural issue of match results
	12299	being stored in the regexp structure which is in turn stored in
	12300	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	12301	could be PL_curpm in multiple contexts, and could require multiple
	12302	result sets being associated with the pattern simultaneously, such
	12303	as when doing a recursive match with (??{$qr})
	12304
	12305	The solution is to make a lightweight copy of the regexp structure
	12306	when a qr// is returned from the code executed by (??{$qr}) this
	12307	lightweight copy doesn't actually own any of its data except for
	12308	the starp/end and the actual regexp structure itself.
	12309
	12310	*/
	12311
	12312
	12313	REGEXP *
	12314	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	12315	{
	12316	struct regexp *ret;
	12317	struct regexp const r = (struct regexp )SvANY(rx);
	12318	register const I32 npar = r->nparens+1;
	12319
	12320	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	12321
	12322	if (!ret_x)
	12323	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	12324	ret = (struct regexp *)SvANY(ret_x);
	12325
	12326	(void)ReREFCNT_inc(rx);
	12327	/* We can take advantage of the existing "copied buffer" mechanism in SVs
	12328	by pointing directly at the buffer, but flagging that the allocated
	12329	space in the copy is zero. As we've just done a struct copy, it's now
	12330	a case of zero-ing that, rather than copying the current length. */
	12331	SvPV_set(ret_x, RX_WRAPPED(rx));
	12332	SvFLAGS(ret_x) \|= SvFLAGS(rx) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	12333	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	12334	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	12335	SvLEN_set(ret_x, 0);
	12336	SvSTASH_set(ret_x, NULL);
	12337	SvMAGIC_set(ret_x, NULL);
	12338	Newx(ret->offs, npar, regexp_paren_pair);
	12339	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	12340	if (r->substrs) {
	12341	Newx(ret->substrs, 1, struct reg_substr_data);
	12342	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	12343
	12344	SvREFCNT_inc_void(ret->anchored_substr);
	12345	SvREFCNT_inc_void(ret->anchored_utf8);
	12346	SvREFCNT_inc_void(ret->float_substr);
	12347	SvREFCNT_inc_void(ret->float_utf8);
	12348
	12349	/* check_substr and check_utf8, if non-NULL, point to either their
	12350	anchored or float namesakes, and don't hold a second reference. */
	12351	}
	12352	RX_MATCH_COPIED_off(ret_x);
	12353	#ifdef PERL_OLD_COPY_ON_WRITE
	12354	ret->saved_copy = NULL;
	12355	#endif
	12356	ret->mother_re = rx;
	12357
	12358	return ret_x;
	12359	}
	12360	#endif
	12361
	12362	/* regfree_internal()
	12363
	12364	Free the private data in a regexp. This is overloadable by
	12365	extensions. Perl takes care of the regexp structure in pregfree(),
	12366	this covers the *pprivate pointer which technically perl doesn't
	12367	know about, however of course we have to handle the
	12368	regexp_internal structure when no extension is in use.
	12369
	12370	Note this is called before freeing anything in the regexp
	12371	structure.
	12372	*/
	12373
	12374	void
	12375	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	12376	{
	12377	dVAR;
	12378	struct regexp const r = (struct regexp )SvANY(rx);
	12379	RXi_GET_DECL(r,ri);
	12380	GET_RE_DEBUG_FLAGS_DECL;
	12381
	12382	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	12383
	12384	DEBUG_COMPILE_r({
	12385	if (!PL_colorset)
	12386	reginitcolors();
	12387	{
	12388	SV *dsv= sv_newmortal();
	12389	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	12390	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	12391	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	12392	PL_colors[4],PL_colors[5],s);
	12393	}
	12394	});
	12395	#ifdef RE_TRACK_PATTERN_OFFSETS
	12396	if (ri->u.offsets)
	12397	Safefree(ri->u.offsets); /* 20010421 MJD */
	12398	#endif
	12399	if (ri->data) {
	12400	int n = ri->data->count;
	12401	PAD* new_comppad = NULL;
	12402	PAD* old_comppad;
	12403	PADOFFSET refcnt;
	12404
	12405	while (--n >= 0) {
	12406	/* If you add a ->what type here, update the comment in regcomp.h */
	12407	switch (ri->data->what[n]) {
	12408	case 'a':
	12409	case 's':
	12410	case 'S':
	12411	case 'u':
	12412	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	12413	break;
	12414	case 'f':
	12415	Safefree(ri->data->data[n]);
	12416	break;
	12417	case 'p':
	12418	new_comppad = MUTABLE_AV(ri->data->data[n]);
	12419	break;
	12420	case 'o':
	12421	if (new_comppad == NULL)
	12422	Perl_croak(aTHX_ "panic: pregfree comppad");
	12423	PAD_SAVE_LOCAL(old_comppad,
	12424	/* Watch out for global destruction's random ordering. */
	12425	(SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
	12426	);
	12427	OP_REFCNT_LOCK;
	12428	refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
	12429	OP_REFCNT_UNLOCK;
	12430	if (!refcnt)
	12431	op_free((OP_4tree*)ri->data->data[n]);
	12432
	12433	PAD_RESTORE_LOCAL(old_comppad);
	12434	SvREFCNT_dec(MUTABLE_SV(new_comppad));
	12435	new_comppad = NULL;
	12436	break;
	12437	case 'n':
	12438	break;
	12439	case 'T':
	12440	{ /* Aho Corasick add-on structure for a trie node.
	12441	Used in stclass optimization only */
	12442	U32 refcount;
	12443	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	12444	OP_REFCNT_LOCK;
	12445	refcount = --aho->refcount;
	12446	OP_REFCNT_UNLOCK;
	12447	if ( !refcount ) {
	12448	PerlMemShared_free(aho->states);
	12449	PerlMemShared_free(aho->fail);
	12450	/* do this last!!!! */
	12451	PerlMemShared_free(ri->data->data[n]);
	12452	PerlMemShared_free(ri->regstclass);
	12453	}
	12454	}
	12455	break;
	12456	case 't':
	12457	{
	12458	/* trie structure. */
	12459	U32 refcount;
	12460	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	12461	OP_REFCNT_LOCK;
	12462	refcount = --trie->refcount;
	12463	OP_REFCNT_UNLOCK;
	12464	if ( !refcount ) {
	12465	PerlMemShared_free(trie->charmap);
	12466	PerlMemShared_free(trie->states);
	12467	PerlMemShared_free(trie->trans);
	12468	if (trie->bitmap)
	12469	PerlMemShared_free(trie->bitmap);
	12470	if (trie->jump)
	12471	PerlMemShared_free(trie->jump);
	12472	PerlMemShared_free(trie->wordinfo);
	12473	/* do this last!!!! */
	12474	PerlMemShared_free(ri->data->data[n]);
	12475	}
	12476	}
	12477	break;
	12478	default:
	12479	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	12480	}
	12481	}
	12482	Safefree(ri->data->what);
	12483	Safefree(ri->data);
	12484	}
	12485
	12486	Safefree(ri);
	12487	}
	12488
	12489	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	12490	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	12491	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	12492
	12493	/*
	12494	re_dup - duplicate a regexp.
	12495
	12496	This routine is expected to clone a given regexp structure. It is only
	12497	compiled under USE_ITHREADS.
	12498
	12499	After all of the core data stored in struct regexp is duplicated
	12500	the regexp_engine.dupe method is used to copy any private data
	12501	stored in the *pprivate pointer. This allows extensions to handle
	12502	any duplication it needs to do.
	12503
	12504	See pregfree() and regfree_internal() if you change anything here.
	12505	*/
	12506	#if defined(USE_ITHREADS)
	12507	#ifndef PERL_IN_XSUB_RE
	12508	void
	12509	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	12510	{
	12511	dVAR;
	12512	I32 npar;
	12513	const struct regexp r = (const struct regexp )SvANY(sstr);
	12514	struct regexp ret = (struct regexp )SvANY(dstr);
	12515
	12516	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	12517
	12518	npar = r->nparens+1;
	12519	Newx(ret->offs, npar, regexp_paren_pair);
	12520	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	12521	if(ret->swap) {
	12522	/* no need to copy these */
	12523	Newx(ret->swap, npar, regexp_paren_pair);
	12524	}
	12525
	12526	if (ret->substrs) {
	12527	/* Do it this way to avoid reading from *r after the StructCopy().
	12528	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	12529	cache, it doesn't matter. */
	12530	const bool anchored = r->check_substr
	12531	? r->check_substr == r->anchored_substr
	12532	: r->check_utf8 == r->anchored_utf8;
	12533	Newx(ret->substrs, 1, struct reg_substr_data);
	12534	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	12535
	12536	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	12537	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	12538	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	12539	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	12540
	12541	/* check_substr and check_utf8, if non-NULL, point to either their
	12542	anchored or float namesakes, and don't hold a second reference. */
	12543
	12544	if (ret->check_substr) {
	12545	if (anchored) {
	12546	assert(r->check_utf8 == r->anchored_utf8);
	12547	ret->check_substr = ret->anchored_substr;
	12548	ret->check_utf8 = ret->anchored_utf8;
	12549	} else {
	12550	assert(r->check_substr == r->float_substr);
	12551	assert(r->check_utf8 == r->float_utf8);
	12552	ret->check_substr = ret->float_substr;
	12553	ret->check_utf8 = ret->float_utf8;
	12554	}
	12555	} else if (ret->check_utf8) {
	12556	if (anchored) {
	12557	ret->check_utf8 = ret->anchored_utf8;
	12558	} else {
	12559	ret->check_utf8 = ret->float_utf8;
	12560	}
	12561	}
	12562	}
	12563
	12564	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	12565
	12566	if (ret->pprivate)
	12567	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	12568
	12569	if (RX_MATCH_COPIED(dstr))
	12570	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	12571	else
	12572	ret->subbeg = NULL;
	12573	#ifdef PERL_OLD_COPY_ON_WRITE
	12574	ret->saved_copy = NULL;
	12575	#endif
	12576
	12577	if (ret->mother_re) {
	12578	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
	12579	/* Our storage points directly to our mother regexp, but that's
	12580	1: a buffer in a different thread
	12581	2: something we no longer hold a reference on
	12582	so we need to copy it locally. */
	12583	/* Note we need to use SvCUR(), rather than
	12584	SvLEN(), on our mother_re, because it, in
	12585	turn, may well be pointing to its own mother_re. */
	12586	SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
	12587	SvCUR(ret->mother_re)+1));
	12588	SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
	12589	}
	12590	ret->mother_re = NULL;
	12591	}
	12592	ret->gofs = 0;
	12593	}
	12594	#endif /* PERL_IN_XSUB_RE */
	12595
	12596	/*
	12597	regdupe_internal()
	12598
	12599	This is the internal complement to regdupe() which is used to copy
	12600	the structure pointed to by the *pprivate pointer in the regexp.
	12601	This is the core version of the extension overridable cloning hook.
	12602	The regexp structure being duplicated will be copied by perl prior
	12603	to this and will be provided as the regexp *r argument, however
	12604	with the /old/ structures pprivate pointer value. Thus this routine
	12605	may override any copying normally done by perl.
	12606
	12607	It returns a pointer to the new regexp_internal structure.
	12608	*/
	12609
	12610	void *
	12611	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	12612	{
	12613	dVAR;
	12614	struct regexp const r = (struct regexp )SvANY(rx);
	12615	regexp_internal *reti;
	12616	int len;
	12617	RXi_GET_DECL(r,ri);
	12618
	12619	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	12620
	12621	len = ProgLen(ri);
	12622
	12623	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	12624	Copy(ri->program, reti->program, len+1, regnode);
	12625
	12626
	12627	reti->regstclass = NULL;
	12628
	12629	if (ri->data) {
	12630	struct reg_data *d;
	12631	const int count = ri->data->count;
	12632	int i;
	12633
	12634	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	12635	char, struct reg_data);
	12636	Newx(d->what, count, U8);
	12637
	12638	d->count = count;
	12639	for (i = 0; i < count; i++) {
	12640	d->what[i] = ri->data->what[i];
	12641	switch (d->what[i]) {
	12642	/* legal options are one of: sSfpontTua
	12643	see also regcomp.h and pregfree() */
	12644	case 'a': /* actually an AV, but the dup function is identical. */
	12645	case 's':
	12646	case 'S':
	12647	case 'p': /* actually an AV, but the dup function is identical. */
	12648	case 'u': /* actually an HV, but the dup function is identical. */
	12649	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	12650	break;
	12651	case 'f':
	12652	/* This is cheating. */
	12653	Newx(d->data[i], 1, struct regnode_charclass_class);
	12654	StructCopy(ri->data->data[i], d->data[i],
	12655	struct regnode_charclass_class);
	12656	reti->regstclass = (regnode*)d->data[i];
	12657	break;
	12658	case 'o':
	12659	/* Compiled op trees are readonly and in shared memory,
	12660	and can thus be shared without duplication. */
	12661	OP_REFCNT_LOCK;
	12662	d->data[i] = (void)OpREFCNT_inc((OP)ri->data->data[i]);
	12663	OP_REFCNT_UNLOCK;
	12664	break;
	12665	case 'T':
	12666	/* Trie stclasses are readonly and can thus be shared
	12667	* without duplication. We free the stclass in pregfree
	12668	* when the corresponding reg_ac_data struct is freed.
	12669	*/
	12670	reti->regstclass= ri->regstclass;
	12671	/* Fall through */
	12672	case 't':
	12673	OP_REFCNT_LOCK;
	12674	((reg_trie_data*)ri->data->data[i])->refcount++;
	12675	OP_REFCNT_UNLOCK;
	12676	/* Fall through */
	12677	case 'n':
	12678	d->data[i] = ri->data->data[i];
	12679	break;
	12680	default:
	12681	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	12682	}
	12683	}
	12684
	12685	reti->data = d;
	12686	}
	12687	else
	12688	reti->data = NULL;
	12689
	12690	reti->name_list_idx = ri->name_list_idx;
	12691
	12692	#ifdef RE_TRACK_PATTERN_OFFSETS
	12693	if (ri->u.offsets) {
	12694	Newx(reti->u.offsets, 2*len+1, U32);
	12695	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	12696	}
	12697	#else
	12698	SetProgLen(reti,len);
	12699	#endif
	12700
	12701	return (void*)reti;
	12702	}
	12703
	12704	#endif /* USE_ITHREADS */
	12705
	12706	#ifndef PERL_IN_XSUB_RE
	12707
	12708	/*
	12709	- regnext - dig the "next" pointer out of a node
	12710	*/
	12711	regnode *
	12712	Perl_regnext(pTHX_ register regnode *p)
	12713	{
	12714	dVAR;
	12715	register I32 offset;
	12716
	12717	if (!p)
	12718	return(NULL);
	12719
	12720	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	12721	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	12722	}
	12723
	12724	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	12725	if (offset == 0)
	12726	return(NULL);
	12727
	12728	return(p+offset);
	12729	}
	12730	#endif
	12731
	12732	STATIC void
	12733	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	12734	{
	12735	va_list args;
	12736	STRLEN l1 = strlen(pat1);
	12737	STRLEN l2 = strlen(pat2);
	12738	char buf[512];
	12739	SV *msv;
	12740	const char *message;
	12741
	12742	PERL_ARGS_ASSERT_RE_CROAK2;
	12743
	12744	if (l1 > 510)
	12745	l1 = 510;
	12746	if (l1 + l2 > 510)
	12747	l2 = 510 - l1;
	12748	Copy(pat1, buf, l1 , char);
	12749	Copy(pat2, buf + l1, l2 , char);
	12750	buf[l1 + l2] = '\n';
	12751	buf[l1 + l2 + 1] = '\0';
	12752	#ifdef I_STDARG
	12753	/* ANSI variant takes additional second argument */
	12754	va_start(args, pat2);
	12755	#else
	12756	va_start(args);
	12757	#endif
	12758	msv = vmess(buf, &args);
	12759	va_end(args);
	12760	message = SvPV_const(msv,l1);
	12761	if (l1 > 512)
	12762	l1 = 512;
	12763	Copy(message, buf, l1 , char);
	12764	buf[l1-1] = '\0'; /* Overwrite \n */
	12765	Perl_croak(aTHX_ "%s", buf);
	12766	}
	12767
	12768	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	12769
	12770	#ifndef PERL_IN_XSUB_RE
	12771	void
	12772	Perl_save_re_context(pTHX)
	12773	{
	12774	dVAR;
	12775
	12776	struct re_save_state *state;
	12777
	12778	SAVEVPTR(PL_curcop);
	12779	SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
	12780
	12781	state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
	12782	PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
	12783	SSPUSHUV(SAVEt_RE_STATE);
	12784
	12785	Copy(&PL_reg_state, state, 1, struct re_save_state);
	12786
	12787	PL_reg_start_tmp = 0;
	12788	PL_reg_start_tmpl = 0;
	12789	PL_reg_oldsaved = NULL;
	12790	PL_reg_oldsavedlen = 0;
	12791	PL_reg_maxiter = 0;
	12792	PL_reg_leftiter = 0;
	12793	PL_reg_poscache = NULL;
	12794	PL_reg_poscache_size = 0;
	12795	#ifdef PERL_OLD_COPY_ON_WRITE
	12796	PL_nrs = NULL;
	12797	#endif
	12798
	12799	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	12800	if (PL_curpm) {
	12801	const REGEXP * const rx = PM_GETRE(PL_curpm);
	12802	if (rx) {
	12803	U32 i;
	12804	for (i = 1; i <= RX_NPARENS(rx); i++) {
	12805	char digits[TYPE_CHARS(long)];
	12806	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	12807	GV const const gvp
	12808	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	12809
	12810	if (gvp) {
	12811	GV * const gv = *gvp;
	12812	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	12813	save_scalar(gv);
	12814	}
	12815	}
	12816	}
	12817	}
	12818	}
	12819	#endif
	12820
	12821	static void
	12822	clear_re(pTHX_ void *r)
	12823	{
	12824	dVAR;
	12825	ReREFCNT_dec((REGEXP *)r);
	12826	}
	12827
	12828	#ifdef DEBUGGING
	12829
	12830	STATIC void
	12831	S_put_byte(pTHX_ SV *sv, int c)
	12832	{
	12833	PERL_ARGS_ASSERT_PUT_BYTE;
	12834
	12835	/* Our definition of isPRINT() ignores locales, so only bytes that are
	12836	not part of UTF-8 are considered printable. I assume that the same
	12837	holds for UTF-EBCDIC.
	12838	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	12839	which Wikipedia says:
	12840
	12841	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	12842	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	12843	identical, to the ASCII delete (DEL) or rubout control character.
	12844	) So the old condition can be simplified to !isPRINT(c) */
	12845	if (!isPRINT(c)) {
	12846	if (c < 256) {
	12847	Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
	12848	}
	12849	else {
	12850	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	12851	}
	12852	}
	12853	else {
	12854	const char string = c;
	12855	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	12856	sv_catpvs(sv, "\\");
	12857	sv_catpvn(sv, &string, 1);
	12858	}
	12859	}
	12860
	12861
	12862	#define CLEAR_OPTSTART \
	12863	if (optstart) STMT_START { \
	12864	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	12865	optstart=NULL; \
	12866	} STMT_END
	12867
	12868	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	12869
	12870	STATIC const regnode *
	12871	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	12872	const regnode last, const regnode plast,
	12873	SV* sv, I32 indent, U32 depth)
	12874	{
	12875	dVAR;
	12876	register U8 op = PSEUDO; /* Arbitrary non-END op. */
	12877	register const regnode *next;
	12878	const regnode *optstart= NULL;
	12879
	12880	RXi_GET_DECL(r,ri);
	12881	GET_RE_DEBUG_FLAGS_DECL;
	12882
	12883	PERL_ARGS_ASSERT_DUMPUNTIL;
	12884
	12885	#ifdef DEBUG_DUMPUNTIL
	12886	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	12887	last ? last-start : 0,plast ? plast-start : 0);
	12888	#endif
	12889
	12890	if (plast && plast < last)
	12891	last= plast;
	12892
	12893	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	12894	/* While that wasn't END last time... */
	12895	NODE_ALIGN(node);
	12896	op = OP(node);
	12897	if (op == CLOSE \|\| op == WHILEM)
	12898	indent--;
	12899	next = regnext((regnode *)node);
	12900
	12901	/* Where, what. */
	12902	if (OP(node) == OPTIMIZED) {
	12903	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	12904	optstart = node;
	12905	else
	12906	goto after_print;
	12907	} else
	12908	CLEAR_OPTSTART;
	12909
	12910	regprop(r, sv, node);
	12911	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	12912	(int)(2*indent + 1), "", SvPVX_const(sv));
	12913
	12914	if (OP(node) != OPTIMIZED) {
	12915	if (next == NULL) /* Next ptr. */
	12916	PerlIO_printf(Perl_debug_log, " (0)");
	12917	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	12918	PerlIO_printf(Perl_debug_log, " (FAIL)");
	12919	else
	12920	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	12921	(void)PerlIO_putc(Perl_debug_log, '\n');
	12922	}
	12923
	12924	after_print:
	12925	if (PL_regkind[(U8)op] == BRANCHJ) {
	12926	assert(next);
	12927	{
	12928	register const regnode *nnode = (OP(next) == LONGJMP
	12929	? regnext((regnode *)next)
	12930	: next);
	12931	if (last && nnode > last)
	12932	nnode = last;
	12933	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	12934	}
	12935	}
	12936	else if (PL_regkind[(U8)op] == BRANCH) {
	12937	assert(next);
	12938	DUMPUNTIL(NEXTOPER(node), next);
	12939	}
	12940	else if ( PL_regkind[(U8)op] == TRIE ) {
	12941	const regnode *this_trie = node;
	12942	const char op = OP(node);
	12943	const U32 n = ARG(node);
	12944	const reg_ac_data * const ac = op>=AHOCORASICK ?
	12945	(reg_ac_data *)ri->data->data[n] :
	12946	NULL;
	12947	const reg_trie_data * const trie =
	12948	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	12949	#ifdef DEBUGGING
	12950	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	12951	#endif
	12952	const regnode *nextbranch= NULL;
	12953	I32 word_idx;
	12954	sv_setpvs(sv, "");
	12955	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	12956	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	12957
	12958	PerlIO_printf(Perl_debug_log, "%*s%s ",
	12959	(int)(2*(indent+3)), "",
	12960	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	12961	PL_colors[0], PL_colors[1],
	12962	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	12963	PERL_PV_PRETTY_ELLIPSES \|
	12964	PERL_PV_PRETTY_LTGT
	12965	)
	12966	: "???"
	12967	);
	12968	if (trie->jump) {
	12969	U16 dist= trie->jump[word_idx+1];
	12970	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	12971	(UV)((dist ? this_trie + dist : next) - start));
	12972	if (dist) {
	12973	if (!nextbranch)
	12974	nextbranch= this_trie + trie->jump[0];
	12975	DUMPUNTIL(this_trie + dist, nextbranch);
	12976	}
	12977	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	12978	nextbranch= regnext((regnode *)nextbranch);
	12979	} else {
	12980	PerlIO_printf(Perl_debug_log, "\n");
	12981	}
	12982	}
	12983	if (last && next > last)
	12984	node= last;
	12985	else
	12986	node= next;
	12987	}
	12988	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	12989	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	12990	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	12991	}
	12992	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	12993	assert(next);
	12994	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	12995	}
	12996	else if ( op == PLUS \|\| op == STAR) {
	12997	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	12998	}
	12999	else if (PL_regkind[(U8)op] == ANYOF) {
	13000	/* arglen 1 + class block */
	13001	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	13002	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	13003	node = NEXTOPER(node);
	13004	}
	13005	else if (PL_regkind[(U8)op] == EXACT) {
	13006	/* Literal string, where present. */
	13007	node += NODE_SZ_STR(node) - 1;
	13008	node = NEXTOPER(node);
	13009	}
	13010	else {
	13011	node = NEXTOPER(node);
	13012	node += regarglen[(U8)op];
	13013	}
	13014	if (op == CURLYX \|\| op == OPEN)
	13015	indent++;
	13016	}
	13017	CLEAR_OPTSTART;
	13018	#ifdef DEBUG_DUMPUNTIL
	13019	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	13020	#endif
	13021	return node;
	13022	}
	13023
	13024	#endif /* DEBUGGING */
	13025
	13026	/*
	13027	* Local variables:
	13028	* c-indentation-style: bsd
	13029	* c-basic-offset: 4
	13030	* indent-tabs-mode: t
	13031	* End:
	13032	*
	13033	* ex: set ts=8 sts=4 sw=4 noet:
	13034	*/