perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	#else
	85	# include "regcomp.h"
	86	#endif
	87
	88	#include "dquote_static.c"
	89
	90	#ifdef op
	91	#undef op
	92	#endif /* op */
	93
	94	#ifdef MSDOS
	95	# if defined(BUGGY_MSC6)
	96	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	97	# pragma optimize("a",off)
	98	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	99	# pragma optimize("w",on )
	100	# endif /* BUGGY_MSC6 */
	101	#endif /* MSDOS */
	102
	103	#ifndef STATIC
	104	#define STATIC static
	105	#endif
	106
	107	typedef struct RExC_state_t {
	108	U32 flags; /* are we folding, multilining? */
	109	char precomp; / uncompiled string. */
	110	REGEXP rx_sv; / The SV that is the regexp. */
	111	regexp rx; / perl core regexp structure */
	112	regexp_internal rxi; / internal data for regexp object pprivate field */
	113	char start; / Start of input for compile */
	114	char end; / End of input for compile */
	115	char parse; / Input-scan pointer. */
	116	I32 whilem_seen; /* number of WHILEM in this expr */
	117	regnode emit_start; / Start of emitted-code area */
	118	regnode emit_bound; / First regnode outside of the allocated space */
	119	regnode emit; / Code-emit pointer; &regdummy = don't = compiling */
	120	I32 naughty; /* How bad is this pattern? */
	121	I32 sawback; /* Did we see \1, ...? */
	122	U32 seen;
	123	I32 size; /* Code size. */
	124	I32 npar; /* Capture buffer count, (OPEN). */
	125	I32 cpar; /* Capture buffer count, (CLOSE). */
	126	I32 nestroot; /* root parens we are in - used by accept */
	127	I32 extralen;
	128	I32 seen_zerolen;
	129	I32 seen_evals;
	130	regnode *open_parens; / pointers to open parens */
	131	regnode *close_parens; / pointers to close parens */
	132	regnode opend; / END node in program */
	133	I32 utf8; /* whether the pattern is utf8 or not */
	134	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	135	/* XXX use this for future optimisation of case
	136	* where pattern must be upgraded to utf8. */
	137	HV paren_names; / Paren names */
	138
	139	regnode *recurse; / Recurse regops */
	140	I32 recurse_count; /* Number of recurse regops */
	141	#if ADD_TO_REGEXEC
	142	char starttry; / -Dr: where regtry was called. */
	143	#define RExC_starttry (pRExC_state->starttry)
	144	#endif
	145	#ifdef DEBUGGING
	146	const char *lastparse;
	147	I32 lastnum;
	148	AV paren_name_list; / idx -> name */
	149	#define RExC_lastparse (pRExC_state->lastparse)
	150	#define RExC_lastnum (pRExC_state->lastnum)
	151	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	152	#endif
	153	} RExC_state_t;
	154
	155	#define RExC_flags (pRExC_state->flags)
	156	#define RExC_precomp (pRExC_state->precomp)
	157	#define RExC_rx_sv (pRExC_state->rx_sv)
	158	#define RExC_rx (pRExC_state->rx)
	159	#define RExC_rxi (pRExC_state->rxi)
	160	#define RExC_start (pRExC_state->start)
	161	#define RExC_end (pRExC_state->end)
	162	#define RExC_parse (pRExC_state->parse)
	163	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	164	#ifdef RE_TRACK_PATTERN_OFFSETS
	165	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	166	#endif
	167	#define RExC_emit (pRExC_state->emit)
	168	#define RExC_emit_start (pRExC_state->emit_start)
	169	#define RExC_emit_bound (pRExC_state->emit_bound)
	170	#define RExC_naughty (pRExC_state->naughty)
	171	#define RExC_sawback (pRExC_state->sawback)
	172	#define RExC_seen (pRExC_state->seen)
	173	#define RExC_size (pRExC_state->size)
	174	#define RExC_npar (pRExC_state->npar)
	175	#define RExC_nestroot (pRExC_state->nestroot)
	176	#define RExC_extralen (pRExC_state->extralen)
	177	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	178	#define RExC_seen_evals (pRExC_state->seen_evals)
	179	#define RExC_utf8 (pRExC_state->utf8)
	180	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	181	#define RExC_open_parens (pRExC_state->open_parens)
	182	#define RExC_close_parens (pRExC_state->close_parens)
	183	#define RExC_opend (pRExC_state->opend)
	184	#define RExC_paren_names (pRExC_state->paren_names)
	185	#define RExC_recurse (pRExC_state->recurse)
	186	#define RExC_recurse_count (pRExC_state->recurse_count)
	187
	188
	189	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	190	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	191	((*s) == '{' && regcurly(s)))
	192
	193	#ifdef SPSTART
	194	#undef SPSTART /* dratted cpp namespace... */
	195	#endif
	196	/*
	197	* Flags to be passed up and down.
	198	*/
	199	#define WORST 0 /* Worst case. */
	200	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	201
	202	/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
	203	* character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
	204	#define SIMPLE 0x02
	205	#define SPSTART 0x04 /* Starts with * or +. */
	206	#define TRYAGAIN 0x08 /* Weeded out a declaration. */
	207	#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
	208
	209	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	210
	211	/* whether trie related optimizations are enabled */
	212	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	213	#define TRIE_STUDY_OPT
	214	#define FULL_TRIE_STUDY
	215	#define TRIE_STCLASS
	216	#endif
	217
	218
	219
	220	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	221	#define PBITVAL(paren) (1 << ((paren) & 7))
	222	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	223	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	224	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	225
	226	/* If not already in utf8, do a longjmp back to the beginning */
	227	#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
	228	#define REQUIRE_UTF8 STMT_START { \
	229	if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
	230	} STMT_END
	231
	232	/* About scan_data_t.
	233
	234	During optimisation we recurse through the regexp program performing
	235	various inplace (keyhole style) optimisations. In addition study_chunk
	236	and scan_commit populate this data structure with information about
	237	what strings MUST appear in the pattern. We look for the longest
	238	string that must appear at a fixed location, and we look for the
	239	longest string that may appear at a floating location. So for instance
	240	in the pattern:
	241
	242	/FOO[xX]A.*B[xX]BAR/
	243
	244	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	245	strings (because they follow a .* construct). study_chunk will identify
	246	both FOO and BAR as being the longest fixed and floating strings respectively.
	247
	248	The strings can be composites, for instance
	249
	250	/(f)(o)(o)/
	251
	252	will result in a composite fixed substring 'foo'.
	253
	254	For each string some basic information is maintained:
	255
	256	- offset or min_offset
	257	This is the position the string must appear at, or not before.
	258	It also implicitly (when combined with minlenp) tells us how many
	259	characters must match before the string we are searching for.
	260	Likewise when combined with minlenp and the length of the string it
	261	tells us how many characters must appear after the string we have
	262	found.
	263
	264	- max_offset
	265	Only used for floating strings. This is the rightmost point that
	266	the string can appear at. If set to I32 max it indicates that the
	267	string can occur infinitely far to the right.
	268
	269	- minlenp
	270	A pointer to the minimum length of the pattern that the string
	271	was found inside. This is important as in the case of positive
	272	lookahead or positive lookbehind we can have multiple patterns
	273	involved. Consider
	274
	275	/(?=FOO).*F/
	276
	277	The minimum length of the pattern overall is 3, the minimum length
	278	of the lookahead part is 3, but the minimum length of the part that
	279	will actually match is 1. So 'FOO's minimum length is 3, but the
	280	minimum length for the F is 1. This is important as the minimum length
	281	is used to determine offsets in front of and behind the string being
	282	looked for. Since strings can be composites this is the length of the
	283	pattern at the time it was committed with a scan_commit. Note that
	284	the length is calculated by study_chunk, so that the minimum lengths
	285	are not known until the full pattern has been compiled, thus the
	286	pointer to the value.
	287
	288	- lookbehind
	289
	290	In the case of lookbehind the string being searched for can be
	291	offset past the start point of the final matching string.
	292	If this value was just blithely removed from the min_offset it would
	293	invalidate some of the calculations for how many chars must match
	294	before or after (as they are derived from min_offset and minlen and
	295	the length of the string being searched for).
	296	When the final pattern is compiled and the data is moved from the
	297	scan_data_t structure into the regexp structure the information
	298	about lookbehind is factored in, with the information that would
	299	have been lost precalculated in the end_shift field for the
	300	associated string.
	301
	302	The fields pos_min and pos_delta are used to store the minimum offset
	303	and the delta to the maximum offset at the current point in the pattern.
	304
	305	*/
	306
	307	typedef struct scan_data_t {
	308	/I32 len_min; unused /
	309	/I32 len_delta; unused /
	310	I32 pos_min;
	311	I32 pos_delta;
	312	SV *last_found;
	313	I32 last_end; /* min value, <0 unless valid. */
	314	I32 last_start_min;
	315	I32 last_start_max;
	316	SV *longest; / Either &l_fixed, or &l_float. */
	317	SV longest_fixed; / longest fixed string found in pattern */
	318	I32 offset_fixed; /* offset where it starts */
	319	I32 minlen_fixed; / pointer to the minlen relevant to the string */
	320	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	321	SV longest_float; / longest floating string found in pattern */
	322	I32 offset_float_min; /* earliest point in string it can appear */
	323	I32 offset_float_max; /* latest point in string it can appear */
	324	I32 minlen_float; / pointer to the minlen relevant to the string */
	325	I32 lookbehind_float; /* is the position of the string modified by LB */
	326	I32 flags;
	327	I32 whilem_c;
	328	I32 *last_closep;
	329	struct regnode_charclass_class *start_class;
	330	} scan_data_t;
	331
	332	/*
	333	* Forward declarations for pregcomp()'s friends.
	334	*/
	335
	336	static const scan_data_t zero_scan_data =
	337	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	338
	339	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	340	#define SF_BEFORE_SEOL 0x0001
	341	#define SF_BEFORE_MEOL 0x0002
	342	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	343	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	344
	345	#ifdef NO_UNARY_PLUS
	346	# define SF_FIX_SHIFT_EOL (0+2)
	347	# define SF_FL_SHIFT_EOL (0+4)
	348	#else
	349	# define SF_FIX_SHIFT_EOL (+2)
	350	# define SF_FL_SHIFT_EOL (+4)
	351	#endif
	352
	353	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	354	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	355
	356	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	357	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	358	#define SF_IS_INF 0x0040
	359	#define SF_HAS_PAR 0x0080
	360	#define SF_IN_PAR 0x0100
	361	#define SF_HAS_EVAL 0x0200
	362	#define SCF_DO_SUBSTR 0x0400
	363	#define SCF_DO_STCLASS_AND 0x0800
	364	#define SCF_DO_STCLASS_OR 0x1000
	365	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	366	#define SCF_WHILEM_VISITED_POS 0x2000
	367
	368	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	369	#define SCF_SEEN_ACCEPT 0x8000
	370
	371	#define UTF cBOOL(RExC_utf8)
	372	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	373	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	374	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	375	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	376	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	377
	378	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	379
	380	#define OOB_UNICODE 12345678
	381	#define OOB_NAMEDCLASS -1
	382
	383	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	384	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	385
	386
	387	/* length of regex to show in messages that don't mark a position within */
	388	#define RegexLengthToShowInErrorMessages 127
	389
	390	/*
	391	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	392	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	393	* op/pragma/warn/regcomp.
	394	*/
	395	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	396	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	397
	398	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	399
	400	/*
	401	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	402	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	403	* "...".
	404	*/
	405	#define _FAIL(code) STMT_START { \
	406	const char *ellipses = ""; \
	407	IV len = RExC_end - RExC_precomp; \
	408	\
	409	if (!SIZE_ONLY) \
	410	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	411	if (len > RegexLengthToShowInErrorMessages) { \
	412	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	413	len = RegexLengthToShowInErrorMessages - 10; \
	414	ellipses = "..."; \
	415	} \
	416	code; \
	417	} STMT_END
	418
	419	#define FAIL(msg) _FAIL( \
	420	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	421	msg, (int)len, RExC_precomp, ellipses))
	422
	423	#define FAIL2(msg,arg) _FAIL( \
	424	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	425	arg, (int)len, RExC_precomp, ellipses))
	426
	427	/*
	428	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	429	*/
	430	#define Simple_vFAIL(m) STMT_START { \
	431	const IV offset = RExC_parse - RExC_precomp; \
	432	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	433	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	434	} STMT_END
	435
	436	/*
	437	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	438	*/
	439	#define vFAIL(m) STMT_START { \
	440	if (!SIZE_ONLY) \
	441	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	442	Simple_vFAIL(m); \
	443	} STMT_END
	444
	445	/*
	446	* Like Simple_vFAIL(), but accepts two arguments.
	447	*/
	448	#define Simple_vFAIL2(m,a1) STMT_START { \
	449	const IV offset = RExC_parse - RExC_precomp; \
	450	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	451	(int)offset, RExC_precomp, RExC_precomp + offset); \
	452	} STMT_END
	453
	454	/*
	455	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	456	*/
	457	#define vFAIL2(m,a1) STMT_START { \
	458	if (!SIZE_ONLY) \
	459	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	460	Simple_vFAIL2(m, a1); \
	461	} STMT_END
	462
	463
	464	/*
	465	* Like Simple_vFAIL(), but accepts three arguments.
	466	*/
	467	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	468	const IV offset = RExC_parse - RExC_precomp; \
	469	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	470	(int)offset, RExC_precomp, RExC_precomp + offset); \
	471	} STMT_END
	472
	473	/*
	474	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	475	*/
	476	#define vFAIL3(m,a1,a2) STMT_START { \
	477	if (!SIZE_ONLY) \
	478	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	479	Simple_vFAIL3(m, a1, a2); \
	480	} STMT_END
	481
	482	/*
	483	* Like Simple_vFAIL(), but accepts four arguments.
	484	*/
	485	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	486	const IV offset = RExC_parse - RExC_precomp; \
	487	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	488	(int)offset, RExC_precomp, RExC_precomp + offset); \
	489	} STMT_END
	490
	491	#define ckWARNreg(loc,m) STMT_START { \
	492	const IV offset = loc - RExC_precomp; \
	493	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	494	(int)offset, RExC_precomp, RExC_precomp + offset); \
	495	} STMT_END
	496
	497	#define ckWARNregdep(loc,m) STMT_START { \
	498	const IV offset = loc - RExC_precomp; \
	499	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	500	m REPORT_LOCATION, \
	501	(int)offset, RExC_precomp, RExC_precomp + offset); \
	502	} STMT_END
	503
	504	#define ckWARN2reg(loc, m, a1) STMT_START { \
	505	const IV offset = loc - RExC_precomp; \
	506	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	507	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	508	} STMT_END
	509
	510	#define vWARN3(loc, m, a1, a2) STMT_START { \
	511	const IV offset = loc - RExC_precomp; \
	512	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	513	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	514	} STMT_END
	515
	516	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	517	const IV offset = loc - RExC_precomp; \
	518	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	519	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	520	} STMT_END
	521
	522	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	523	const IV offset = loc - RExC_precomp; \
	524	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	525	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	526	} STMT_END
	527
	528	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	529	const IV offset = loc - RExC_precomp; \
	530	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	531	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	532	} STMT_END
	533
	534	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	535	const IV offset = loc - RExC_precomp; \
	536	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	537	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	538	} STMT_END
	539
	540
	541	/* Allow for side effects in s */
	542	#define REGC(c,s) STMT_START { \
	543	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	544	} STMT_END
	545
	546	/* Macros for recording node offsets. 20001227 mjd@plover.com
	547	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	548	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	549	* Element 0 holds the number n.
	550	* Position is 1 indexed.
	551	*/
	552	#ifndef RE_TRACK_PATTERN_OFFSETS
	553	#define Set_Node_Offset_To_R(node,byte)
	554	#define Set_Node_Offset(node,byte)
	555	#define Set_Cur_Node_Offset
	556	#define Set_Node_Length_To_R(node,len)
	557	#define Set_Node_Length(node,len)
	558	#define Set_Node_Cur_Length(node)
	559	#define Node_Offset(n)
	560	#define Node_Length(n)
	561	#define Set_Node_Offset_Length(node,offset,len)
	562	#define ProgLen(ri) ri->u.proglen
	563	#define SetProgLen(ri,x) ri->u.proglen = x
	564	#else
	565	#define ProgLen(ri) ri->u.offsets[0]
	566	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	567	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	568	if (! SIZE_ONLY) { \
	569	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	570	__LINE__, (int)(node), (int)(byte))); \
	571	if((node) < 0) { \
	572	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	573	} else { \
	574	RExC_offsets[2*(node)-1] = (byte); \
	575	} \
	576	} \
	577	} STMT_END
	578
	579	#define Set_Node_Offset(node,byte) \
	580	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	581	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	582
	583	#define Set_Node_Length_To_R(node,len) STMT_START { \
	584	if (! SIZE_ONLY) { \
	585	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	586	__LINE__, (int)(node), (int)(len))); \
	587	if((node) < 0) { \
	588	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	589	} else { \
	590	RExC_offsets[2*(node)] = (len); \
	591	} \
	592	} \
	593	} STMT_END
	594
	595	#define Set_Node_Length(node,len) \
	596	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	597	#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
	598	#define Set_Node_Cur_Length(node) \
	599	Set_Node_Length(node, RExC_parse - parse_start)
	600
	601	/* Get offsets and lengths */
	602	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	603	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	604
	605	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	606	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	607	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	608	} STMT_END
	609	#endif
	610
	611	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	612	#define EXPERIMENTAL_INPLACESCAN
	613	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	614
	615	#define DEBUG_STUDYDATA(str,data,depth) \
	616	DEBUG_OPTIMISE_MORE_r(if(data){ \
	617	PerlIO_printf(Perl_debug_log, \
	618	"%*s" str "Pos:%"IVdf"/%"IVdf \
	619	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	620	(int)(depth)*2, "", \
	621	(IV)((data)->pos_min), \
	622	(IV)((data)->pos_delta), \
	623	(UV)((data)->flags), \
	624	(IV)((data)->whilem_c), \
	625	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	626	is_inf ? "INF " : "" \
	627	); \
	628	if ((data)->last_found) \
	629	PerlIO_printf(Perl_debug_log, \
	630	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	631	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	632	SvPVX_const((data)->last_found), \
	633	(IV)((data)->last_end), \
	634	(IV)((data)->last_start_min), \
	635	(IV)((data)->last_start_max), \
	636	((data)->longest && \
	637	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	638	SvPVX_const((data)->longest_fixed), \
	639	(IV)((data)->offset_fixed), \
	640	((data)->longest && \
	641	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	642	SvPVX_const((data)->longest_float), \
	643	(IV)((data)->offset_float_min), \
	644	(IV)((data)->offset_float_max) \
	645	); \
	646	PerlIO_printf(Perl_debug_log,"\n"); \
	647	});
	648
	649	static void clear_re(pTHX_ void *r);
	650
	651	/* Mark that we cannot extend a found fixed substring at this point.
	652	Update the longest found anchored substring and the longest found
	653	floating substrings if needed. */
	654
	655	STATIC void
	656	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data, I32 *minlenp, int is_inf)
	657	{
	658	const STRLEN l = CHR_SVLEN(data->last_found);
	659	const STRLEN old_l = CHR_SVLEN(*data->longest);
	660	GET_RE_DEBUG_FLAGS_DECL;
	661
	662	PERL_ARGS_ASSERT_SCAN_COMMIT;
	663
	664	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	665	SvSetMagicSV(*data->longest, data->last_found);
	666	if (*data->longest == data->longest_fixed) {
	667	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	668	if (data->flags & SF_BEFORE_EOL)
	669	data->flags
	670	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	671	else
	672	data->flags &= ~SF_FIX_BEFORE_EOL;
	673	data->minlen_fixed=minlenp;
	674	data->lookbehind_fixed=0;
	675	}
	676	else { /* data->longest == data->longest_float /
	677	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	678	data->offset_float_max = (l
	679	? data->last_start_max
	680	: data->pos_min + data->pos_delta);
	681	if (is_inf \|\| (U32)data->offset_float_max > (U32)I32_MAX)
	682	data->offset_float_max = I32_MAX;
	683	if (data->flags & SF_BEFORE_EOL)
	684	data->flags
	685	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	686	else
	687	data->flags &= ~SF_FL_BEFORE_EOL;
	688	data->minlen_float=minlenp;
	689	data->lookbehind_float=0;
	690	}
	691	}
	692	SvCUR_set(data->last_found, 0);
	693	{
	694	SV * const sv = data->last_found;
	695	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	696	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	697	if (mg)
	698	mg->mg_len = 0;
	699	}
	700	}
	701	data->last_end = -1;
	702	data->flags &= ~SF_BEFORE_EOL;
	703	DEBUG_STUDYDATA("commit: ",data,0);
	704	}
	705
	706	/* Can match anything (initialization) */
	707	STATIC void
	708	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	709	{
	710	PERL_ARGS_ASSERT_CL_ANYTHING;
	711
	712	ANYOF_CLASS_ZERO(cl);
	713	ANYOF_BITMAP_SETALL(cl);
	714	cl->flags = ANYOF_EOS\|ANYOF_UNICODE_ALL\|ANYOF_LOC_NONBITMAP_FOLD\|ANYOF_NON_UTF8_LATIN1_ALL;
	715	if (LOC)
	716	cl->flags \|= ANYOF_LOCALE;
	717	}
	718
	719	/* Can match anything (initialization) */
	720	STATIC int
	721	S_cl_is_anything(const struct regnode_charclass_class *cl)
	722	{
	723	int value;
	724
	725	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	726
	727	for (value = 0; value <= ANYOF_MAX; value += 2)
	728	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	729	return 1;
	730	if (!(cl->flags & ANYOF_UNICODE_ALL))
	731	return 0;
	732	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	733	return 0;
	734	return 1;
	735	}
	736
	737	/* Can match anything (initialization) */
	738	STATIC void
	739	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	740	{
	741	PERL_ARGS_ASSERT_CL_INIT;
	742
	743	Zero(cl, 1, struct regnode_charclass_class);
	744	cl->type = ANYOF;
	745	cl_anything(pRExC_state, cl);
	746	}
	747
	748	STATIC void
	749	S_cl_init_zero(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	750	{
	751	PERL_ARGS_ASSERT_CL_INIT_ZERO;
	752
	753	Zero(cl, 1, struct regnode_charclass_class);
	754	cl->type = ANYOF;
	755	cl_anything(pRExC_state, cl);
	756	if (LOC)
	757	cl->flags \|= ANYOF_LOCALE;
	758	}
	759
	760	/* 'And' a given class with another one. Can create false positives */
	761	/* We assume that cl is not inverted */
	762	STATIC void
	763	S_cl_and(struct regnode_charclass_class *cl,
	764	const struct regnode_charclass_class *and_with)
	765	{
	766	PERL_ARGS_ASSERT_CL_AND;
	767
	768	assert(and_with->type == ANYOF);
	769
	770	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	771	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	772	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	773	&& !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	774	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
	775	int i;
	776
	777	if (and_with->flags & ANYOF_INVERT)
	778	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	779	cl->bitmap[i] &= ~and_with->bitmap[i];
	780	else
	781	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	782	cl->bitmap[i] &= and_with->bitmap[i];
	783	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	784	if (!(and_with->flags & ANYOF_EOS))
	785	cl->flags &= ~ANYOF_EOS;
	786
	787	if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
	788	cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
	789	if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
	790	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	791
	792	if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
	793	!(and_with->flags & ANYOF_INVERT)) {
	794	cl->flags &= ~ANYOF_UNICODE_ALL;
	795	cl->flags \|= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
	796	only the one(s)
	797	actually set */
	798	ARG_SET(cl, ARG(and_with));
	799	}
	800	if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
	801	!(and_with->flags & ANYOF_INVERT))
	802	cl->flags &= ~ANYOF_UNICODE_ALL;
	803	if (!(and_with->flags & (ANYOF_NONBITMAP\|ANYOF_UNICODE_ALL)) &&
	804	!(and_with->flags & ANYOF_INVERT))
	805	cl->flags &= ~ANYOF_NONBITMAP;
	806	}
	807
	808	/* 'OR' a given class with another one. Can create false positives */
	809	/* We assume that cl is not inverted */
	810	STATIC void
	811	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	812	{
	813	PERL_ARGS_ASSERT_CL_OR;
	814
	815	if (or_with->flags & ANYOF_INVERT) {
	816	/* We do not use
	817	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	818	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	819	* which is wasteful if CL2 is small, but we ignore CL2:
	820	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	821	* XXXX Can we handle case-fold? Unclear:
	822	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	823	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	824	*/
	825	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	826	&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	827	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
	828	int i;
	829
	830	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	831	cl->bitmap[i] \|= ~or_with->bitmap[i];
	832	} /* XXXX: logic is complicated otherwise */
	833	else {
	834	cl_anything(pRExC_state, cl);
	835	}
	836	} else {
	837	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	838	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	839	&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	840	\|\| (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
	841	int i;
	842
	843	/* OR char bitmap and class bitmap separately */
	844	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	845	cl->bitmap[i] \|= or_with->bitmap[i];
	846	if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
	847	for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
	848	cl->classflags[i] \|= or_with->classflags[i];
	849	cl->flags \|= ANYOF_CLASS;
	850	}
	851	}
	852	else { /* XXXX: logic is complicated, leave it along for a moment. */
	853	cl_anything(pRExC_state, cl);
	854	}
	855	}
	856	if (or_with->flags & ANYOF_EOS)
	857	cl->flags \|= ANYOF_EOS;
	858	if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
	859	cl->flags \|= ANYOF_NON_UTF8_LATIN1_ALL;
	860
	861	if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	862	cl->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	863
	864	/* If both nodes match something outside the bitmap, but what they match
	865	* outside is not the same pointer, and hence not easily compared, give up
	866	* and allow the start class to match everything outside the bitmap */
	867	if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
	868	ARG(cl) != ARG(or_with)) {
	869	cl->flags \|= ANYOF_UNICODE_ALL;
	870	}
	871
	872	if (or_with->flags & ANYOF_UNICODE_ALL) {
	873	cl->flags \|= ANYOF_UNICODE_ALL;
	874	}
	875	}
	876
	877	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	878	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	879	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	880	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	881
	882
	883	#ifdef DEBUGGING
	884	/*
	885	dump_trie(trie,widecharmap,revcharmap)
	886	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	887	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	888
	889	These routines dump out a trie in a somewhat readable format.
	890	The _interim_ variants are used for debugging the interim
	891	tables that are used to generate the final compressed
	892	representation which is what dump_trie expects.
	893
	894	Part of the reason for their existence is to provide a form
	895	of documentation as to how the different representations function.
	896
	897	*/
	898
	899	/*
	900	Dumps the final compressed table form of the trie to Perl_debug_log.
	901	Used for debugging make_trie().
	902	*/
	903
	904	STATIC void
	905	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	906	AV *revcharmap, U32 depth)
	907	{
	908	U32 state;
	909	SV *sv=sv_newmortal();
	910	int colwidth= widecharmap ? 6 : 4;
	911	U16 word;
	912	GET_RE_DEBUG_FLAGS_DECL;
	913
	914	PERL_ARGS_ASSERT_DUMP_TRIE;
	915
	916	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	917	(int)depth * 2 + 2,"",
	918	"Match","Base","Ofs" );
	919
	920	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	921	SV ** const tmp = av_fetch( revcharmap, state, 0);
	922	if ( tmp ) {
	923	PerlIO_printf( Perl_debug_log, "%*s",
	924	colwidth,
	925	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	926	PL_colors[0], PL_colors[1],
	927	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	928	PERL_PV_ESCAPE_FIRSTCHAR
	929	)
	930	);
	931	}
	932	}
	933	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	934	(int)depth * 2 + 2,"");
	935
	936	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	937	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	938	PerlIO_printf( Perl_debug_log, "\n");
	939
	940	for( state = 1 ; state < trie->statecount ; state++ ) {
	941	const U32 base = trie->states[ state ].trans.base;
	942
	943	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	944
	945	if ( trie->states[ state ].wordnum ) {
	946	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	947	} else {
	948	PerlIO_printf( Perl_debug_log, "%6s", "" );
	949	}
	950
	951	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	952
	953	if ( base ) {
	954	U32 ofs = 0;
	955
	956	while( ( base + ofs < trie->uniquecharcount ) \|\|
	957	( base + ofs - trie->uniquecharcount < trie->lasttrans
	958	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	959	ofs++;
	960
	961	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	962
	963	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	964	if ( ( base + ofs >= trie->uniquecharcount ) &&
	965	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	966	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	967	{
	968	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	969	colwidth,
	970	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	971	} else {
	972	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	973	}
	974	}
	975
	976	PerlIO_printf( Perl_debug_log, "]");
	977
	978	}
	979	PerlIO_printf( Perl_debug_log, "\n" );
	980	}
	981	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	982	for (word=1; word <= trie->wordcount; word++) {
	983	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	984	(int)word, (int)(trie->wordinfo[word].prev),
	985	(int)(trie->wordinfo[word].len));
	986	}
	987	PerlIO_printf(Perl_debug_log, "\n" );
	988	}
	989	/*
	990	Dumps a fully constructed but uncompressed trie in list form.
	991	List tries normally only are used for construction when the number of
	992	possible chars (trie->uniquecharcount) is very high.
	993	Used for debugging make_trie().
	994	*/
	995	STATIC void
	996	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	997	HV widecharmap, AV revcharmap, U32 next_alloc,
	998	U32 depth)
	999	{
	1000	U32 state;
	1001	SV *sv=sv_newmortal();
	1002	int colwidth= widecharmap ? 6 : 4;
	1003	GET_RE_DEBUG_FLAGS_DECL;
	1004
	1005	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1006
	1007	/* print out the table precompression. */
	1008	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1009	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1010	"------:-----+-----------------\n" );
	1011
	1012	for( state=1 ; state < next_alloc ; state ++ ) {
	1013	U16 charid;
	1014
	1015	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1016	(int)depth * 2 + 2,"", (UV)state );
	1017	if ( ! trie->states[ state ].wordnum ) {
	1018	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1019	} else {
	1020	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1021	trie->states[ state ].wordnum
	1022	);
	1023	}
	1024	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1025	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1026	if ( tmp ) {
	1027	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1028	colwidth,
	1029	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1030	PL_colors[0], PL_colors[1],
	1031	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1032	PERL_PV_ESCAPE_FIRSTCHAR
	1033	) ,
	1034	TRIE_LIST_ITEM(state,charid).forid,
	1035	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1036	);
	1037	if (!(charid % 10))
	1038	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1039	(int)((depth * 2) + 14), "");
	1040	}
	1041	}
	1042	PerlIO_printf( Perl_debug_log, "\n");
	1043	}
	1044	}
	1045
	1046	/*
	1047	Dumps a fully constructed but uncompressed trie in table form.
	1048	This is the normal DFA style state transition table, with a few
	1049	twists to facilitate compression later.
	1050	Used for debugging make_trie().
	1051	*/
	1052	STATIC void
	1053	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1054	HV widecharmap, AV revcharmap, U32 next_alloc,
	1055	U32 depth)
	1056	{
	1057	U32 state;
	1058	U16 charid;
	1059	SV *sv=sv_newmortal();
	1060	int colwidth= widecharmap ? 6 : 4;
	1061	GET_RE_DEBUG_FLAGS_DECL;
	1062
	1063	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1064
	1065	/*
	1066	print out the table precompression so that we can do a visual check
	1067	that they are identical.
	1068	*/
	1069
	1070	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1071
	1072	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1073	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1074	if ( tmp ) {
	1075	PerlIO_printf( Perl_debug_log, "%*s",
	1076	colwidth,
	1077	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1078	PL_colors[0], PL_colors[1],
	1079	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1080	PERL_PV_ESCAPE_FIRSTCHAR
	1081	)
	1082	);
	1083	}
	1084	}
	1085
	1086	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1087
	1088	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1089	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1090	}
	1091
	1092	PerlIO_printf( Perl_debug_log, "\n" );
	1093
	1094	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1095
	1096	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1097	(int)depth * 2 + 2,"",
	1098	(UV)TRIE_NODENUM( state ) );
	1099
	1100	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1101	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1102	if (v)
	1103	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1104	else
	1105	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1106	}
	1107	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1108	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1109	} else {
	1110	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1111	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1112	}
	1113	}
	1114	}
	1115
	1116	#endif
	1117
	1118
	1119	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1120	startbranch: the first branch in the whole branch sequence
	1121	first : start branch of sequence of branch-exact nodes.
	1122	May be the same as startbranch
	1123	last : Thing following the last branch.
	1124	May be the same as tail.
	1125	tail : item following the branch sequence
	1126	count : words in the sequence
	1127	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1128	depth : indent depth
	1129
	1130	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1131
	1132	A trie is an N'ary tree where the branches are determined by digital
	1133	decomposition of the key. IE, at the root node you look up the 1st character and
	1134	follow that branch repeat until you find the end of the branches. Nodes can be
	1135	marked as "accepting" meaning they represent a complete word. Eg:
	1136
	1137	/he\|she\|his\|hers/
	1138
	1139	would convert into the following structure. Numbers represent states, letters
	1140	following numbers represent valid transitions on the letter from that state, if
	1141	the number is in square brackets it represents an accepting state, otherwise it
	1142	will be in parenthesis.
	1143
	1144	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1145	\| \|
	1146	\| (2)
	1147	\| \|
	1148	(1) +-i->(6)-+-s->[7]
	1149	\|
	1150	+-s->(3)-+-h->(4)-+-e->[5]
	1151
	1152	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1153
	1154	This shows that when matching against the string 'hers' we will begin at state 1
	1155	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1156	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1157	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1158	single traverse. We store a mapping from accepting to state to which word was
	1159	matched, and then when we have multiple possibilities we try to complete the
	1160	rest of the regex in the order in which they occured in the alternation.
	1161
	1162	The only prior NFA like behaviour that would be changed by the TRIE support is
	1163	the silent ignoring of duplicate alternations which are of the form:
	1164
	1165	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1166
	1167	Thus EVAL blocks following a trie may be called a different number of times with
	1168	and without the optimisation. With the optimisations dupes will be silently
	1169	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1170	the following demonstrates:
	1171
	1172	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1173
	1174	which prints out 'word' three times, but
	1175
	1176	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1177
	1178	which doesnt print it out at all. This is due to other optimisations kicking in.
	1179
	1180	Example of what happens on a structural level:
	1181
	1182	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1183
	1184	1: CURLYM[1] {1,32767}(18)
	1185	5: BRANCH(8)
	1186	6: EXACT <ac>(16)
	1187	8: BRANCH(11)
	1188	9: EXACT <ad>(16)
	1189	11: BRANCH(14)
	1190	12: EXACT <ab>(16)
	1191	16: SUCCEED(0)
	1192	17: NOTHING(18)
	1193	18: END(0)
	1194
	1195	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1196	and should turn into:
	1197
	1198	1: CURLYM[1] {1,32767}(18)
	1199	5: TRIE(16)
	1200	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1201	<ac>
	1202	<ad>
	1203	<ab>
	1204	16: SUCCEED(0)
	1205	17: NOTHING(18)
	1206	18: END(0)
	1207
	1208	Cases where tail != last would be like /(?foo\|bar)baz/:
	1209
	1210	1: BRANCH(4)
	1211	2: EXACT <foo>(8)
	1212	4: BRANCH(7)
	1213	5: EXACT <bar>(8)
	1214	7: TAIL(8)
	1215	8: EXACT <baz>(10)
	1216	10: END(0)
	1217
	1218	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1219	and would end up looking like:
	1220
	1221	1: TRIE(8)
	1222	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1223	<foo>
	1224	<bar>
	1225	7: TAIL(8)
	1226	8: EXACT <baz>(10)
	1227	10: END(0)
	1228
	1229	d = uvuni_to_utf8_flags(d, uv, 0);
	1230
	1231	is the recommended Unicode-aware way of saying
	1232
	1233	*(d++) = uv;
	1234	*/
	1235
	1236	#define TRIE_STORE_REVCHAR \
	1237	STMT_START { \
	1238	if (UTF) { \
	1239	SV *zlopp = newSV(2); \
	1240	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1241	unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
	1242	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1243	SvPOK_on(zlopp); \
	1244	SvUTF8_on(zlopp); \
	1245	av_push(revcharmap, zlopp); \
	1246	} else { \
	1247	char ooooff = (char)uvc; \
	1248	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1249	} \
	1250	} STMT_END
	1251
	1252	#define TRIE_READ_CHAR STMT_START { \
	1253	wordlen++; \
	1254	if ( UTF ) { \
	1255	if ( folder ) { \
	1256	if ( foldlen > 0 ) { \
	1257	uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
	1258	foldlen -= len; \
	1259	scan += len; \
	1260	len = 0; \
	1261	} else { \
	1262	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1263	uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
	1264	foldlen -= UNISKIP( uvc ); \
	1265	scan = foldbuf + UNISKIP( uvc ); \
	1266	} \
	1267	} else { \
	1268	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1269	} \
	1270	} else { \
	1271	uvc = (U32)*uc; \
	1272	len = 1; \
	1273	} \
	1274	} STMT_END
	1275
	1276
	1277
	1278	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1279	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1280	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1281	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1282	} \
	1283	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1284	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1285	TRIE_LIST_CUR( state )++; \
	1286	} STMT_END
	1287
	1288	#define TRIE_LIST_NEW(state) STMT_START { \
	1289	Newxz( trie->states[ state ].trans.list, \
	1290	4, reg_trie_trans_le ); \
	1291	TRIE_LIST_CUR( state ) = 1; \
	1292	TRIE_LIST_LEN( state ) = 4; \
	1293	} STMT_END
	1294
	1295	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1296	U16 dupe= trie->states[ state ].wordnum; \
	1297	regnode * const noper_next = regnext( noper ); \
	1298	\
	1299	DEBUG_r({ \
	1300	/* store the word for dumping */ \
	1301	SV* tmp; \
	1302	if (OP(noper) != NOTHING) \
	1303	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1304	else \
	1305	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1306	av_push( trie_words, tmp ); \
	1307	}); \
	1308	\
	1309	curword++; \
	1310	trie->wordinfo[curword].prev = 0; \
	1311	trie->wordinfo[curword].len = wordlen; \
	1312	trie->wordinfo[curword].accept = state; \
	1313	\
	1314	if ( noper_next < tail ) { \
	1315	if (!trie->jump) \
	1316	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1317	trie->jump[curword] = (U16)(noper_next - convert); \
	1318	if (!jumper) \
	1319	jumper = noper_next; \
	1320	if (!nextbranch) \
	1321	nextbranch= regnext(cur); \
	1322	} \
	1323	\
	1324	if ( dupe ) { \
	1325	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1326	/* chain, so that when the bits of chain are later */\
	1327	/* linked together, the dups appear in the chain */\
	1328	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1329	trie->wordinfo[dupe].prev = curword; \
	1330	} else { \
	1331	/* we haven't inserted this word yet. */ \
	1332	trie->states[ state ].wordnum = curword; \
	1333	} \
	1334	} STMT_END
	1335
	1336
	1337	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1338	( ( base + charid >= ucharcount \
	1339	&& base + charid < ubound \
	1340	&& state == trie->trans[ base - ucharcount + charid ].check \
	1341	&& trie->trans[ base - ucharcount + charid ].next ) \
	1342	? trie->trans[ base - ucharcount + charid ].next \
	1343	: ( state==1 ? special : 0 ) \
	1344	)
	1345
	1346	#define MADE_TRIE 1
	1347	#define MADE_JUMP_TRIE 2
	1348	#define MADE_EXACT_TRIE 4
	1349
	1350	STATIC I32
	1351	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1352	{
	1353	dVAR;
	1354	/* first pass, loop through and scan words */
	1355	reg_trie_data *trie;
	1356	HV *widecharmap = NULL;
	1357	AV *revcharmap = newAV();
	1358	regnode *cur;
	1359	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1360	STRLEN len = 0;
	1361	UV uvc = 0;
	1362	U16 curword = 0;
	1363	U32 next_alloc = 0;
	1364	regnode *jumper = NULL;
	1365	regnode *nextbranch = NULL;
	1366	regnode *convert = NULL;
	1367	U32 prev_states; / temp array mapping each state to previous one */
	1368	/* we just use folder as a flag in utf8 */
	1369	const U8 * folder = NULL;
	1370
	1371	#ifdef DEBUGGING
	1372	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1373	AV *trie_words = NULL;
	1374	/* along with revcharmap, this only used during construction but both are
	1375	* useful during debugging so we store them in the struct when debugging.
	1376	*/
	1377	#else
	1378	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1379	STRLEN trie_charcount=0;
	1380	#endif
	1381	SV *re_trie_maxbuff;
	1382	GET_RE_DEBUG_FLAGS_DECL;
	1383
	1384	PERL_ARGS_ASSERT_MAKE_TRIE;
	1385	#ifndef DEBUGGING
	1386	PERL_UNUSED_ARG(depth);
	1387	#endif
	1388
	1389	switch (flags) {
	1390	case EXACTFU: folder = PL_fold_latin1; break;
	1391	case EXACTF: folder = PL_fold; break;
	1392	case EXACTFL: folder = PL_fold_locale; break;
	1393	}
	1394
	1395	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1396	trie->refcount = 1;
	1397	trie->startstate = 1;
	1398	trie->wordcount = word_count;
	1399	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1400	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1401	if (!(UTF && folder))
	1402	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1403	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1404	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1405
	1406	DEBUG_r({
	1407	trie_words = newAV();
	1408	});
	1409
	1410	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1411	if (!SvIOK(re_trie_maxbuff)) {
	1412	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1413	}
	1414	DEBUG_OPTIMISE_r({
	1415	PerlIO_printf( Perl_debug_log,
	1416	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1417	(int)depth * 2 + 2, "",
	1418	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1419	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1420	(int)depth);
	1421	});
	1422
	1423	/* Find the node we are going to overwrite */
	1424	if ( first == startbranch && OP( last ) != BRANCH ) {
	1425	/* whole branch chain */
	1426	convert = first;
	1427	} else {
	1428	/* branch sub-chain */
	1429	convert = NEXTOPER( first );
	1430	}
	1431
	1432	/* -- First loop and Setup --
	1433
	1434	We first traverse the branches and scan each word to determine if it
	1435	contains widechars, and how many unique chars there are, this is
	1436	important as we have to build a table with at least as many columns as we
	1437	have unique chars.
	1438
	1439	We use an array of integers to represent the character codes 0..255
	1440	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1441	native representation of the character value as the key and IV's for the
	1442	coded index.
	1443
	1444	TODO If we keep track of how many times each character is used we can
	1445	remap the columns so that the table compression later on is more
	1446	efficient in terms of memory by ensuring the most common value is in the
	1447	middle and the least common are on the outside. IMO this would be better
	1448	than a most to least common mapping as theres a decent chance the most
	1449	common letter will share a node with the least common, meaning the node
	1450	will not be compressible. With a middle is most common approach the worst
	1451	case is when we have the least common nodes twice.
	1452
	1453	*/
	1454
	1455	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1456	regnode * const noper = NEXTOPER( cur );
	1457	const U8 uc = (U8)STRING( noper );
	1458	const U8 * const e = uc + STR_LEN( noper );
	1459	STRLEN foldlen = 0;
	1460	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1461	const U8 scan = (U8)NULL;
	1462	U32 wordlen = 0; /* required init */
	1463	STRLEN chars = 0;
	1464	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1465
	1466	if (OP(noper) == NOTHING) {
	1467	trie->minlen= 0;
	1468	continue;
	1469	}
	1470	if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
	1471	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1472	regardless of encoding */
	1473
	1474	for ( ; uc < e ; uc += len ) {
	1475	TRIE_CHARCOUNT(trie)++;
	1476	TRIE_READ_CHAR;
	1477	chars++;
	1478	if ( uvc < 256 ) {
	1479	if ( !trie->charmap[ uvc ] ) {
	1480	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1481	if ( folder )
	1482	trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
	1483	TRIE_STORE_REVCHAR;
	1484	}
	1485	if ( set_bit ) {
	1486	/* store the codepoint in the bitmap, and its folded
	1487	* equivalent. */
	1488	TRIE_BITMAP_SET(trie,uvc);
	1489
	1490	/* store the folded codepoint */
	1491	if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
	1492
	1493	if ( !UTF ) {
	1494	/* store first byte of utf8 representation of
	1495	variant codepoints */
	1496	if (! UNI_IS_INVARIANT(uvc)) {
	1497	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1498	}
	1499	}
	1500	set_bit = 0; /* We've done our bit :-) */
	1501	}
	1502	} else {
	1503	SV** svpp;
	1504	if ( !widecharmap )
	1505	widecharmap = newHV();
	1506
	1507	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1508
	1509	if ( !svpp )
	1510	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1511
	1512	if ( !SvTRUE( *svpp ) ) {
	1513	sv_setiv( *svpp, ++trie->uniquecharcount );
	1514	TRIE_STORE_REVCHAR;
	1515	}
	1516	}
	1517	}
	1518	if( cur == first ) {
	1519	trie->minlen=chars;
	1520	trie->maxlen=chars;
	1521	} else if (chars < trie->minlen) {
	1522	trie->minlen=chars;
	1523	} else if (chars > trie->maxlen) {
	1524	trie->maxlen=chars;
	1525	}
	1526
	1527	} /* end first pass */
	1528	DEBUG_TRIE_COMPILE_r(
	1529	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1530	(int)depth * 2 + 2,"",
	1531	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1532	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1533	(int)trie->minlen, (int)trie->maxlen )
	1534	);
	1535
	1536	/*
	1537	We now know what we are dealing with in terms of unique chars and
	1538	string sizes so we can calculate how much memory a naive
	1539	representation using a flat table will take. If it's over a reasonable
	1540	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1541	conservative but potentially much slower representation using an array
	1542	of lists.
	1543
	1544	At the end we convert both representations into the same compressed
	1545	form that will be used in regexec.c for matching with. The latter
	1546	is a form that cannot be used to construct with but has memory
	1547	properties similar to the list form and access properties similar
	1548	to the table form making it both suitable for fast searches and
	1549	small enough that its feasable to store for the duration of a program.
	1550
	1551	See the comment in the code where the compressed table is produced
	1552	inplace from the flat tabe representation for an explanation of how
	1553	the compression works.
	1554
	1555	*/
	1556
	1557
	1558	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1559	prev_states[1] = 0;
	1560
	1561	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1562	/*
	1563	Second Pass -- Array Of Lists Representation
	1564
	1565	Each state will be represented by a list of charid:state records
	1566	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1567	points of the allocated array. (See defines above).
	1568
	1569	We build the initial structure using the lists, and then convert
	1570	it into the compressed table form which allows faster lookups
	1571	(but cant be modified once converted).
	1572	*/
	1573
	1574	STRLEN transcount = 1;
	1575
	1576	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1577	"%*sCompiling trie using list compiler\n",
	1578	(int)depth * 2 + 2, ""));
	1579
	1580	trie->states = (reg_trie_state *)
	1581	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1582	sizeof(reg_trie_state) );
	1583	TRIE_LIST_NEW(1);
	1584	next_alloc = 2;
	1585
	1586	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1587
	1588	regnode * const noper = NEXTOPER( cur );
	1589	U8 uc = (U8)STRING( noper );
	1590	const U8 * const e = uc + STR_LEN( noper );
	1591	U32 state = 1; /* required init */
	1592	U16 charid = 0; /* sanity init */
	1593	U8 scan = (U8)NULL; /* sanity init */
	1594	STRLEN foldlen = 0; /* required init */
	1595	U32 wordlen = 0; /* required init */
	1596	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1597
	1598	if (OP(noper) != NOTHING) {
	1599	for ( ; uc < e ; uc += len ) {
	1600
	1601	TRIE_READ_CHAR;
	1602
	1603	if ( uvc < 256 ) {
	1604	charid = trie->charmap[ uvc ];
	1605	} else {
	1606	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1607	if ( !svpp ) {
	1608	charid = 0;
	1609	} else {
	1610	charid=(U16)SvIV( *svpp );
	1611	}
	1612	}
	1613	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1614	if ( charid ) {
	1615
	1616	U16 check;
	1617	U32 newstate = 0;
	1618
	1619	charid--;
	1620	if ( !trie->states[ state ].trans.list ) {
	1621	TRIE_LIST_NEW( state );
	1622	}
	1623	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1624	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1625	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1626	break;
	1627	}
	1628	}
	1629	if ( ! newstate ) {
	1630	newstate = next_alloc++;
	1631	prev_states[newstate] = state;
	1632	TRIE_LIST_PUSH( state, charid, newstate );
	1633	transcount++;
	1634	}
	1635	state = newstate;
	1636	} else {
	1637	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1638	}
	1639	}
	1640	}
	1641	TRIE_HANDLE_WORD(state);
	1642
	1643	} /* end second pass */
	1644
	1645	/* next alloc is the NEXT state to be allocated */
	1646	trie->statecount = next_alloc;
	1647	trie->states = (reg_trie_state *)
	1648	PerlMemShared_realloc( trie->states,
	1649	next_alloc
	1650	* sizeof(reg_trie_state) );
	1651
	1652	/* and now dump it out before we compress it */
	1653	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1654	revcharmap, next_alloc,
	1655	depth+1)
	1656	);
	1657
	1658	trie->trans = (reg_trie_trans *)
	1659	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1660	{
	1661	U32 state;
	1662	U32 tp = 0;
	1663	U32 zp = 0;
	1664
	1665
	1666	for( state=1 ; state < next_alloc ; state ++ ) {
	1667	U32 base=0;
	1668
	1669	/*
	1670	DEBUG_TRIE_COMPILE_MORE_r(
	1671	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1672	);
	1673	*/
	1674
	1675	if (trie->states[state].trans.list) {
	1676	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1677	U16 maxid=minid;
	1678	U16 idx;
	1679
	1680	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1681	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1682	if ( forid < minid ) {
	1683	minid=forid;
	1684	} else if ( forid > maxid ) {
	1685	maxid=forid;
	1686	}
	1687	}
	1688	if ( transcount < tp + maxid - minid + 1) {
	1689	transcount *= 2;
	1690	trie->trans = (reg_trie_trans *)
	1691	PerlMemShared_realloc( trie->trans,
	1692	transcount
	1693	* sizeof(reg_trie_trans) );
	1694	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1695	}
	1696	base = trie->uniquecharcount + tp - minid;
	1697	if ( maxid == minid ) {
	1698	U32 set = 0;
	1699	for ( ; zp < tp ; zp++ ) {
	1700	if ( ! trie->trans[ zp ].next ) {
	1701	base = trie->uniquecharcount + zp - minid;
	1702	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1703	trie->trans[ zp ].check = state;
	1704	set = 1;
	1705	break;
	1706	}
	1707	}
	1708	if ( !set ) {
	1709	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1710	trie->trans[ tp ].check = state;
	1711	tp++;
	1712	zp = tp;
	1713	}
	1714	} else {
	1715	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1716	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	1717	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	1718	trie->trans[ tid ].check = state;
	1719	}
	1720	tp += ( maxid - minid + 1 );
	1721	}
	1722	Safefree(trie->states[ state ].trans.list);
	1723	}
	1724	/*
	1725	DEBUG_TRIE_COMPILE_MORE_r(
	1726	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	1727	);
	1728	*/
	1729	trie->states[ state ].trans.base=base;
	1730	}
	1731	trie->lasttrans = tp + 1;
	1732	}
	1733	} else {
	1734	/*
	1735	Second Pass -- Flat Table Representation.
	1736
	1737	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	1738	We know that we will need Charcount+1 trans at most to store the data
	1739	(one row per char at worst case) So we preallocate both structures
	1740	assuming worst case.
	1741
	1742	We then construct the trie using only the .next slots of the entry
	1743	structs.
	1744
	1745	We use the .check field of the first entry of the node temporarily to
	1746	make compression both faster and easier by keeping track of how many non
	1747	zero fields are in the node.
	1748
	1749	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1750	transition.
	1751
	1752	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	1753	number representing the first entry of the node, and state as a
	1754	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	1755	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	1756	are 2 entrys per node. eg:
	1757
	1758	A B A B
	1759	1. 2 4 1. 3 7
	1760	2. 0 3 3. 0 5
	1761	3. 0 0 5. 0 0
	1762	4. 0 0 7. 0 0
	1763
	1764	The table is internally in the right hand, idx form. However as we also
	1765	have to deal with the states array which is indexed by nodenum we have to
	1766	use TRIE_NODENUM() to convert.
	1767
	1768	*/
	1769	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1770	"%*sCompiling trie using table compiler\n",
	1771	(int)depth * 2 + 2, ""));
	1772
	1773	trie->trans = (reg_trie_trans *)
	1774	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1775	* trie->uniquecharcount + 1,
	1776	sizeof(reg_trie_trans) );
	1777	trie->states = (reg_trie_state *)
	1778	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1779	sizeof(reg_trie_state) );
	1780	next_alloc = trie->uniquecharcount + 1;
	1781
	1782
	1783	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1784
	1785	regnode * const noper = NEXTOPER( cur );
	1786	const U8 uc = (U8)STRING( noper );
	1787	const U8 * const e = uc + STR_LEN( noper );
	1788
	1789	U32 state = 1; /* required init */
	1790
	1791	U16 charid = 0; /* sanity init */
	1792	U32 accept_state = 0; /* sanity init */
	1793	U8 scan = (U8)NULL; /* sanity init */
	1794
	1795	STRLEN foldlen = 0; /* required init */
	1796	U32 wordlen = 0; /* required init */
	1797	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1798
	1799	if ( OP(noper) != NOTHING ) {
	1800	for ( ; uc < e ; uc += len ) {
	1801
	1802	TRIE_READ_CHAR;
	1803
	1804	if ( uvc < 256 ) {
	1805	charid = trie->charmap[ uvc ];
	1806	} else {
	1807	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1808	charid = svpp ? (U16)SvIV(*svpp) : 0;
	1809	}
	1810	if ( charid ) {
	1811	charid--;
	1812	if ( !trie->trans[ state + charid ].next ) {
	1813	trie->trans[ state + charid ].next = next_alloc;
	1814	trie->trans[ state ].check++;
	1815	prev_states[TRIE_NODENUM(next_alloc)]
	1816	= TRIE_NODENUM(state);
	1817	next_alloc += trie->uniquecharcount;
	1818	}
	1819	state = trie->trans[ state + charid ].next;
	1820	} else {
	1821	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1822	}
	1823	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1824	}
	1825	}
	1826	accept_state = TRIE_NODENUM( state );
	1827	TRIE_HANDLE_WORD(accept_state);
	1828
	1829	} /* end second pass */
	1830
	1831	/* and now dump it out before we compress it */
	1832	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	1833	revcharmap,
	1834	next_alloc, depth+1));
	1835
	1836	{
	1837	/*
	1838	* Inplace compress the table.*
	1839
	1840	For sparse data sets the table constructed by the trie algorithm will
	1841	be mostly 0/FAIL transitions or to put it another way mostly empty.
	1842	(Note that leaf nodes will not contain any transitions.)
	1843
	1844	This algorithm compresses the tables by eliminating most such
	1845	transitions, at the cost of a modest bit of extra work during lookup:
	1846
	1847	- Each states[] entry contains a .base field which indicates the
	1848	index in the state[] array wheres its transition data is stored.
	1849
	1850	- If .base is 0 there are no valid transitions from that node.
	1851
	1852	- If .base is nonzero then charid is added to it to find an entry in
	1853	the trans array.
	1854
	1855	-If trans[states[state].base+charid].check!=state then the
	1856	transition is taken to be a 0/Fail transition. Thus if there are fail
	1857	transitions at the front of the node then the .base offset will point
	1858	somewhere inside the previous nodes data (or maybe even into a node
	1859	even earlier), but the .check field determines if the transition is
	1860	valid.
	1861
	1862	XXX - wrong maybe?
	1863	The following process inplace converts the table to the compressed
	1864	table: We first do not compress the root node 1,and mark all its
	1865	.check pointers as 1 and set its .base pointer as 1 as well. This
	1866	allows us to do a DFA construction from the compressed table later,
	1867	and ensures that any .base pointers we calculate later are greater
	1868	than 0.
	1869
	1870	- We set 'pos' to indicate the first entry of the second node.
	1871
	1872	- We then iterate over the columns of the node, finding the first and
	1873	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	1874	and set the .check pointers accordingly, and advance pos
	1875	appropriately and repreat for the next node. Note that when we copy
	1876	the next pointers we have to convert them from the original
	1877	NODEIDX form to NODENUM form as the former is not valid post
	1878	compression.
	1879
	1880	- If a node has no transitions used we mark its base as 0 and do not
	1881	advance the pos pointer.
	1882
	1883	- If a node only has one transition we use a second pointer into the
	1884	structure to fill in allocated fail transitions from other states.
	1885	This pointer is independent of the main pointer and scans forward
	1886	looking for null transitions that are allocated to a state. When it
	1887	finds one it writes the single transition into the "hole". If the
	1888	pointer doesnt find one the single transition is appended as normal.
	1889
	1890	- Once compressed we can Renew/realloc the structures to release the
	1891	excess space.
	1892
	1893	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	1894	specifically Fig 3.47 and the associated pseudocode.
	1895
	1896	demq
	1897	*/
	1898	const U32 laststate = TRIE_NODENUM( next_alloc );
	1899	U32 state, charid;
	1900	U32 pos = 0, zp=0;
	1901	trie->statecount = laststate;
	1902
	1903	for ( state = 1 ; state < laststate ; state++ ) {
	1904	U8 flag = 0;
	1905	const U32 stateidx = TRIE_NODEIDX( state );
	1906	const U32 o_used = trie->trans[ stateidx ].check;
	1907	U32 used = trie->trans[ stateidx ].check;
	1908	trie->trans[ stateidx ].check = 0;
	1909
	1910	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	1911	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	1912	if ( trie->trans[ stateidx + charid ].next ) {
	1913	if (o_used == 1) {
	1914	for ( ; zp < pos ; zp++ ) {
	1915	if ( ! trie->trans[ zp ].next ) {
	1916	break;
	1917	}
	1918	}
	1919	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	1920	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	1921	trie->trans[ zp ].check = state;
	1922	if ( ++zp > pos ) pos = zp;
	1923	break;
	1924	}
	1925	used--;
	1926	}
	1927	if ( !flag ) {
	1928	flag = 1;
	1929	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	1930	}
	1931	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	1932	trie->trans[ pos ].check = state;
	1933	pos++;
	1934	}
	1935	}
	1936	}
	1937	trie->lasttrans = pos + 1;
	1938	trie->states = (reg_trie_state *)
	1939	PerlMemShared_realloc( trie->states, laststate
	1940	* sizeof(reg_trie_state) );
	1941	DEBUG_TRIE_COMPILE_MORE_r(
	1942	PerlIO_printf( Perl_debug_log,
	1943	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	1944	(int)depth * 2 + 2,"",
	1945	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	1946	(IV)next_alloc,
	1947	(IV)pos,
	1948	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	1949	);
	1950
	1951	} /* end table compress */
	1952	}
	1953	DEBUG_TRIE_COMPILE_MORE_r(
	1954	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	1955	(int)depth * 2 + 2, "",
	1956	(UV)trie->statecount,
	1957	(UV)trie->lasttrans)
	1958	);
	1959	/* resize the trans array to remove unused space */
	1960	trie->trans = (reg_trie_trans *)
	1961	PerlMemShared_realloc( trie->trans, trie->lasttrans
	1962	* sizeof(reg_trie_trans) );
	1963
	1964	{ /* Modify the program and insert the new TRIE node */
	1965	U8 nodetype =(U8)(flags & 0xFF);
	1966	char *str=NULL;
	1967
	1968	#ifdef DEBUGGING
	1969	regnode *optimize = NULL;
	1970	#ifdef RE_TRACK_PATTERN_OFFSETS
	1971
	1972	U32 mjd_offset = 0;
	1973	U32 mjd_nodelen = 0;
	1974	#endif /* RE_TRACK_PATTERN_OFFSETS */
	1975	#endif /* DEBUGGING */
	1976	/*
	1977	This means we convert either the first branch or the first Exact,
	1978	depending on whether the thing following (in 'last') is a branch
	1979	or not and whther first is the startbranch (ie is it a sub part of
	1980	the alternation or is it the whole thing.)
	1981	Assuming its a sub part we convert the EXACT otherwise we convert
	1982	the whole branch sequence, including the first.
	1983	*/
	1984	/* Find the node we are going to overwrite */
	1985	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	1986	/* branch sub-chain */
	1987	NEXT_OFF( first ) = (U16)(last - first);
	1988	#ifdef RE_TRACK_PATTERN_OFFSETS
	1989	DEBUG_r({
	1990	mjd_offset= Node_Offset((convert));
	1991	mjd_nodelen= Node_Length((convert));
	1992	});
	1993	#endif
	1994	/* whole branch chain */
	1995	}
	1996	#ifdef RE_TRACK_PATTERN_OFFSETS
	1997	else {
	1998	DEBUG_r({
	1999	const regnode *nop = NEXTOPER( convert );
	2000	mjd_offset= Node_Offset((nop));
	2001	mjd_nodelen= Node_Length((nop));
	2002	});
	2003	}
	2004	DEBUG_OPTIMISE_r(
	2005	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2006	(int)depth * 2 + 2, "",
	2007	(UV)mjd_offset, (UV)mjd_nodelen)
	2008	);
	2009	#endif
	2010	/* But first we check to see if there is a common prefix we can
	2011	split out as an EXACT and put in front of the TRIE node. */
	2012	trie->startstate= 1;
	2013	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2014	U32 state;
	2015	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2016	U32 ofs = 0;
	2017	I32 idx = -1;
	2018	U32 count = 0;
	2019	const U32 base = trie->states[ state ].trans.base;
	2020
	2021	if ( trie->states[state].wordnum )
	2022	count = 1;
	2023
	2024	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2025	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2026	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2027	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2028	{
	2029	if ( ++count > 1 ) {
	2030	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2031	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2032	if ( state == 1 ) break;
	2033	if ( count == 2 ) {
	2034	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2035	DEBUG_OPTIMISE_r(
	2036	PerlIO_printf(Perl_debug_log,
	2037	"%*sNew Start State=%"UVuf" Class: [",
	2038	(int)depth * 2 + 2, "",
	2039	(UV)state));
	2040	if (idx >= 0) {
	2041	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2042	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2043
	2044	TRIE_BITMAP_SET(trie,*ch);
	2045	if ( folder )
	2046	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2047	DEBUG_OPTIMISE_r(
	2048	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2049	);
	2050	}
	2051	}
	2052	TRIE_BITMAP_SET(trie,*ch);
	2053	if ( folder )
	2054	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2055	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2056	}
	2057	idx = ofs;
	2058	}
	2059	}
	2060	if ( count == 1 ) {
	2061	SV **tmp = av_fetch( revcharmap, idx, 0);
	2062	STRLEN len;
	2063	char ch = SvPV( tmp, len );
	2064	DEBUG_OPTIMISE_r({
	2065	SV *sv=sv_newmortal();
	2066	PerlIO_printf( Perl_debug_log,
	2067	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2068	(int)depth * 2 + 2, "",
	2069	(UV)state, (UV)idx,
	2070	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2071	PL_colors[0], PL_colors[1],
	2072	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2073	PERL_PV_ESCAPE_FIRSTCHAR
	2074	)
	2075	);
	2076	});
	2077	if ( state==1 ) {
	2078	OP( convert ) = nodetype;
	2079	str=STRING(convert);
	2080	STR_LEN(convert)=0;
	2081	}
	2082	STR_LEN(convert) += len;
	2083	while (len--)
	2084	str++ = ch++;
	2085	} else {
	2086	#ifdef DEBUGGING
	2087	if (state>1)
	2088	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2089	#endif
	2090	break;
	2091	}
	2092	}
	2093	trie->prefixlen = (state-1);
	2094	if (str) {
	2095	regnode *n = convert+NODE_SZ_STR(convert);
	2096	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2097	trie->startstate = state;
	2098	trie->minlen -= (state - 1);
	2099	trie->maxlen -= (state - 1);
	2100	#ifdef DEBUGGING
	2101	/* At least the UNICOS C compiler choked on this
	2102	* being argument to DEBUG_r(), so let's just have
	2103	* it right here. */
	2104	if (
	2105	#ifdef PERL_EXT_RE_BUILD
	2106	1
	2107	#else
	2108	DEBUG_r_TEST
	2109	#endif
	2110	) {
	2111	regnode *fix = convert;
	2112	U32 word = trie->wordcount;
	2113	mjd_nodelen++;
	2114	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2115	while( ++fix < n ) {
	2116	Set_Node_Offset_Length(fix, 0, 0);
	2117	}
	2118	while (word--) {
	2119	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2120	if (tmp) {
	2121	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2122	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2123	else
	2124	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2125	}
	2126	}
	2127	}
	2128	#endif
	2129	if (trie->maxlen) {
	2130	convert = n;
	2131	} else {
	2132	NEXT_OFF(convert) = (U16)(tail - convert);
	2133	DEBUG_r(optimize= n);
	2134	}
	2135	}
	2136	}
	2137	if (!jumper)
	2138	jumper = last;
	2139	if ( trie->maxlen ) {
	2140	NEXT_OFF( convert ) = (U16)(tail - convert);
	2141	ARG_SET( convert, data_slot );
	2142	/* Store the offset to the first unabsorbed branch in
	2143	jump[0], which is otherwise unused by the jump logic.
	2144	We use this when dumping a trie and during optimisation. */
	2145	if (trie->jump)
	2146	trie->jump[0] = (U16)(nextbranch - convert);
	2147
	2148	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2149	* and there is a bitmap
	2150	* and the first "jump target" node we found leaves enough room
	2151	* then convert the TRIE node into a TRIEC node, with the bitmap
	2152	* embedded inline in the opcode - this is hypothetically faster.
	2153	*/
	2154	if ( !trie->states[trie->startstate].wordnum
	2155	&& trie->bitmap
	2156	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2157	{
	2158	OP( convert ) = TRIEC;
	2159	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2160	PerlMemShared_free(trie->bitmap);
	2161	trie->bitmap= NULL;
	2162	} else
	2163	OP( convert ) = TRIE;
	2164
	2165	/* store the type in the flags */
	2166	convert->flags = nodetype;
	2167	DEBUG_r({
	2168	optimize = convert
	2169	+ NODE_STEP_REGNODE
	2170	+ regarglen[ OP( convert ) ];
	2171	});
	2172	/* XXX We really should free up the resource in trie now,
	2173	as we won't use them - (which resources?) dmq */
	2174	}
	2175	/* needed for dumping*/
	2176	DEBUG_r(if (optimize) {
	2177	regnode *opt = convert;
	2178
	2179	while ( ++opt < optimize) {
	2180	Set_Node_Offset_Length(opt,0,0);
	2181	}
	2182	/*
	2183	Try to clean up some of the debris left after the
	2184	optimisation.
	2185	*/
	2186	while( optimize < jumper ) {
	2187	mjd_nodelen += Node_Length((optimize));
	2188	OP( optimize ) = OPTIMIZED;
	2189	Set_Node_Offset_Length(optimize,0,0);
	2190	optimize++;
	2191	}
	2192	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2193	});
	2194	} /* end node insert */
	2195
	2196	/* Finish populating the prev field of the wordinfo array. Walk back
	2197	* from each accept state until we find another accept state, and if
	2198	* so, point the first word's .prev field at the second word. If the
	2199	* second already has a .prev field set, stop now. This will be the
	2200	* case either if we've already processed that word's accept state,
	2201	* or that state had multiple words, and the overspill words were
	2202	* already linked up earlier.
	2203	*/
	2204	{
	2205	U16 word;
	2206	U32 state;
	2207	U16 prev;
	2208
	2209	for (word=1; word <= trie->wordcount; word++) {
	2210	prev = 0;
	2211	if (trie->wordinfo[word].prev)
	2212	continue;
	2213	state = trie->wordinfo[word].accept;
	2214	while (state) {
	2215	state = prev_states[state];
	2216	if (!state)
	2217	break;
	2218	prev = trie->states[state].wordnum;
	2219	if (prev)
	2220	break;
	2221	}
	2222	trie->wordinfo[word].prev = prev;
	2223	}
	2224	Safefree(prev_states);
	2225	}
	2226
	2227
	2228	/* and now dump out the compressed format */
	2229	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2230
	2231	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2232	#ifdef DEBUGGING
	2233	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2234	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2235	#else
	2236	SvREFCNT_dec(revcharmap);
	2237	#endif
	2238	return trie->jump
	2239	? MADE_JUMP_TRIE
	2240	: trie->startstate>1
	2241	? MADE_EXACT_TRIE
	2242	: MADE_TRIE;
	2243	}
	2244
	2245	STATIC void
	2246	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2247	{
	2248	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2249
	2250	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2251	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2252	ISBN 0-201-10088-6
	2253
	2254	We find the fail state for each state in the trie, this state is the longest proper
	2255	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2256	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2257	the DFA not to have to restart after its tried and failed a word at a given point, it
	2258	simply continues as though it had been matching the other word in the first place.
	2259	Consider
	2260	'abcdgu'=~/abcdefg\|cdgu/
	2261	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2262	fail, which would bring us to the state representing 'd' in the second word where we would
	2263	try 'g' and succeed, proceeding to match 'cdgu'.
	2264	*/
	2265	/* add a fail transition */
	2266	const U32 trie_offset = ARG(source);
	2267	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2268	U32 *q;
	2269	const U32 ucharcount = trie->uniquecharcount;
	2270	const U32 numstates = trie->statecount;
	2271	const U32 ubound = trie->lasttrans + ucharcount;
	2272	U32 q_read = 0;
	2273	U32 q_write = 0;
	2274	U32 charid;
	2275	U32 base = trie->states[ 1 ].trans.base;
	2276	U32 *fail;
	2277	reg_ac_data *aho;
	2278	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2279	GET_RE_DEBUG_FLAGS_DECL;
	2280
	2281	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2282	#ifndef DEBUGGING
	2283	PERL_UNUSED_ARG(depth);
	2284	#endif
	2285
	2286
	2287	ARG_SET( stclass, data_slot );
	2288	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2289	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2290	aho->trie=trie_offset;
	2291	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2292	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2293	Newxz( q, numstates, U32);
	2294	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2295	aho->refcount = 1;
	2296	fail = aho->fail;
	2297	/* initialize fail[0..1] to be 1 so that we always have
	2298	a valid final fail state */
	2299	fail[ 0 ] = fail[ 1 ] = 1;
	2300
	2301	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2302	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2303	if ( newstate ) {
	2304	q[ q_write ] = newstate;
	2305	/* set to point at the root */
	2306	fail[ q[ q_write++ ] ]=1;
	2307	}
	2308	}
	2309	while ( q_read < q_write) {
	2310	const U32 cur = q[ q_read++ % numstates ];
	2311	base = trie->states[ cur ].trans.base;
	2312
	2313	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2314	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2315	if (ch_state) {
	2316	U32 fail_state = cur;
	2317	U32 fail_base;
	2318	do {
	2319	fail_state = fail[ fail_state ];
	2320	fail_base = aho->states[ fail_state ].trans.base;
	2321	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2322
	2323	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2324	fail[ ch_state ] = fail_state;
	2325	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2326	{
	2327	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2328	}
	2329	q[ q_write++ % numstates] = ch_state;
	2330	}
	2331	}
	2332	}
	2333	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2334	when we fail in state 1, this allows us to use the
	2335	charclass scan to find a valid start char. This is based on the principle
	2336	that theres a good chance the string being searched contains lots of stuff
	2337	that cant be a start char.
	2338	*/
	2339	fail[ 0 ] = fail[ 1 ] = 0;
	2340	DEBUG_TRIE_COMPILE_r({
	2341	PerlIO_printf(Perl_debug_log,
	2342	"%*sStclass Failtable (%"UVuf" states): 0",
	2343	(int)(depth * 2), "", (UV)numstates
	2344	);
	2345	for( q_read=1; q_read<numstates; q_read++ ) {
	2346	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2347	}
	2348	PerlIO_printf(Perl_debug_log, "\n");
	2349	});
	2350	Safefree(q);
	2351	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2352	}
	2353
	2354
	2355	/*
	2356	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2357	* These need to be revisited when a newer toolchain becomes available.
	2358	*/
	2359	#if defined(__sparc64__) && defined(__GNUC__)
	2360	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2361	# undef SPARC64_GCC_WORKAROUND
	2362	# define SPARC64_GCC_WORKAROUND 1
	2363	# endif
	2364	#endif
	2365
	2366	#define DEBUG_PEEP(str,scan,depth) \
	2367	DEBUG_OPTIMISE_r({if (scan){ \
	2368	SV * const mysv=sv_newmortal(); \
	2369	regnode *Next = regnext(scan); \
	2370	regprop(RExC_rx, mysv, scan); \
	2371	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2372	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2373	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2374	}});
	2375
	2376
	2377
	2378
	2379
	2380	#define JOIN_EXACT(scan,min,flags) \
	2381	if (PL_regkind[OP(scan)] == EXACT) \
	2382	join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
	2383
	2384	STATIC U32
	2385	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, I32 min, U32 flags,regnode val, U32 depth) {
	2386	/* Merge several consecutive EXACTish nodes into one. */
	2387	regnode *n = regnext(scan);
	2388	U32 stringok = 1;
	2389	regnode *next = scan + NODE_SZ_STR(scan);
	2390	U32 merged = 0;
	2391	U32 stopnow = 0;
	2392	#ifdef DEBUGGING
	2393	regnode *stop = scan;
	2394	GET_RE_DEBUG_FLAGS_DECL;
	2395	#else
	2396	PERL_UNUSED_ARG(depth);
	2397	#endif
	2398
	2399	PERL_ARGS_ASSERT_JOIN_EXACT;
	2400	#ifndef EXPERIMENTAL_INPLACESCAN
	2401	PERL_UNUSED_ARG(flags);
	2402	PERL_UNUSED_ARG(val);
	2403	#endif
	2404	DEBUG_PEEP("join",scan,depth);
	2405
	2406	/* Skip NOTHING, merge EXACT. /
	2407	while (n &&
	2408	( PL_regkind[OP(n)] == NOTHING \|\|
	2409	(stringok && (OP(n) == OP(scan))))
	2410	&& NEXT_OFF(n)
	2411	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
	2412
	2413	if (OP(n) == TAIL \|\| n > next)
	2414	stringok = 0;
	2415	if (PL_regkind[OP(n)] == NOTHING) {
	2416	DEBUG_PEEP("skip:",n,depth);
	2417	NEXT_OFF(scan) += NEXT_OFF(n);
	2418	next = n + NODE_STEP_REGNODE;
	2419	#ifdef DEBUGGING
	2420	if (stringok)
	2421	stop = n;
	2422	#endif
	2423	n = regnext(n);
	2424	}
	2425	else if (stringok) {
	2426	const unsigned int oldl = STR_LEN(scan);
	2427	regnode * const nnext = regnext(n);
	2428
	2429	DEBUG_PEEP("merg",n,depth);
	2430
	2431	merged++;
	2432	if (oldl + STR_LEN(n) > U8_MAX)
	2433	break;
	2434	NEXT_OFF(scan) += NEXT_OFF(n);
	2435	STR_LEN(scan) += STR_LEN(n);
	2436	next = n + NODE_SZ_STR(n);
	2437	/* Now we can overwrite n : /
	2438	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2439	#ifdef DEBUGGING
	2440	stop = next - 1;
	2441	#endif
	2442	n = nnext;
	2443	if (stopnow) break;
	2444	}
	2445
	2446	#ifdef EXPERIMENTAL_INPLACESCAN
	2447	if (flags && !NEXT_OFF(n)) {
	2448	DEBUG_PEEP("atch", val, depth);
	2449	if (reg_off_by_arg[OP(n)]) {
	2450	ARG_SET(n, val - n);
	2451	}
	2452	else {
	2453	NEXT_OFF(n) = val - n;
	2454	}
	2455	stopnow = 1;
	2456	}
	2457	#endif
	2458	}
	2459	#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
	2460	#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
	2461	#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
	2462	#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
	2463
	2464	if (UTF
	2465	&& ( OP(scan) == EXACTF \|\| OP(scan) == EXACTFU)
	2466	&& ( STR_LEN(scan) >= 6 ) )
	2467	{
	2468	/*
	2469	Two problematic code points in Unicode casefolding of EXACT nodes:
	2470
	2471	U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	2472	U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	2473
	2474	which casefold to
	2475
	2476	Unicode UTF-8
	2477
	2478	U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
	2479	U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
	2480
	2481	This means that in case-insensitive matching (or "loose matching",
	2482	as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
	2483	length of the above casefolded versions) can match a target string
	2484	of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
	2485	This would rather mess up the minimum length computation.
	2486
	2487	What we'll do is to look for the tail four bytes, and then peek
	2488	at the preceding two bytes to see whether we need to decrease
	2489	the minimum length by four (six minus two).
	2490
	2491	Thanks to the design of UTF-8, there cannot be false matches:
	2492	A sequence of valid UTF-8 bytes cannot be a subsequence of
	2493	another valid sequence of UTF-8 bytes.
	2494
	2495	*/
	2496	char * const s0 = STRING(scan), s, t;
	2497	char * const s1 = s0 + STR_LEN(scan) - 1;
	2498	char * const s2 = s1 - 4;
	2499	#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
	2500	const char t0[] = "\xaf\x49\xaf\x42";
	2501	#else
	2502	const char t0[] = "\xcc\x88\xcc\x81";
	2503	#endif
	2504	const char * const t1 = t0 + 3;
	2505
	2506	for (s = s0 + 2;
	2507	s < s2 && (t = ninstr(s, s1, t0, t1));
	2508	s = t + 4) {
	2509	#ifdef EBCDIC
	2510	if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) \|\|
	2511	((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
	2512	#else
	2513	if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) \|\|
	2514	((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
	2515	#endif
	2516	*min -= 4;
	2517	}
	2518	}
	2519
	2520	#ifdef DEBUGGING
	2521	/* Allow dumping */
	2522	n = scan + NODE_SZ_STR(scan);
	2523	while (n <= stop) {
	2524	if (PL_regkind[OP(n)] != NOTHING \|\| OP(n) == NOTHING) {
	2525	OP(n) = OPTIMIZED;
	2526	NEXT_OFF(n) = 0;
	2527	}
	2528	n++;
	2529	}
	2530	#endif
	2531	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	2532	return stopnow;
	2533	}
	2534
	2535	/* REx optimizer. Converts nodes into quicker variants "in place".
	2536	Finds fixed substrings. */
	2537
	2538	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	2539	to the position after last scanned or to NULL. */
	2540
	2541	#define INIT_AND_WITHP \
	2542	assert(!and_withp); \
	2543	Newx(and_withp,1,struct regnode_charclass_class); \
	2544	SAVEFREEPV(and_withp)
	2545
	2546	/* this is a chain of data about sub patterns we are processing that
	2547	need to be handled separately/specially in study_chunk. Its so
	2548	we can simulate recursion without losing state. */
	2549	struct scan_frame;
	2550	typedef struct scan_frame {
	2551	regnode last; / last node to process in this frame */
	2552	regnode next; / next node to process when last is reached */
	2553	struct scan_frame prev; /previous frame*/
	2554	I32 stop; /* what stopparen do we use */
	2555	} scan_frame;
	2556
	2557
	2558	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	2559
	2560	#define CASE_SYNST_FNC(nAmE) \
	2561	case nAmE: \
	2562	if (flags & SCF_DO_STCLASS_AND) { \
	2563	for (value = 0; value < 256; value++) \
	2564	if (!is_ ## nAmE ## _cp(value)) \
	2565	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2566	} \
	2567	else { \
	2568	for (value = 0; value < 256; value++) \
	2569	if (is_ ## nAmE ## _cp(value)) \
	2570	ANYOF_BITMAP_SET(data->start_class, value); \
	2571	} \
	2572	break; \
	2573	case N ## nAmE: \
	2574	if (flags & SCF_DO_STCLASS_AND) { \
	2575	for (value = 0; value < 256; value++) \
	2576	if (is_ ## nAmE ## _cp(value)) \
	2577	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2578	} \
	2579	else { \
	2580	for (value = 0; value < 256; value++) \
	2581	if (!is_ ## nAmE ## _cp(value)) \
	2582	ANYOF_BITMAP_SET(data->start_class, value); \
	2583	} \
	2584	break
	2585
	2586
	2587
	2588	STATIC I32
	2589	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	2590	I32 minlenp, I32 deltap,
	2591	regnode *last,
	2592	scan_data_t *data,
	2593	I32 stopparen,
	2594	U8* recursed,
	2595	struct regnode_charclass_class *and_withp,
	2596	U32 flags, U32 depth)
	2597	/* scanp: Start here (read-write). */
	2598	/* deltap: Write maxlen-minlen here. */
	2599	/* last: Stop before this one. */
	2600	/* data: string data about the pattern */
	2601	/* stopparen: treat close N as END */
	2602	/* recursed: which subroutines have we recursed into */
	2603	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	2604	{
	2605	dVAR;
	2606	I32 min = 0, pars = 0, code;
	2607	regnode scan = scanp, *next;
	2608	I32 delta = 0;
	2609	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	2610	int is_inf_internal = 0; /* The studied chunk is infinite */
	2611	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	2612	scan_data_t data_fake;
	2613	SV *re_trie_maxbuff = NULL;
	2614	regnode *first_non_open = scan;
	2615	I32 stopmin = I32_MAX;
	2616	scan_frame *frame = NULL;
	2617	GET_RE_DEBUG_FLAGS_DECL;
	2618
	2619	PERL_ARGS_ASSERT_STUDY_CHUNK;
	2620
	2621	#ifdef DEBUGGING
	2622	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	2623	#endif
	2624
	2625	if ( depth == 0 ) {
	2626	while (first_non_open && OP(first_non_open) == OPEN)
	2627	first_non_open=regnext(first_non_open);
	2628	}
	2629
	2630
	2631	fake_study_recurse:
	2632	while ( scan && OP(scan) != END && scan < last ){
	2633	/* Peephole optimizer: */
	2634	DEBUG_STUDYDATA("Peep:", data,depth);
	2635	DEBUG_PEEP("Peep",scan,depth);
	2636	JOIN_EXACT(scan,&min,0);
	2637
	2638	/* Follow the next-chain of the current node and optimize
	2639	away all the NOTHINGs from it. */
	2640	if (OP(scan) != CURLYX) {
	2641	const int max = (reg_off_by_arg[OP(scan)]
	2642	? I32_MAX
	2643	/* I32 may be smaller than U16 on CRAYs! */
	2644	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	2645	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	2646	int noff;
	2647	regnode *n = scan;
	2648
	2649	/* Skip NOTHING and LONGJMP. */
	2650	while ((n = regnext(n))
	2651	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	2652	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	2653	&& off + noff < max)
	2654	off += noff;
	2655	if (reg_off_by_arg[OP(scan)])
	2656	ARG(scan) = off;
	2657	else
	2658	NEXT_OFF(scan) = off;
	2659	}
	2660
	2661
	2662
	2663	/* The principal pseudo-switch. Cannot be a switch, since we
	2664	look into several different things. */
	2665	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	2666	\|\| OP(scan) == IFTHEN) {
	2667	next = regnext(scan);
	2668	code = OP(scan);
	2669	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	2670
	2671	if (OP(next) == code \|\| code == IFTHEN) {
	2672	/* NOTE - There is similar code to this block below for handling
	2673	TRIE nodes on a re-study. If you change stuff here check there
	2674	too. */
	2675	I32 max1 = 0, min1 = I32_MAX, num = 0;
	2676	struct regnode_charclass_class accum;
	2677	regnode * const startbranch=scan;
	2678
	2679	if (flags & SCF_DO_SUBSTR)
	2680	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	2681	if (flags & SCF_DO_STCLASS)
	2682	cl_init_zero(pRExC_state, &accum);
	2683
	2684	while (OP(scan) == code) {
	2685	I32 deltanext, minnext, f = 0, fake;
	2686	struct regnode_charclass_class this_class;
	2687
	2688	num++;
	2689	data_fake.flags = 0;
	2690	if (data) {
	2691	data_fake.whilem_c = data->whilem_c;
	2692	data_fake.last_closep = data->last_closep;
	2693	}
	2694	else
	2695	data_fake.last_closep = &fake;
	2696
	2697	data_fake.pos_delta = delta;
	2698	next = regnext(scan);
	2699	scan = NEXTOPER(scan);
	2700	if (code != BRANCH)
	2701	scan = NEXTOPER(scan);
	2702	if (flags & SCF_DO_STCLASS) {
	2703	cl_init(pRExC_state, &this_class);
	2704	data_fake.start_class = &this_class;
	2705	f = SCF_DO_STCLASS_AND;
	2706	}
	2707	if (flags & SCF_WHILEM_VISITED_POS)
	2708	f \|= SCF_WHILEM_VISITED_POS;
	2709
	2710	/* we suppose the run is continuous, last=next...*/
	2711	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	2712	next, &data_fake,
	2713	stopparen, recursed, NULL, f,depth+1);
	2714	if (min1 > minnext)
	2715	min1 = minnext;
	2716	if (max1 < minnext + deltanext)
	2717	max1 = minnext + deltanext;
	2718	if (deltanext == I32_MAX)
	2719	is_inf = is_inf_internal = 1;
	2720	scan = next;
	2721	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	2722	pars++;
	2723	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	2724	if ( stopmin > minnext)
	2725	stopmin = min + min1;
	2726	flags &= ~SCF_DO_SUBSTR;
	2727	if (data)
	2728	data->flags \|= SCF_SEEN_ACCEPT;
	2729	}
	2730	if (data) {
	2731	if (data_fake.flags & SF_HAS_EVAL)
	2732	data->flags \|= SF_HAS_EVAL;
	2733	data->whilem_c = data_fake.whilem_c;
	2734	}
	2735	if (flags & SCF_DO_STCLASS)
	2736	cl_or(pRExC_state, &accum, &this_class);
	2737	}
	2738	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	2739	min1 = 0;
	2740	if (flags & SCF_DO_SUBSTR) {
	2741	data->pos_min += min1;
	2742	data->pos_delta += max1 - min1;
	2743	if (max1 != min1 \|\| is_inf)
	2744	data->longest = &(data->longest_float);
	2745	}
	2746	min += min1;
	2747	delta += max1 - min1;
	2748	if (flags & SCF_DO_STCLASS_OR) {
	2749	cl_or(pRExC_state, data->start_class, &accum);
	2750	if (min1) {
	2751	cl_and(data->start_class, and_withp);
	2752	flags &= ~SCF_DO_STCLASS;
	2753	}
	2754	}
	2755	else if (flags & SCF_DO_STCLASS_AND) {
	2756	if (min1) {
	2757	cl_and(data->start_class, &accum);
	2758	flags &= ~SCF_DO_STCLASS;
	2759	}
	2760	else {
	2761	/* Switch to OR mode: cache the old value of
	2762	* data->start_class */
	2763	INIT_AND_WITHP;
	2764	StructCopy(data->start_class, and_withp,
	2765	struct regnode_charclass_class);
	2766	flags &= ~SCF_DO_STCLASS_AND;
	2767	StructCopy(&accum, data->start_class,
	2768	struct regnode_charclass_class);
	2769	flags \|= SCF_DO_STCLASS_OR;
	2770	data->start_class->flags \|= ANYOF_EOS;
	2771	}
	2772	}
	2773
	2774	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	2775	/* demq.
	2776
	2777	Assuming this was/is a branch we are dealing with: 'scan' now
	2778	points at the item that follows the branch sequence, whatever
	2779	it is. We now start at the beginning of the sequence and look
	2780	for subsequences of
	2781
	2782	BRANCH->EXACT=>x1
	2783	BRANCH->EXACT=>x2
	2784	tail
	2785
	2786	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	2787
	2788	If we can find such a subsequence we need to turn the first
	2789	element into a trie and then add the subsequent branch exact
	2790	strings to the trie.
	2791
	2792	We have two cases
	2793
	2794	1. patterns where the whole set of branches can be converted.
	2795
	2796	2. patterns where only a subset can be converted.
	2797
	2798	In case 1 we can replace the whole set with a single regop
	2799	for the trie. In case 2 we need to keep the start and end
	2800	branches so
	2801
	2802	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	2803	becomes BRANCH TRIE; BRANCH X;
	2804
	2805	There is an additional case, that being where there is a
	2806	common prefix, which gets split out into an EXACT like node
	2807	preceding the TRIE node.
	2808
	2809	If x(1..n)==tail then we can do a simple trie, if not we make
	2810	a "jump" trie, such that when we match the appropriate word
	2811	we "jump" to the appropriate tail node. Essentially we turn
	2812	a nested if into a case structure of sorts.
	2813
	2814	*/
	2815
	2816	int made=0;
	2817	if (!re_trie_maxbuff) {
	2818	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	2819	if (!SvIOK(re_trie_maxbuff))
	2820	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	2821	}
	2822	if ( SvIV(re_trie_maxbuff)>=0 ) {
	2823	regnode *cur;
	2824	regnode first = (regnode )NULL;
	2825	regnode last = (regnode )NULL;
	2826	regnode *tail = scan;
	2827	U8 optype = 0;
	2828	U32 count=0;
	2829
	2830	#ifdef DEBUGGING
	2831	SV * const mysv = sv_newmortal(); /* for dumping */
	2832	#endif
	2833	/* var tail is used because there may be a TAIL
	2834	regop in the way. Ie, the exacts will point to the
	2835	thing following the TAIL, but the last branch will
	2836	point at the TAIL. So we advance tail. If we
	2837	have nested (?:) we may have to move through several
	2838	tails.
	2839	*/
	2840
	2841	while ( OP( tail ) == TAIL ) {
	2842	/* this is the TAIL generated by (?:) */
	2843	tail = regnext( tail );
	2844	}
	2845
	2846
	2847	DEBUG_OPTIMISE_r({
	2848	regprop(RExC_rx, mysv, tail );
	2849	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	2850	(int)depth * 2 + 2, "",
	2851	"Looking for TRIE'able sequences. Tail node is: ",
	2852	SvPV_nolen_const( mysv )
	2853	);
	2854	});
	2855
	2856	/*
	2857
	2858	step through the branches, cur represents each
	2859	branch, noper is the first thing to be matched
	2860	as part of that branch and noper_next is the
	2861	regnext() of that node. if noper is an EXACT
	2862	and noper_next is the same as scan (our current
	2863	position in the regex) then the EXACT branch is
	2864	a possible optimization target. Once we have
	2865	two or more consecutive such branches we can
	2866	create a trie of the EXACT's contents and stich
	2867	it in place. If the sequence represents all of
	2868	the branches we eliminate the whole thing and
	2869	replace it with a single TRIE. If it is a
	2870	subsequence then we need to stitch it in. This
	2871	means the first branch has to remain, and needs
	2872	to be repointed at the item on the branch chain
	2873	following the last branch optimized. This could
	2874	be either a BRANCH, in which case the
	2875	subsequence is internal, or it could be the
	2876	item following the branch sequence in which
	2877	case the subsequence is at the end.
	2878
	2879	*/
	2880
	2881	/* dont use tail as the end marker for this traverse */
	2882	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	2883	regnode * const noper = NEXTOPER( cur );
	2884	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	2885	regnode * const noper_next = regnext( noper );
	2886	#endif
	2887
	2888	DEBUG_OPTIMISE_r({
	2889	regprop(RExC_rx, mysv, cur);
	2890	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	2891	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	2892
	2893	regprop(RExC_rx, mysv, noper);
	2894	PerlIO_printf( Perl_debug_log, " -> %s",
	2895	SvPV_nolen_const(mysv));
	2896
	2897	if ( noper_next ) {
	2898	regprop(RExC_rx, mysv, noper_next );
	2899	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	2900	SvPV_nolen_const(mysv));
	2901	}
	2902	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
	2903	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
	2904	});
	2905	if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
	2906	: PL_regkind[ OP( noper ) ] == EXACT )
	2907	\|\| OP(noper) == NOTHING )
	2908	#ifdef NOJUMPTRIE
	2909	&& noper_next == tail
	2910	#endif
	2911	&& count < U16_MAX)
	2912	{
	2913	count++;
	2914	if ( !first \|\| optype == NOTHING ) {
	2915	if (!first) first = cur;
	2916	optype = OP( noper );
	2917	} else {
	2918	last = cur;
	2919	}
	2920	} else {
	2921	/*
	2922	Currently we do not believe that the trie logic can
	2923	handle case insensitive matching properly when the
	2924	pattern is not unicode (thus forcing unicode semantics).
	2925
	2926	If/when this is fixed the following define can be swapped
	2927	in below to fully enable trie logic.
	2928
	2929	#define TRIE_TYPE_IS_SAFE 1
	2930
	2931	*/
	2932	#define TRIE_TYPE_IS_SAFE (UTF \|\| optype==EXACT)
	2933
	2934	if ( last && TRIE_TYPE_IS_SAFE ) {
	2935	make_trie( pRExC_state,
	2936	startbranch, first, cur, tail, count,
	2937	optype, depth+1 );
	2938	}
	2939	if ( PL_regkind[ OP( noper ) ] == EXACT
	2940	#ifdef NOJUMPTRIE
	2941	&& noper_next == tail
	2942	#endif
	2943	){
	2944	count = 1;
	2945	first = cur;
	2946	optype = OP( noper );
	2947	} else {
	2948	count = 0;
	2949	first = NULL;
	2950	optype = 0;
	2951	}
	2952	last = NULL;
	2953	}
	2954	}
	2955	DEBUG_OPTIMISE_r({
	2956	regprop(RExC_rx, mysv, cur);
	2957	PerlIO_printf( Perl_debug_log,
	2958	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	2959	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	2960
	2961	});
	2962
	2963	if ( last && TRIE_TYPE_IS_SAFE ) {
	2964	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
	2965	#ifdef TRIE_STUDY_OPT
	2966	if ( ((made == MADE_EXACT_TRIE &&
	2967	startbranch == first)
	2968	\|\| ( first_non_open == first )) &&
	2969	depth==0 ) {
	2970	flags \|= SCF_TRIE_RESTUDY;
	2971	if ( startbranch == first
	2972	&& scan == tail )
	2973	{
	2974	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	2975	}
	2976	}
	2977	#endif
	2978	}
	2979	}
	2980
	2981	} /* do trie */
	2982
	2983	}
	2984	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	2985	scan = NEXTOPER(NEXTOPER(scan));
	2986	} else /* single branch is optimized. */
	2987	scan = NEXTOPER(scan);
	2988	continue;
	2989	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	2990	scan_frame *newframe = NULL;
	2991	I32 paren;
	2992	regnode *start;
	2993	regnode *end;
	2994
	2995	if (OP(scan) != SUSPEND) {
	2996	/* set the pointer */
	2997	if (OP(scan) == GOSUB) {
	2998	paren = ARG(scan);
	2999	RExC_recurse[ARG2L(scan)] = scan;
	3000	start = RExC_open_parens[paren-1];
	3001	end = RExC_close_parens[paren-1];
	3002	} else {
	3003	paren = 0;
	3004	start = RExC_rxi->program + 1;
	3005	end = RExC_opend;
	3006	}
	3007	if (!recursed) {
	3008	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3009	SAVEFREEPV(recursed);
	3010	}
	3011	if (!PAREN_TEST(recursed,paren+1)) {
	3012	PAREN_SET(recursed,paren+1);
	3013	Newx(newframe,1,scan_frame);
	3014	} else {
	3015	if (flags & SCF_DO_SUBSTR) {
	3016	SCAN_COMMIT(pRExC_state,data,minlenp);
	3017	data->longest = &(data->longest_float);
	3018	}
	3019	is_inf = is_inf_internal = 1;
	3020	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3021	cl_anything(pRExC_state, data->start_class);
	3022	flags &= ~SCF_DO_STCLASS;
	3023	}
	3024	} else {
	3025	Newx(newframe,1,scan_frame);
	3026	paren = stopparen;
	3027	start = scan+2;
	3028	end = regnext(scan);
	3029	}
	3030	if (newframe) {
	3031	assert(start);
	3032	assert(end);
	3033	SAVEFREEPV(newframe);
	3034	newframe->next = regnext(scan);
	3035	newframe->last = last;
	3036	newframe->stop = stopparen;
	3037	newframe->prev = frame;
	3038
	3039	frame = newframe;
	3040	scan = start;
	3041	stopparen = paren;
	3042	last = end;
	3043
	3044	continue;
	3045	}
	3046	}
	3047	else if (OP(scan) == EXACT) {
	3048	I32 l = STR_LEN(scan);
	3049	UV uc;
	3050	if (UTF) {
	3051	const U8 * const s = (U8*)STRING(scan);
	3052	l = utf8_length(s, s + l);
	3053	uc = utf8_to_uvchr(s, NULL);
	3054	} else {
	3055	uc = ((U8)STRING(scan));
	3056	}
	3057	min += l;
	3058	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3059	/* The code below prefers earlier match for fixed
	3060	offset, later match for variable offset. */
	3061	if (data->last_end == -1) { /* Update the start info. */
	3062	data->last_start_min = data->pos_min;
	3063	data->last_start_max = is_inf
	3064	? I32_MAX : data->pos_min + data->pos_delta;
	3065	}
	3066	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3067	if (UTF)
	3068	SvUTF8_on(data->last_found);
	3069	{
	3070	SV * const sv = data->last_found;
	3071	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3072	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3073	if (mg && mg->mg_len >= 0)
	3074	mg->mg_len += utf8_length((U8*)STRING(scan),
	3075	(U8*)STRING(scan)+STR_LEN(scan));
	3076	}
	3077	data->last_end = data->pos_min + l;
	3078	data->pos_min += l; /* As in the first entry. */
	3079	data->flags &= ~SF_BEFORE_EOL;
	3080	}
	3081	if (flags & SCF_DO_STCLASS_AND) {
	3082	/* Check whether it is compatible with what we know already! */
	3083	int compat = 1;
	3084
	3085
	3086	/* If compatible, we or it in below. It is compatible if is
	3087	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3088	* it's for a locale. Even if there isn't unicode semantics
	3089	* here, at runtime there may be because of matching against a
	3090	* utf8 string, so accept a possible false positive for
	3091	* latin1-range folds */
	3092	if (uc >= 0x100 \|\|
	3093	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3094	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3095	&& (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
	3096	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3097	)
	3098	compat = 0;
	3099	ANYOF_CLASS_ZERO(data->start_class);
	3100	ANYOF_BITMAP_ZERO(data->start_class);
	3101	if (compat)
	3102	ANYOF_BITMAP_SET(data->start_class, uc);
	3103	data->start_class->flags &= ~ANYOF_EOS;
	3104	if (uc < 0x100)
	3105	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3106	}
	3107	else if (flags & SCF_DO_STCLASS_OR) {
	3108	/* false positive possible if the class is case-folded */
	3109	if (uc < 0x100)
	3110	ANYOF_BITMAP_SET(data->start_class, uc);
	3111	else
	3112	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3113	data->start_class->flags &= ~ANYOF_EOS;
	3114	cl_and(data->start_class, and_withp);
	3115	}
	3116	flags &= ~SCF_DO_STCLASS;
	3117	}
	3118	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3119	I32 l = STR_LEN(scan);
	3120	UV uc = ((U8)STRING(scan));
	3121
	3122	/* Search for fixed substrings supports EXACT only. */
	3123	if (flags & SCF_DO_SUBSTR) {
	3124	assert(data);
	3125	SCAN_COMMIT(pRExC_state, data, minlenp);
	3126	}
	3127	if (UTF) {
	3128	const U8 * const s = (U8 *)STRING(scan);
	3129	l = utf8_length(s, s + l);
	3130	uc = utf8_to_uvchr(s, NULL);
	3131	}
	3132	min += l;
	3133	if (flags & SCF_DO_SUBSTR)
	3134	data->pos_min += l;
	3135	if (flags & SCF_DO_STCLASS_AND) {
	3136	/* Check whether it is compatible with what we know already! */
	3137	int compat = 1;
	3138	if (uc >= 0x100 \|\|
	3139	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3140	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3141	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3142	{
	3143	compat = 0;
	3144	}
	3145	ANYOF_CLASS_ZERO(data->start_class);
	3146	ANYOF_BITMAP_ZERO(data->start_class);
	3147	if (compat) {
	3148	ANYOF_BITMAP_SET(data->start_class, uc);
	3149	data->start_class->flags &= ~ANYOF_EOS;
	3150	data->start_class->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	3151	if (OP(scan) == EXACTFL) {
	3152	data->start_class->flags \|= ANYOF_LOCALE;
	3153	}
	3154	else {
	3155
	3156	/* Also set the other member of the fold pair. In case
	3157	* that unicode semantics is called for at runtime, use
	3158	* the full latin1 fold. (Can't do this for locale,
	3159	* because not known until runtime */
	3160	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3161	}
	3162	}
	3163	}
	3164	else if (flags & SCF_DO_STCLASS_OR) {
	3165	if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
	3166	/* false positive possible if the class is case-folded.
	3167	Assume that the locale settings are the same... */
	3168	if (uc < 0x100) {
	3169	ANYOF_BITMAP_SET(data->start_class, uc);
	3170	if (OP(scan) != EXACTFL) {
	3171
	3172	/* And set the other member of the fold pair, but
	3173	* can't do that in locale because not known until
	3174	* run-time */
	3175	ANYOF_BITMAP_SET(data->start_class,
	3176	PL_fold_latin1[uc]);
	3177	}
	3178	}
	3179	data->start_class->flags &= ~ANYOF_EOS;
	3180	}
	3181	cl_and(data->start_class, and_withp);
	3182	}
	3183	flags &= ~SCF_DO_STCLASS;
	3184	}
	3185	else if (REGNODE_VARIES(OP(scan))) {
	3186	I32 mincount, maxcount, minnext, deltanext, fl = 0;
	3187	I32 f = flags, pos_before = 0;
	3188	regnode * const oscan = scan;
	3189	struct regnode_charclass_class this_class;
	3190	struct regnode_charclass_class *oclass = NULL;
	3191	I32 next_is_eval = 0;
	3192
	3193	switch (PL_regkind[OP(scan)]) {
	3194	case WHILEM: /* End of (?:...)* . */
	3195	scan = NEXTOPER(scan);
	3196	goto finish;
	3197	case PLUS:
	3198	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3199	next = NEXTOPER(scan);
	3200	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3201	mincount = 1;
	3202	maxcount = REG_INFTY;
	3203	next = regnext(scan);
	3204	scan = NEXTOPER(scan);
	3205	goto do_curly;
	3206	}
	3207	}
	3208	if (flags & SCF_DO_SUBSTR)
	3209	data->pos_min++;
	3210	min++;
	3211	/* Fall through. */
	3212	case STAR:
	3213	if (flags & SCF_DO_STCLASS) {
	3214	mincount = 0;
	3215	maxcount = REG_INFTY;
	3216	next = regnext(scan);
	3217	scan = NEXTOPER(scan);
	3218	goto do_curly;
	3219	}
	3220	is_inf = is_inf_internal = 1;
	3221	scan = regnext(scan);
	3222	if (flags & SCF_DO_SUBSTR) {
	3223	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3224	data->longest = &(data->longest_float);
	3225	}
	3226	goto optimize_curly_tail;
	3227	case CURLY:
	3228	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3229	&& (scan->flags == stopparen))
	3230	{
	3231	mincount = 1;
	3232	maxcount = 1;
	3233	} else {
	3234	mincount = ARG1(scan);
	3235	maxcount = ARG2(scan);
	3236	}
	3237	next = regnext(scan);
	3238	if (OP(scan) == CURLYX) {
	3239	I32 lp = (data ? *(data->last_closep) : 0);
	3240	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3241	}
	3242	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3243	next_is_eval = (OP(scan) == EVAL);
	3244	do_curly:
	3245	if (flags & SCF_DO_SUBSTR) {
	3246	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3247	pos_before = data->pos_min;
	3248	}
	3249	if (data) {
	3250	fl = data->flags;
	3251	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3252	if (is_inf)
	3253	data->flags \|= SF_IS_INF;
	3254	}
	3255	if (flags & SCF_DO_STCLASS) {
	3256	cl_init(pRExC_state, &this_class);
	3257	oclass = data->start_class;
	3258	data->start_class = &this_class;
	3259	f \|= SCF_DO_STCLASS_AND;
	3260	f &= ~SCF_DO_STCLASS_OR;
	3261	}
	3262	/* Exclude from super-linear cache processing any {n,m}
	3263	regops for which the combination of input pos and regex
	3264	pos is not enough information to determine if a match
	3265	will be possible.
	3266
	3267	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3268	regex pos at the \s*, the prospects for a match depend not
	3269	only on the input position but also on how many (bar\s*)
	3270	repeats into the {4,8} we are. */
	3271	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3272	f &= ~SCF_WHILEM_VISITED_POS;
	3273
	3274	/* This will finish on WHILEM, setting scan, or on NULL: */
	3275	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3276	last, data, stopparen, recursed, NULL,
	3277	(mincount == 0
	3278	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3279
	3280	if (flags & SCF_DO_STCLASS)
	3281	data->start_class = oclass;
	3282	if (mincount == 0 \|\| minnext == 0) {
	3283	if (flags & SCF_DO_STCLASS_OR) {
	3284	cl_or(pRExC_state, data->start_class, &this_class);
	3285	}
	3286	else if (flags & SCF_DO_STCLASS_AND) {
	3287	/* Switch to OR mode: cache the old value of
	3288	* data->start_class */
	3289	INIT_AND_WITHP;
	3290	StructCopy(data->start_class, and_withp,
	3291	struct regnode_charclass_class);
	3292	flags &= ~SCF_DO_STCLASS_AND;
	3293	StructCopy(&this_class, data->start_class,
	3294	struct regnode_charclass_class);
	3295	flags \|= SCF_DO_STCLASS_OR;
	3296	data->start_class->flags \|= ANYOF_EOS;
	3297	}
	3298	} else { /* Non-zero len */
	3299	if (flags & SCF_DO_STCLASS_OR) {
	3300	cl_or(pRExC_state, data->start_class, &this_class);
	3301	cl_and(data->start_class, and_withp);
	3302	}
	3303	else if (flags & SCF_DO_STCLASS_AND)
	3304	cl_and(data->start_class, &this_class);
	3305	flags &= ~SCF_DO_STCLASS;
	3306	}
	3307	if (!scan) /* It was not CURLYX, but CURLY. */
	3308	scan = next;
	3309	if ( /* ? quantifier ok, except for (?{ ... }) */
	3310	(next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3311	&& (minnext == 0) && (deltanext == 0)
	3312	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3313	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3314	{
	3315	ckWARNreg(RExC_parse,
	3316	"Quantifier unexpected on zero-length expression");
	3317	}
	3318
	3319	min += minnext * mincount;
	3320	is_inf_internal \|= ((maxcount == REG_INFTY
	3321	&& (minnext + deltanext) > 0)
	3322	\|\| deltanext == I32_MAX);
	3323	is_inf \|= is_inf_internal;
	3324	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3325
	3326	/* Try powerful optimization CURLYX => CURLYN. */
	3327	if ( OP(oscan) == CURLYX && data
	3328	&& data->flags & SF_IN_PAR
	3329	&& !(data->flags & SF_HAS_EVAL)
	3330	&& !deltanext && minnext == 1 ) {
	3331	/* Try to optimize to CURLYN. */
	3332	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3333	regnode * const nxt1 = nxt;
	3334	#ifdef DEBUGGING
	3335	regnode *nxt2;
	3336	#endif
	3337
	3338	/* Skip open. */
	3339	nxt = regnext(nxt);
	3340	if (!REGNODE_SIMPLE(OP(nxt))
	3341	&& !(PL_regkind[OP(nxt)] == EXACT
	3342	&& STR_LEN(nxt) == 1))
	3343	goto nogo;
	3344	#ifdef DEBUGGING
	3345	nxt2 = nxt;
	3346	#endif
	3347	nxt = regnext(nxt);
	3348	if (OP(nxt) != CLOSE)
	3349	goto nogo;
	3350	if (RExC_open_parens) {
	3351	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3352	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3353	}
	3354	/* Now we know that nxt2 is the only contents: */
	3355	oscan->flags = (U8)ARG(nxt);
	3356	OP(oscan) = CURLYN;
	3357	OP(nxt1) = NOTHING; /* was OPEN. */
	3358
	3359	#ifdef DEBUGGING
	3360	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3361	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	3362	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	3363	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3364	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3365	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	3366	#endif
	3367	}
	3368	nogo:
	3369
	3370	/* Try optimization CURLYX => CURLYM. */
	3371	if ( OP(oscan) == CURLYX && data
	3372	&& !(data->flags & SF_HAS_PAR)
	3373	&& !(data->flags & SF_HAS_EVAL)
	3374	&& !deltanext /* atom is fixed width */
	3375	&& minnext != 0 /* CURLYM can't handle zero width */
	3376	) {
	3377	/* XXXX How to optimize if data == 0? */
	3378	/* Optimize to a simpler form. */
	3379	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	3380	regnode *nxt2;
	3381
	3382	OP(oscan) = CURLYM;
	3383	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	3384	&& (OP(nxt2) != WHILEM))
	3385	nxt = nxt2;
	3386	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	3387	/* Need to optimize away parenths. */
	3388	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	3389	/* Set the parenth number. */
	3390	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	3391
	3392	oscan->flags = (U8)ARG(nxt);
	3393	if (RExC_open_parens) {
	3394	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3395	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	3396	}
	3397	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	3398	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3399
	3400	#ifdef DEBUGGING
	3401	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3402	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3403	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	3404	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	3405	#endif
	3406	#if 0
	3407	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	3408	regnode *nnxt = regnext(nxt1);
	3409	if (nnxt == nxt) {
	3410	if (reg_off_by_arg[OP(nxt1)])
	3411	ARG_SET(nxt1, nxt2 - nxt1);
	3412	else if (nxt2 - nxt1 < U16_MAX)
	3413	NEXT_OFF(nxt1) = nxt2 - nxt1;
	3414	else
	3415	OP(nxt) = NOTHING; /* Cannot beautify */
	3416	}
	3417	nxt1 = nnxt;
	3418	}
	3419	#endif
	3420	/* Optimize again: */
	3421	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	3422	NULL, stopparen, recursed, NULL, 0,depth+1);
	3423	}
	3424	else
	3425	oscan->flags = 0;
	3426	}
	3427	else if ((OP(oscan) == CURLYX)
	3428	&& (flags & SCF_WHILEM_VISITED_POS)
	3429	/* See the comment on a similar expression above.
	3430	However, this time it's not a subexpression
	3431	we care about, but the expression itself. */
	3432	&& (maxcount == REG_INFTY)
	3433	&& data && ++data->whilem_c < 16) {
	3434	/* This stays as CURLYX, we can put the count/of pair. */
	3435	/* Find WHILEM (as in regexec.c) */
	3436	regnode *nxt = oscan + NEXT_OFF(oscan);
	3437
	3438	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	3439	nxt += ARG(nxt);
	3440	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	3441	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	3442	}
	3443	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	3444	pars++;
	3445	if (flags & SCF_DO_SUBSTR) {
	3446	SV *last_str = NULL;
	3447	int counted = mincount != 0;
	3448
	3449	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	3450	#if defined(SPARC64_GCC_WORKAROUND)
	3451	I32 b = 0;
	3452	STRLEN l = 0;
	3453	const char *s = NULL;
	3454	I32 old = 0;
	3455
	3456	if (pos_before >= data->last_start_min)
	3457	b = pos_before;
	3458	else
	3459	b = data->last_start_min;
	3460
	3461	l = 0;
	3462	s = SvPV_const(data->last_found, l);
	3463	old = b - data->last_start_min;
	3464
	3465	#else
	3466	I32 b = pos_before >= data->last_start_min
	3467	? pos_before : data->last_start_min;
	3468	STRLEN l;
	3469	const char * const s = SvPV_const(data->last_found, l);
	3470	I32 old = b - data->last_start_min;
	3471	#endif
	3472
	3473	if (UTF)
	3474	old = utf8_hop((U8)s, old) - (U8)s;
	3475	l -= old;
	3476	/* Get the added string: */
	3477	last_str = newSVpvn_utf8(s + old, l, UTF);
	3478	if (deltanext == 0 && pos_before == b) {
	3479	/* What was added is a constant string */
	3480	if (mincount > 1) {
	3481	SvGROW(last_str, (mincount * l) + 1);
	3482	repeatcpy(SvPVX(last_str) + l,
	3483	SvPVX_const(last_str), l, mincount - 1);
	3484	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	3485	/* Add additional parts. */
	3486	SvCUR_set(data->last_found,
	3487	SvCUR(data->last_found) - l);
	3488	sv_catsv(data->last_found, last_str);
	3489	{
	3490	SV * sv = data->last_found;
	3491	MAGIC *mg =
	3492	SvUTF8(sv) && SvMAGICAL(sv) ?
	3493	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3494	if (mg && mg->mg_len >= 0)
	3495	mg->mg_len += CHR_SVLEN(last_str) - l;
	3496	}
	3497	data->last_end += l * (mincount - 1);
	3498	}
	3499	} else {
	3500	/* start offset must point into the last copy */
	3501	data->last_start_min += minnext * (mincount - 1);
	3502	data->last_start_max += is_inf ? I32_MAX
	3503	: (maxcount - 1) * (minnext + data->pos_delta);
	3504	}
	3505	}
	3506	/* It is counted once already... */
	3507	data->pos_min += minnext * (mincount - counted);
	3508	data->pos_delta += - counted * deltanext +
	3509	(minnext + deltanext) * maxcount - minnext * mincount;
	3510	if (mincount != maxcount) {
	3511	/* Cannot extend fixed substrings found inside
	3512	the group. */
	3513	SCAN_COMMIT(pRExC_state,data,minlenp);
	3514	if (mincount && last_str) {
	3515	SV * const sv = data->last_found;
	3516	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3517	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3518
	3519	if (mg)
	3520	mg->mg_len = -1;
	3521	sv_setsv(sv, last_str);
	3522	data->last_end = data->pos_min;
	3523	data->last_start_min =
	3524	data->pos_min - CHR_SVLEN(last_str);
	3525	data->last_start_max = is_inf
	3526	? I32_MAX
	3527	: data->pos_min + data->pos_delta
	3528	- CHR_SVLEN(last_str);
	3529	}
	3530	data->longest = &(data->longest_float);
	3531	}
	3532	SvREFCNT_dec(last_str);
	3533	}
	3534	if (data && (fl & SF_HAS_EVAL))
	3535	data->flags \|= SF_HAS_EVAL;
	3536	optimize_curly_tail:
	3537	if (OP(oscan) != CURLYX) {
	3538	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	3539	&& NEXT_OFF(next))
	3540	NEXT_OFF(oscan) += NEXT_OFF(next);
	3541	}
	3542	continue;
	3543	default: /* REF, ANYOFV, and CLUMP only? */
	3544	if (flags & SCF_DO_SUBSTR) {
	3545	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3546	data->longest = &(data->longest_float);
	3547	}
	3548	is_inf = is_inf_internal = 1;
	3549	if (flags & SCF_DO_STCLASS_OR)
	3550	cl_anything(pRExC_state, data->start_class);
	3551	flags &= ~SCF_DO_STCLASS;
	3552	break;
	3553	}
	3554	}
	3555	else if (OP(scan) == LNBREAK) {
	3556	if (flags & SCF_DO_STCLASS) {
	3557	int value = 0;
	3558	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3559	if (flags & SCF_DO_STCLASS_AND) {
	3560	for (value = 0; value < 256; value++)
	3561	if (!is_VERTWS_cp(value))
	3562	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3563	}
	3564	else {
	3565	for (value = 0; value < 256; value++)
	3566	if (is_VERTWS_cp(value))
	3567	ANYOF_BITMAP_SET(data->start_class, value);
	3568	}
	3569	if (flags & SCF_DO_STCLASS_OR)
	3570	cl_and(data->start_class, and_withp);
	3571	flags &= ~SCF_DO_STCLASS;
	3572	}
	3573	min += 1;
	3574	delta += 1;
	3575	if (flags & SCF_DO_SUBSTR) {
	3576	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3577	data->pos_min += 1;
	3578	data->pos_delta += 1;
	3579	data->longest = &(data->longest_float);
	3580	}
	3581	}
	3582	else if (OP(scan) == FOLDCHAR) {
	3583	int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
	3584	flags &= ~SCF_DO_STCLASS;
	3585	min += 1;
	3586	delta += d;
	3587	if (flags & SCF_DO_SUBSTR) {
	3588	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3589	data->pos_min += 1;
	3590	data->pos_delta += d;
	3591	data->longest = &(data->longest_float);
	3592	}
	3593	}
	3594	else if (REGNODE_SIMPLE(OP(scan))) {
	3595	int value = 0;
	3596
	3597	if (flags & SCF_DO_SUBSTR) {
	3598	SCAN_COMMIT(pRExC_state,data,minlenp);
	3599	data->pos_min++;
	3600	}
	3601	min++;
	3602	if (flags & SCF_DO_STCLASS) {
	3603	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3604
	3605	/* Some of the logic below assumes that switching
	3606	locale on will only add false positives. */
	3607	switch (PL_regkind[OP(scan)]) {
	3608	case SANY:
	3609	default:
	3610	do_default:
	3611	/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
	3612	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3613	cl_anything(pRExC_state, data->start_class);
	3614	break;
	3615	case REG_ANY:
	3616	if (OP(scan) == SANY)
	3617	goto do_default;
	3618	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	3619	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	3620	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	3621	cl_anything(pRExC_state, data->start_class);
	3622	}
	3623	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	3624	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	3625	break;
	3626	case ANYOF:
	3627	if (flags & SCF_DO_STCLASS_AND)
	3628	cl_and(data->start_class,
	3629	(struct regnode_charclass_class*)scan);
	3630	else
	3631	cl_or(pRExC_state, data->start_class,
	3632	(struct regnode_charclass_class*)scan);
	3633	break;
	3634	case ALNUM:
	3635	if (flags & SCF_DO_STCLASS_AND) {
	3636	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3637	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
	3638	if (OP(scan) == ALNUMU) {
	3639	for (value = 0; value < 256; value++) {
	3640	if (!isWORDCHAR_L1(value)) {
	3641	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3642	}
	3643	}
	3644	} else {
	3645	for (value = 0; value < 256; value++) {
	3646	if (!isALNUM(value)) {
	3647	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3648	}
	3649	}
	3650	}
	3651	}
	3652	}
	3653	else {
	3654	if (data->start_class->flags & ANYOF_LOCALE)
	3655	ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
	3656	else if (OP(scan) == ALNUMU) {
	3657	for (value = 0; value < 256; value++) {
	3658	if (isWORDCHAR_L1(value)) {
	3659	ANYOF_BITMAP_SET(data->start_class, value);
	3660	}
	3661	}
	3662	} else {
	3663	for (value = 0; value < 256; value++) {
	3664	if (isALNUM(value)) {
	3665	ANYOF_BITMAP_SET(data->start_class, value);
	3666	}
	3667	}
	3668	}
	3669	}
	3670	break;
	3671	case NALNUM:
	3672	if (flags & SCF_DO_STCLASS_AND) {
	3673	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3674	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
	3675	if (OP(scan) == NALNUMU) {
	3676	for (value = 0; value < 256; value++) {
	3677	if (isWORDCHAR_L1(value)) {
	3678	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3679	}
	3680	}
	3681	} else {
	3682	for (value = 0; value < 256; value++) {
	3683	if (isALNUM(value)) {
	3684	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3685	}
	3686	}
	3687	}
	3688	}
	3689	}
	3690	else {
	3691	if (data->start_class->flags & ANYOF_LOCALE)
	3692	ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
	3693	else {
	3694	if (OP(scan) == NALNUMU) {
	3695	for (value = 0; value < 256; value++) {
	3696	if (! isWORDCHAR_L1(value)) {
	3697	ANYOF_BITMAP_SET(data->start_class, value);
	3698	}
	3699	}
	3700	} else {
	3701	for (value = 0; value < 256; value++) {
	3702	if (! isALNUM(value)) {
	3703	ANYOF_BITMAP_SET(data->start_class, value);
	3704	}
	3705	}
	3706	}
	3707	}
	3708	}
	3709	break;
	3710	case SPACE:
	3711	if (flags & SCF_DO_STCLASS_AND) {
	3712	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3713	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
	3714	if (OP(scan) == SPACEU) {
	3715	for (value = 0; value < 256; value++) {
	3716	if (!isSPACE_L1(value)) {
	3717	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3718	}
	3719	}
	3720	} else {
	3721	for (value = 0; value < 256; value++) {
	3722	if (!isSPACE(value)) {
	3723	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3724	}
	3725	}
	3726	}
	3727	}
	3728	}
	3729	else {
	3730	if (data->start_class->flags & ANYOF_LOCALE) {
	3731	ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
	3732	}
	3733	else if (OP(scan) == SPACEU) {
	3734	for (value = 0; value < 256; value++) {
	3735	if (isSPACE_L1(value)) {
	3736	ANYOF_BITMAP_SET(data->start_class, value);
	3737	}
	3738	}
	3739	} else {
	3740	for (value = 0; value < 256; value++) {
	3741	if (isSPACE(value)) {
	3742	ANYOF_BITMAP_SET(data->start_class, value);
	3743	}
	3744	}
	3745	}
	3746	}
	3747	break;
	3748	case NSPACE:
	3749	if (flags & SCF_DO_STCLASS_AND) {
	3750	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3751	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
	3752	if (OP(scan) == NSPACEU) {
	3753	for (value = 0; value < 256; value++) {
	3754	if (isSPACE_L1(value)) {
	3755	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3756	}
	3757	}
	3758	} else {
	3759	for (value = 0; value < 256; value++) {
	3760	if (isSPACE(value)) {
	3761	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3762	}
	3763	}
	3764	}
	3765	}
	3766	}
	3767	else {
	3768	if (data->start_class->flags & ANYOF_LOCALE)
	3769	ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
	3770	else if (OP(scan) == NSPACEU) {
	3771	for (value = 0; value < 256; value++) {
	3772	if (!isSPACE_L1(value)) {
	3773	ANYOF_BITMAP_SET(data->start_class, value);
	3774	}
	3775	}
	3776	}
	3777	else {
	3778	for (value = 0; value < 256; value++) {
	3779	if (!isSPACE(value)) {
	3780	ANYOF_BITMAP_SET(data->start_class, value);
	3781	}
	3782	}
	3783	}
	3784	}
	3785	break;
	3786	case DIGIT:
	3787	if (flags & SCF_DO_STCLASS_AND) {
	3788	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
	3789	for (value = 0; value < 256; value++)
	3790	if (!isDIGIT(value))
	3791	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3792	}
	3793	else {
	3794	if (data->start_class->flags & ANYOF_LOCALE)
	3795	ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
	3796	else {
	3797	for (value = 0; value < 256; value++)
	3798	if (isDIGIT(value))
	3799	ANYOF_BITMAP_SET(data->start_class, value);
	3800	}
	3801	}
	3802	break;
	3803	case NDIGIT:
	3804	if (flags & SCF_DO_STCLASS_AND) {
	3805	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
	3806	for (value = 0; value < 256; value++)
	3807	if (isDIGIT(value))
	3808	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3809	}
	3810	else {
	3811	if (data->start_class->flags & ANYOF_LOCALE)
	3812	ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
	3813	else {
	3814	for (value = 0; value < 256; value++)
	3815	if (!isDIGIT(value))
	3816	ANYOF_BITMAP_SET(data->start_class, value);
	3817	}
	3818	}
	3819	break;
	3820	CASE_SYNST_FNC(VERTWS);
	3821	CASE_SYNST_FNC(HORIZWS);
	3822
	3823	}
	3824	if (flags & SCF_DO_STCLASS_OR)
	3825	cl_and(data->start_class, and_withp);
	3826	flags &= ~SCF_DO_STCLASS;
	3827	}
	3828	}
	3829	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	3830	data->flags \|= (OP(scan) == MEOL
	3831	? SF_BEFORE_MEOL
	3832	: SF_BEFORE_SEOL);
	3833	}
	3834	else if ( PL_regkind[OP(scan)] == BRANCHJ
	3835	/* Lookbehind, or need to calculate parens/evals/stclass: */
	3836	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	3837	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	3838	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	3839	\|\| OP(scan) == UNLESSM )
	3840	{
	3841	/* Negative Lookahead/lookbehind
	3842	In this case we can't do fixed string optimisation.
	3843	*/
	3844
	3845	I32 deltanext, minnext, fake = 0;
	3846	regnode *nscan;
	3847	struct regnode_charclass_class intrnl;
	3848	int f = 0;
	3849
	3850	data_fake.flags = 0;
	3851	if (data) {
	3852	data_fake.whilem_c = data->whilem_c;
	3853	data_fake.last_closep = data->last_closep;
	3854	}
	3855	else
	3856	data_fake.last_closep = &fake;
	3857	data_fake.pos_delta = delta;
	3858	if ( flags & SCF_DO_STCLASS && !scan->flags
	3859	&& OP(scan) == IFMATCH ) { /* Lookahead */
	3860	cl_init(pRExC_state, &intrnl);
	3861	data_fake.start_class = &intrnl;
	3862	f \|= SCF_DO_STCLASS_AND;
	3863	}
	3864	if (flags & SCF_WHILEM_VISITED_POS)
	3865	f \|= SCF_WHILEM_VISITED_POS;
	3866	next = regnext(scan);
	3867	nscan = NEXTOPER(NEXTOPER(scan));
	3868	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	3869	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	3870	if (scan->flags) {
	3871	if (deltanext) {
	3872	FAIL("Variable length lookbehind not implemented");
	3873	}
	3874	else if (minnext > (I32)U8_MAX) {
	3875	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	3876	}
	3877	scan->flags = (U8)minnext;
	3878	}
	3879	if (data) {
	3880	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3881	pars++;
	3882	if (data_fake.flags & SF_HAS_EVAL)
	3883	data->flags \|= SF_HAS_EVAL;
	3884	data->whilem_c = data_fake.whilem_c;
	3885	}
	3886	if (f & SCF_DO_STCLASS_AND) {
	3887	if (flags & SCF_DO_STCLASS_OR) {
	3888	/* OR before, AND after: ideally we would recurse with
	3889	* data_fake to get the AND applied by study of the
	3890	* remainder of the pattern, and then derecurse;
	3891	* * HACK * for now just treat as "no information".
	3892	* See [perl #56690].
	3893	*/
	3894	cl_init(pRExC_state, data->start_class);
	3895	} else {
	3896	/* AND before and after: combine and continue */
	3897	const int was = (data->start_class->flags & ANYOF_EOS);
	3898
	3899	cl_and(data->start_class, &intrnl);
	3900	if (was)
	3901	data->start_class->flags \|= ANYOF_EOS;
	3902	}
	3903	}
	3904	}
	3905	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	3906	else {
	3907	/* Positive Lookahead/lookbehind
	3908	In this case we can do fixed string optimisation,
	3909	but we must be careful about it. Note in the case of
	3910	lookbehind the positions will be offset by the minimum
	3911	length of the pattern, something we won't know about
	3912	until after the recurse.
	3913	*/
	3914	I32 deltanext, fake = 0;
	3915	regnode *nscan;
	3916	struct regnode_charclass_class intrnl;
	3917	int f = 0;
	3918	/* We use SAVEFREEPV so that when the full compile
	3919	is finished perl will clean up the allocated
	3920	minlens when it's all done. This way we don't
	3921	have to worry about freeing them when we know
	3922	they wont be used, which would be a pain.
	3923	*/
	3924	I32 *minnextp;
	3925	Newx( minnextp, 1, I32 );
	3926	SAVEFREEPV(minnextp);
	3927
	3928	if (data) {
	3929	StructCopy(data, &data_fake, scan_data_t);
	3930	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	3931	f \|= SCF_DO_SUBSTR;
	3932	if (scan->flags)
	3933	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	3934	data_fake.last_found=newSVsv(data->last_found);
	3935	}
	3936	}
	3937	else
	3938	data_fake.last_closep = &fake;
	3939	data_fake.flags = 0;
	3940	data_fake.pos_delta = delta;
	3941	if (is_inf)
	3942	data_fake.flags \|= SF_IS_INF;
	3943	if ( flags & SCF_DO_STCLASS && !scan->flags
	3944	&& OP(scan) == IFMATCH ) { /* Lookahead */
	3945	cl_init(pRExC_state, &intrnl);
	3946	data_fake.start_class = &intrnl;
	3947	f \|= SCF_DO_STCLASS_AND;
	3948	}
	3949	if (flags & SCF_WHILEM_VISITED_POS)
	3950	f \|= SCF_WHILEM_VISITED_POS;
	3951	next = regnext(scan);
	3952	nscan = NEXTOPER(NEXTOPER(scan));
	3953
	3954	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	3955	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	3956	if (scan->flags) {
	3957	if (deltanext) {
	3958	FAIL("Variable length lookbehind not implemented");
	3959	}
	3960	else if (*minnextp > (I32)U8_MAX) {
	3961	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	3962	}
	3963	scan->flags = (U8)*minnextp;
	3964	}
	3965
	3966	*minnextp += min;
	3967
	3968	if (f & SCF_DO_STCLASS_AND) {
	3969	const int was = (data->start_class->flags & ANYOF_EOS);
	3970
	3971	cl_and(data->start_class, &intrnl);
	3972	if (was)
	3973	data->start_class->flags \|= ANYOF_EOS;
	3974	}
	3975	if (data) {
	3976	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3977	pars++;
	3978	if (data_fake.flags & SF_HAS_EVAL)
	3979	data->flags \|= SF_HAS_EVAL;
	3980	data->whilem_c = data_fake.whilem_c;
	3981	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	3982	if (RExC_rx->minlen<*minnextp)
	3983	RExC_rx->minlen=*minnextp;
	3984	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	3985	SvREFCNT_dec(data_fake.last_found);
	3986
	3987	if ( data_fake.minlen_fixed != minlenp )
	3988	{
	3989	data->offset_fixed= data_fake.offset_fixed;
	3990	data->minlen_fixed= data_fake.minlen_fixed;
	3991	data->lookbehind_fixed+= scan->flags;
	3992	}
	3993	if ( data_fake.minlen_float != minlenp )
	3994	{
	3995	data->minlen_float= data_fake.minlen_float;
	3996	data->offset_float_min=data_fake.offset_float_min;
	3997	data->offset_float_max=data_fake.offset_float_max;
	3998	data->lookbehind_float+= scan->flags;
	3999	}
	4000	}
	4001	}
	4002
	4003
	4004	}
	4005	#endif
	4006	}
	4007	else if (OP(scan) == OPEN) {
	4008	if (stopparen != (I32)ARG(scan))
	4009	pars++;
	4010	}
	4011	else if (OP(scan) == CLOSE) {
	4012	if (stopparen == (I32)ARG(scan)) {
	4013	break;
	4014	}
	4015	if ((I32)ARG(scan) == is_par) {
	4016	next = regnext(scan);
	4017
	4018	if ( next && (OP(next) != WHILEM) && next < last)
	4019	is_par = 0; /* Disable optimization */
	4020	}
	4021	if (data)
	4022	*(data->last_closep) = ARG(scan);
	4023	}
	4024	else if (OP(scan) == EVAL) {
	4025	if (data)
	4026	data->flags \|= SF_HAS_EVAL;
	4027	}
	4028	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4029	if (flags & SCF_DO_SUBSTR) {
	4030	SCAN_COMMIT(pRExC_state,data,minlenp);
	4031	flags &= ~SCF_DO_SUBSTR;
	4032	}
	4033	if (data && OP(scan)==ACCEPT) {
	4034	data->flags \|= SCF_SEEN_ACCEPT;
	4035	if (stopmin > min)
	4036	stopmin = min;
	4037	}
	4038	}
	4039	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4040	{
	4041	if (flags & SCF_DO_SUBSTR) {
	4042	SCAN_COMMIT(pRExC_state,data,minlenp);
	4043	data->longest = &(data->longest_float);
	4044	}
	4045	is_inf = is_inf_internal = 1;
	4046	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4047	cl_anything(pRExC_state, data->start_class);
	4048	flags &= ~SCF_DO_STCLASS;
	4049	}
	4050	else if (OP(scan) == GPOS) {
	4051	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4052	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4053	{
	4054	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4055	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4056	if (RExC_rx->gofs < (U32)min)
	4057	RExC_rx->gofs = min;
	4058	} else {
	4059	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4060	RExC_rx->gofs = 0;
	4061	}
	4062	}
	4063	#ifdef TRIE_STUDY_OPT
	4064	#ifdef FULL_TRIE_STUDY
	4065	else if (PL_regkind[OP(scan)] == TRIE) {
	4066	/* NOTE - There is similar code to this block above for handling
	4067	BRANCH nodes on the initial study. If you change stuff here
	4068	check there too. */
	4069	regnode *trie_node= scan;
	4070	regnode *tail= regnext(scan);
	4071	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4072	I32 max1 = 0, min1 = I32_MAX;
	4073	struct regnode_charclass_class accum;
	4074
	4075	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4076	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4077	if (flags & SCF_DO_STCLASS)
	4078	cl_init_zero(pRExC_state, &accum);
	4079
	4080	if (!trie->jump) {
	4081	min1= trie->minlen;
	4082	max1= trie->maxlen;
	4083	} else {
	4084	const regnode *nextbranch= NULL;
	4085	U32 word;
	4086
	4087	for ( word=1 ; word <= trie->wordcount ; word++)
	4088	{
	4089	I32 deltanext=0, minnext=0, f = 0, fake;
	4090	struct regnode_charclass_class this_class;
	4091
	4092	data_fake.flags = 0;
	4093	if (data) {
	4094	data_fake.whilem_c = data->whilem_c;
	4095	data_fake.last_closep = data->last_closep;
	4096	}
	4097	else
	4098	data_fake.last_closep = &fake;
	4099	data_fake.pos_delta = delta;
	4100	if (flags & SCF_DO_STCLASS) {
	4101	cl_init(pRExC_state, &this_class);
	4102	data_fake.start_class = &this_class;
	4103	f = SCF_DO_STCLASS_AND;
	4104	}
	4105	if (flags & SCF_WHILEM_VISITED_POS)
	4106	f \|= SCF_WHILEM_VISITED_POS;
	4107
	4108	if (trie->jump[word]) {
	4109	if (!nextbranch)
	4110	nextbranch = trie_node + trie->jump[0];
	4111	scan= trie_node + trie->jump[word];
	4112	/* We go from the jump point to the branch that follows
	4113	it. Note this means we need the vestigal unused branches
	4114	even though they arent otherwise used.
	4115	*/
	4116	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4117	&deltanext, (regnode *)nextbranch, &data_fake,
	4118	stopparen, recursed, NULL, f,depth+1);
	4119	}
	4120	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4121	nextbranch= regnext((regnode*)nextbranch);
	4122
	4123	if (min1 > (I32)(minnext + trie->minlen))
	4124	min1 = minnext + trie->minlen;
	4125	if (max1 < (I32)(minnext + deltanext + trie->maxlen))
	4126	max1 = minnext + deltanext + trie->maxlen;
	4127	if (deltanext == I32_MAX)
	4128	is_inf = is_inf_internal = 1;
	4129
	4130	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4131	pars++;
	4132	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4133	if ( stopmin > min + min1)
	4134	stopmin = min + min1;
	4135	flags &= ~SCF_DO_SUBSTR;
	4136	if (data)
	4137	data->flags \|= SCF_SEEN_ACCEPT;
	4138	}
	4139	if (data) {
	4140	if (data_fake.flags & SF_HAS_EVAL)
	4141	data->flags \|= SF_HAS_EVAL;
	4142	data->whilem_c = data_fake.whilem_c;
	4143	}
	4144	if (flags & SCF_DO_STCLASS)
	4145	cl_or(pRExC_state, &accum, &this_class);
	4146	}
	4147	}
	4148	if (flags & SCF_DO_SUBSTR) {
	4149	data->pos_min += min1;
	4150	data->pos_delta += max1 - min1;
	4151	if (max1 != min1 \|\| is_inf)
	4152	data->longest = &(data->longest_float);
	4153	}
	4154	min += min1;
	4155	delta += max1 - min1;
	4156	if (flags & SCF_DO_STCLASS_OR) {
	4157	cl_or(pRExC_state, data->start_class, &accum);
	4158	if (min1) {
	4159	cl_and(data->start_class, and_withp);
	4160	flags &= ~SCF_DO_STCLASS;
	4161	}
	4162	}
	4163	else if (flags & SCF_DO_STCLASS_AND) {
	4164	if (min1) {
	4165	cl_and(data->start_class, &accum);
	4166	flags &= ~SCF_DO_STCLASS;
	4167	}
	4168	else {
	4169	/* Switch to OR mode: cache the old value of
	4170	* data->start_class */
	4171	INIT_AND_WITHP;
	4172	StructCopy(data->start_class, and_withp,
	4173	struct regnode_charclass_class);
	4174	flags &= ~SCF_DO_STCLASS_AND;
	4175	StructCopy(&accum, data->start_class,
	4176	struct regnode_charclass_class);
	4177	flags \|= SCF_DO_STCLASS_OR;
	4178	data->start_class->flags \|= ANYOF_EOS;
	4179	}
	4180	}
	4181	scan= tail;
	4182	continue;
	4183	}
	4184	#else
	4185	else if (PL_regkind[OP(scan)] == TRIE) {
	4186	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4187	U8*bang=NULL;
	4188
	4189	min += trie->minlen;
	4190	delta += (trie->maxlen - trie->minlen);
	4191	flags &= ~SCF_DO_STCLASS; /* xxx */
	4192	if (flags & SCF_DO_SUBSTR) {
	4193	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4194	data->pos_min += trie->minlen;
	4195	data->pos_delta += (trie->maxlen - trie->minlen);
	4196	if (trie->maxlen != trie->minlen)
	4197	data->longest = &(data->longest_float);
	4198	}
	4199	if (trie->jump) /* no more substrings -- for now /grr*/
	4200	flags &= ~SCF_DO_SUBSTR;
	4201	}
	4202	#endif /* old or new */
	4203	#endif /* TRIE_STUDY_OPT */
	4204
	4205	/* Else: zero-length, ignore. */
	4206	scan = regnext(scan);
	4207	}
	4208	if (frame) {
	4209	last = frame->last;
	4210	scan = frame->next;
	4211	stopparen = frame->stop;
	4212	frame = frame->prev;
	4213	goto fake_study_recurse;
	4214	}
	4215
	4216	finish:
	4217	assert(!frame);
	4218	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4219
	4220	*scanp = scan;
	4221	*deltap = is_inf_internal ? I32_MAX : delta;
	4222	if (flags & SCF_DO_SUBSTR && is_inf)
	4223	data->pos_delta = I32_MAX - data->pos_min;
	4224	if (is_par > (I32)U8_MAX)
	4225	is_par = 0;
	4226	if (is_par && pars==1 && data) {
	4227	data->flags \|= SF_IN_PAR;
	4228	data->flags &= ~SF_HAS_PAR;
	4229	}
	4230	else if (pars && data) {
	4231	data->flags \|= SF_HAS_PAR;
	4232	data->flags &= ~SF_IN_PAR;
	4233	}
	4234	if (flags & SCF_DO_STCLASS_OR)
	4235	cl_and(data->start_class, and_withp);
	4236	if (flags & SCF_TRIE_RESTUDY)
	4237	data->flags \|= SCF_TRIE_RESTUDY;
	4238
	4239	DEBUG_STUDYDATA("post-fin:",data,depth);
	4240
	4241	return min < stopmin ? min : stopmin;
	4242	}
	4243
	4244	STATIC U32
	4245	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4246	{
	4247	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4248
	4249	PERL_ARGS_ASSERT_ADD_DATA;
	4250
	4251	Renewc(RExC_rxi->data,
	4252	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4253	char, struct reg_data);
	4254	if(count)
	4255	Renew(RExC_rxi->data->what, count + n, U8);
	4256	else
	4257	Newx(RExC_rxi->data->what, n, U8);
	4258	RExC_rxi->data->count = count + n;
	4259	Copy(s, RExC_rxi->data->what + count, n, U8);
	4260	return count;
	4261	}
	4262
	4263	/XXX: todo make this not included in a non debugging perl /
	4264	#ifndef PERL_IN_XSUB_RE
	4265	void
	4266	Perl_reginitcolors(pTHX)
	4267	{
	4268	dVAR;
	4269	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4270	if (s) {
	4271	char *t = savepv(s);
	4272	int i = 0;
	4273	PL_colors[0] = t;
	4274	while (++i < 6) {
	4275	t = strchr(t, '\t');
	4276	if (t) {
	4277	*t = '\0';
	4278	PL_colors[i] = ++t;
	4279	}
	4280	else
	4281	PL_colors[i] = t = (char *)"";
	4282	}
	4283	} else {
	4284	int i = 0;
	4285	while (i < 6)
	4286	PL_colors[i++] = (char *)"";
	4287	}
	4288	PL_colorset = 1;
	4289	}
	4290	#endif
	4291
	4292
	4293	#ifdef TRIE_STUDY_OPT
	4294	#define CHECK_RESTUDY_GOTO \
	4295	if ( \
	4296	(data.flags & SCF_TRIE_RESTUDY) \
	4297	&& ! restudied++ \
	4298	) goto reStudy
	4299	#else
	4300	#define CHECK_RESTUDY_GOTO
	4301	#endif
	4302
	4303	/*
	4304	- pregcomp - compile a regular expression into internal code
	4305	*
	4306	* We can't allocate space until we know how big the compiled form will be,
	4307	* but we can't compile it (and thus know how big it is) until we've got a
	4308	* place to put the code. So we cheat: we compile it twice, once with code
	4309	* generation turned off and size counting turned on, and once "for real".
	4310	* This also means that we don't allocate space until we are sure that the
	4311	* thing really will compile successfully, and we never have to move the
	4312	* code and thus invalidate pointers into it. (Note that it has to be in
	4313	* one piece because free() must be able to free it all.) [NB: not true in perl]
	4314	*
	4315	* Beware that the optimization-preparation code in here knows about some
	4316	* of the structure of the compiled regexp. [I'll say.]
	4317	*/
	4318
	4319
	4320
	4321	#ifndef PERL_IN_XSUB_RE
	4322	#define RE_ENGINE_PTR &PL_core_reg_engine
	4323	#else
	4324	extern const struct regexp_engine my_reg_engine;
	4325	#define RE_ENGINE_PTR &my_reg_engine
	4326	#endif
	4327
	4328	#ifndef PERL_IN_XSUB_RE
	4329	REGEXP *
	4330	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4331	{
	4332	dVAR;
	4333	HV * const table = GvHV(PL_hintgv);
	4334
	4335	PERL_ARGS_ASSERT_PREGCOMP;
	4336
	4337	/* Dispatch a request to compile a regexp to correct
	4338	regexp engine. */
	4339	if (table) {
	4340	SV **ptr= hv_fetchs(table, "regcomp", FALSE);
	4341	GET_RE_DEBUG_FLAGS_DECL;
	4342	if (ptr && SvIOK(ptr) && SvIV(ptr)) {
	4343	const regexp_engine eng=INT2PTR(regexp_engine,SvIV(*ptr));
	4344	DEBUG_COMPILE_r({
	4345	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4346	SvIV(*ptr));
	4347	});
	4348	return CALLREGCOMP_ENG(eng, pattern, flags);
	4349	}
	4350	}
	4351	return Perl_re_compile(aTHX_ pattern, flags);
	4352	}
	4353	#endif
	4354
	4355	REGEXP *
	4356	Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
	4357	{
	4358	dVAR;
	4359	REGEXP *rx;
	4360	struct regexp *r;
	4361	register regexp_internal *ri;
	4362	STRLEN plen;
	4363	char *exp;
	4364	char* xend;
	4365	regnode *scan;
	4366	I32 flags;
	4367	I32 minlen = 0;
	4368	U32 pm_flags;
	4369
	4370	/* these are all flags - maybe they should be turned
	4371	* into a single int with different bit masks */
	4372	I32 sawlookahead = 0;
	4373	I32 sawplus = 0;
	4374	I32 sawopen = 0;
	4375	bool used_setjump = FALSE;
	4376
	4377	U8 jump_ret = 0;
	4378	dJMPENV;
	4379	scan_data_t data;
	4380	RExC_state_t RExC_state;
	4381	RExC_state_t * const pRExC_state = &RExC_state;
	4382	#ifdef TRIE_STUDY_OPT
	4383	int restudied;
	4384	RExC_state_t copyRExC_state;
	4385	#endif
	4386	GET_RE_DEBUG_FLAGS_DECL;
	4387
	4388	PERL_ARGS_ASSERT_RE_COMPILE;
	4389
	4390	DEBUG_r(if (!PL_colorset) reginitcolors());
	4391
	4392	RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
	4393
	4394	/**************** LONG JUMP TARGET HERE*********************/
	4395	/* Longjmp back to here if have to switch in midstream to utf8 */
	4396	if (! RExC_orig_utf8) {
	4397	JMPENV_PUSH(jump_ret);
	4398	used_setjump = TRUE;
	4399	}
	4400
	4401	if (jump_ret == 0) { /* First time through */
	4402	exp = SvPV(pattern, plen);
	4403	xend = exp + plen;
	4404	/* ignore the utf8ness if the pattern is 0 length */
	4405	if (plen == 0) {
	4406	RExC_utf8 = RExC_orig_utf8 = 0;
	4407	}
	4408
	4409	DEBUG_COMPILE_r({
	4410	SV *dsv= sv_newmortal();
	4411	RE_PV_QUOTED_DECL(s, RExC_utf8,
	4412	dsv, exp, plen, 60);
	4413	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	4414	PL_colors[4],PL_colors[5],s);
	4415	});
	4416	}
	4417	else { /* longjumped back */
	4418	STRLEN len = plen;
	4419
	4420	/* If the cause for the longjmp was other than changing to utf8, pop
	4421	* our own setjmp, and longjmp to the correct handler */
	4422	if (jump_ret != UTF8_LONGJMP) {
	4423	JMPENV_POP;
	4424	JMPENV_JUMP(jump_ret);
	4425	}
	4426
	4427	GET_RE_DEBUG_FLAGS;
	4428
	4429	/* It's possible to write a regexp in ascii that represents Unicode
	4430	codepoints outside of the byte range, such as via \x{100}. If we
	4431	detect such a sequence we have to convert the entire pattern to utf8
	4432	and then recompile, as our sizing calculation will have been based
	4433	on 1 byte == 1 character, but we will need to use utf8 to encode
	4434	at least some part of the pattern, and therefore must convert the whole
	4435	thing.
	4436	-- dmq */
	4437	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	4438	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	4439	exp = (char)Perl_bytes_to_utf8(aTHX_ (U8)SvPV(pattern, plen), &len);
	4440	xend = exp + len;
	4441	RExC_orig_utf8 = RExC_utf8 = 1;
	4442	SAVEFREEPV(exp);
	4443	}
	4444
	4445	#ifdef TRIE_STUDY_OPT
	4446	restudied = 0;
	4447	#endif
	4448
	4449	/* Set to use unicode semantics if the pattern is in utf8 and has the
	4450	* 'depends' charset specified, as it means unicode when utf8 */
	4451	pm_flags = orig_pm_flags;
	4452
	4453	if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
	4454	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4455	}
	4456
	4457	RExC_precomp = exp;
	4458	RExC_flags = pm_flags;
	4459	RExC_sawback = 0;
	4460
	4461	RExC_seen = 0;
	4462	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	4463	RExC_seen_evals = 0;
	4464	RExC_extralen = 0;
	4465
	4466	/* First pass: determine size, legality. */
	4467	RExC_parse = exp;
	4468	RExC_start = exp;
	4469	RExC_end = xend;
	4470	RExC_naughty = 0;
	4471	RExC_npar = 1;
	4472	RExC_nestroot = 0;
	4473	RExC_size = 0L;
	4474	RExC_emit = &PL_regdummy;
	4475	RExC_whilem_seen = 0;
	4476	RExC_open_parens = NULL;
	4477	RExC_close_parens = NULL;
	4478	RExC_opend = NULL;
	4479	RExC_paren_names = NULL;
	4480	#ifdef DEBUGGING
	4481	RExC_paren_name_list = NULL;
	4482	#endif
	4483	RExC_recurse = NULL;
	4484	RExC_recurse_count = 0;
	4485
	4486	#if 0 /* REGC() is (currently) a NOP at the first pass.
	4487	* Clever compilers notice this and complain. --jhi */
	4488	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	4489	#endif
	4490	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
	4491	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4492	RExC_precomp = NULL;
	4493	return(NULL);
	4494	}
	4495
	4496	/* Here, finished first pass. Get rid of any added setjmp */
	4497	if (used_setjump) {
	4498	JMPENV_POP;
	4499	}
	4500	DEBUG_PARSE_r({
	4501	PerlIO_printf(Perl_debug_log,
	4502	"Required size %"IVdf" nodes\n"
	4503	"Starting second pass (creation)\n",
	4504	(IV)RExC_size);
	4505	RExC_lastnum=0;
	4506	RExC_lastparse=NULL;
	4507	});
	4508	/* Small enough for pointer-storage convention?
	4509	If extralen==0, this means that we will not need long jumps. */
	4510	if (RExC_size >= 0x10000L && RExC_extralen)
	4511	RExC_size += RExC_extralen;
	4512	else
	4513	RExC_extralen = 0;
	4514	if (RExC_whilem_seen > 15)
	4515	RExC_whilem_seen = 15;
	4516
	4517	/* Allocate space and zero-initialize. Note, the two step process
	4518	of zeroing when in debug mode, thus anything assigned has to
	4519	happen after that */
	4520	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	4521	r = (struct regexp*)SvANY(rx);
	4522	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	4523	char, regexp_internal);
	4524	if ( r == NULL \|\| ri == NULL )
	4525	FAIL("Regexp out of space");
	4526	#ifdef DEBUGGING
	4527	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	4528	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	4529	#else
	4530	/* bulk initialize base fields with 0. */
	4531	Zero(ri, sizeof(regexp_internal), char);
	4532	#endif
	4533
	4534	/* non-zero initialization begins here */
	4535	RXi_SET( r, ri );
	4536	r->engine= RE_ENGINE_PTR;
	4537	r->extflags = pm_flags;
	4538	{
	4539	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	4540	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	4541
	4542	/* The caret is output if there are any defaults: if not all the STD
	4543	* flags are set, or if no character set specifier is needed */
	4544	bool has_default =
	4545	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	4546	\|\| ! has_charset);
	4547	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	4548	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	4549	>> RXf_PMf_STD_PMMOD_SHIFT);
	4550	const char fptr = STD_PAT_MODS; /"msix"*/
	4551	char *p;
	4552	/* Allocate for the worst case, which is all the std flags are turned
	4553	* on. If more precision is desired, we could do a population count of
	4554	* the flags set. This could be done with a small lookup table, or by
	4555	* shifting, masking and adding, or even, when available, assembly
	4556	* language for a machine-language population count.
	4557	* We never output a minus, as all those are defaults, so are
	4558	* covered by the caret */
	4559	const STRLEN wraplen = plen + has_p + has_runon
	4560	+ has_default /* If needs a caret */
	4561
	4562	/* If needs a character set specifier */
	4563	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	4564	+ (sizeof(STD_PAT_MODS) - 1)
	4565	+ (sizeof("(?:)") - 1);
	4566
	4567	p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
	4568	SvPOK_on(rx);
	4569	SvFLAGS(rx) \|= SvUTF8(pattern);
	4570	p++='('; p++='?';
	4571
	4572	/* If a default, cover it using the caret */
	4573	if (has_default) {
	4574	*p++= DEFAULT_PAT_MOD;
	4575	}
	4576	if (has_charset) {
	4577	STRLEN len;
	4578	const char* const name = get_regex_charset_name(r->extflags, &len);
	4579	Copy(name, p, len, char);
	4580	p += len;
	4581	}
	4582	if (has_p)
	4583	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	4584	{
	4585	char ch;
	4586	while((ch = *fptr++)) {
	4587	if(reganch & 1)
	4588	*p++ = ch;
	4589	reganch >>= 1;
	4590	}
	4591	}
	4592
	4593	*p++ = ':';
	4594	Copy(RExC_precomp, p, plen, char);
	4595	assert ((RX_WRAPPED(rx) - p) < 16);
	4596	r->pre_prefix = p - RX_WRAPPED(rx);
	4597	p += plen;
	4598	if (has_runon)
	4599	*p++ = '\n';
	4600	*p++ = ')';
	4601	*p = 0;
	4602	SvCUR_set(rx, p - SvPVX_const(rx));
	4603	}
	4604
	4605	r->intflags = 0;
	4606	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	4607
	4608	if (RExC_seen & REG_SEEN_RECURSE) {
	4609	Newxz(RExC_open_parens, RExC_npar,regnode *);
	4610	SAVEFREEPV(RExC_open_parens);
	4611	Newxz(RExC_close_parens,RExC_npar,regnode *);
	4612	SAVEFREEPV(RExC_close_parens);
	4613	}
	4614
	4615	/* Useful during FAIL. */
	4616	#ifdef RE_TRACK_PATTERN_OFFSETS
	4617	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	4618	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	4619	"%s %"UVuf" bytes for offset annotations.\n",
	4620	ri->u.offsets ? "Got" : "Couldn't get",
	4621	(UV)((2RExC_size+1) sizeof(U32))));
	4622	#endif
	4623	SetProgLen(ri,RExC_size);
	4624	RExC_rx_sv = rx;
	4625	RExC_rx = r;
	4626	RExC_rxi = ri;
	4627
	4628	/* Second pass: emit code. */
	4629	RExC_flags = pm_flags; /* don't let top level (?i) bleed */
	4630	RExC_parse = exp;
	4631	RExC_end = xend;
	4632	RExC_naughty = 0;
	4633	RExC_npar = 1;
	4634	RExC_emit_start = ri->program;
	4635	RExC_emit = ri->program;
	4636	RExC_emit_bound = ri->program + RExC_size + 1;
	4637
	4638	/* Store the count of eval-groups for security checks: */
	4639	RExC_rx->seen_evals = RExC_seen_evals;
	4640	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	4641	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4642	ReREFCNT_dec(rx);
	4643	return(NULL);
	4644	}
	4645	/* XXXX To minimize changes to RE engine we always allocate
	4646	3-units-long substrs field. */
	4647	Newx(r->substrs, 1, struct reg_substr_data);
	4648	if (RExC_recurse_count) {
	4649	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	4650	SAVEFREEPV(RExC_recurse);
	4651	}
	4652
	4653	reStudy:
	4654	r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
	4655	Zero(r->substrs, 1, struct reg_substr_data);
	4656
	4657	#ifdef TRIE_STUDY_OPT
	4658	if (!restudied) {
	4659	StructCopy(&zero_scan_data, &data, scan_data_t);
	4660	copyRExC_state = RExC_state;
	4661	} else {
	4662	U32 seen=RExC_seen;
	4663	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	4664
	4665	RExC_state = copyRExC_state;
	4666	if (seen & REG_TOP_LEVEL_BRANCHES)
	4667	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	4668	else
	4669	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	4670	if (data.last_found) {
	4671	SvREFCNT_dec(data.longest_fixed);
	4672	SvREFCNT_dec(data.longest_float);
	4673	SvREFCNT_dec(data.last_found);
	4674	}
	4675	StructCopy(&zero_scan_data, &data, scan_data_t);
	4676	}
	4677	#else
	4678	StructCopy(&zero_scan_data, &data, scan_data_t);
	4679	#endif
	4680
	4681	/* Dig out information for optimizations. */
	4682	r->extflags = RExC_flags; /* was pm_op */
	4683	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	4684
	4685	if (UTF)
	4686	SvUTF8_on(rx); /* Unicode in it? */
	4687	ri->regstclass = NULL;
	4688	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	4689	r->intflags \|= PREGf_NAUGHTY;
	4690	scan = ri->program + 1; /* First BRANCH. */
	4691
	4692	/* testing for BRANCH here tells us whether there is "must appear"
	4693	data in the pattern. If there is then we can use it for optimisations */
	4694	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	4695	I32 fake;
	4696	STRLEN longest_float_length, longest_fixed_length;
	4697	struct regnode_charclass_class ch_class; /* pointed to by data */
	4698	int stclass_flag;
	4699	I32 last_close = 0; /* pointed to by data */
	4700	regnode *first= scan;
	4701	regnode *first_next= regnext(first);
	4702	/*
	4703	* Skip introductions and multiplicators >= 1
	4704	* so that we can extract the 'meat' of the pattern that must
	4705	* match in the large if() sequence following.
	4706	* NOTE that EXACT is NOT covered here, as it is normally
	4707	* picked up by the optimiser separately.
	4708	*
	4709	* This is unfortunate as the optimiser isnt handling lookahead
	4710	* properly currently.
	4711	*
	4712	*/
	4713	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	4714	/* An OR of one alternative - should not happen now. */
	4715	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	4716	/* for now we can't handle lookbehind IFMATCH*/
	4717	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	4718	(OP(first) == PLUS) \|\|
	4719	(OP(first) == MINMOD) \|\|
	4720	/* An {n,m} with n>0 */
	4721	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	4722	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	4723	{
	4724	/*
	4725	* the only op that could be a regnode is PLUS, all the rest
	4726	* will be regnode_1 or regnode_2.
	4727	*
	4728	*/
	4729	if (OP(first) == PLUS)
	4730	sawplus = 1;
	4731	else
	4732	first += regarglen[OP(first)];
	4733
	4734	first = NEXTOPER(first);
	4735	first_next= regnext(first);
	4736	}
	4737
	4738	/* Starting-point info. */
	4739	again:
	4740	DEBUG_PEEP("first:",first,0);
	4741	/* Ignore EXACT as we deal with it later. */
	4742	if (PL_regkind[OP(first)] == EXACT) {
	4743	if (OP(first) == EXACT)
	4744	NOOP; /* Empty, get anchored substr later. */
	4745	else
	4746	ri->regstclass = first;
	4747	}
	4748	#ifdef TRIE_STCLASS
	4749	else if (PL_regkind[OP(first)] == TRIE &&
	4750	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	4751	{
	4752	regnode *trie_op;
	4753	/* this can happen only on restudy */
	4754	if ( OP(first) == TRIE ) {
	4755	struct regnode_1 trieop = (struct regnode_1 )
	4756	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	4757	StructCopy(first,trieop,struct regnode_1);
	4758	trie_op=(regnode *)trieop;
	4759	} else {
	4760	struct regnode_charclass trieop = (struct regnode_charclass )
	4761	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	4762	StructCopy(first,trieop,struct regnode_charclass);
	4763	trie_op=(regnode *)trieop;
	4764	}
	4765	OP(trie_op)+=2;
	4766	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	4767	ri->regstclass = trie_op;
	4768	}
	4769	#endif
	4770	else if (REGNODE_SIMPLE(OP(first)))
	4771	ri->regstclass = first;
	4772	else if (PL_regkind[OP(first)] == BOUND \|\|
	4773	PL_regkind[OP(first)] == NBOUND)
	4774	ri->regstclass = first;
	4775	else if (PL_regkind[OP(first)] == BOL) {
	4776	r->extflags \|= (OP(first) == MBOL
	4777	? RXf_ANCH_MBOL
	4778	: (OP(first) == SBOL
	4779	? RXf_ANCH_SBOL
	4780	: RXf_ANCH_BOL));
	4781	first = NEXTOPER(first);
	4782	goto again;
	4783	}
	4784	else if (OP(first) == GPOS) {
	4785	r->extflags \|= RXf_ANCH_GPOS;
	4786	first = NEXTOPER(first);
	4787	goto again;
	4788	}
	4789	else if ((!sawopen \|\| !RExC_sawback) &&
	4790	(OP(first) == STAR &&
	4791	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	4792	!(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
	4793	{
	4794	/* turn .* into ^.* with an implied $=1 /
	4795	const int type =
	4796	(OP(NEXTOPER(first)) == REG_ANY)
	4797	? RXf_ANCH_MBOL
	4798	: RXf_ANCH_SBOL;
	4799	r->extflags \|= type;
	4800	r->intflags \|= PREGf_IMPLICIT;
	4801	first = NEXTOPER(first);
	4802	goto again;
	4803	}
	4804	if (sawplus && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	4805	&& !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
	4806	/* x+ must match at the 1st pos of run of x's */
	4807	r->intflags \|= PREGf_SKIP;
	4808
	4809	/* Scan is after the zeroth branch, first is atomic matcher. */
	4810	#ifdef TRIE_STUDY_OPT
	4811	DEBUG_PARSE_r(
	4812	if (!restudied)
	4813	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	4814	(IV)(first - scan + 1))
	4815	);
	4816	#else
	4817	DEBUG_PARSE_r(
	4818	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	4819	(IV)(first - scan + 1))
	4820	);
	4821	#endif
	4822
	4823
	4824	/*
	4825	* If there's something expensive in the r.e., find the
	4826	* longest literal string that must appear and make it the
	4827	* regmust. Resolve ties in favor of later strings, since
	4828	* the regstart check works with the beginning of the r.e.
	4829	* and avoiding duplication strengthens checking. Not a
	4830	* strong reason, but sufficient in the absence of others.
	4831	* [Now we resolve ties in favor of the earlier string if
	4832	* it happens that c_offset_min has been invalidated, since the
	4833	* earlier string may buy us something the later one won't.]
	4834	*/
	4835
	4836	data.longest_fixed = newSVpvs("");
	4837	data.longest_float = newSVpvs("");
	4838	data.last_found = newSVpvs("");
	4839	data.longest = &(data.longest_fixed);
	4840	first = scan;
	4841	if (!ri->regstclass) {
	4842	cl_init(pRExC_state, &ch_class);
	4843	data.start_class = &ch_class;
	4844	stclass_flag = SCF_DO_STCLASS_AND;
	4845	} else /* XXXX Check for BOUND? */
	4846	stclass_flag = 0;
	4847	data.last_closep = &last_close;
	4848
	4849	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	4850	&data, -1, NULL, NULL,
	4851	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag,0);
	4852
	4853
	4854	CHECK_RESTUDY_GOTO;
	4855
	4856
	4857	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	4858	&& data.last_start_min == 0 && data.last_end > 0
	4859	&& !RExC_seen_zerolen
	4860	&& !(RExC_seen & REG_SEEN_VERBARG)
	4861	&& (!(RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	4862	r->extflags \|= RXf_CHECK_ALL;
	4863	scan_commit(pRExC_state, &data,&minlen,0);
	4864	SvREFCNT_dec(data.last_found);
	4865
	4866	/* Note that code very similar to this but for anchored string
	4867	follows immediately below, changes may need to be made to both.
	4868	Be careful.
	4869	*/
	4870	longest_float_length = CHR_SVLEN(data.longest_float);
	4871	if (longest_float_length
	4872	\|\| (data.flags & SF_FL_BEFORE_EOL
	4873	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	4874	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	4875	{
	4876	I32 t,ml;
	4877
	4878	if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	4879	&& data.offset_fixed == data.offset_float_min
	4880	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
	4881	goto remove_float; /* As in (a)+. */
	4882
	4883	/* copy the information about the longest float from the reg_scan_data
	4884	over to the program. */
	4885	if (SvUTF8(data.longest_float)) {
	4886	r->float_utf8 = data.longest_float;
	4887	r->float_substr = NULL;
	4888	} else {
	4889	r->float_substr = data.longest_float;
	4890	r->float_utf8 = NULL;
	4891	}
	4892	/* float_end_shift is how many chars that must be matched that
	4893	follow this item. We calculate it ahead of time as once the
	4894	lookbehind offset is added in we lose the ability to correctly
	4895	calculate it.*/
	4896	ml = data.minlen_float ? *(data.minlen_float)
	4897	: (I32)longest_float_length;
	4898	r->float_end_shift = ml - data.offset_float_min
	4899	- longest_float_length + (SvTAIL(data.longest_float) != 0)
	4900	+ data.lookbehind_float;
	4901	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	4902	r->float_max_offset = data.offset_float_max;
	4903	if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
	4904	r->float_max_offset -= data.lookbehind_float;
	4905
	4906	t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
	4907	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	4908	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	4909	fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
	4910	}
	4911	else {
	4912	remove_float:
	4913	r->float_substr = r->float_utf8 = NULL;
	4914	SvREFCNT_dec(data.longest_float);
	4915	longest_float_length = 0;
	4916	}
	4917
	4918	/* Note that code very similar to this but for floating string
	4919	is immediately above, changes may need to be made to both.
	4920	Be careful.
	4921	*/
	4922	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	4923	if (longest_fixed_length
	4924	\|\| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
	4925	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	4926	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	4927	{
	4928	I32 t,ml;
	4929
	4930	/* copy the information about the longest fixed
	4931	from the reg_scan_data over to the program. */
	4932	if (SvUTF8(data.longest_fixed)) {
	4933	r->anchored_utf8 = data.longest_fixed;
	4934	r->anchored_substr = NULL;
	4935	} else {
	4936	r->anchored_substr = data.longest_fixed;
	4937	r->anchored_utf8 = NULL;
	4938	}
	4939	/* fixed_end_shift is how many chars that must be matched that
	4940	follow this item. We calculate it ahead of time as once the
	4941	lookbehind offset is added in we lose the ability to correctly
	4942	calculate it.*/
	4943	ml = data.minlen_fixed ? *(data.minlen_fixed)
	4944	: (I32)longest_fixed_length;
	4945	r->anchored_end_shift = ml - data.offset_fixed
	4946	- longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
	4947	+ data.lookbehind_fixed;
	4948	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	4949
	4950	t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
	4951	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	4952	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	4953	fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
	4954	}
	4955	else {
	4956	r->anchored_substr = r->anchored_utf8 = NULL;
	4957	SvREFCNT_dec(data.longest_fixed);
	4958	longest_fixed_length = 0;
	4959	}
	4960	if (ri->regstclass
	4961	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	4962	ri->regstclass = NULL;
	4963
	4964	/* If the synthetic start class were to ever be used when EOS is set,
	4965	* that bit would have to be cleared, as it is shared with another */
	4966	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	4967	&& stclass_flag
	4968	&& !(data.start_class->flags & ANYOF_EOS)
	4969	&& !cl_is_anything(data.start_class))
	4970	{
	4971	const U32 n = add_data(pRExC_state, 1, "f");
	4972
	4973	Newx(RExC_rxi->data->data[n], 1,
	4974	struct regnode_charclass_class);
	4975	StructCopy(data.start_class,
	4976	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	4977	struct regnode_charclass_class);
	4978	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	4979	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	4980	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	4981	regprop(r, sv, (regnode*)data.start_class);
	4982	PerlIO_printf(Perl_debug_log,
	4983	"synthetic stclass \"%s\".\n",
	4984	SvPVX_const(sv));});
	4985	}
	4986
	4987	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	4988	if (longest_fixed_length > longest_float_length) {
	4989	r->check_end_shift = r->anchored_end_shift;
	4990	r->check_substr = r->anchored_substr;
	4991	r->check_utf8 = r->anchored_utf8;
	4992	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	4993	if (r->extflags & RXf_ANCH_SINGLE)
	4994	r->extflags \|= RXf_NOSCAN;
	4995	}
	4996	else {
	4997	r->check_end_shift = r->float_end_shift;
	4998	r->check_substr = r->float_substr;
	4999	r->check_utf8 = r->float_utf8;
	5000	r->check_offset_min = r->float_min_offset;
	5001	r->check_offset_max = r->float_max_offset;
	5002	}
	5003	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
	5004	This should be changed ASAP! */
	5005	if ((r->check_substr \|\| r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
	5006	r->extflags \|= RXf_USE_INTUIT;
	5007	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	5008	r->extflags \|= RXf_INTUIT_TAIL;
	5009	}
	5010	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	5011	if ( (STRLEN)minlen < longest_float_length )
	5012	minlen= longest_float_length;
	5013	if ( (STRLEN)minlen < longest_fixed_length )
	5014	minlen= longest_fixed_length;
	5015	*/
	5016	}
	5017	else {
	5018	/* Several toplevels. Best we can is to set minlen. */
	5019	I32 fake;
	5020	struct regnode_charclass_class ch_class;
	5021	I32 last_close = 0;
	5022
	5023	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	5024
	5025	scan = ri->program + 1;
	5026	cl_init(pRExC_state, &ch_class);
	5027	data.start_class = &ch_class;
	5028	data.last_closep = &last_close;
	5029
	5030
	5031	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	5032	&data, -1, NULL, NULL, SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS,0);
	5033
	5034	CHECK_RESTUDY_GOTO;
	5035
	5036	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	5037	= r->float_substr = r->float_utf8 = NULL;
	5038
	5039	/* If the synthetic start class were to ever be used when EOS is set,
	5040	* that bit would have to be cleared, as it is shared with another */
	5041	if (!(data.start_class->flags & ANYOF_EOS)
	5042	&& !cl_is_anything(data.start_class))
	5043	{
	5044	const U32 n = add_data(pRExC_state, 1, "f");
	5045
	5046	Newx(RExC_rxi->data->data[n], 1,
	5047	struct regnode_charclass_class);
	5048	StructCopy(data.start_class,
	5049	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5050	struct regnode_charclass_class);
	5051	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5052	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5053	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	5054	regprop(r, sv, (regnode*)data.start_class);
	5055	PerlIO_printf(Perl_debug_log,
	5056	"synthetic stclass \"%s\".\n",
	5057	SvPVX_const(sv));});
	5058	}
	5059	}
	5060
	5061	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	5062	the "real" pattern. */
	5063	DEBUG_OPTIMISE_r({
	5064	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	5065	(IV)minlen, (IV)r->minlen);
	5066	});
	5067	r->minlenret = minlen;
	5068	if (r->minlen < minlen)
	5069	r->minlen = minlen;
	5070
	5071	if (RExC_seen & REG_SEEN_GPOS)
	5072	r->extflags \|= RXf_GPOS_SEEN;
	5073	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	5074	r->extflags \|= RXf_LOOKBEHIND_SEEN;
	5075	if (RExC_seen & REG_SEEN_EVAL)
	5076	r->extflags \|= RXf_EVAL_SEEN;
	5077	if (RExC_seen & REG_SEEN_CANY)
	5078	r->extflags \|= RXf_CANY_SEEN;
	5079	if (RExC_seen & REG_SEEN_VERBARG)
	5080	r->intflags \|= PREGf_VERBARG_SEEN;
	5081	if (RExC_seen & REG_SEEN_CUTGROUP)
	5082	r->intflags \|= PREGf_CUTGROUP_SEEN;
	5083	if (RExC_paren_names)
	5084	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	5085	else
	5086	RXp_PAREN_NAMES(r) = NULL;
	5087
	5088	#ifdef STUPID_PATTERN_CHECKS
	5089	if (RX_PRELEN(rx) == 0)
	5090	r->extflags \|= RXf_NULL;
	5091	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5092	/* XXX: this should happen BEFORE we compile */
	5093	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5094	else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
	5095	r->extflags \|= RXf_WHITE;
	5096	else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
	5097	r->extflags \|= RXf_START_ONLY;
	5098	#else
	5099	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5100	/* XXX: this should happen BEFORE we compile */
	5101	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5102	else {
	5103	regnode *first = ri->program + 1;
	5104	U8 fop = OP(first);
	5105	U8 nop = OP(NEXTOPER(first));
	5106
	5107	if (PL_regkind[fop] == NOTHING && nop == END)
	5108	r->extflags \|= RXf_NULL;
	5109	else if (PL_regkind[fop] == BOL && nop == END)
	5110	r->extflags \|= RXf_START_ONLY;
	5111	else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
	5112	r->extflags \|= RXf_WHITE;
	5113	}
	5114	#endif
	5115	#ifdef DEBUGGING
	5116	if (RExC_paren_names) {
	5117	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	5118	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	5119	} else
	5120	#endif
	5121	ri->name_list_idx = 0;
	5122
	5123	if (RExC_recurse_count) {
	5124	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	5125	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	5126	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	5127	}
	5128	}
	5129	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	5130	/* assume we don't need to swap parens around before we match */
	5131
	5132	DEBUG_DUMP_r({
	5133	PerlIO_printf(Perl_debug_log,"Final program:\n");
	5134	regdump(r);
	5135	});
	5136	#ifdef RE_TRACK_PATTERN_OFFSETS
	5137	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	5138	const U32 len = ri->u.offsets[0];
	5139	U32 i;
	5140	GET_RE_DEBUG_FLAGS_DECL;
	5141	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	5142	for (i = 1; i <= len; i++) {
	5143	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	5144	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	5145	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	5146	}
	5147	PerlIO_printf(Perl_debug_log, "\n");
	5148	});
	5149	#endif
	5150	return rx;
	5151	}
	5152
	5153	#undef RE_ENGINE_PTR
	5154
	5155
	5156	SV*
	5157	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	5158	const U32 flags)
	5159	{
	5160	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	5161
	5162	PERL_UNUSED_ARG(value);
	5163
	5164	if (flags & RXapif_FETCH) {
	5165	return reg_named_buff_fetch(rx, key, flags);
	5166	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	5167	Perl_croak_no_modify(aTHX);
	5168	return NULL;
	5169	} else if (flags & RXapif_EXISTS) {
	5170	return reg_named_buff_exists(rx, key, flags)
	5171	? &PL_sv_yes
	5172	: &PL_sv_no;
	5173	} else if (flags & RXapif_REGNAMES) {
	5174	return reg_named_buff_all(rx, flags);
	5175	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	5176	return reg_named_buff_scalar(rx, flags);
	5177	} else {
	5178	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	5179	return NULL;
	5180	}
	5181	}
	5182
	5183	SV*
	5184	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	5185	const U32 flags)
	5186	{
	5187	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	5188	PERL_UNUSED_ARG(lastkey);
	5189
	5190	if (flags & RXapif_FIRSTKEY)
	5191	return reg_named_buff_firstkey(rx, flags);
	5192	else if (flags & RXapif_NEXTKEY)
	5193	return reg_named_buff_nextkey(rx, flags);
	5194	else {
	5195	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	5196	return NULL;
	5197	}
	5198	}
	5199
	5200	SV*
	5201	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	5202	const U32 flags)
	5203	{
	5204	AV *retarray = NULL;
	5205	SV *ret;
	5206	struct regexp const rx = (struct regexp )SvANY(r);
	5207
	5208	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	5209
	5210	if (flags & RXapif_ALL)
	5211	retarray=newAV();
	5212
	5213	if (rx && RXp_PAREN_NAMES(rx)) {
	5214	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	5215	if (he_str) {
	5216	IV i;
	5217	SV* sv_dat=HeVAL(he_str);
	5218	I32 nums=(I32)SvPVX(sv_dat);
	5219	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	5220	if ((I32)(rx->nparens) >= nums[i]
	5221	&& rx->offs[nums[i]].start != -1
	5222	&& rx->offs[nums[i]].end != -1)
	5223	{
	5224	ret = newSVpvs("");
	5225	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	5226	if (!retarray)
	5227	return ret;
	5228	} else {
	5229	ret = newSVsv(&PL_sv_undef);
	5230	}
	5231	if (retarray)
	5232	av_push(retarray, ret);
	5233	}
	5234	if (retarray)
	5235	return newRV_noinc(MUTABLE_SV(retarray));
	5236	}
	5237	}
	5238	return NULL;
	5239	}
	5240
	5241	bool
	5242	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	5243	const U32 flags)
	5244	{
	5245	struct regexp const rx = (struct regexp )SvANY(r);
	5246
	5247	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	5248
	5249	if (rx && RXp_PAREN_NAMES(rx)) {
	5250	if (flags & RXapif_ALL) {
	5251	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	5252	} else {
	5253	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	5254	if (sv) {
	5255	SvREFCNT_dec(sv);
	5256	return TRUE;
	5257	} else {
	5258	return FALSE;
	5259	}
	5260	}
	5261	} else {
	5262	return FALSE;
	5263	}
	5264	}
	5265
	5266	SV*
	5267	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	5268	{
	5269	struct regexp const rx = (struct regexp )SvANY(r);
	5270
	5271	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	5272
	5273	if ( rx && RXp_PAREN_NAMES(rx) ) {
	5274	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	5275
	5276	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	5277	} else {
	5278	return FALSE;
	5279	}
	5280	}
	5281
	5282	SV*
	5283	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	5284	{
	5285	struct regexp const rx = (struct regexp )SvANY(r);
	5286	GET_RE_DEBUG_FLAGS_DECL;
	5287
	5288	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	5289
	5290	if (rx && RXp_PAREN_NAMES(rx)) {
	5291	HV *hv = RXp_PAREN_NAMES(rx);
	5292	HE *temphe;
	5293	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5294	IV i;
	5295	IV parno = 0;
	5296	SV* sv_dat = HeVAL(temphe);
	5297	I32 nums = (I32)SvPVX(sv_dat);
	5298	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5299	if ((I32)(rx->lastparen) >= nums[i] &&
	5300	rx->offs[nums[i]].start != -1 &&
	5301	rx->offs[nums[i]].end != -1)
	5302	{
	5303	parno = nums[i];
	5304	break;
	5305	}
	5306	}
	5307	if (parno \|\| flags & RXapif_ALL) {
	5308	return newSVhek(HeKEY_hek(temphe));
	5309	}
	5310	}
	5311	}
	5312	return NULL;
	5313	}
	5314
	5315	SV*
	5316	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	5317	{
	5318	SV *ret;
	5319	AV *av;
	5320	I32 length;
	5321	struct regexp const rx = (struct regexp )SvANY(r);
	5322
	5323	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	5324
	5325	if (rx && RXp_PAREN_NAMES(rx)) {
	5326	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	5327	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	5328	} else if (flags & RXapif_ONE) {
	5329	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	5330	av = MUTABLE_AV(SvRV(ret));
	5331	length = av_len(av);
	5332	SvREFCNT_dec(ret);
	5333	return newSViv(length + 1);
	5334	} else {
	5335	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	5336	return NULL;
	5337	}
	5338	}
	5339	return &PL_sv_undef;
	5340	}
	5341
	5342	SV*
	5343	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	5344	{
	5345	struct regexp const rx = (struct regexp )SvANY(r);
	5346	AV *av = newAV();
	5347
	5348	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	5349
	5350	if (rx && RXp_PAREN_NAMES(rx)) {
	5351	HV *hv= RXp_PAREN_NAMES(rx);
	5352	HE *temphe;
	5353	(void)hv_iterinit(hv);
	5354	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5355	IV i;
	5356	IV parno = 0;
	5357	SV* sv_dat = HeVAL(temphe);
	5358	I32 nums = (I32)SvPVX(sv_dat);
	5359	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5360	if ((I32)(rx->lastparen) >= nums[i] &&
	5361	rx->offs[nums[i]].start != -1 &&
	5362	rx->offs[nums[i]].end != -1)
	5363	{
	5364	parno = nums[i];
	5365	break;
	5366	}
	5367	}
	5368	if (parno \|\| flags & RXapif_ALL) {
	5369	av_push(av, newSVhek(HeKEY_hek(temphe)));
	5370	}
	5371	}
	5372	}
	5373
	5374	return newRV_noinc(MUTABLE_SV(av));
	5375	}
	5376
	5377	void
	5378	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	5379	SV * const sv)
	5380	{
	5381	struct regexp const rx = (struct regexp )SvANY(r);
	5382	char *s = NULL;
	5383	I32 i = 0;
	5384	I32 s1, t1;
	5385
	5386	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	5387
	5388	if (!rx->subbeg) {
	5389	sv_setsv(sv,&PL_sv_undef);
	5390	return;
	5391	}
	5392	else
	5393	if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
	5394	/* $` */
	5395	i = rx->offs[0].start;
	5396	s = rx->subbeg;
	5397	}
	5398	else
	5399	if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
	5400	/* $' */
	5401	s = rx->subbeg + rx->offs[0].end;
	5402	i = rx->sublen - rx->offs[0].end;
	5403	}
	5404	else
	5405	if ( 0 <= paren && paren <= (I32)rx->nparens &&
	5406	(s1 = rx->offs[paren].start) != -1 &&
	5407	(t1 = rx->offs[paren].end) != -1)
	5408	{
	5409	/* $& $1 ... */
	5410	i = t1 - s1;
	5411	s = rx->subbeg + s1;
	5412	} else {
	5413	sv_setsv(sv,&PL_sv_undef);
	5414	return;
	5415	}
	5416	assert(rx->sublen >= (s - rx->subbeg) + i );
	5417	if (i >= 0) {
	5418	const int oldtainted = PL_tainted;
	5419	TAINT_NOT;
	5420	sv_setpvn(sv, s, i);
	5421	PL_tainted = oldtainted;
	5422	if ( (rx->extflags & RXf_CANY_SEEN)
	5423	? (RXp_MATCH_UTF8(rx)
	5424	&& (!i \|\| is_utf8_string((U8*)s, i)))
	5425	: (RXp_MATCH_UTF8(rx)) )
	5426	{
	5427	SvUTF8_on(sv);
	5428	}
	5429	else
	5430	SvUTF8_off(sv);
	5431	if (PL_tainting) {
	5432	if (RXp_MATCH_TAINTED(rx)) {
	5433	if (SvTYPE(sv) >= SVt_PVMG) {
	5434	MAGIC* const mg = SvMAGIC(sv);
	5435	MAGIC* mgt;
	5436	PL_tainted = 1;
	5437	SvMAGIC_set(sv, mg->mg_moremagic);
	5438	SvTAINT(sv);
	5439	if ((mgt = SvMAGIC(sv))) {
	5440	mg->mg_moremagic = mgt;
	5441	SvMAGIC_set(sv, mg);
	5442	}
	5443	} else {
	5444	PL_tainted = 1;
	5445	SvTAINT(sv);
	5446	}
	5447	} else
	5448	SvTAINTED_off(sv);
	5449	}
	5450	} else {
	5451	sv_setsv(sv,&PL_sv_undef);
	5452	return;
	5453	}
	5454	}
	5455
	5456	void
	5457	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	5458	SV const * const value)
	5459	{
	5460	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	5461
	5462	PERL_UNUSED_ARG(rx);
	5463	PERL_UNUSED_ARG(paren);
	5464	PERL_UNUSED_ARG(value);
	5465
	5466	if (!PL_localizing)
	5467	Perl_croak_no_modify(aTHX);
	5468	}
	5469
	5470	I32
	5471	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	5472	const I32 paren)
	5473	{
	5474	struct regexp const rx = (struct regexp )SvANY(r);
	5475	I32 i;
	5476	I32 s1, t1;
	5477
	5478	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	5479
	5480	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	5481	switch (paren) {
	5482	/* $` / ${^PREMATCH} */
	5483	case RX_BUFF_IDX_PREMATCH:
	5484	if (rx->offs[0].start != -1) {
	5485	i = rx->offs[0].start;
	5486	if (i > 0) {
	5487	s1 = 0;
	5488	t1 = i;
	5489	goto getlen;
	5490	}
	5491	}
	5492	return 0;
	5493	/* $' / ${^POSTMATCH} */
	5494	case RX_BUFF_IDX_POSTMATCH:
	5495	if (rx->offs[0].end != -1) {
	5496	i = rx->sublen - rx->offs[0].end;
	5497	if (i > 0) {
	5498	s1 = rx->offs[0].end;
	5499	t1 = rx->sublen;
	5500	goto getlen;
	5501	}
	5502	}
	5503	return 0;
	5504	/* $& / ${^MATCH}, $1, $2, ... */
	5505	default:
	5506	if (paren <= (I32)rx->nparens &&
	5507	(s1 = rx->offs[paren].start) != -1 &&
	5508	(t1 = rx->offs[paren].end) != -1)
	5509	{
	5510	i = t1 - s1;
	5511	goto getlen;
	5512	} else {
	5513	if (ckWARN(WARN_UNINITIALIZED))
	5514	report_uninit((const SV *)sv);
	5515	return 0;
	5516	}
	5517	}
	5518	getlen:
	5519	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	5520	const char * const s = rx->subbeg + s1;
	5521	const U8 *ep;
	5522	STRLEN el;
	5523
	5524	i = t1 - s1;
	5525	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	5526	i = el;
	5527	}
	5528	return i;
	5529	}
	5530
	5531	SV*
	5532	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	5533	{
	5534	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	5535	PERL_UNUSED_ARG(rx);
	5536	if (0)
	5537	return NULL;
	5538	else
	5539	return newSVpvs("Regexp");
	5540	}
	5541
	5542	/* Scans the name of a named buffer from the pattern.
	5543	* If flags is REG_RSN_RETURN_NULL returns null.
	5544	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	5545	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	5546	* to the parsed name as looked up in the RExC_paren_names hash.
	5547	* If there is an error throws a vFAIL().. type exception.
	5548	*/
	5549
	5550	#define REG_RSN_RETURN_NULL 0
	5551	#define REG_RSN_RETURN_NAME 1
	5552	#define REG_RSN_RETURN_DATA 2
	5553
	5554	STATIC SV*
	5555	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	5556	{
	5557	char *name_start = RExC_parse;
	5558
	5559	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	5560
	5561	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	5562	/* skip IDFIRST by using do...while */
	5563	if (UTF)
	5564	do {
	5565	RExC_parse += UTF8SKIP(RExC_parse);
	5566	} while (isALNUM_utf8((U8*)RExC_parse));
	5567	else
	5568	do {
	5569	RExC_parse++;
	5570	} while (isALNUM(*RExC_parse));
	5571	}
	5572
	5573	if ( flags ) {
	5574	SV* sv_name
	5575	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	5576	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	5577	if ( flags == REG_RSN_RETURN_NAME)
	5578	return sv_name;
	5579	else if (flags==REG_RSN_RETURN_DATA) {
	5580	HE *he_str = NULL;
	5581	SV *sv_dat = NULL;
	5582	if ( ! sv_name ) /* should not happen*/
	5583	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	5584	if (RExC_paren_names)
	5585	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	5586	if ( he_str )
	5587	sv_dat = HeVAL(he_str);
	5588	if ( ! sv_dat )
	5589	vFAIL("Reference to nonexistent named group");
	5590	return sv_dat;
	5591	}
	5592	else {
	5593	Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
	5594	}
	5595	/* NOT REACHED */
	5596	}
	5597	return NULL;
	5598	}
	5599
	5600	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	5601	int rem=(int)(RExC_end - RExC_parse); \
	5602	int cut; \
	5603	int num; \
	5604	int iscut=0; \
	5605	if (rem>10) { \
	5606	rem=10; \
	5607	iscut=1; \
	5608	} \
	5609	cut=10-rem; \
	5610	if (RExC_lastparse!=RExC_parse) \
	5611	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	5612	rem, RExC_parse, \
	5613	cut + 4, \
	5614	iscut ? "..." : "<" \
	5615	); \
	5616	else \
	5617	PerlIO_printf(Perl_debug_log,"%16s",""); \
	5618	\
	5619	if (SIZE_ONLY) \
	5620	num = RExC_size + 1; \
	5621	else \
	5622	num=REG_NODE_NUM(RExC_emit); \
	5623	if (RExC_lastnum!=num) \
	5624	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	5625	else \
	5626	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	5627	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	5628	(int)((depth*2)), "", \
	5629	(funcname) \
	5630	); \
	5631	RExC_lastnum=num; \
	5632	RExC_lastparse=RExC_parse; \
	5633	})
	5634
	5635
	5636
	5637	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	5638	DEBUG_PARSE_MSG((funcname)); \
	5639	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	5640	})
	5641	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	5642	DEBUG_PARSE_MSG((funcname)); \
	5643	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	5644	})
	5645	/*
	5646	- reg - regular expression, i.e. main body or parenthesized thing
	5647	*
	5648	* Caller must absorb opening parenthesis.
	5649	*
	5650	* Combining parenthesis handling with the base level of regular expression
	5651	* is a trifle forced, but the need to tie the tails of the branches to what
	5652	* follows makes it hard to avoid.
	5653	*/
	5654	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	5655	#ifdef DEBUGGING
	5656	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	5657	#else
	5658	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	5659	#endif
	5660
	5661	STATIC regnode *
	5662	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	5663	/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
	5664	{
	5665	dVAR;
	5666	register regnode ret; / Will be the head of the group. */
	5667	register regnode *br;
	5668	register regnode *lastbr;
	5669	register regnode *ender = NULL;
	5670	register I32 parno = 0;
	5671	I32 flags;
	5672	U32 oregflags = RExC_flags;
	5673	bool have_branch = 0;
	5674	bool is_open = 0;
	5675	I32 freeze_paren = 0;
	5676	I32 after_freeze = 0;
	5677
	5678	/* for (?g), (?gc), and (?o) warnings; warning
	5679	about (?c) will warn about (?g) -- japhy */
	5680
	5681	#define WASTED_O 0x01
	5682	#define WASTED_G 0x02
	5683	#define WASTED_C 0x04
	5684	#define WASTED_GC (0x02\|0x04)
	5685	I32 wastedflags = 0x00;
	5686
	5687	char * parse_start = RExC_parse; /* MJD */
	5688	char * const oregcomp_parse = RExC_parse;
	5689
	5690	GET_RE_DEBUG_FLAGS_DECL;
	5691
	5692	PERL_ARGS_ASSERT_REG;
	5693	DEBUG_PARSE("reg ");
	5694
	5695	flagp = 0; / Tentatively. */
	5696
	5697
	5698	/* Make an OPEN node, if parenthesized. */
	5699	if (paren) {
	5700	if ( RExC_parse == '') { /* (VERB:ARG) /
	5701	char *start_verb = RExC_parse;
	5702	STRLEN verb_len = 0;
	5703	char *start_arg = NULL;
	5704	unsigned char op = 0;
	5705	int argok = 1;
	5706	int internal_argval = 0; /* internal_argval is only useful if !argok */
	5707	while ( RExC_parse && RExC_parse != ')' ) {
	5708	if ( *RExC_parse == ':' ) {
	5709	start_arg = RExC_parse + 1;
	5710	break;
	5711	}
	5712	RExC_parse++;
	5713	}
	5714	++start_verb;
	5715	verb_len = RExC_parse - start_verb;
	5716	if ( start_arg ) {
	5717	RExC_parse++;
	5718	while ( RExC_parse && RExC_parse != ')' )
	5719	RExC_parse++;
	5720	if ( *RExC_parse != ')' )
	5721	vFAIL("Unterminated verb pattern argument");
	5722	if ( RExC_parse == start_arg )
	5723	start_arg = NULL;
	5724	} else {
	5725	if ( *RExC_parse != ')' )
	5726	vFAIL("Unterminated verb pattern");
	5727	}
	5728
	5729	switch ( *start_verb ) {
	5730	case 'A': /* (ACCEPT) /
	5731	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	5732	op = ACCEPT;
	5733	internal_argval = RExC_nestroot;
	5734	}
	5735	break;
	5736	case 'C': /* (COMMIT) /
	5737	if ( memEQs(start_verb,verb_len,"COMMIT") )
	5738	op = COMMIT;
	5739	break;
	5740	case 'F': /* (FAIL) /
	5741	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	5742	op = OPFAIL;
	5743	argok = 0;
	5744	}
	5745	break;
	5746	case ':': /* (:NAME) /
	5747	case 'M': /* (MARK:NAME) /
	5748	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	5749	op = MARKPOINT;
	5750	argok = -1;
	5751	}
	5752	break;
	5753	case 'P': /* (PRUNE) /
	5754	if ( memEQs(start_verb,verb_len,"PRUNE") )
	5755	op = PRUNE;
	5756	break;
	5757	case 'S': /* (SKIP) /
	5758	if ( memEQs(start_verb,verb_len,"SKIP") )
	5759	op = SKIP;
	5760	break;
	5761	case 'T': /* (THEN) /
	5762	/* [19:06] <TimToady> :: is then */
	5763	if ( memEQs(start_verb,verb_len,"THEN") ) {
	5764	op = CUTGROUP;
	5765	RExC_seen \|= REG_SEEN_CUTGROUP;
	5766	}
	5767	break;
	5768	}
	5769	if ( ! op ) {
	5770	RExC_parse++;
	5771	vFAIL3("Unknown verb pattern '%.*s'",
	5772	verb_len, start_verb);
	5773	}
	5774	if ( argok ) {
	5775	if ( start_arg && internal_argval ) {
	5776	vFAIL3("Verb pattern '%.*s' may not have an argument",
	5777	verb_len, start_verb);
	5778	} else if ( argok < 0 && !start_arg ) {
	5779	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	5780	verb_len, start_verb);
	5781	} else {
	5782	ret = reganode(pRExC_state, op, internal_argval);
	5783	if ( ! internal_argval && ! SIZE_ONLY ) {
	5784	if (start_arg) {
	5785	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	5786	ARG(ret) = add_data( pRExC_state, 1, "S" );
	5787	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	5788	ret->flags = 0;
	5789	} else {
	5790	ret->flags = 1;
	5791	}
	5792	}
	5793	}
	5794	if (!internal_argval)
	5795	RExC_seen \|= REG_SEEN_VERBARG;
	5796	} else if ( start_arg ) {
	5797	vFAIL3("Verb pattern '%.*s' may not have an argument",
	5798	verb_len, start_verb);
	5799	} else {
	5800	ret = reg_node(pRExC_state, op);
	5801	}
	5802	nextchar(pRExC_state);
	5803	return ret;
	5804	} else
	5805	if (RExC_parse == '?') { / (?...) */
	5806	bool is_logical = 0;
	5807	const char * const seqstart = RExC_parse;
	5808	bool has_use_defaults = FALSE;
	5809
	5810	RExC_parse++;
	5811	paren = *RExC_parse++;
	5812	ret = NULL; /* For look-ahead/behind. */
	5813	switch (paren) {
	5814
	5815	case 'P': /* (?P...) variants for those used to PCRE/Python */
	5816	paren = *RExC_parse++;
	5817	if ( paren == '<') /* (?P<...>) named capture */
	5818	goto named_capture;
	5819	else if (paren == '>') { /* (?P>name) named recursion */
	5820	goto named_recursion;
	5821	}
	5822	else if (paren == '=') { /* (?P=...) named backref */
	5823	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	5824	you change this make sure you change that */
	5825	char* name_start = RExC_parse;
	5826	U32 num = 0;
	5827	SV *sv_dat = reg_scan_name(pRExC_state,
	5828	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	5829	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	5830	vFAIL2("Sequence %.3s... not terminated",parse_start);
	5831
	5832	if (!SIZE_ONLY) {
	5833	num = add_data( pRExC_state, 1, "S" );
	5834	RExC_rxi->data->data[num]=(void*)sv_dat;
	5835	SvREFCNT_inc_simple_void(sv_dat);
	5836	}
	5837	RExC_sawback = 1;
	5838	ret = reganode(pRExC_state,
	5839	((! FOLD)
	5840	? NREF
	5841	: (UNI_SEMANTICS)
	5842	? NREFFU
	5843	: (LOC)
	5844	? NREFFL
	5845	: NREFF),
	5846	num);
	5847	*flagp \|= HASWIDTH;
	5848
	5849	Set_Node_Offset(ret, parse_start+1);
	5850	Set_Node_Cur_Length(ret); /* MJD */
	5851
	5852	nextchar(pRExC_state);
	5853	return ret;
	5854	}
	5855	RExC_parse++;
	5856	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	5857	/NOTREACHED/
	5858	case '<': /* (?<...) */
	5859	if (*RExC_parse == '!')
	5860	paren = ',';
	5861	else if (*RExC_parse != '=')
	5862	named_capture:
	5863	{ /* (?<...>) */
	5864	char *name_start;
	5865	SV *svname;
	5866	paren= '>';
	5867	case '\'': /* (?'...') */
	5868	name_start= RExC_parse;
	5869	svname = reg_scan_name(pRExC_state,
	5870	SIZE_ONLY ? /* reverse test from the others */
	5871	REG_RSN_RETURN_NAME :
	5872	REG_RSN_RETURN_NULL);
	5873	if (RExC_parse == name_start) {
	5874	RExC_parse++;
	5875	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	5876	/NOTREACHED/
	5877	}
	5878	if (*RExC_parse != paren)
	5879	vFAIL2("Sequence (?%c... not terminated",
	5880	paren=='>' ? '<' : paren);
	5881	if (SIZE_ONLY) {
	5882	HE *he_str;
	5883	SV *sv_dat = NULL;
	5884	if (!svname) /* shouldn't happen */
	5885	Perl_croak(aTHX_
	5886	"panic: reg_scan_name returned NULL");
	5887	if (!RExC_paren_names) {
	5888	RExC_paren_names= newHV();
	5889	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	5890	#ifdef DEBUGGING
	5891	RExC_paren_name_list= newAV();
	5892	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	5893	#endif
	5894	}
	5895	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	5896	if ( he_str )
	5897	sv_dat = HeVAL(he_str);
	5898	if ( ! sv_dat ) {
	5899	/* croak baby croak */
	5900	Perl_croak(aTHX_
	5901	"panic: paren_name hash element allocation failed");
	5902	} else if ( SvPOK(sv_dat) ) {
	5903	/* (?\|...) can mean we have dupes so scan to check
	5904	its already been stored. Maybe a flag indicating
	5905	we are inside such a construct would be useful,
	5906	but the arrays are likely to be quite small, so
	5907	for now we punt -- dmq */
	5908	IV count = SvIV(sv_dat);
	5909	I32 pv = (I32)SvPVX(sv_dat);
	5910	IV i;
	5911	for ( i = 0 ; i < count ; i++ ) {
	5912	if ( pv[i] == RExC_npar ) {
	5913	count = 0;
	5914	break;
	5915	}
	5916	}
	5917	if ( count ) {
	5918	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	5919	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	5920	pv[count] = RExC_npar;
	5921	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	5922	}
	5923	} else {
	5924	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	5925	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	5926	SvIOK_on(sv_dat);
	5927	SvIV_set(sv_dat, 1);
	5928	}
	5929	#ifdef DEBUGGING
	5930	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	5931	SvREFCNT_dec(svname);
	5932	#endif
	5933
	5934	/sv_dump(sv_dat);/
	5935	}
	5936	nextchar(pRExC_state);
	5937	paren = 1;
	5938	goto capturing_parens;
	5939	}
	5940	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	5941	RExC_parse++;
	5942	case '=': /* (?=...) */
	5943	RExC_seen_zerolen++;
	5944	break;
	5945	case '!': /* (?!...) */
	5946	RExC_seen_zerolen++;
	5947	if (*RExC_parse == ')') {
	5948	ret=reg_node(pRExC_state, OPFAIL);
	5949	nextchar(pRExC_state);
	5950	return ret;
	5951	}
	5952	break;
	5953	case '\|': /* (?\|...) */
	5954	/* branch reset, behave like a (?:...) except that
	5955	buffers in alternations share the same numbers */
	5956	paren = ':';
	5957	after_freeze = freeze_paren = RExC_npar;
	5958	break;
	5959	case ':': /* (?:...) */
	5960	case '>': /* (?>...) */
	5961	break;
	5962	case '$': /* (?$...) */
	5963	case '@': /* (?@...) */
	5964	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	5965	break;
	5966	case '#': /* (?#...) */
	5967	while (RExC_parse && RExC_parse != ')')
	5968	RExC_parse++;
	5969	if (*RExC_parse != ')')
	5970	FAIL("Sequence (?#... not terminated");
	5971	nextchar(pRExC_state);
	5972	*flagp = TRYAGAIN;
	5973	return NULL;
	5974	case '0' : /* (?0) */
	5975	case 'R' : /* (?R) */
	5976	if (*RExC_parse != ')')
	5977	FAIL("Sequence (?R) not terminated");
	5978	ret = reg_node(pRExC_state, GOSTART);
	5979	*flagp \|= POSTPONED;
	5980	nextchar(pRExC_state);
	5981	return ret;
	5982	/notreached/
	5983	{ /* named and numeric backreferences */
	5984	I32 num;
	5985	case '&': /* (?&NAME) */
	5986	parse_start = RExC_parse - 1;
	5987	named_recursion:
	5988	{
	5989	SV *sv_dat = reg_scan_name(pRExC_state,
	5990	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	5991	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	5992	}
	5993	goto gen_recurse_regop;
	5994	/* NOT REACHED */
	5995	case '+':
	5996	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	5997	RExC_parse++;
	5998	vFAIL("Illegal pattern");
	5999	}
	6000	goto parse_recursion;
	6001	/* NOT REACHED*/
	6002	case '-': /* (?-1) */
	6003	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	6004	RExC_parse--; /* rewind to let it be handled later */
	6005	goto parse_flags;
	6006	}
	6007	/FALLTHROUGH /
	6008	case '1': case '2': case '3': case '4': /* (?1) */
	6009	case '5': case '6': case '7': case '8': case '9':
	6010	RExC_parse--;
	6011	parse_recursion:
	6012	num = atoi(RExC_parse);
	6013	parse_start = RExC_parse - 1; /* MJD */
	6014	if (*RExC_parse == '-')
	6015	RExC_parse++;
	6016	while (isDIGIT(*RExC_parse))
	6017	RExC_parse++;
	6018	if (*RExC_parse!=')')
	6019	vFAIL("Expecting close bracket");
	6020
	6021	gen_recurse_regop:
	6022	if ( paren == '-' ) {
	6023	/*
	6024	Diagram of capture buffer numbering.
	6025	Top line is the normal capture buffer numbers
	6026	Bottom line is the negative indexing as from
	6027	the X (the (?-2))
	6028
	6029	+ 1 2 3 4 5 X 6 7
	6030	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	6031	- 5 4 3 2 1 X x x
	6032
	6033	*/
	6034	num = RExC_npar + num;
	6035	if (num < 1) {
	6036	RExC_parse++;
	6037	vFAIL("Reference to nonexistent group");
	6038	}
	6039	} else if ( paren == '+' ) {
	6040	num = RExC_npar + num - 1;
	6041	}
	6042
	6043	ret = reganode(pRExC_state, GOSUB, num);
	6044	if (!SIZE_ONLY) {
	6045	if (num > (I32)RExC_rx->nparens) {
	6046	RExC_parse++;
	6047	vFAIL("Reference to nonexistent group");
	6048	}
	6049	ARG2L_SET( ret, RExC_recurse_count++);
	6050	RExC_emit++;
	6051	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	6052	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	6053	} else {
	6054	RExC_size++;
	6055	}
	6056	RExC_seen \|= REG_SEEN_RECURSE;
	6057	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	6058	Set_Node_Offset(ret, parse_start); /* MJD */
	6059
	6060	*flagp \|= POSTPONED;
	6061	nextchar(pRExC_state);
	6062	return ret;
	6063	} /* named and numeric backreferences */
	6064	/* NOT REACHED */
	6065
	6066	case '?': /* (??...) */
	6067	is_logical = 1;
	6068	if (*RExC_parse != '{') {
	6069	RExC_parse++;
	6070	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6071	/NOTREACHED/
	6072	}
	6073	*flagp \|= POSTPONED;
	6074	paren = *RExC_parse++;
	6075	/* FALL THROUGH */
	6076	case '{': /* (?{...}) */
	6077	{
	6078	I32 count = 1;
	6079	U32 n = 0;
	6080	char c;
	6081	char *s = RExC_parse;
	6082
	6083	RExC_seen_zerolen++;
	6084	RExC_seen \|= REG_SEEN_EVAL;
	6085	while (count && (c = *RExC_parse)) {
	6086	if (c == '\\') {
	6087	if (RExC_parse[1])
	6088	RExC_parse++;
	6089	}
	6090	else if (c == '{')
	6091	count++;
	6092	else if (c == '}')
	6093	count--;
	6094	RExC_parse++;
	6095	}
	6096	if (*RExC_parse != ')') {
	6097	RExC_parse = s;
	6098	vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
	6099	}
	6100	if (!SIZE_ONLY) {
	6101	PAD *pad;
	6102	OP_4tree sop, rop;
	6103	SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
	6104
	6105	ENTER;
	6106	Perl_save_re_context(aTHX);
	6107	rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
	6108	sop->op_private \|= OPpREFCOUNTED;
	6109	/* re_dup will OpREFCNT_inc */
	6110	OpREFCNT_set(sop, 1);
	6111	LEAVE;
	6112
	6113	n = add_data(pRExC_state, 3, "nop");
	6114	RExC_rxi->data->data[n] = (void*)rop;
	6115	RExC_rxi->data->data[n+1] = (void*)sop;
	6116	RExC_rxi->data->data[n+2] = (void*)pad;
	6117	SvREFCNT_dec(sv);
	6118	}
	6119	else { /* First pass */
	6120	if (PL_reginterp_cnt < ++RExC_seen_evals
	6121	&& IN_PERL_RUNTIME)
	6122	/* No compiled RE interpolated, has runtime
	6123	components ===> unsafe. */
	6124	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	6125	if (PL_tainting && PL_tainted)
	6126	FAIL("Eval-group in insecure regular expression");
	6127	#if PERL_VERSION > 8
	6128	if (IN_PERL_COMPILETIME)
	6129	PL_cv_has_eval = 1;
	6130	#endif
	6131	}
	6132
	6133	nextchar(pRExC_state);
	6134	if (is_logical) {
	6135	ret = reg_node(pRExC_state, LOGICAL);
	6136	if (!SIZE_ONLY)
	6137	ret->flags = 2;
	6138	REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
	6139	/* deal with the length of this later - MJD */
	6140	return ret;
	6141	}
	6142	ret = reganode(pRExC_state, EVAL, n);
	6143	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	6144	Set_Node_Offset(ret, parse_start);
	6145	return ret;
	6146	}
	6147	case '(': /* (?(?{...})...) and (?(?=...)...) */
	6148	{
	6149	int is_define= 0;
	6150	if (RExC_parse[0] == '?') { /* (?(?...)) */
	6151	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	6152	\|\| RExC_parse[1] == '<'
	6153	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	6154	I32 flag;
	6155
	6156	ret = reg_node(pRExC_state, LOGICAL);
	6157	if (!SIZE_ONLY)
	6158	ret->flags = 1;
	6159	REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
	6160	goto insert_if;
	6161	}
	6162	}
	6163	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	6164	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	6165	{
	6166	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	6167	char *name_start= RExC_parse++;
	6168	U32 num = 0;
	6169	SV *sv_dat=reg_scan_name(pRExC_state,
	6170	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6171	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	6172	vFAIL2("Sequence (?(%c... not terminated",
	6173	(ch == '>' ? '<' : ch));
	6174	RExC_parse++;
	6175	if (!SIZE_ONLY) {
	6176	num = add_data( pRExC_state, 1, "S" );
	6177	RExC_rxi->data->data[num]=(void*)sv_dat;
	6178	SvREFCNT_inc_simple_void(sv_dat);
	6179	}
	6180	ret = reganode(pRExC_state,NGROUPP,num);
	6181	goto insert_if_check_paren;
	6182	}
	6183	else if (RExC_parse[0] == 'D' &&
	6184	RExC_parse[1] == 'E' &&
	6185	RExC_parse[2] == 'F' &&
	6186	RExC_parse[3] == 'I' &&
	6187	RExC_parse[4] == 'N' &&
	6188	RExC_parse[5] == 'E')
	6189	{
	6190	ret = reganode(pRExC_state,DEFINEP,0);
	6191	RExC_parse +=6 ;
	6192	is_define = 1;
	6193	goto insert_if_check_paren;
	6194	}
	6195	else if (RExC_parse[0] == 'R') {
	6196	RExC_parse++;
	6197	parno = 0;
	6198	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	6199	parno = atoi(RExC_parse++);
	6200	while (isDIGIT(*RExC_parse))
	6201	RExC_parse++;
	6202	} else if (RExC_parse[0] == '&') {
	6203	SV *sv_dat;
	6204	RExC_parse++;
	6205	sv_dat = reg_scan_name(pRExC_state,
	6206	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6207	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	6208	}
	6209	ret = reganode(pRExC_state,INSUBP,parno);
	6210	goto insert_if_check_paren;
	6211	}
	6212	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	6213	/* (?(1)...) */
	6214	char c;
	6215	parno = atoi(RExC_parse++);
	6216
	6217	while (isDIGIT(*RExC_parse))
	6218	RExC_parse++;
	6219	ret = reganode(pRExC_state, GROUPP, parno);
	6220
	6221	insert_if_check_paren:
	6222	if ((c = *nextchar(pRExC_state)) != ')')
	6223	vFAIL("Switch condition not recognized");
	6224	insert_if:
	6225	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	6226	br = regbranch(pRExC_state, &flags, 1,depth+1);
	6227	if (br == NULL)
	6228	br = reganode(pRExC_state, LONGJMP, 0);
	6229	else
	6230	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	6231	c = *nextchar(pRExC_state);
	6232	if (flags&HASWIDTH)
	6233	*flagp \|= HASWIDTH;
	6234	if (c == '\|') {
	6235	if (is_define)
	6236	vFAIL("(?(DEFINE)....) does not allow branches");
	6237	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	6238	regbranch(pRExC_state, &flags, 1,depth+1);
	6239	REGTAIL(pRExC_state, ret, lastbr);
	6240	if (flags&HASWIDTH)
	6241	*flagp \|= HASWIDTH;
	6242	c = *nextchar(pRExC_state);
	6243	}
	6244	else
	6245	lastbr = NULL;
	6246	if (c != ')')
	6247	vFAIL("Switch (?(condition)... contains too many branches");
	6248	ender = reg_node(pRExC_state, TAIL);
	6249	REGTAIL(pRExC_state, br, ender);
	6250	if (lastbr) {
	6251	REGTAIL(pRExC_state, lastbr, ender);
	6252	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	6253	}
	6254	else
	6255	REGTAIL(pRExC_state, ret, ender);
	6256	RExC_size++; /* XXX WHY do we need this?!!
	6257	For large programs it seems to be required
	6258	but I can't figure out why. -- dmq*/
	6259	return ret;
	6260	}
	6261	else {
	6262	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	6263	}
	6264	}
	6265	case 0:
	6266	RExC_parse--; /* for vFAIL to print correctly */
	6267	vFAIL("Sequence (? incomplete");
	6268	break;
	6269	case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
	6270	that follow */
	6271	has_use_defaults = TRUE;
	6272	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	6273	if (RExC_utf8) { /* But the default for a utf8 pattern is
	6274	unicode semantics */
	6275	set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
	6276	}
	6277	goto parse_flags;
	6278	default:
	6279	--RExC_parse;
	6280	parse_flags: /* (?i) */
	6281	{
	6282	U32 posflags = 0, negflags = 0;
	6283	U32 *flagsp = &posflags;
	6284	bool has_charset_modifier = 0;
	6285	regex_charset cs = REGEX_DEPENDS_CHARSET;
	6286
	6287	while (*RExC_parse) {
	6288	/* && strchr("iogcmsx", RExC_parse) /
	6289	/* (?g), (?gc) and (?o) are useless here
	6290	and must be globally applied -- japhy */
	6291	switch (*RExC_parse) {
	6292	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	6293	case LOCALE_PAT_MOD:
	6294	if (has_charset_modifier \|\| flagsp == &negflags) {
	6295	goto fail_modifiers;
	6296	}
	6297	cs = REGEX_LOCALE_CHARSET;
	6298	has_charset_modifier = 1;
	6299	break;
	6300	case UNICODE_PAT_MOD:
	6301	if (has_charset_modifier \|\| flagsp == &negflags) {
	6302	goto fail_modifiers;
	6303	}
	6304	cs = REGEX_UNICODE_CHARSET;
	6305	has_charset_modifier = 1;
	6306	break;
	6307	case ASCII_RESTRICT_PAT_MOD:
	6308	if (has_charset_modifier \|\| flagsp == &negflags) {
	6309	goto fail_modifiers;
	6310	}
	6311	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	6312	has_charset_modifier = 1;
	6313	break;
	6314	case DEPENDS_PAT_MOD:
	6315	if (has_use_defaults
	6316	\|\| has_charset_modifier
	6317	\|\| flagsp == &negflags)
	6318	{
	6319	goto fail_modifiers;
	6320	}
	6321
	6322	/* The dual charset means unicode semantics if the
	6323	* pattern (or target, not known until runtime) are
	6324	* utf8 */
	6325	cs = (RExC_utf8)
	6326	? REGEX_UNICODE_CHARSET
	6327	: REGEX_DEPENDS_CHARSET;
	6328	has_charset_modifier = 1;
	6329	break;
	6330	case ONCE_PAT_MOD: /* 'o' */
	6331	case GLOBAL_PAT_MOD: /* 'g' */
	6332	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	6333	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	6334	if (! (wastedflags & wflagbit) ) {
	6335	wastedflags \|= wflagbit;
	6336	vWARN5(
	6337	RExC_parse + 1,
	6338	"Useless (%s%c) - %suse /%c modifier",
	6339	flagsp == &negflags ? "?-" : "?",
	6340	*RExC_parse,
	6341	flagsp == &negflags ? "don't " : "",
	6342	*RExC_parse
	6343	);
	6344	}
	6345	}
	6346	break;
	6347
	6348	case CONTINUE_PAT_MOD: /* 'c' */
	6349	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	6350	if (! (wastedflags & WASTED_C) ) {
	6351	wastedflags \|= WASTED_GC;
	6352	vWARN3(
	6353	RExC_parse + 1,
	6354	"Useless (%sc) - %suse /gc modifier",
	6355	flagsp == &negflags ? "?-" : "?",
	6356	flagsp == &negflags ? "don't " : ""
	6357	);
	6358	}
	6359	}
	6360	break;
	6361	case KEEPCOPY_PAT_MOD: /* 'p' */
	6362	if (flagsp == &negflags) {
	6363	if (SIZE_ONLY)
	6364	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	6365	} else {
	6366	*flagsp \|= RXf_PMf_KEEPCOPY;
	6367	}
	6368	break;
	6369	case '-':
	6370	/* A flag is a default iff it is following a minus, so
	6371	* if there is a minus, it means will be trying to
	6372	* re-specify a default which is an error */
	6373	if (has_use_defaults \|\| flagsp == &negflags) {
	6374	fail_modifiers:
	6375	RExC_parse++;
	6376	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6377	/NOTREACHED/
	6378	}
	6379	flagsp = &negflags;
	6380	wastedflags = 0; /* reset so (?g-c) warns twice */
	6381	break;
	6382	case ':':
	6383	paren = ':';
	6384	/FALLTHROUGH/
	6385	case ')':
	6386	RExC_flags \|= posflags;
	6387	RExC_flags &= ~negflags;
	6388	set_regex_charset(&RExC_flags, cs);
	6389	if (paren != ':') {
	6390	oregflags \|= posflags;
	6391	oregflags &= ~negflags;
	6392	set_regex_charset(&oregflags, cs);
	6393	}
	6394	nextchar(pRExC_state);
	6395	if (paren != ':') {
	6396	*flagp = TRYAGAIN;
	6397	return NULL;
	6398	} else {
	6399	ret = NULL;
	6400	goto parse_rest;
	6401	}
	6402	/NOTREACHED/
	6403	default:
	6404	RExC_parse++;
	6405	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6406	/NOTREACHED/
	6407	}
	6408	++RExC_parse;
	6409	}
	6410	}} /* one for the default block, one for the switch */
	6411	}
	6412	else { /* (...) */
	6413	capturing_parens:
	6414	parno = RExC_npar;
	6415	RExC_npar++;
	6416
	6417	ret = reganode(pRExC_state, OPEN, parno);
	6418	if (!SIZE_ONLY ){
	6419	if (!RExC_nestroot)
	6420	RExC_nestroot = parno;
	6421	if (RExC_seen & REG_SEEN_RECURSE
	6422	&& !RExC_open_parens[parno-1])
	6423	{
	6424	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	6425	"Setting open paren #%"IVdf" to %d\n",
	6426	(IV)parno, REG_NODE_NUM(ret)));
	6427	RExC_open_parens[parno-1]= ret;
	6428	}
	6429	}
	6430	Set_Node_Length(ret, 1); /* MJD */
	6431	Set_Node_Offset(ret, RExC_parse); /* MJD */
	6432	is_open = 1;
	6433	}
	6434	}
	6435	else /* ! paren */
	6436	ret = NULL;
	6437
	6438	parse_rest:
	6439	/* Pick up the branches, linking them together. */
	6440	parse_start = RExC_parse; /* MJD */
	6441	br = regbranch(pRExC_state, &flags, 1,depth+1);
	6442
	6443	if (freeze_paren) {
	6444	if (RExC_npar > after_freeze)
	6445	after_freeze = RExC_npar;
	6446	RExC_npar = freeze_paren;
	6447	}
	6448
	6449	/* branch_len = (paren != 0); */
	6450
	6451	if (br == NULL)
	6452	return(NULL);
	6453	if (*RExC_parse == '\|') {
	6454	if (!SIZE_ONLY && RExC_extralen) {
	6455	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	6456	}
	6457	else { /* MJD */
	6458	reginsert(pRExC_state, BRANCH, br, depth+1);
	6459	Set_Node_Length(br, paren != 0);
	6460	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	6461	}
	6462	have_branch = 1;
	6463	if (SIZE_ONLY)
	6464	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	6465	}
	6466	else if (paren == ':') {
	6467	*flagp \|= flags&SIMPLE;
	6468	}
	6469	if (is_open) { /* Starts with OPEN. */
	6470	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	6471	}
	6472	else if (paren != '?') /* Not Conditional */
	6473	ret = br;
	6474	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	6475	lastbr = br;
	6476	while (*RExC_parse == '\|') {
	6477	if (!SIZE_ONLY && RExC_extralen) {
	6478	ender = reganode(pRExC_state, LONGJMP,0);
	6479	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	6480	}
	6481	if (SIZE_ONLY)
	6482	RExC_extralen += 2; /* Account for LONGJMP. */
	6483	nextchar(pRExC_state);
	6484	if (freeze_paren) {
	6485	if (RExC_npar > after_freeze)
	6486	after_freeze = RExC_npar;
	6487	RExC_npar = freeze_paren;
	6488	}
	6489	br = regbranch(pRExC_state, &flags, 0, depth+1);
	6490
	6491	if (br == NULL)
	6492	return(NULL);
	6493	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	6494	lastbr = br;
	6495	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	6496	}
	6497
	6498	if (have_branch \|\| paren != ':') {
	6499	/* Make a closing node, and hook it on the end. */
	6500	switch (paren) {
	6501	case ':':
	6502	ender = reg_node(pRExC_state, TAIL);
	6503	break;
	6504	case 1:
	6505	ender = reganode(pRExC_state, CLOSE, parno);
	6506	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	6507	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	6508	"Setting close paren #%"IVdf" to %d\n",
	6509	(IV)parno, REG_NODE_NUM(ender)));
	6510	RExC_close_parens[parno-1]= ender;
	6511	if (RExC_nestroot == parno)
	6512	RExC_nestroot = 0;
	6513	}
	6514	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	6515	Set_Node_Length(ender,1); /* MJD */
	6516	break;
	6517	case '<':
	6518	case ',':
	6519	case '=':
	6520	case '!':
	6521	*flagp &= ~HASWIDTH;
	6522	/* FALL THROUGH */
	6523	case '>':
	6524	ender = reg_node(pRExC_state, SUCCEED);
	6525	break;
	6526	case 0:
	6527	ender = reg_node(pRExC_state, END);
	6528	if (!SIZE_ONLY) {
	6529	assert(!RExC_opend); /* there can only be one! */
	6530	RExC_opend = ender;
	6531	}
	6532	break;
	6533	}
	6534	REGTAIL(pRExC_state, lastbr, ender);
	6535
	6536	if (have_branch && !SIZE_ONLY) {
	6537	if (depth==1)
	6538	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	6539
	6540	/* Hook the tails of the branches to the closing node. */
	6541	for (br = ret; br; br = regnext(br)) {
	6542	const U8 op = PL_regkind[OP(br)];
	6543	if (op == BRANCH) {
	6544	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	6545	}
	6546	else if (op == BRANCHJ) {
	6547	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	6548	}
	6549	}
	6550	}
	6551	}
	6552
	6553	{
	6554	const char *p;
	6555	static const char parens[] = "=!<,>";
	6556
	6557	if (paren && (p = strchr(parens, paren))) {
	6558	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	6559	int flag = (p - parens) > 1;
	6560
	6561	if (paren == '>')
	6562	node = SUSPEND, flag = 0;
	6563	reginsert(pRExC_state, node,ret, depth+1);
	6564	Set_Node_Cur_Length(ret);
	6565	Set_Node_Offset(ret, parse_start + 1);
	6566	ret->flags = flag;
	6567	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	6568	}
	6569	}
	6570
	6571	/* Check for proper termination. */
	6572	if (paren) {
	6573	RExC_flags = oregflags;
	6574	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	6575	RExC_parse = oregcomp_parse;
	6576	vFAIL("Unmatched (");
	6577	}
	6578	}
	6579	else if (!paren && RExC_parse < RExC_end) {
	6580	if (*RExC_parse == ')') {
	6581	RExC_parse++;
	6582	vFAIL("Unmatched )");
	6583	}
	6584	else
	6585	FAIL("Junk on end of regexp"); /* "Can't happen". */
	6586	/* NOTREACHED */
	6587	}
	6588	if (after_freeze)
	6589	RExC_npar = after_freeze;
	6590	return(ret);
	6591	}
	6592
	6593	/*
	6594	- regbranch - one alternative of an \| operator
	6595	*
	6596	* Implements the concatenation operator.
	6597	*/
	6598	STATIC regnode *
	6599	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	6600	{
	6601	dVAR;
	6602	register regnode *ret;
	6603	register regnode *chain = NULL;
	6604	register regnode *latest;
	6605	I32 flags = 0, c = 0;
	6606	GET_RE_DEBUG_FLAGS_DECL;
	6607
	6608	PERL_ARGS_ASSERT_REGBRANCH;
	6609
	6610	DEBUG_PARSE("brnc");
	6611
	6612	if (first)
	6613	ret = NULL;
	6614	else {
	6615	if (!SIZE_ONLY && RExC_extralen)
	6616	ret = reganode(pRExC_state, BRANCHJ,0);
	6617	else {
	6618	ret = reg_node(pRExC_state, BRANCH);
	6619	Set_Node_Length(ret, 1);
	6620	}
	6621	}
	6622
	6623	if (!first && SIZE_ONLY)
	6624	RExC_extralen += 1; /* BRANCHJ */
	6625
	6626	flagp = WORST; / Tentatively. */
	6627
	6628	RExC_parse--;
	6629	nextchar(pRExC_state);
	6630	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	6631	flags &= ~TRYAGAIN;
	6632	latest = regpiece(pRExC_state, &flags,depth+1);
	6633	if (latest == NULL) {
	6634	if (flags & TRYAGAIN)
	6635	continue;
	6636	return(NULL);
	6637	}
	6638	else if (ret == NULL)
	6639	ret = latest;
	6640	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	6641	if (chain == NULL) /* First piece. */
	6642	*flagp \|= flags&SPSTART;
	6643	else {
	6644	RExC_naughty++;
	6645	REGTAIL(pRExC_state, chain, latest);
	6646	}
	6647	chain = latest;
	6648	c++;
	6649	}
	6650	if (chain == NULL) { /* Loop ran zero times. */
	6651	chain = reg_node(pRExC_state, NOTHING);
	6652	if (ret == NULL)
	6653	ret = chain;
	6654	}
	6655	if (c == 1) {
	6656	*flagp \|= flags&SIMPLE;
	6657	}
	6658
	6659	return ret;
	6660	}
	6661
	6662	/*
	6663	- regpiece - something followed by possible [*+?]
	6664	*
	6665	* Note that the branching code sequences used for ? and the general cases
	6666	* of * and + are somewhat optimized: they use the same NOTHING node as
	6667	* both the endmarker for their branch list and the body of the last branch.
	6668	* It might seem that this node could be dispensed with entirely, but the
	6669	* endmarker role is not redundant.
	6670	*/
	6671	STATIC regnode *
	6672	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	6673	{
	6674	dVAR;
	6675	register regnode *ret;
	6676	register char op;
	6677	register char *next;
	6678	I32 flags;
	6679	const char * const origparse = RExC_parse;
	6680	I32 min;
	6681	I32 max = REG_INFTY;
	6682	char *parse_start;
	6683	const char *maxpos = NULL;
	6684	GET_RE_DEBUG_FLAGS_DECL;
	6685
	6686	PERL_ARGS_ASSERT_REGPIECE;
	6687
	6688	DEBUG_PARSE("piec");
	6689
	6690	ret = regatom(pRExC_state, &flags,depth+1);
	6691	if (ret == NULL) {
	6692	if (flags & TRYAGAIN)
	6693	*flagp \|= TRYAGAIN;
	6694	return(NULL);
	6695	}
	6696
	6697	op = *RExC_parse;
	6698
	6699	if (op == '{' && regcurly(RExC_parse)) {
	6700	maxpos = NULL;
	6701	parse_start = RExC_parse; /* MJD */
	6702	next = RExC_parse + 1;
	6703	while (isDIGIT(next) \|\| next == ',') {
	6704	if (*next == ',') {
	6705	if (maxpos)
	6706	break;
	6707	else
	6708	maxpos = next;
	6709	}
	6710	next++;
	6711	}
	6712	if (next == '}') { / got one */
	6713	if (!maxpos)
	6714	maxpos = next;
	6715	RExC_parse++;
	6716	min = atoi(RExC_parse);
	6717	if (*maxpos == ',')
	6718	maxpos++;
	6719	else
	6720	maxpos = RExC_parse;
	6721	max = atoi(maxpos);
	6722	if (!max && *maxpos != '0')
	6723	max = REG_INFTY; /* meaning "infinity" */
	6724	else if (max >= REG_INFTY)
	6725	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	6726	RExC_parse = next;
	6727	nextchar(pRExC_state);
	6728
	6729	do_curly:
	6730	if ((flags&SIMPLE)) {
	6731	RExC_naughty += 2 + RExC_naughty / 2;
	6732	reginsert(pRExC_state, CURLY, ret, depth+1);
	6733	Set_Node_Offset(ret, parse_start+1); /* MJD */
	6734	Set_Node_Cur_Length(ret);
	6735	}
	6736	else {
	6737	regnode * const w = reg_node(pRExC_state, WHILEM);
	6738
	6739	w->flags = 0;
	6740	REGTAIL(pRExC_state, ret, w);
	6741	if (!SIZE_ONLY && RExC_extralen) {
	6742	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	6743	reginsert(pRExC_state, NOTHING,ret, depth+1);
	6744	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	6745	}
	6746	reginsert(pRExC_state, CURLYX,ret, depth+1);
	6747	/* MJD hk */
	6748	Set_Node_Offset(ret, parse_start+1);
	6749	Set_Node_Length(ret,
	6750	op == '{' ? (RExC_parse - parse_start) : 1);
	6751
	6752	if (!SIZE_ONLY && RExC_extralen)
	6753	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	6754	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	6755	if (SIZE_ONLY)
	6756	RExC_whilem_seen++, RExC_extralen += 3;
	6757	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	6758	}
	6759	ret->flags = 0;
	6760
	6761	if (min > 0)
	6762	*flagp = WORST;
	6763	if (max > 0)
	6764	*flagp \|= HASWIDTH;
	6765	if (max < min)
	6766	vFAIL("Can't do {n,m} with n > m");
	6767	if (!SIZE_ONLY) {
	6768	ARG1_SET(ret, (U16)min);
	6769	ARG2_SET(ret, (U16)max);
	6770	}
	6771
	6772	goto nest_check;
	6773	}
	6774	}
	6775
	6776	if (!ISMULT1(op)) {
	6777	*flagp = flags;
	6778	return(ret);
	6779	}
	6780
	6781	#if 0 /* Now runtime fix should be reliable. */
	6782
	6783	/* if this is reinstated, don't forget to put this back into perldiag:
	6784
	6785	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	6786
	6787	(F) The part of the regexp subject to either the * or + quantifier
	6788	could match an empty string. The {#} shows in the regular
	6789	expression about where the problem was discovered.
	6790
	6791	*/
	6792
	6793	if (!(flags&HASWIDTH) && op != '?')
	6794	vFAIL("Regexp *+ operand could be empty");
	6795	#endif
	6796
	6797	parse_start = RExC_parse;
	6798	nextchar(pRExC_state);
	6799
	6800	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	6801
	6802	if (op == '*' && (flags&SIMPLE)) {
	6803	reginsert(pRExC_state, STAR, ret, depth+1);
	6804	ret->flags = 0;
	6805	RExC_naughty += 4;
	6806	}
	6807	else if (op == '*') {
	6808	min = 0;
	6809	goto do_curly;
	6810	}
	6811	else if (op == '+' && (flags&SIMPLE)) {
	6812	reginsert(pRExC_state, PLUS, ret, depth+1);
	6813	ret->flags = 0;
	6814	RExC_naughty += 3;
	6815	}
	6816	else if (op == '+') {
	6817	min = 1;
	6818	goto do_curly;
	6819	}
	6820	else if (op == '?') {
	6821	min = 0; max = 1;
	6822	goto do_curly;
	6823	}
	6824	nest_check:
	6825	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	6826	ckWARN3reg(RExC_parse,
	6827	"%.*s matches null string many times",
	6828	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	6829	origparse);
	6830	}
	6831
	6832	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	6833	nextchar(pRExC_state);
	6834	reginsert(pRExC_state, MINMOD, ret, depth+1);
	6835	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	6836	}
	6837	#ifndef REG_ALLOW_MINMOD_SUSPEND
	6838	else
	6839	#endif
	6840	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	6841	regnode *ender;
	6842	nextchar(pRExC_state);
	6843	ender = reg_node(pRExC_state, SUCCEED);
	6844	REGTAIL(pRExC_state, ret, ender);
	6845	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	6846	ret->flags = 0;
	6847	ender = reg_node(pRExC_state, TAIL);
	6848	REGTAIL(pRExC_state, ret, ender);
	6849	/ret= ender;/
	6850	}
	6851
	6852	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	6853	RExC_parse++;
	6854	vFAIL("Nested quantifiers");
	6855	}
	6856
	6857	return(ret);
	6858	}
	6859
	6860
	6861	/* reg_namedseq(pRExC_state,UVp)
	6862
	6863	This is expected to be called by a parser routine that has
	6864	recognized '\N' and needs to handle the rest. RExC_parse is
	6865	expected to point at the first char following the N at the time
	6866	of the call.
	6867
	6868	The \N may be inside (indicated by valuep not being NULL) or outside a
	6869	character class.
	6870
	6871	\N may begin either a named sequence, or if outside a character class, mean
	6872	to match a non-newline. For non single-quoted regexes, the tokenizer has
	6873	attempted to decide which, and in the case of a named sequence converted it
	6874	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	6875	where c1... are the characters in the sequence. For single-quoted regexes,
	6876	the tokenizer passes the \N sequence through unchanged; this code will not
	6877	attempt to determine this nor expand those. The net effect is that if the
	6878	beginning of the passed-in pattern isn't '{U+' or there is no '}', it
	6879	signals that this \N occurrence means to match a non-newline.
	6880
	6881	Only the \N{U+...} form should occur in a character class, for the same
	6882	reason that '.' inside a character class means to just match a period: it
	6883	just doesn't make sense.
	6884
	6885	If valuep is non-null then it is assumed that we are parsing inside
	6886	of a charclass definition and the first codepoint in the resolved
	6887	string is returned via *valuep and the routine will return NULL.
	6888	In this mode if a multichar string is returned from the charnames
	6889	handler, a warning will be issued, and only the first char in the
	6890	sequence will be examined. If the string returned is zero length
	6891	then the value of *valuep is undefined and NON-NULL will
	6892	be returned to indicate failure. (This will NOT be a valid pointer
	6893	to a regnode.)
	6894
	6895	If valuep is null then it is assumed that we are parsing normal text and a
	6896	new EXACT node is inserted into the program containing the resolved string,
	6897	and a pointer to the new node is returned. But if the string is zero length
	6898	a NOTHING node is emitted instead.
	6899
	6900	On success RExC_parse is set to the char following the endbrace.
	6901	Parsing failures will generate a fatal error via vFAIL(...)
	6902	*/
	6903	STATIC regnode *
	6904	S_reg_namedseq(pTHX_ RExC_state_t pRExC_state, UV valuep, I32 *flagp)
	6905	{
	6906	char * endbrace; /* '}' following the name */
	6907	regnode *ret = NULL;
	6908	#ifdef DEBUGGING
	6909	char* parse_start = RExC_parse - 2; /* points to the '\N' */
	6910	#endif
	6911	char* p;
	6912
	6913	GET_RE_DEBUG_FLAGS_DECL;
	6914
	6915	PERL_ARGS_ASSERT_REG_NAMEDSEQ;
	6916
	6917	GET_RE_DEBUG_FLAGS;
	6918
	6919	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	6920	* modifier. The other meaning does not */
	6921	p = (RExC_flags & RXf_PMf_EXTENDED)
	6922	? regwhite( pRExC_state, RExC_parse )
	6923	: RExC_parse;
	6924
	6925	/* Disambiguate between \N meaning a named character versus \N meaning
	6926	* [^\n]. The former is assumed when it can't be the latter. */
	6927	if (*p != '{' \|\| regcurly(p)) {
	6928	RExC_parse = p;
	6929	if (valuep) {
	6930	/* no bare \N in a charclass */
	6931	vFAIL("\\N in a character class must be a named character: \\N{...}");
	6932	}
	6933	nextchar(pRExC_state);
	6934	ret = reg_node(pRExC_state, REG_ANY);
	6935	*flagp \|= HASWIDTH\|SIMPLE;
	6936	RExC_naughty++;
	6937	RExC_parse--;
	6938	Set_Node_Length(ret, 1); /* MJD */
	6939	return ret;
	6940	}
	6941
	6942	/* Here, we have decided it should be a named sequence */
	6943
	6944	/* The test above made sure that the next real character is a '{', but
	6945	* under the /x modifier, it could be separated by space (or a comment and
	6946	* \n) and this is not allowed (for consistency with \x{...} and the
	6947	* tokenizer handling of \N{NAME}). */
	6948	if (*RExC_parse != '{') {
	6949	vFAIL("Missing braces on \\N{}");
	6950	}
	6951
	6952	RExC_parse++; /* Skip past the '{' */
	6953
	6954	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	6955	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	6956	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	6957	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	6958	{
	6959	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	6960	vFAIL("\\N{NAME} must be resolved by the lexer");
	6961	}
	6962
	6963	if (endbrace == RExC_parse) { /* empty: \N{} */
	6964	if (! valuep) {
	6965	RExC_parse = endbrace + 1;
	6966	return reg_node(pRExC_state,NOTHING);
	6967	}
	6968
	6969	if (SIZE_ONLY) {
	6970	ckWARNreg(RExC_parse,
	6971	"Ignoring zero length \\N{} in character class"
	6972	);
	6973	RExC_parse = endbrace + 1;
	6974	}
	6975	*valuep = 0;
	6976	return (regnode ) &RExC_parse; / Invalid regnode pointer */
	6977	}
	6978
	6979	REQUIRE_UTF8; /* named sequences imply Unicode semantics */
	6980	RExC_parse += 2; /* Skip past the 'U+' */
	6981
	6982	if (valuep) { /* In a bracketed char class */
	6983	/* We only pay attention to the first char of
	6984	multichar strings being returned. I kinda wonder
	6985	if this makes sense as it does change the behaviour
	6986	from earlier versions, OTOH that behaviour was broken
	6987	as well. XXX Solution is to recharacterize as
	6988	[rest-of-class]\|multi1\|multi2... */
	6989
	6990	STRLEN length_of_hex;
	6991	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	6992	\| PERL_SCAN_DISALLOW_PREFIX
	6993	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	6994
	6995	char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
	6996	if (endchar < endbrace) {
	6997	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	6998	}
	6999
	7000	length_of_hex = (STRLEN)(endchar - RExC_parse);
	7001	*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
	7002
	7003	/* The tokenizer should have guaranteed validity, but it's possible to
	7004	* bypass it by using single quoting, so check */
	7005	if (length_of_hex == 0
	7006	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	7007	{
	7008	RExC_parse += length_of_hex; /* Includes all the valid */
	7009	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	7010	? UTF8SKIP(RExC_parse)
	7011	: 1;
	7012	/* Guard against malformed utf8 */
	7013	if (RExC_parse >= endchar) RExC_parse = endchar;
	7014	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	7015	}
	7016
	7017	RExC_parse = endbrace + 1;
	7018	if (endchar == endbrace) return NULL;
	7019
	7020	ret = (regnode ) &RExC_parse; / Invalid regnode pointer */
	7021	}
	7022	else { /* Not a char class */
	7023	char s; / String to put in generated EXACT node */
	7024	STRLEN len = 0; /* Its current byte length */
	7025	char endchar; / Points to '.' or '}' ending cur char in the input
	7026	stream */
	7027
	7028	ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
	7029	: (LOC)
	7030	? EXACTFL
	7031	: UNI_SEMANTICS
	7032	? EXACTFU
	7033	: EXACTF));
	7034	s= STRING(ret);
	7035
	7036	/* Exact nodes can hold only a U8 length's of text = 255. Loop through
	7037	* the input which is of the form now 'c1.c2.c3...}' until find the
	7038	* ending brace or exceed length 255. The characters that exceed this
	7039	* limit are dropped. The limit could be relaxed should it become
	7040	* desirable by reparsing this as (?:\N{NAME}), so could generate
	7041	* multiple EXACT nodes, as is done for just regular input. But this
	7042	* is primarily a named character, and not intended to be a huge long
	7043	* string, so 255 bytes should be good enough */
	7044	while (1) {
	7045	STRLEN length_of_hex;
	7046	I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
	7047	\| PERL_SCAN_DISALLOW_PREFIX
	7048	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	7049	UV cp; /* Ord of current character */
	7050
	7051	/* Code points are separated by dots. If none, there is only one
	7052	* code point, and is terminated by the brace */
	7053	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	7054
	7055	/* The values are Unicode even on EBCDIC machines */
	7056	length_of_hex = (STRLEN)(endchar - RExC_parse);
	7057	cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
	7058	if ( length_of_hex == 0
	7059	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	7060	{
	7061	RExC_parse += length_of_hex; /* Includes all the valid */
	7062	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	7063	? UTF8SKIP(RExC_parse)
	7064	: 1;
	7065	/* Guard against malformed utf8 */
	7066	if (RExC_parse >= endchar) RExC_parse = endchar;
	7067	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	7068	}
	7069
	7070	if (! FOLD) { /* Not folding, just append to the string */
	7071	STRLEN unilen;
	7072
	7073	/* Quit before adding this character if would exceed limit */
	7074	if (len + UNISKIP(cp) > U8_MAX) break;
	7075
	7076	unilen = reguni(pRExC_state, cp, s);
	7077	if (unilen > 0) {
	7078	s += unilen;
	7079	len += unilen;
	7080	}
	7081	} else { /* Folding, output the folded equivalent */
	7082	STRLEN foldlen,numlen;
	7083	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	7084	cp = toFOLD_uni(cp, tmpbuf, &foldlen);
	7085
	7086	/* Quit before exceeding size limit */
	7087	if (len + foldlen > U8_MAX) break;
	7088
	7089	for (foldbuf = tmpbuf;
	7090	foldlen;
	7091	foldlen -= numlen)
	7092	{
	7093	cp = utf8_to_uvchr(foldbuf, &numlen);
	7094	if (numlen > 0) {
	7095	const STRLEN unilen = reguni(pRExC_state, cp, s);
	7096	s += unilen;
	7097	len += unilen;
	7098	/* In EBCDIC the numlen and unilen can differ. */
	7099	foldbuf += numlen;
	7100	if (numlen >= foldlen)
	7101	break;
	7102	}
	7103	else
	7104	break; /* "Can't happen." */
	7105	}
	7106	}
	7107
	7108	/* Point to the beginning of the next character in the sequence. */
	7109	RExC_parse = endchar + 1;
	7110
	7111	/* Quit if no more characters */
	7112	if (RExC_parse >= endbrace) break;
	7113	}
	7114
	7115
	7116	if (SIZE_ONLY) {
	7117	if (RExC_parse < endbrace) {
	7118	ckWARNreg(RExC_parse - 1,
	7119	"Using just the first characters returned by \\N{}");
	7120	}
	7121
	7122	RExC_size += STR_SZ(len);
	7123	} else {
	7124	STR_LEN(ret) = len;
	7125	RExC_emit += STR_SZ(len);
	7126	}
	7127
	7128	RExC_parse = endbrace + 1;
	7129
	7130	flagp \|= HASWIDTH; / Not SIMPLE, as that causes the engine to fail
	7131	with malformed in t/re/pat_advanced.t */
	7132	RExC_parse --;
	7133	Set_Node_Cur_Length(ret); /* MJD */
	7134	nextchar(pRExC_state);
	7135	}
	7136
	7137	return ret;
	7138	}
	7139
	7140
	7141	/*
	7142	* reg_recode
	7143	*
	7144	* It returns the code point in utf8 for the value in *encp.
	7145	* value: a code value in the source encoding
	7146	* encp: a pointer to an Encode object
	7147	*
	7148	* If the result from Encode is not a single character,
	7149	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	7150	*/
	7151	STATIC UV
	7152	S_reg_recode(pTHX_ const char value, SV **encp)
	7153	{
	7154	STRLEN numlen = 1;
	7155	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	7156	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	7157	const STRLEN newlen = SvCUR(sv);
	7158	UV uv = UNICODE_REPLACEMENT;
	7159
	7160	PERL_ARGS_ASSERT_REG_RECODE;
	7161
	7162	if (newlen)
	7163	uv = SvUTF8(sv)
	7164	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	7165	: (U8)s;
	7166
	7167	if (!newlen \|\| numlen != newlen) {
	7168	uv = UNICODE_REPLACEMENT;
	7169	*encp = NULL;
	7170	}
	7171	return uv;
	7172	}
	7173
	7174
	7175	/*
	7176	- regatom - the lowest level
	7177
	7178	Try to identify anything special at the start of the pattern. If there
	7179	is, then handle it as required. This may involve generating a single regop,
	7180	such as for an assertion; or it may involve recursing, such as to
	7181	handle a () structure.
	7182
	7183	If the string doesn't start with something special then we gobble up
	7184	as much literal text as we can.
	7185
	7186	Once we have been able to handle whatever type of thing started the
	7187	sequence, we return.
	7188
	7189	Note: we have to be careful with escapes, as they can be both literal
	7190	and special, and in the case of \10 and friends can either, depending
	7191	on context. Specifically there are two separate switches for handling
	7192	escape sequences, with the one for handling literal escapes requiring
	7193	a dummy entry for all of the special escapes that are actually handled
	7194	by the other.
	7195	*/
	7196
	7197	STATIC regnode *
	7198	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	7199	{
	7200	dVAR;
	7201	register regnode *ret = NULL;
	7202	I32 flags;
	7203	char *parse_start = RExC_parse;
	7204	U8 op;
	7205	GET_RE_DEBUG_FLAGS_DECL;
	7206	DEBUG_PARSE("atom");
	7207	flagp = WORST; / Tentatively. */
	7208
	7209	PERL_ARGS_ASSERT_REGATOM;
	7210
	7211	tryagain:
	7212	switch ((U8)*RExC_parse) {
	7213	case '^':
	7214	RExC_seen_zerolen++;
	7215	nextchar(pRExC_state);
	7216	if (RExC_flags & RXf_PMf_MULTILINE)
	7217	ret = reg_node(pRExC_state, MBOL);
	7218	else if (RExC_flags & RXf_PMf_SINGLELINE)
	7219	ret = reg_node(pRExC_state, SBOL);
	7220	else
	7221	ret = reg_node(pRExC_state, BOL);
	7222	Set_Node_Length(ret, 1); /* MJD */
	7223	break;
	7224	case '$':
	7225	nextchar(pRExC_state);
	7226	if (*RExC_parse)
	7227	RExC_seen_zerolen++;
	7228	if (RExC_flags & RXf_PMf_MULTILINE)
	7229	ret = reg_node(pRExC_state, MEOL);
	7230	else if (RExC_flags & RXf_PMf_SINGLELINE)
	7231	ret = reg_node(pRExC_state, SEOL);
	7232	else
	7233	ret = reg_node(pRExC_state, EOL);
	7234	Set_Node_Length(ret, 1); /* MJD */
	7235	break;
	7236	case '.':
	7237	nextchar(pRExC_state);
	7238	if (RExC_flags & RXf_PMf_SINGLELINE)
	7239	ret = reg_node(pRExC_state, SANY);
	7240	else
	7241	ret = reg_node(pRExC_state, REG_ANY);
	7242	*flagp \|= HASWIDTH\|SIMPLE;
	7243	RExC_naughty++;
	7244	Set_Node_Length(ret, 1); /* MJD */
	7245	break;
	7246	case '[':
	7247	{
	7248	char * const oregcomp_parse = ++RExC_parse;
	7249	ret = regclass(pRExC_state,depth+1);
	7250	if (*RExC_parse != ']') {
	7251	RExC_parse = oregcomp_parse;
	7252	vFAIL("Unmatched [");
	7253	}
	7254	nextchar(pRExC_state);
	7255	*flagp \|= HASWIDTH\|SIMPLE;
	7256	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	7257	break;
	7258	}
	7259	case '(':
	7260	nextchar(pRExC_state);
	7261	ret = reg(pRExC_state, 1, &flags,depth+1);
	7262	if (ret == NULL) {
	7263	if (flags & TRYAGAIN) {
	7264	if (RExC_parse == RExC_end) {
	7265	/* Make parent create an empty node if needed. */
	7266	*flagp \|= TRYAGAIN;
	7267	return(NULL);
	7268	}
	7269	goto tryagain;
	7270	}
	7271	return(NULL);
	7272	}
	7273	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	7274	break;
	7275	case '\|':
	7276	case ')':
	7277	if (flags & TRYAGAIN) {
	7278	*flagp \|= TRYAGAIN;
	7279	return NULL;
	7280	}
	7281	vFAIL("Internal urp");
	7282	/* Supposed to be caught earlier. */
	7283	break;
	7284	case '{':
	7285	if (!regcurly(RExC_parse)) {
	7286	RExC_parse++;
	7287	goto defchar;
	7288	}
	7289	/* FALL THROUGH */
	7290	case '?':
	7291	case '+':
	7292	case '*':
	7293	RExC_parse++;
	7294	vFAIL("Quantifier follows nothing");
	7295	break;
	7296	case LATIN_SMALL_LETTER_SHARP_S:
	7297	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
	7298	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
	7299	#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
	7300	#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
	7301	case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
	7302	#endif
	7303	do_foldchar:
	7304	if (!LOC && FOLD) {
	7305	U32 len,cp;
	7306	len=0; /* silence a spurious compiler warning */
	7307	if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
	7308	flagp \|= HASWIDTH; / could be SIMPLE too, but needs a handler in regexec.regrepeat */
	7309	RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
	7310	ret = reganode(pRExC_state, FOLDCHAR, cp);
	7311	Set_Node_Length(ret, 1); /* MJD */
	7312	nextchar(pRExC_state); /* kill whitespace under /x */
	7313	return ret;
	7314	}
	7315	}
	7316	goto outer_default;
	7317	case '\\':
	7318	/* Special Escapes
	7319
	7320	This switch handles escape sequences that resolve to some kind
	7321	of special regop and not to literal text. Escape sequnces that
	7322	resolve to literal text are handled below in the switch marked
	7323	"Literal Escapes".
	7324
	7325	Every entry in this switch must have a corresponding entry
	7326	in the literal escape switch. However, the opposite is not
	7327	required, as the default for this switch is to jump to the
	7328	literal text handling code.
	7329	*/
	7330	switch ((U8)*++RExC_parse) {
	7331	case LATIN_SMALL_LETTER_SHARP_S:
	7332	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
	7333	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
	7334	goto do_foldchar;
	7335	/* Special Escapes */
	7336	case 'A':
	7337	RExC_seen_zerolen++;
	7338	ret = reg_node(pRExC_state, SBOL);
	7339	*flagp \|= SIMPLE;
	7340	goto finish_meta_pat;
	7341	case 'G':
	7342	ret = reg_node(pRExC_state, GPOS);
	7343	RExC_seen \|= REG_SEEN_GPOS;
	7344	*flagp \|= SIMPLE;
	7345	goto finish_meta_pat;
	7346	case 'K':
	7347	RExC_seen_zerolen++;
	7348	ret = reg_node(pRExC_state, KEEPS);
	7349	*flagp \|= SIMPLE;
	7350	/* XXX:dmq : disabling in-place substitution seems to
	7351	* be necessary here to avoid cases of memory corruption, as
	7352	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	7353	*/
	7354	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	7355	goto finish_meta_pat;
	7356	case 'Z':
	7357	ret = reg_node(pRExC_state, SEOL);
	7358	*flagp \|= SIMPLE;
	7359	RExC_seen_zerolen++; /* Do not optimize RE away */
	7360	goto finish_meta_pat;
	7361	case 'z':
	7362	ret = reg_node(pRExC_state, EOS);
	7363	*flagp \|= SIMPLE;
	7364	RExC_seen_zerolen++; /* Do not optimize RE away */
	7365	goto finish_meta_pat;
	7366	case 'C':
	7367	ret = reg_node(pRExC_state, CANY);
	7368	RExC_seen \|= REG_SEEN_CANY;
	7369	*flagp \|= HASWIDTH\|SIMPLE;
	7370	goto finish_meta_pat;
	7371	case 'X':
	7372	ret = reg_node(pRExC_state, CLUMP);
	7373	*flagp \|= HASWIDTH;
	7374	goto finish_meta_pat;
	7375	case 'w':
	7376	switch (get_regex_charset(RExC_flags)) {
	7377	case REGEX_LOCALE_CHARSET:
	7378	op = ALNUML;
	7379	break;
	7380	case REGEX_UNICODE_CHARSET:
	7381	op = ALNUMU;
	7382	break;
	7383	case REGEX_ASCII_RESTRICTED_CHARSET:
	7384	op = ALNUMA;
	7385	break;
	7386	case REGEX_DEPENDS_CHARSET:
	7387	op = ALNUM;
	7388	break;
	7389	default:
	7390	goto bad_charset;
	7391	}
	7392	ret = reg_node(pRExC_state, op);
	7393	*flagp \|= HASWIDTH\|SIMPLE;
	7394	goto finish_meta_pat;
	7395	case 'W':
	7396	switch (get_regex_charset(RExC_flags)) {
	7397	case REGEX_LOCALE_CHARSET:
	7398	op = NALNUML;
	7399	break;
	7400	case REGEX_UNICODE_CHARSET:
	7401	op = NALNUMU;
	7402	break;
	7403	case REGEX_ASCII_RESTRICTED_CHARSET:
	7404	op = NALNUMA;
	7405	break;
	7406	case REGEX_DEPENDS_CHARSET:
	7407	op = NALNUM;
	7408	break;
	7409	default:
	7410	goto bad_charset;
	7411	}
	7412	ret = reg_node(pRExC_state, op);
	7413	*flagp \|= HASWIDTH\|SIMPLE;
	7414	goto finish_meta_pat;
	7415	case 'b':
	7416	RExC_seen_zerolen++;
	7417	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	7418	switch (get_regex_charset(RExC_flags)) {
	7419	case REGEX_LOCALE_CHARSET:
	7420	op = BOUNDL;
	7421	break;
	7422	case REGEX_UNICODE_CHARSET:
	7423	op = BOUNDU;
	7424	break;
	7425	case REGEX_ASCII_RESTRICTED_CHARSET:
	7426	op = BOUNDA;
	7427	break;
	7428	case REGEX_DEPENDS_CHARSET:
	7429	op = BOUND;
	7430	break;
	7431	default:
	7432	goto bad_charset;
	7433	}
	7434	ret = reg_node(pRExC_state, op);
	7435	FLAGS(ret) = get_regex_charset(RExC_flags);
	7436	*flagp \|= SIMPLE;
	7437	goto finish_meta_pat;
	7438	case 'B':
	7439	RExC_seen_zerolen++;
	7440	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	7441	switch (get_regex_charset(RExC_flags)) {
	7442	case REGEX_LOCALE_CHARSET:
	7443	op = NBOUNDL;
	7444	break;
	7445	case REGEX_UNICODE_CHARSET:
	7446	op = NBOUNDU;
	7447	break;
	7448	case REGEX_ASCII_RESTRICTED_CHARSET:
	7449	op = NBOUNDA;
	7450	break;
	7451	case REGEX_DEPENDS_CHARSET:
	7452	op = NBOUND;
	7453	break;
	7454	default:
	7455	goto bad_charset;
	7456	}
	7457	ret = reg_node(pRExC_state, op);
	7458	FLAGS(ret) = get_regex_charset(RExC_flags);
	7459	*flagp \|= SIMPLE;
	7460	goto finish_meta_pat;
	7461	case 's':
	7462	switch (get_regex_charset(RExC_flags)) {
	7463	case REGEX_LOCALE_CHARSET:
	7464	op = SPACEL;
	7465	break;
	7466	case REGEX_UNICODE_CHARSET:
	7467	op = SPACEU;
	7468	break;
	7469	case REGEX_ASCII_RESTRICTED_CHARSET:
	7470	op = SPACEA;
	7471	break;
	7472	case REGEX_DEPENDS_CHARSET:
	7473	op = SPACE;
	7474	break;
	7475	default:
	7476	goto bad_charset;
	7477	}
	7478	ret = reg_node(pRExC_state, op);
	7479	*flagp \|= HASWIDTH\|SIMPLE;
	7480	goto finish_meta_pat;
	7481	case 'S':
	7482	switch (get_regex_charset(RExC_flags)) {
	7483	case REGEX_LOCALE_CHARSET:
	7484	op = NSPACEL;
	7485	break;
	7486	case REGEX_UNICODE_CHARSET:
	7487	op = NSPACEU;
	7488	break;
	7489	case REGEX_ASCII_RESTRICTED_CHARSET:
	7490	op = NSPACEA;
	7491	break;
	7492	case REGEX_DEPENDS_CHARSET:
	7493	op = NSPACE;
	7494	break;
	7495	default:
	7496	goto bad_charset;
	7497	}
	7498	ret = reg_node(pRExC_state, op);
	7499	*flagp \|= HASWIDTH\|SIMPLE;
	7500	goto finish_meta_pat;
	7501	case 'd':
	7502	switch (get_regex_charset(RExC_flags)) {
	7503	case REGEX_LOCALE_CHARSET:
	7504	op = DIGITL;
	7505	break;
	7506	case REGEX_ASCII_RESTRICTED_CHARSET:
	7507	op = DIGITA;
	7508	break;
	7509	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	7510	case REGEX_UNICODE_CHARSET:
	7511	op = DIGIT;
	7512	break;
	7513	default:
	7514	goto bad_charset;
	7515	}
	7516	ret = reg_node(pRExC_state, op);
	7517	*flagp \|= HASWIDTH\|SIMPLE;
	7518	goto finish_meta_pat;
	7519	case 'D':
	7520	switch (get_regex_charset(RExC_flags)) {
	7521	case REGEX_LOCALE_CHARSET:
	7522	op = NDIGITL;
	7523	break;
	7524	case REGEX_ASCII_RESTRICTED_CHARSET:
	7525	op = NDIGITA;
	7526	break;
	7527	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	7528	case REGEX_UNICODE_CHARSET:
	7529	op = NDIGIT;
	7530	break;
	7531	default:
	7532	goto bad_charset;
	7533	}
	7534	ret = reg_node(pRExC_state, op);
	7535	*flagp \|= HASWIDTH\|SIMPLE;
	7536	goto finish_meta_pat;
	7537	case 'R':
	7538	ret = reg_node(pRExC_state, LNBREAK);
	7539	*flagp \|= HASWIDTH\|SIMPLE;
	7540	goto finish_meta_pat;
	7541	case 'h':
	7542	ret = reg_node(pRExC_state, HORIZWS);
	7543	*flagp \|= HASWIDTH\|SIMPLE;
	7544	goto finish_meta_pat;
	7545	case 'H':
	7546	ret = reg_node(pRExC_state, NHORIZWS);
	7547	*flagp \|= HASWIDTH\|SIMPLE;
	7548	goto finish_meta_pat;
	7549	case 'v':
	7550	ret = reg_node(pRExC_state, VERTWS);
	7551	*flagp \|= HASWIDTH\|SIMPLE;
	7552	goto finish_meta_pat;
	7553	case 'V':
	7554	ret = reg_node(pRExC_state, NVERTWS);
	7555	*flagp \|= HASWIDTH\|SIMPLE;
	7556	finish_meta_pat:
	7557	nextchar(pRExC_state);
	7558	Set_Node_Length(ret, 2); /* MJD */
	7559	break;
	7560	case 'p':
	7561	case 'P':
	7562	{
	7563	char* const oldregxend = RExC_end;
	7564	#ifdef DEBUGGING
	7565	char* parse_start = RExC_parse - 2;
	7566	#endif
	7567
	7568	if (RExC_parse[1] == '{') {
	7569	/* a lovely hack--pretend we saw [\pX] instead */
	7570	RExC_end = strchr(RExC_parse, '}');
	7571	if (!RExC_end) {
	7572	const U8 c = (U8)*RExC_parse;
	7573	RExC_parse += 2;
	7574	RExC_end = oldregxend;
	7575	vFAIL2("Missing right brace on \\%c{}", c);
	7576	}
	7577	RExC_end++;
	7578	}
	7579	else {
	7580	RExC_end = RExC_parse + 2;
	7581	if (RExC_end > oldregxend)
	7582	RExC_end = oldregxend;
	7583	}
	7584	RExC_parse--;
	7585
	7586	ret = regclass(pRExC_state,depth+1);
	7587
	7588	RExC_end = oldregxend;
	7589	RExC_parse--;
	7590
	7591	Set_Node_Offset(ret, parse_start + 2);
	7592	Set_Node_Cur_Length(ret);
	7593	nextchar(pRExC_state);
	7594	*flagp \|= HASWIDTH\|SIMPLE;
	7595	}
	7596	break;
	7597	case 'N':
	7598	/* Handle \N and \N{NAME} here and not below because it can be
	7599	multicharacter. join_exact() will join them up later on.
	7600	Also this makes sure that things like /\N{BLAH}+/ and
	7601	\N{BLAH} being multi char Just Happen. dmq*/
	7602	++RExC_parse;
	7603	ret= reg_namedseq(pRExC_state, NULL, flagp);
	7604	break;
	7605	case 'k': /* Handle \k<NAME> and \k'NAME' */
	7606	parse_named_seq:
	7607	{
	7608	char ch= RExC_parse[1];
	7609	if (ch != '<' && ch != '\'' && ch != '{') {
	7610	RExC_parse++;
	7611	vFAIL2("Sequence %.2s... not terminated",parse_start);
	7612	} else {
	7613	/* this pretty much dupes the code for (?P=...) in reg(), if
	7614	you change this make sure you change that */
	7615	char* name_start = (RExC_parse += 2);
	7616	U32 num = 0;
	7617	SV *sv_dat = reg_scan_name(pRExC_state,
	7618	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7619	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	7620	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	7621	vFAIL2("Sequence %.3s... not terminated",parse_start);
	7622
	7623	if (!SIZE_ONLY) {
	7624	num = add_data( pRExC_state, 1, "S" );
	7625	RExC_rxi->data->data[num]=(void*)sv_dat;
	7626	SvREFCNT_inc_simple_void(sv_dat);
	7627	}
	7628
	7629	RExC_sawback = 1;
	7630	ret = reganode(pRExC_state,
	7631	((! FOLD)
	7632	? NREF
	7633	: (AT_LEAST_UNI_SEMANTICS)
	7634	? NREFFU
	7635	: (LOC)
	7636	? NREFFL
	7637	: NREFF),
	7638	num);
	7639	*flagp \|= HASWIDTH;
	7640
	7641	/* override incorrect value set in reganode MJD */
	7642	Set_Node_Offset(ret, parse_start+1);
	7643	Set_Node_Cur_Length(ret); /* MJD */
	7644	nextchar(pRExC_state);
	7645
	7646	}
	7647	break;
	7648	}
	7649	case 'g':
	7650	case '1': case '2': case '3': case '4':
	7651	case '5': case '6': case '7': case '8': case '9':
	7652	{
	7653	I32 num;
	7654	bool isg = *RExC_parse == 'g';
	7655	bool isrel = 0;
	7656	bool hasbrace = 0;
	7657	if (isg) {
	7658	RExC_parse++;
	7659	if (*RExC_parse == '{') {
	7660	RExC_parse++;
	7661	hasbrace = 1;
	7662	}
	7663	if (*RExC_parse == '-') {
	7664	RExC_parse++;
	7665	isrel = 1;
	7666	}
	7667	if (hasbrace && !isDIGIT(*RExC_parse)) {
	7668	if (isrel) RExC_parse--;
	7669	RExC_parse -= 2;
	7670	goto parse_named_seq;
	7671	} }
	7672	num = atoi(RExC_parse);
	7673	if (isg && num == 0)
	7674	vFAIL("Reference to invalid group 0");
	7675	if (isrel) {
	7676	num = RExC_npar - num;
	7677	if (num < 1)
	7678	vFAIL("Reference to nonexistent or unclosed group");
	7679	}
	7680	if (!isg && num > 9 && num >= RExC_npar)
	7681	goto defchar;
	7682	else {
	7683	char * const parse_start = RExC_parse - 1; /* MJD */
	7684	while (isDIGIT(*RExC_parse))
	7685	RExC_parse++;
	7686	if (parse_start == RExC_parse - 1)
	7687	vFAIL("Unterminated \\g... pattern");
	7688	if (hasbrace) {
	7689	if (*RExC_parse != '}')
	7690	vFAIL("Unterminated \\g{...} pattern");
	7691	RExC_parse++;
	7692	}
	7693	if (!SIZE_ONLY) {
	7694	if (num > (I32)RExC_rx->nparens)
	7695	vFAIL("Reference to nonexistent group");
	7696	}
	7697	RExC_sawback = 1;
	7698	ret = reganode(pRExC_state,
	7699	((! FOLD)
	7700	? REF
	7701	: (AT_LEAST_UNI_SEMANTICS)
	7702	? REFFU
	7703	: (LOC)
	7704	? REFFL
	7705	: REFF),
	7706	num);
	7707	*flagp \|= HASWIDTH;
	7708
	7709	/* override incorrect value set in reganode MJD */
	7710	Set_Node_Offset(ret, parse_start+1);
	7711	Set_Node_Cur_Length(ret); /* MJD */
	7712	RExC_parse--;
	7713	nextchar(pRExC_state);
	7714	}
	7715	}
	7716	break;
	7717	case '\0':
	7718	if (RExC_parse >= RExC_end)
	7719	FAIL("Trailing \\");
	7720	/* FALL THROUGH */
	7721	default:
	7722	/* Do not generate "unrecognized" warnings here, we fall
	7723	back into the quick-grab loop below */
	7724	parse_start--;
	7725	goto defchar;
	7726	}
	7727	break;
	7728
	7729	case '#':
	7730	if (RExC_flags & RXf_PMf_EXTENDED) {
	7731	if ( reg_skipcomment( pRExC_state ) )
	7732	goto tryagain;
	7733	}
	7734	/* FALL THROUGH */
	7735
	7736	default:
	7737	outer_default:{
	7738	register STRLEN len;
	7739	register UV ender;
	7740	register char *p;
	7741	char *s;
	7742	STRLEN foldlen;
	7743	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	7744
	7745	parse_start = RExC_parse - 1;
	7746
	7747	RExC_parse++;
	7748
	7749	defchar:
	7750	ender = 0;
	7751	ret = reg_node(pRExC_state,
	7752	(U8) ((! FOLD) ? EXACT
	7753	: (LOC)
	7754	? EXACTFL
	7755	: (AT_LEAST_UNI_SEMANTICS)
	7756	? EXACTFU
	7757	: EXACTF)
	7758	);
	7759	s = STRING(ret);
	7760	for (len = 0, p = RExC_parse - 1;
	7761	len < 127 && p < RExC_end;
	7762	len++)
	7763	{
	7764	char * const oldp = p;
	7765
	7766	if (RExC_flags & RXf_PMf_EXTENDED)
	7767	p = regwhite( pRExC_state, p );
	7768	switch ((U8)*p) {
	7769	case LATIN_SMALL_LETTER_SHARP_S:
	7770	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
	7771	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
	7772	if (LOC \|\| !FOLD \|\| !is_TRICKYFOLD_safe(p,RExC_end,UTF))
	7773	goto normal_default;
	7774	case '^':
	7775	case '$':
	7776	case '.':
	7777	case '[':
	7778	case '(':
	7779	case ')':
	7780	case '\|':
	7781	goto loopdone;
	7782	case '\\':
	7783	/* Literal Escapes Switch
	7784
	7785	This switch is meant to handle escape sequences that
	7786	resolve to a literal character.
	7787
	7788	Every escape sequence that represents something
	7789	else, like an assertion or a char class, is handled
	7790	in the switch marked 'Special Escapes' above in this
	7791	routine, but also has an entry here as anything that
	7792	isn't explicitly mentioned here will be treated as
	7793	an unescaped equivalent literal.
	7794	*/
	7795
	7796	switch ((U8)*++p) {
	7797	/* These are all the special escapes. */
	7798	case LATIN_SMALL_LETTER_SHARP_S:
	7799	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
	7800	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
	7801	if (LOC \|\| !FOLD \|\| !is_TRICKYFOLD_safe(p,RExC_end,UTF))
	7802	goto normal_default;
	7803	case 'A': /* Start assertion */
	7804	case 'b': case 'B': /* Word-boundary assertion*/
	7805	case 'C': /* Single char !DANGEROUS! */
	7806	case 'd': case 'D': /* digit class */
	7807	case 'g': case 'G': /* generic-backref, pos assertion */
	7808	case 'h': case 'H': /* HORIZWS */
	7809	case 'k': case 'K': /* named backref, keep marker */
	7810	case 'N': /* named char sequence */
	7811	case 'p': case 'P': /* Unicode property */
	7812	case 'R': /* LNBREAK */
	7813	case 's': case 'S': /* space class */
	7814	case 'v': case 'V': /* VERTWS */
	7815	case 'w': case 'W': /* word class */
	7816	case 'X': /* eXtended Unicode "combining character sequence" */
	7817	case 'z': case 'Z': /* End of line/string assertion */
	7818	--p;
	7819	goto loopdone;
	7820
	7821	/* Anything after here is an escape that resolves to a
	7822	literal. (Except digits, which may or may not)
	7823	*/
	7824	case 'n':
	7825	ender = '\n';
	7826	p++;
	7827	break;
	7828	case 'r':
	7829	ender = '\r';
	7830	p++;
	7831	break;
	7832	case 't':
	7833	ender = '\t';
	7834	p++;
	7835	break;
	7836	case 'f':
	7837	ender = '\f';
	7838	p++;
	7839	break;
	7840	case 'e':
	7841	ender = ASCII_TO_NATIVE('\033');
	7842	p++;
	7843	break;
	7844	case 'a':
	7845	ender = ASCII_TO_NATIVE('\007');
	7846	p++;
	7847	break;
	7848	case 'o':
	7849	{
	7850	STRLEN brace_len = len;
	7851	UV result;
	7852	const char* error_msg;
	7853
	7854	bool valid = grok_bslash_o(p,
	7855	&result,
	7856	&brace_len,
	7857	&error_msg,
	7858	1);
	7859	p += brace_len;
	7860	if (! valid) {
	7861	RExC_parse = p; /* going to die anyway; point
	7862	to exact spot of failure */
	7863	vFAIL(error_msg);
	7864	}
	7865	else
	7866	{
	7867	ender = result;
	7868	}
	7869	if (PL_encoding && ender < 0x100) {
	7870	goto recode_encoding;
	7871	}
	7872	if (ender > 0xff) {
	7873	REQUIRE_UTF8;
	7874	}
	7875	break;
	7876	}
	7877	case 'x':
	7878	if (*++p == '{') {
	7879	char* const e = strchr(p, '}');
	7880
	7881	if (!e) {
	7882	RExC_parse = p + 1;
	7883	vFAIL("Missing right brace on \\x{}");
	7884	}
	7885	else {
	7886	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	7887	\| PERL_SCAN_DISALLOW_PREFIX;
	7888	STRLEN numlen = e - p - 1;
	7889	ender = grok_hex(p + 1, &numlen, &flags, NULL);
	7890	if (ender > 0xff)
	7891	REQUIRE_UTF8;
	7892	p = e + 1;
	7893	}
	7894	}
	7895	else {
	7896	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	7897	STRLEN numlen = 2;
	7898	ender = grok_hex(p, &numlen, &flags, NULL);
	7899	p += numlen;
	7900	}
	7901	if (PL_encoding && ender < 0x100)
	7902	goto recode_encoding;
	7903	break;
	7904	case 'c':
	7905	p++;
	7906	ender = grok_bslash_c(*p++, SIZE_ONLY);
	7907	break;
	7908	case '0': case '1': case '2': case '3':case '4':
	7909	case '5': case '6': case '7': case '8':case '9':
	7910	if (*p == '0' \|\|
	7911	(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
	7912	{
	7913	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	7914	STRLEN numlen = 3;
	7915	ender = grok_oct(p, &numlen, &flags, NULL);
	7916	if (ender > 0xff) {
	7917	REQUIRE_UTF8;
	7918	}
	7919	p += numlen;
	7920	}
	7921	else {
	7922	--p;
	7923	goto loopdone;
	7924	}
	7925	if (PL_encoding && ender < 0x100)
	7926	goto recode_encoding;
	7927	break;
	7928	recode_encoding:
	7929	{
	7930	SV* enc = PL_encoding;
	7931	ender = reg_recode((const char)(U8)ender, &enc);
	7932	if (!enc && SIZE_ONLY)
	7933	ckWARNreg(p, "Invalid escape in the specified encoding");
	7934	REQUIRE_UTF8;
	7935	}
	7936	break;
	7937	case '\0':
	7938	if (p >= RExC_end)
	7939	FAIL("Trailing \\");
	7940	/* FALL THROUGH */
	7941	default:
	7942	if (!SIZE_ONLY&& isALPHA(*p))
	7943	ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
	7944	goto normal_default;
	7945	}
	7946	break;
	7947	default:
	7948	normal_default:
	7949	if (UTF8_IS_START(*p) && UTF) {
	7950	STRLEN numlen;
	7951	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	7952	&numlen, UTF8_ALLOW_DEFAULT);
	7953	p += numlen;
	7954	}
	7955	else
	7956	ender = *p++;
	7957	break;
	7958	}
	7959	if ( RExC_flags & RXf_PMf_EXTENDED)
	7960	p = regwhite( pRExC_state, p );
	7961	if (UTF && FOLD) {
	7962	/* Prime the casefolded buffer. */
	7963	ender = toFOLD_uni(ender, tmpbuf, &foldlen);
	7964	}
	7965	if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+. /
	7966	if (len)
	7967	p = oldp;
	7968	else if (UTF) {
	7969	if (FOLD) {
	7970	/* Emit all the Unicode characters. */
	7971	STRLEN numlen;
	7972	for (foldbuf = tmpbuf;
	7973	foldlen;
	7974	foldlen -= numlen) {
	7975	ender = utf8_to_uvchr(foldbuf, &numlen);
	7976	if (numlen > 0) {
	7977	const STRLEN unilen = reguni(pRExC_state, ender, s);
	7978	s += unilen;
	7979	len += unilen;
	7980	/* In EBCDIC the numlen
	7981	* and unilen can differ. */
	7982	foldbuf += numlen;
	7983	if (numlen >= foldlen)
	7984	break;
	7985	}
	7986	else
	7987	break; /* "Can't happen." */
	7988	}
	7989	}
	7990	else {
	7991	const STRLEN unilen = reguni(pRExC_state, ender, s);
	7992	if (unilen > 0) {
	7993	s += unilen;
	7994	len += unilen;
	7995	}
	7996	}
	7997	}
	7998	else {
	7999	len++;
	8000	REGC((char)ender, s++);
	8001	}
	8002	break;
	8003	}
	8004	if (UTF) {
	8005	if (FOLD) {
	8006	/* Emit all the Unicode characters. */
	8007	STRLEN numlen;
	8008	for (foldbuf = tmpbuf;
	8009	foldlen;
	8010	foldlen -= numlen) {
	8011	ender = utf8_to_uvchr(foldbuf, &numlen);
	8012	if (numlen > 0) {
	8013	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8014	len += unilen;
	8015	s += unilen;
	8016	/* In EBCDIC the numlen
	8017	* and unilen can differ. */
	8018	foldbuf += numlen;
	8019	if (numlen >= foldlen)
	8020	break;
	8021	}
	8022	else
	8023	break;
	8024	}
	8025	}
	8026	else {
	8027	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8028	if (unilen > 0) {
	8029	s += unilen;
	8030	len += unilen;
	8031	}
	8032	}
	8033	len--;
	8034	}
	8035	else
	8036	REGC((char)ender, s++);
	8037	}
	8038	loopdone:
	8039	RExC_parse = p - 1;
	8040	Set_Node_Cur_Length(ret); /* MJD */
	8041	nextchar(pRExC_state);
	8042	{
	8043	/* len is STRLEN which is unsigned, need to copy to signed */
	8044	IV iv = len;
	8045	if (iv < 0)
	8046	vFAIL("Internal disaster");
	8047	}
	8048	if (len > 0)
	8049	*flagp \|= HASWIDTH;
	8050	if (len == 1 && UNI_IS_INVARIANT(ender))
	8051	*flagp \|= SIMPLE;
	8052
	8053	if (SIZE_ONLY)
	8054	RExC_size += STR_SZ(len);
	8055	else {
	8056	STR_LEN(ret) = len;
	8057	RExC_emit += STR_SZ(len);
	8058	}
	8059	}
	8060	break;
	8061	}
	8062
	8063	return(ret);
	8064
	8065	/* Jumped to when an unrecognized character set is encountered */
	8066	bad_charset:
	8067	Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
	8068	return(NULL);
	8069	}
	8070
	8071	STATIC char *
	8072	S_regwhite( RExC_state_t pRExC_state, char p )
	8073	{
	8074	const char *e = RExC_end;
	8075
	8076	PERL_ARGS_ASSERT_REGWHITE;
	8077
	8078	while (p < e) {
	8079	if (isSPACE(*p))
	8080	++p;
	8081	else if (*p == '#') {
	8082	bool ended = 0;
	8083	do {
	8084	if (*p++ == '\n') {
	8085	ended = 1;
	8086	break;
	8087	}
	8088	} while (p < e);
	8089	if (!ended)
	8090	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	8091	}
	8092	else
	8093	break;
	8094	}
	8095	return p;
	8096	}
	8097
	8098	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	8099	Character classes ([:foo:]) can also be negated ([:^foo:]).
	8100	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	8101	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	8102	but trigger failures because they are currently unimplemented. */
	8103
	8104	#define POSIXCC_DONE(c) ((c) == ':')
	8105	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	8106	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	8107
	8108	STATIC I32
	8109	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
	8110	{
	8111	dVAR;
	8112	I32 namedclass = OOB_NAMEDCLASS;
	8113
	8114	PERL_ARGS_ASSERT_REGPPOSIXCC;
	8115
	8116	if (value == '[' && RExC_parse + 1 < RExC_end &&
	8117	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	8118	POSIXCC(UCHARAT(RExC_parse))) {
	8119	const char c = UCHARAT(RExC_parse);
	8120	char* const s = RExC_parse++;
	8121
	8122	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	8123	RExC_parse++;
	8124	if (RExC_parse == RExC_end)
	8125	/* Grandfather lone [:, [=, [. */
	8126	RExC_parse = s;
	8127	else {
	8128	const char* const t = RExC_parse++; /* skip over the c */
	8129	assert(*t == c);
	8130
	8131	if (UCHARAT(RExC_parse) == ']') {
	8132	const char *posixcc = s + 1;
	8133	RExC_parse++; /* skip over the ending ] */
	8134
	8135	if (*s == ':') {
	8136	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	8137	const I32 skip = t - posixcc;
	8138
	8139	/* Initially switch on the length of the name. */
	8140	switch (skip) {
	8141	case 4:
	8142	if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
	8143	namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
	8144	break;
	8145	case 5:
	8146	/* Names all of length 5. */
	8147	/* alnum alpha ascii blank cntrl digit graph lower
	8148	print punct space upper */
	8149	/* Offset 4 gives the best switch position. */
	8150	switch (posixcc[4]) {
	8151	case 'a':
	8152	if (memEQ(posixcc, "alph", 4)) /* alpha */
	8153	namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
	8154	break;
	8155	case 'e':
	8156	if (memEQ(posixcc, "spac", 4)) /* space */
	8157	namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
	8158	break;
	8159	case 'h':
	8160	if (memEQ(posixcc, "grap", 4)) /* graph */
	8161	namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
	8162	break;
	8163	case 'i':
	8164	if (memEQ(posixcc, "asci", 4)) /* ascii */
	8165	namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
	8166	break;
	8167	case 'k':
	8168	if (memEQ(posixcc, "blan", 4)) /* blank */
	8169	namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
	8170	break;
	8171	case 'l':
	8172	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	8173	namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
	8174	break;
	8175	case 'm':
	8176	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	8177	namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
	8178	break;
	8179	case 'r':
	8180	if (memEQ(posixcc, "lowe", 4)) /* lower */
	8181	namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
	8182	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	8183	namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
	8184	break;
	8185	case 't':
	8186	if (memEQ(posixcc, "digi", 4)) /* digit */
	8187	namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
	8188	else if (memEQ(posixcc, "prin", 4)) /* print */
	8189	namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
	8190	else if (memEQ(posixcc, "punc", 4)) /* punct */
	8191	namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
	8192	break;
	8193	}
	8194	break;
	8195	case 6:
	8196	if (memEQ(posixcc, "xdigit", 6))
	8197	namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
	8198	break;
	8199	}
	8200
	8201	if (namedclass == OOB_NAMEDCLASS)
	8202	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	8203	t - s - 1, s + 1);
	8204	assert (posixcc[skip] == ':');
	8205	assert (posixcc[skip+1] == ']');
	8206	} else if (!SIZE_ONLY) {
	8207	/* [[=foo=]] and [[.foo.]] are still future. */
	8208
	8209	/* adjust RExC_parse so the warning shows after
	8210	the class closes */
	8211	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	8212	RExC_parse++;
	8213	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	8214	}
	8215	} else {
	8216	/* Maternal grandfather:
	8217	* "[:" ending in ":" but not in ":]" */
	8218	RExC_parse = s;
	8219	}
	8220	}
	8221	}
	8222
	8223	return namedclass;
	8224	}
	8225
	8226	STATIC void
	8227	S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
	8228	{
	8229	dVAR;
	8230
	8231	PERL_ARGS_ASSERT_CHECKPOSIXCC;
	8232
	8233	if (POSIXCC(UCHARAT(RExC_parse))) {
	8234	const char *s = RExC_parse;
	8235	const char c = *s++;
	8236
	8237	while (isALNUM(*s))
	8238	s++;
	8239	if (s && c == s && s[1] == ']') {
	8240	ckWARN3reg(s+2,
	8241	"POSIX syntax [%c %c] belongs inside character classes",
	8242	c, c);
	8243
	8244	/* [[=foo=]] and [[.foo.]] are still future. */
	8245	if (POSIXCC_NOTYET(c)) {
	8246	/* adjust RExC_parse so the error shows after
	8247	the class closes */
	8248	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
	8249	NOOP;
	8250	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	8251	}
	8252	}
	8253	}
	8254	}
	8255
	8256	/* No locale test, and always Unicode semantics */
	8257	#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
	8258	ANYOF_##NAME: \
	8259	for (value = 0; value < 256; value++) \
	8260	if (TEST) \
	8261	stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
	8262	yesno = '+'; \
	8263	what = WORD; \
	8264	break; \
	8265	case ANYOF_N##NAME: \
	8266	for (value = 0; value < 256; value++) \
	8267	if (!TEST) \
	8268	stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
	8269	yesno = '!'; \
	8270	what = WORD; \
	8271	break
	8272
	8273	/* Like the above, but there are differences if we are in uni-8-bit or not, so
	8274	* there are two tests passed in, to use depending on that. There aren't any
	8275	* cases where the label is different from the name, so no need for that
	8276	* parameter */
	8277	#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
	8278	ANYOF_##NAME: \
	8279	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
	8280	else if (UNI_SEMANTICS) { \
	8281	for (value = 0; value < 256; value++) { \
	8282	if (TEST_8) stored += \
	8283	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
	8284	} \
	8285	} \
	8286	else { \
	8287	for (value = 0; value < 128; value++) { \
	8288	if (TEST_7) stored += \
	8289	S_set_regclass_bit(aTHX_ pRExC_state, ret, \
	8290	(U8) UNI_TO_NATIVE(value)); \
	8291	} \
	8292	} \
	8293	yesno = '+'; \
	8294	what = WORD; \
	8295	break; \
	8296	case ANYOF_N##NAME: \
	8297	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
	8298	else if (UNI_SEMANTICS) { \
	8299	for (value = 0; value < 256; value++) { \
	8300	if (! TEST_8) stored += \
	8301	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
	8302	} \
	8303	} \
	8304	else { \
	8305	for (value = 0; value < 128; value++) { \
	8306	if (! TEST_7) stored += \
	8307	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
	8308	} \
	8309	if (ASCII_RESTRICTED) { \
	8310	for (value = 128; value < 256; value++) { \
	8311	stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
	8312	} \
	8313	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL\|ANYOF_UTF8; \
	8314	} \
	8315	else { \
	8316	/* For a non-ut8 target string with DEPENDS semantics, all above \
	8317	* ASCII Latin1 code points match the complement of any of the \
	8318	* classes. But in utf8, they have their Unicode semantics, so \
	8319	* can't just set them in the bitmap, or else regexec.c will think \
	8320	* they matched when they shouldn't. */ \
	8321	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL\|ANYOF_UTF8; \
	8322	} \
	8323	} \
	8324	yesno = '!'; \
	8325	what = WORD; \
	8326	break
	8327
	8328	/*
	8329	We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
	8330	so that it is possible to override the option here without having to
	8331	rebuild the entire core. as we are required to do if we change regcomp.h
	8332	which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
	8333	*/
	8334	#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
	8335	#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
	8336	#endif
	8337
	8338	#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
	8339	#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
	8340	#else
	8341	#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
	8342	#endif
	8343
	8344	STATIC U8
	8345	S_set_regclass_bit_fold(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value)
	8346	{
	8347
	8348	/* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
	8349	* Locale folding is done at run-time, so this function should not be
	8350	* called for nodes that are for locales.
	8351	*
	8352	* This function simply sets the bit corresponding to the fold of the input
	8353	* 'value', if not already set. The fold of 'f' is 'F', and the fold of
	8354	* 'F' is 'f'.
	8355	*
	8356	* It also sets any necessary flags, and returns the number of bits that
	8357	* actually changed from 0 to 1 */
	8358
	8359	U8 stored = 0;
	8360	U8 fold;
	8361
	8362	fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
	8363	: PL_fold[value];
	8364
	8365	/* It assumes the bit for 'value' has already been set */
	8366	if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
	8367	ANYOF_BITMAP_SET(node, fold);
	8368	stored++;
	8369	}
	8370	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
	8371	\|\| (! UNI_SEMANTICS
	8372	&& ! isASCII(value)
	8373	&& PL_fold_latin1[value] != value))
	8374	{ /* A character that has a fold outside of Latin1 matches outside the
	8375	bitmap, but only when the target string is utf8. Similarly when we
	8376	don't have unicode semantics for the above ASCII Latin-1 characters,
	8377	and they have a fold, they should match if the target is utf8, and
	8378	not otherwise */
	8379	ANYOF_FLAGS(node) \|= ANYOF_UTF8;
	8380	}
	8381
	8382	return stored;
	8383	}
	8384
	8385
	8386	PERL_STATIC_INLINE U8
	8387	S_set_regclass_bit(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value)
	8388	{
	8389	/* This inline function sets a bit in the bitmap if not already set, and if
	8390	* appropriate, its fold, returning the number of bits that actually
	8391	* changed from 0 to 1 */
	8392
	8393	U8 stored;
	8394
	8395	if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
	8396	return 0;
	8397	}
	8398
	8399	ANYOF_BITMAP_SET(node, value);
	8400	stored = 1;
	8401
	8402	if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
	8403	stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
	8404	}
	8405
	8406	return stored;
	8407	}
	8408
	8409	/*
	8410	parse a class specification and produce either an ANYOF node that
	8411	matches the pattern or if the pattern matches a single char only and
	8412	that char is < 256 and we are case insensitive then we produce an
	8413	EXACT node instead.
	8414	*/
	8415
	8416	STATIC regnode *
	8417	S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
	8418	{
	8419	dVAR;
	8420	register UV nextvalue;
	8421	register IV prevvalue = OOB_UNICODE;
	8422	register IV range = 0;
	8423	UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
	8424	register regnode *ret;
	8425	STRLEN numlen;
	8426	IV namedclass;
	8427	char *rangebegin = NULL;
	8428	bool need_class = 0;
	8429	SV *listsv = NULL;
	8430	UV n;
	8431	AV* unicode_alternate = NULL;
	8432	#ifdef EBCDIC
	8433	UV literal_endpoint = 0;
	8434	#endif
	8435	UV stored = 0; /* how many chars stored in the bitmap */
	8436
	8437	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	8438	case we need to change the emitted regop to an EXACT. */
	8439	const char * orig_parse = RExC_parse;
	8440	GET_RE_DEBUG_FLAGS_DECL;
	8441
	8442	PERL_ARGS_ASSERT_REGCLASS;
	8443	#ifndef DEBUGGING
	8444	PERL_UNUSED_ARG(depth);
	8445	#endif
	8446
	8447	DEBUG_PARSE("clas");
	8448
	8449	/* Assume we are going to generate an ANYOF node. */
	8450	ret = reganode(pRExC_state, ANYOF, 0);
	8451
	8452	if (!SIZE_ONLY)
	8453	ANYOF_FLAGS(ret) = 0;
	8454
	8455	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	8456	RExC_naughty++;
	8457	RExC_parse++;
	8458	if (!SIZE_ONLY)
	8459	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	8460	}
	8461
	8462	if (SIZE_ONLY) {
	8463	RExC_size += ANYOF_SKIP;
	8464	#ifdef ANYOF_ADD_LOC_SKIP
	8465	if (LOC) {
	8466	RExC_size += ANYOF_ADD_LOC_SKIP;
	8467	}
	8468	#endif
	8469	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	8470	}
	8471	else {
	8472	RExC_emit += ANYOF_SKIP;
	8473	if (LOC) {
	8474	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	8475	#ifdef ANYOF_ADD_LOC_SKIP
	8476	RExC_emit += ANYOF_ADD_LOC_SKIP;
	8477	#endif
	8478	}
	8479	ANYOF_BITMAP_ZERO(ret);
	8480	listsv = newSVpvs("# comment\n");
	8481	}
	8482
	8483	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	8484
	8485	if (!SIZE_ONLY && POSIXCC(nextvalue))
	8486	checkposixcc(pRExC_state);
	8487
	8488	/* allow 1st char to be ] (allowing it to be - is dealt with later) */
	8489	if (UCHARAT(RExC_parse) == ']')
	8490	goto charclassloop;
	8491
	8492	parseit:
	8493	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	8494
	8495	charclassloop:
	8496
	8497	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	8498
	8499	if (!range)
	8500	rangebegin = RExC_parse;
	8501	if (UTF) {
	8502	value = utf8n_to_uvchr((U8*)RExC_parse,
	8503	RExC_end - RExC_parse,
	8504	&numlen, UTF8_ALLOW_DEFAULT);
	8505	RExC_parse += numlen;
	8506	}
	8507	else
	8508	value = UCHARAT(RExC_parse++);
	8509
	8510	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	8511	if (value == '[' && POSIXCC(nextvalue))
	8512	namedclass = regpposixcc(pRExC_state, value);
	8513	else if (value == '\\') {
	8514	if (UTF) {
	8515	value = utf8n_to_uvchr((U8*)RExC_parse,
	8516	RExC_end - RExC_parse,
	8517	&numlen, UTF8_ALLOW_DEFAULT);
	8518	RExC_parse += numlen;
	8519	}
	8520	else
	8521	value = UCHARAT(RExC_parse++);
	8522	/* Some compilers cannot handle switching on 64-bit integer
	8523	* values, therefore value cannot be an UV. Yes, this will
	8524	* be a problem later if we want switch on Unicode.
	8525	* A similar issue a little bit later when switching on
	8526	* namedclass. --jhi */
	8527	switch ((I32)value) {
	8528	case 'w': namedclass = ANYOF_ALNUM; break;
	8529	case 'W': namedclass = ANYOF_NALNUM; break;
	8530	case 's': namedclass = ANYOF_SPACE; break;
	8531	case 'S': namedclass = ANYOF_NSPACE; break;
	8532	case 'd': namedclass = ANYOF_DIGIT; break;
	8533	case 'D': namedclass = ANYOF_NDIGIT; break;
	8534	case 'v': namedclass = ANYOF_VERTWS; break;
	8535	case 'V': namedclass = ANYOF_NVERTWS; break;
	8536	case 'h': namedclass = ANYOF_HORIZWS; break;
	8537	case 'H': namedclass = ANYOF_NHORIZWS; break;
	8538	case 'N': /* Handle \N{NAME} in class */
	8539	{
	8540	/* We only pay attention to the first char of
	8541	multichar strings being returned. I kinda wonder
	8542	if this makes sense as it does change the behaviour
	8543	from earlier versions, OTOH that behaviour was broken
	8544	as well. */
	8545	UV v; /* value is register so we cant & it /grrr */
	8546	if (reg_namedseq(pRExC_state, &v, NULL)) {
	8547	goto parseit;
	8548	}
	8549	value= v;
	8550	}
	8551	break;
	8552	case 'p':
	8553	case 'P':
	8554	{
	8555	char *e;
	8556	if (RExC_parse >= RExC_end)
	8557	vFAIL2("Empty \\%c{}", (U8)value);
	8558	if (*RExC_parse == '{') {
	8559	const U8 c = (U8)value;
	8560	e = strchr(RExC_parse++, '}');
	8561	if (!e)
	8562	vFAIL2("Missing right brace on \\%c{}", c);
	8563	while (isSPACE(UCHARAT(RExC_parse)))
	8564	RExC_parse++;
	8565	if (e == RExC_parse)
	8566	vFAIL2("Empty \\%c{}", c);
	8567	n = e - RExC_parse;
	8568	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	8569	n--;
	8570	}
	8571	else {
	8572	e = RExC_parse;
	8573	n = 1;
	8574	}
	8575	if (!SIZE_ONLY) {
	8576	if (UCHARAT(RExC_parse) == '^') {
	8577	RExC_parse++;
	8578	n--;
	8579	value = value == 'p' ? 'P' : 'p'; /* toggle */
	8580	while (isSPACE(UCHARAT(RExC_parse))) {
	8581	RExC_parse++;
	8582	n--;
	8583	}
	8584	}
	8585	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
	8586	(value=='p' ? '+' : '!'), (int)n, RExC_parse);
	8587	}
	8588	RExC_parse = e + 1;
	8589
	8590	/* The \p could match something in the Latin1 range, hence
	8591	* something that isn't utf8 */
	8592	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP;
	8593	if (FOLD) { /* And one of these could have a multi-char fold */
	8594	OP(ret) = ANYOFV;
	8595	}
	8596	namedclass = ANYOF_MAX; /* no official name, but it's named */
	8597	}
	8598	break;
	8599	case 'n': value = '\n'; break;
	8600	case 'r': value = '\r'; break;
	8601	case 't': value = '\t'; break;
	8602	case 'f': value = '\f'; break;
	8603	case 'b': value = '\b'; break;
	8604	case 'e': value = ASCII_TO_NATIVE('\033');break;
	8605	case 'a': value = ASCII_TO_NATIVE('\007');break;
	8606	case 'o':
	8607	RExC_parse--; /* function expects to be pointed at the 'o' */
	8608	{
	8609	const char* error_msg;
	8610	bool valid = grok_bslash_o(RExC_parse,
	8611	&value,
	8612	&numlen,
	8613	&error_msg,
	8614	SIZE_ONLY);
	8615	RExC_parse += numlen;
	8616	if (! valid) {
	8617	vFAIL(error_msg);
	8618	}
	8619	}
	8620	if (PL_encoding && value < 0x100) {
	8621	goto recode_encoding;
	8622	}
	8623	break;
	8624	case 'x':
	8625	if (*RExC_parse == '{') {
	8626	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	8627	\| PERL_SCAN_DISALLOW_PREFIX;
	8628	char * const e = strchr(RExC_parse++, '}');
	8629	if (!e)
	8630	vFAIL("Missing right brace on \\x{}");
	8631
	8632	numlen = e - RExC_parse;
	8633	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	8634	RExC_parse = e + 1;
	8635	}
	8636	else {
	8637	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	8638	numlen = 2;
	8639	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	8640	RExC_parse += numlen;
	8641	}
	8642	if (PL_encoding && value < 0x100)
	8643	goto recode_encoding;
	8644	break;
	8645	case 'c':
	8646	value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
	8647	break;
	8648	case '0': case '1': case '2': case '3': case '4':
	8649	case '5': case '6': case '7':
	8650	{
	8651	/* Take 1-3 octal digits */
	8652	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	8653	numlen = 3;
	8654	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	8655	RExC_parse += numlen;
	8656	if (PL_encoding && value < 0x100)
	8657	goto recode_encoding;
	8658	break;
	8659	}
	8660	recode_encoding:
	8661	{
	8662	SV* enc = PL_encoding;
	8663	value = reg_recode((const char)(U8)value, &enc);
	8664	if (!enc && SIZE_ONLY)
	8665	ckWARNreg(RExC_parse,
	8666	"Invalid escape in the specified encoding");
	8667	break;
	8668	}
	8669	default:
	8670	/* Allow \_ to not give an error */
	8671	if (!SIZE_ONLY && isALNUM(value) && value != '_') {
	8672	ckWARN2reg(RExC_parse,
	8673	"Unrecognized escape \\%c in character class passed through",
	8674	(int)value);
	8675	}
	8676	break;
	8677	}
	8678	} /* end of \blah */
	8679	#ifdef EBCDIC
	8680	else
	8681	literal_endpoint++;
	8682	#endif
	8683
	8684	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	8685
	8686	/* What matches in a locale is not known until runtime, so need to
	8687	* (one time per class) allocate extra space to pass to regexec.
	8688	* The space will contain a bit for each named class that is to be
	8689	* matched against. This isn't needed for \p{} and pseudo-classes,
	8690	* as they are not affected by locale, and hence are dealt with
	8691	* separately */
	8692	if (LOC && namedclass < ANYOF_MAX && ! need_class) {
	8693	need_class = 1;
	8694	if (SIZE_ONLY) {
	8695	#ifdef ANYOF_CLASS_ADD_SKIP
	8696	RExC_size += ANYOF_CLASS_ADD_SKIP;
	8697	#endif
	8698	}
	8699	else {
	8700	#ifdef ANYOF_CLASS_ADD_SKIP
	8701	RExC_emit += ANYOF_CLASS_ADD_SKIP;
	8702	#endif
	8703	ANYOF_CLASS_ZERO(ret);
	8704	}
	8705	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	8706	}
	8707
	8708	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	8709	* literal */
	8710	if (range) {
	8711	if (!SIZE_ONLY) {
	8712	const int w =
	8713	RExC_parse >= rangebegin ?
	8714	RExC_parse - rangebegin : 0;
	8715	ckWARN4reg(RExC_parse,
	8716	"False [] range \"%.s\"",
	8717	w, w, rangebegin);
	8718
	8719	if (prevvalue < 256) {
	8720	stored +=
	8721	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
	8722	stored +=
	8723	S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
	8724	}
	8725	else {
	8726	ANYOF_FLAGS(ret) \|= ANYOF_UTF8;
	8727	Perl_sv_catpvf(aTHX_ listsv,
	8728	"%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
	8729	}
	8730	}
	8731
	8732	range = 0; /* this was not a true range */
	8733	}
	8734
	8735
	8736
	8737	if (!SIZE_ONLY) {
	8738	const char *what = NULL;
	8739	char yesno = 0;
	8740
	8741	/* Possible truncation here but in some 64-bit environments
	8742	* the compiler gets heartburn about switch on 64-bit values.
	8743	* A similar issue a little earlier when switching on value.
	8744	* --jhi */
	8745	switch ((I32)namedclass) {
	8746
	8747	case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
	8748	case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
	8749	case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
	8750	case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
	8751	case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
	8752	case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
	8753	case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
	8754	case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
	8755	case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
	8756	case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
	8757	#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
	8758	/* \s, \w match all unicode if utf8. */
	8759	case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
	8760	case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
	8761	#else
	8762	/* \s, \w match ascii and locale only */
	8763	case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
	8764	case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
	8765	#endif
	8766	case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
	8767	case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
	8768	case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
	8769	case ANYOF_ASCII:
	8770	if (LOC)
	8771	ANYOF_CLASS_SET(ret, ANYOF_ASCII);
	8772	else {
	8773	for (value = 0; value < 128; value++)
	8774	stored +=
	8775	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
	8776	}
	8777	yesno = '+';
	8778	what = NULL; /* Doesn't match outside ascii, so
	8779	don't want to add +utf8:: */
	8780	break;
	8781	case ANYOF_NASCII:
	8782	if (LOC)
	8783	ANYOF_CLASS_SET(ret, ANYOF_NASCII);
	8784	else {
	8785	for (value = 128; value < 256; value++)
	8786	stored +=
	8787	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
	8788	}
	8789	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	8790	yesno = '!';
	8791	what = "ASCII";
	8792	break;
	8793	case ANYOF_DIGIT:
	8794	if (LOC)
	8795	ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
	8796	else {
	8797	/* consecutive digits assumed */
	8798	for (value = '0'; value <= '9'; value++)
	8799	stored +=
	8800	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
	8801	}
	8802	yesno = '+';
	8803	what = POSIX_CC_UNI_NAME("Digit");
	8804	break;
	8805	case ANYOF_NDIGIT:
	8806	if (LOC)
	8807	ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
	8808	else {
	8809	/* consecutive digits assumed */
	8810	for (value = 0; value < '0'; value++)
	8811	stored +=
	8812	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
	8813	for (value = '9' + 1; value < 256; value++)
	8814	stored +=
	8815	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
	8816	}
	8817	yesno = '!';
	8818	what = POSIX_CC_UNI_NAME("Digit");
	8819	if (ASCII_RESTRICTED ) {
	8820	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	8821	}
	8822	break;
	8823	case ANYOF_MAX:
	8824	/* this is to handle \p and \P */
	8825	break;
	8826	default:
	8827	vFAIL("Invalid [::] class");
	8828	break;
	8829	}
	8830	if (what && ! (ASCII_RESTRICTED)) {
	8831	/* Strings such as "+utf8::isWord\n" */
	8832	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
	8833	ANYOF_FLAGS(ret) \|= ANYOF_UTF8;
	8834	}
	8835
	8836	continue;
	8837	}
	8838	} /* end of namedclass \blah */
	8839
	8840	if (range) {
	8841	if (prevvalue > (IV)value) /* b-a */ {
	8842	const int w = RExC_parse - rangebegin;
	8843	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	8844	range = 0; /* not a valid range */
	8845	}
	8846	}
	8847	else {
	8848	prevvalue = value; /* save the beginning of the range */
	8849	if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
	8850	RExC_parse[1] != ']') {
	8851	RExC_parse++;
	8852
	8853	/* a bad range like \w-, [:word:]- ? */
	8854	if (namedclass > OOB_NAMEDCLASS) {
	8855	if (ckWARN(WARN_REGEXP)) {
	8856	const int w =
	8857	RExC_parse >= rangebegin ?
	8858	RExC_parse - rangebegin : 0;
	8859	vWARN4(RExC_parse,
	8860	"False [] range \"%.s\"",
	8861	w, w, rangebegin);
	8862	}
	8863	if (!SIZE_ONLY)
	8864	stored +=
	8865	S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
	8866	} else
	8867	range = 1; /* yeah, it's a range! */
	8868	continue; /* but do it the next time */
	8869	}
	8870	}
	8871
	8872	/* now is the next time */
	8873	if (!SIZE_ONLY) {
	8874	if (prevvalue < 256) {
	8875	const IV ceilvalue = value < 256 ? value : 255;
	8876	IV i;
	8877	#ifdef EBCDIC
	8878	/* In EBCDIC [\x89-\x91] should include
	8879	* the \x8e but [i-j] should not. */
	8880	if (literal_endpoint == 2 &&
	8881	((isLOWER(prevvalue) && isLOWER(ceilvalue)) \|\|
	8882	(isUPPER(prevvalue) && isUPPER(ceilvalue))))
	8883	{
	8884	if (isLOWER(prevvalue)) {
	8885	for (i = prevvalue; i <= ceilvalue; i++)
	8886	if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	8887	stored +=
	8888	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
	8889	}
	8890	} else {
	8891	for (i = prevvalue; i <= ceilvalue; i++)
	8892	if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	8893	stored +=
	8894	S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
	8895	}
	8896	}
	8897	}
	8898	else
	8899	#endif
	8900	for (i = prevvalue; i <= ceilvalue; i++) {
	8901	stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
	8902	}
	8903	}
	8904	if (value > 255 \|\| UTF) {
	8905	const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
	8906	const UV natvalue = NATIVE_TO_UNI(value);
	8907
	8908	/* If the code point requires utf8 to represent, and we are not
	8909	* folding, it can't match unless the target is in utf8. Only
	8910	* a few code points above 255 fold to below it, so XXX an
	8911	* optimization would be to know which ones and set the flag
	8912	* appropriately. */
	8913	ANYOF_FLAGS(ret) \|= (FOLD \|\| value < 256)
	8914	? ANYOF_NONBITMAP
	8915	: ANYOF_UTF8;
	8916	if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
	8917
	8918	/* The \t sets the whole range */
	8919	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
	8920	prevnatvalue, natvalue);
	8921
	8922	/* Currently, we don't look at every value in the range.
	8923	* Therefore we have to assume the worst case: that if
	8924	* folding, it will match more than one character */
	8925	if (FOLD) {
	8926	OP(ret) = ANYOFV;
	8927	}
	8928	}
	8929	else if (prevnatvalue == natvalue) {
	8930	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
	8931	if (FOLD) {
	8932	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	8933	STRLEN foldlen;
	8934	const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
	8935
	8936	#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
	8937	if (RExC_precomp[0] == ':' &&
	8938	RExC_precomp[1] == '[' &&
	8939	(f == 0xDF \|\| f == 0x92)) {
	8940	f = NATIVE_TO_UNI(f);
	8941	}
	8942	#endif
	8943	/* If folding and foldable and a single
	8944	* character, insert also the folded version
	8945	* to the charclass. */
	8946	if (f != value) {
	8947	#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
	8948	if ((RExC_precomp[0] == ':' &&
	8949	RExC_precomp[1] == '[' &&
	8950	(f == 0xA2 &&
	8951	(value == 0xFB05 \|\| value == 0xFB06))) ?
	8952	foldlen == ((STRLEN)UNISKIP(f) - 1) :
	8953	foldlen == (STRLEN)UNISKIP(f) )
	8954	#else
	8955	if (foldlen == (STRLEN)UNISKIP(f))
	8956	#endif
	8957	Perl_sv_catpvf(aTHX_ listsv,
	8958	"%04"UVxf"\n", f);
	8959	else {
	8960	/* Any multicharacter foldings
	8961	* require the following transform:
	8962	* [ABCDEF] -> (?:[ABCabcDEFd]\|pq\|rst)
	8963	* where E folds into "pq" and F folds
	8964	* into "rst", all other characters
	8965	* fold to single characters. We save
	8966	* away these multicharacter foldings,
	8967	* to be later saved as part of the
	8968	* additional "s" data. */
	8969	SV *sv;
	8970
	8971	if (!unicode_alternate)
	8972	unicode_alternate = newAV();
	8973	sv = newSVpvn_utf8((char*)foldbuf, foldlen,
	8974	TRUE);
	8975	av_push(unicode_alternate, sv);
	8976	OP(ret) = ANYOFV;
	8977	}
	8978	}
	8979
	8980	/* If folding and the value is one of the Greek
	8981	* sigmas insert a few more sigmas to make the
	8982	* folding rules of the sigmas to work right.
	8983	* Note that not all the possible combinations
	8984	* are handled here: some of them are handled
	8985	* by the standard folding rules, and some of
	8986	* them (literal or EXACTF cases) are handled
	8987	* during runtime in regexec.c:S_find_byclass(). */
	8988	if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
	8989	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
	8990	(UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
	8991	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
	8992	(UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
	8993	}
	8994	else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
	8995	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
	8996	(UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
	8997	}
	8998	}
	8999	}
	9000	#ifdef EBCDIC
	9001	literal_endpoint = 0;
	9002	#endif
	9003	}
	9004
	9005	range = 0; /* this range (if it was one) is done now */
	9006	}
	9007
	9008
	9009
	9010	if (SIZE_ONLY)
	9011	return ret;
	9012	/**** !SIZE_ONLY AFTER HERE *******/
	9013
	9014	/* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
	9015	* set the FOLD flag yet, so this this does optimize those. It doesn't
	9016	* optimize locale. Doing so perhaps could be done as long as there is
	9017	* nothing like \w in it; some thought also would have to be given to the
	9018	* interaction with above 0x100 chars */
	9019	if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
	9020	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
	9021	ANYOF_BITMAP(ret)[value] ^= 0xFF;
	9022	stored = 256 - stored;
	9023
	9024	/* The inversion means that everything above 255 is matched; and at the
	9025	* same time we clear the invert flag */
	9026	ANYOF_FLAGS(ret) = ANYOF_UTF8\|ANYOF_UNICODE_ALL;
	9027	}
	9028
	9029	if (FOLD) {
	9030	SV *sv;
	9031
	9032	/* This is the one character in the bitmap that needs special handling
	9033	* under non-locale folding, as it folds to two characters 'ss'. This
	9034	* happens if it is set and not inverting, or isn't set and are
	9035	* inverting */
	9036	if (! LOC
	9037	&& (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
	9038	^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
	9039	{
	9040	OP(ret) = ANYOFV; /* Can match more than a single char */
	9041
	9042	/* Under Unicode semantics), it can do this when the target string
	9043	* isn't in utf8 */
	9044	if (UNI_SEMANTICS) {
	9045	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	9046	}
	9047
	9048	if (!unicode_alternate) {
	9049	unicode_alternate = newAV();
	9050	}
	9051	sv = newSVpvn_utf8("ss", 2, TRUE);
	9052	av_push(unicode_alternate, sv);
	9053	}
	9054
	9055	/* Folding in the bitmap is taken care of above, but not for locale
	9056	* (for which we have to wait to see what folding is in effect at
	9057	* runtime), and for things not in the bitmap. Set run-time fold flag
	9058	* for these */
	9059	if ((LOC \|\| (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
	9060	ANYOF_FLAGS(ret) \|= ANYOF_LOC_NONBITMAP_FOLD;
	9061	}
	9062	}
	9063
	9064	/* A single character class can be "optimized" into an EXACTish node.
	9065	* Note that since we don't currently count how many characters there are
	9066	* outside the bitmap, we are XXX missing optimization possibilities for
	9067	* them. This optimization can't happen unless this is a truly single
	9068	* character class, which means that it can't be an inversion into a
	9069	* many-character class, and there must be no possibility of there being
	9070	* things outside the bitmap. 'stored' (only) for locales doesn't include
	9071	* \w, etc, so have to make a special test that they aren't present
	9072	*
	9073	* Similarly A 2-character class of the very special form like [bB] can be
	9074	* optimized into an EXACTFish node, but only for non-locales, and for
	9075	* characters which only have the two folds; so things like 'fF' and 'Ii'
	9076	* wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
	9077	* FI'. */
	9078	if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP\|ANYOF_INVERT\|ANYOF_UNICODE_ALL))
	9079	&& (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	9080	\|\| (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
	9081	\|\| (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	9082	&& (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
	9083	/* If the latest code point has a fold whose
	9084	* bit is set, it must be the only other one */
	9085	&& ((prevvalue = PL_fold_latin1[value]) != (IV)value)
	9086	&& ANYOF_BITMAP_TEST(ret, prevvalue)))))
	9087	{
	9088	/* Note that the information needed to decide to do this optimization
	9089	* is not currently available until the 2nd pass, and that the actually
	9090	* used EXACTish node takes less space than the calculated ANYOF node,
	9091	* and hence the amount of space calculated in the first pass is larger
	9092	* than actually used, so this optimization doesn't gain us any space.
	9093	* But an EXACT node is faster than an ANYOF node, and can be combined
	9094	* with any adjacent EXACT nodes later by the optimizer for further
	9095	* gains. The speed of executing an EXACTF is similar to an ANYOF
	9096	* node, so the optimization advantage comes from the ability to join
	9097	* it to adjacent EXACT nodes */
	9098
	9099	const char * cur_parse= RExC_parse;
	9100	U8 op;
	9101	RExC_emit = (regnode *)orig_emit;
	9102	RExC_parse = (char *)orig_parse;
	9103
	9104	if (stored == 1) {
	9105
	9106	/* A locale node with one point can be folded; all the other cases
	9107	* with folding will have two points, since we calculate them above
	9108	*/
	9109	if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
	9110	op = EXACTFL;
	9111	}
	9112	else {
	9113	op = EXACT;
	9114	}
	9115	} /* else 2 chars in the bit map: the folds of each other */
	9116	else if (AT_LEAST_UNI_SEMANTICS \|\| !isASCII(value)) {
	9117
	9118	/* To join adjacent nodes, they must be the exact EXACTish type.
	9119	* Try to use the most likely type, by using EXACTFU if the regex
	9120	* calls for them, or is required because the character is
	9121	* non-ASCII */
	9122	op = EXACTFU;
	9123	}
	9124	else { /* Otherwise, more likely to be EXACTF type */
	9125	op = EXACTF;
	9126	}
	9127
	9128	ret = reg_node(pRExC_state, op);
	9129	RExC_parse = (char *)cur_parse;
	9130	if (UTF && ! NATIVE_IS_INVARIANT(value)) {
	9131	*STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
	9132	*(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
	9133	STR_LEN(ret)= 2;
	9134	RExC_emit += STR_SZ(2);
	9135	}
	9136	else {
	9137	*STRING(ret)= (char)value;
	9138	STR_LEN(ret)= 1;
	9139	RExC_emit += STR_SZ(1);
	9140	}
	9141	SvREFCNT_dec(listsv);
	9142	return ret;
	9143	}
	9144
	9145	{
	9146	AV * const av = newAV();
	9147	SV *rv;
	9148	/* The 0th element stores the character class description
	9149	* in its textual form: used later (regexec.c:Perl_regclass_swash())
	9150	* to initialize the appropriate swash (which gets stored in
	9151	* the 1st element), and also useful for dumping the regnode.
	9152	* The 2nd element stores the multicharacter foldings,
	9153	* used later (regexec.c:S_reginclass()). */
	9154	av_store(av, 0, listsv);
	9155	av_store(av, 1, NULL);
	9156	av_store(av, 2, MUTABLE_SV(unicode_alternate));
	9157	rv = newRV_noinc(MUTABLE_SV(av));
	9158	n = add_data(pRExC_state, 1, "s");
	9159	RExC_rxi->data->data[n] = (void*)rv;
	9160	ARG_SET(ret, n);
	9161	}
	9162	return ret;
	9163	}
	9164	#undef _C_C_T_
	9165
	9166
	9167	/* reg_skipcomment()
	9168
	9169	Absorbs an /x style # comments from the input stream.
	9170	Returns true if there is more text remaining in the stream.
	9171	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	9172	terminates the pattern without including a newline.
	9173
	9174	Note its the callers responsibility to ensure that we are
	9175	actually in /x mode
	9176
	9177	*/
	9178
	9179	STATIC bool
	9180	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	9181	{
	9182	bool ended = 0;
	9183
	9184	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	9185
	9186	while (RExC_parse < RExC_end)
	9187	if (*RExC_parse++ == '\n') {
	9188	ended = 1;
	9189	break;
	9190	}
	9191	if (!ended) {
	9192	/* we ran off the end of the pattern without ending
	9193	the comment, so we have to add an \n when wrapping */
	9194	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	9195	return 0;
	9196	} else
	9197	return 1;
	9198	}
	9199
	9200	/* nextchar()
	9201
	9202	Advances the parse position, and optionally absorbs
	9203	"whitespace" from the inputstream.
	9204
	9205	Without /x "whitespace" means (?#...) style comments only,
	9206	with /x this means (?#...) and # comments and whitespace proper.
	9207
	9208	Returns the RExC_parse point from BEFORE the scan occurs.
	9209
	9210	This is the /x friendly way of saying RExC_parse++.
	9211	*/
	9212
	9213	STATIC char*
	9214	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	9215	{
	9216	char* const retval = RExC_parse++;
	9217
	9218	PERL_ARGS_ASSERT_NEXTCHAR;
	9219
	9220	for (;;) {
	9221	if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
	9222	RExC_parse[2] == '#') {
	9223	while (*RExC_parse != ')') {
	9224	if (RExC_parse == RExC_end)
	9225	FAIL("Sequence (?#... not terminated");
	9226	RExC_parse++;
	9227	}
	9228	RExC_parse++;
	9229	continue;
	9230	}
	9231	if (RExC_flags & RXf_PMf_EXTENDED) {
	9232	if (isSPACE(*RExC_parse)) {
	9233	RExC_parse++;
	9234	continue;
	9235	}
	9236	else if (*RExC_parse == '#') {
	9237	if ( reg_skipcomment( pRExC_state ) )
	9238	continue;
	9239	}
	9240	}
	9241	return retval;
	9242	}
	9243	}
	9244
	9245	/*
	9246	- reg_node - emit a node
	9247	*/
	9248	STATIC regnode * /* Location. */
	9249	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	9250	{
	9251	dVAR;
	9252	register regnode *ptr;
	9253	regnode * const ret = RExC_emit;
	9254	GET_RE_DEBUG_FLAGS_DECL;
	9255
	9256	PERL_ARGS_ASSERT_REG_NODE;
	9257
	9258	if (SIZE_ONLY) {
	9259	SIZE_ALIGN(RExC_size);
	9260	RExC_size += 1;
	9261	return(ret);
	9262	}
	9263	if (RExC_emit >= RExC_emit_bound)
	9264	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
	9265
	9266	NODE_ALIGN_FILL(ret);
	9267	ptr = ret;
	9268	FILL_ADVANCE_NODE(ptr, op);
	9269	#ifdef RE_TRACK_PATTERN_OFFSETS
	9270	if (RExC_offsets) { /* MJD */
	9271	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	9272	"reg_node", __LINE__,
	9273	PL_reg_name[op],
	9274	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	9275	? "Overwriting end of array!\n" : "OK",
	9276	(UV)(RExC_emit - RExC_emit_start),
	9277	(UV)(RExC_parse - RExC_start),
	9278	(UV)RExC_offsets[0]));
	9279	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	9280	}
	9281	#endif
	9282	RExC_emit = ptr;
	9283	return(ret);
	9284	}
	9285
	9286	/*
	9287	- reganode - emit a node with an argument
	9288	*/
	9289	STATIC regnode * /* Location. */
	9290	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	9291	{
	9292	dVAR;
	9293	register regnode *ptr;
	9294	regnode * const ret = RExC_emit;
	9295	GET_RE_DEBUG_FLAGS_DECL;
	9296
	9297	PERL_ARGS_ASSERT_REGANODE;
	9298
	9299	if (SIZE_ONLY) {
	9300	SIZE_ALIGN(RExC_size);
	9301	RExC_size += 2;
	9302	/*
	9303	We can't do this:
	9304
	9305	assert(2==regarglen[op]+1);
	9306
	9307	Anything larger than this has to allocate the extra amount.
	9308	If we changed this to be:
	9309
	9310	RExC_size += (1 + regarglen[op]);
	9311
	9312	then it wouldn't matter. Its not clear what side effect
	9313	might come from that so its not done so far.
	9314	-- dmq
	9315	*/
	9316	return(ret);
	9317	}
	9318	if (RExC_emit >= RExC_emit_bound)
	9319	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
	9320
	9321	NODE_ALIGN_FILL(ret);
	9322	ptr = ret;
	9323	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	9324	#ifdef RE_TRACK_PATTERN_OFFSETS
	9325	if (RExC_offsets) { /* MJD */
	9326	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	9327	"reganode",
	9328	__LINE__,
	9329	PL_reg_name[op],
	9330	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	9331	"Overwriting end of array!\n" : "OK",
	9332	(UV)(RExC_emit - RExC_emit_start),
	9333	(UV)(RExC_parse - RExC_start),
	9334	(UV)RExC_offsets[0]));
	9335	Set_Cur_Node_Offset;
	9336	}
	9337	#endif
	9338	RExC_emit = ptr;
	9339	return(ret);
	9340	}
	9341
	9342	/*
	9343	- reguni - emit (if appropriate) a Unicode character
	9344	*/
	9345	STATIC STRLEN
	9346	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	9347	{
	9348	dVAR;
	9349
	9350	PERL_ARGS_ASSERT_REGUNI;
	9351
	9352	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	9353	}
	9354
	9355	/*
	9356	- reginsert - insert an operator in front of already-emitted operand
	9357	*
	9358	* Means relocating the operand.
	9359	*/
	9360	STATIC void
	9361	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	9362	{
	9363	dVAR;
	9364	register regnode *src;
	9365	register regnode *dst;
	9366	register regnode *place;
	9367	const int offset = regarglen[(U8)op];
	9368	const int size = NODE_STEP_REGNODE + offset;
	9369	GET_RE_DEBUG_FLAGS_DECL;
	9370
	9371	PERL_ARGS_ASSERT_REGINSERT;
	9372	PERL_UNUSED_ARG(depth);
	9373	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	9374	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	9375	if (SIZE_ONLY) {
	9376	RExC_size += size;
	9377	return;
	9378	}
	9379
	9380	src = RExC_emit;
	9381	RExC_emit += size;
	9382	dst = RExC_emit;
	9383	if (RExC_open_parens) {
	9384	int paren;
	9385	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	9386	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	9387	if ( RExC_open_parens[paren] >= opnd ) {
	9388	/DEBUG_PARSE_FMT("open"," - %d",size);/
	9389	RExC_open_parens[paren] += size;
	9390	} else {
	9391	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	9392	}
	9393	if ( RExC_close_parens[paren] >= opnd ) {
	9394	/DEBUG_PARSE_FMT("close"," - %d",size);/
	9395	RExC_close_parens[paren] += size;
	9396	} else {
	9397	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	9398	}
	9399	}
	9400	}
	9401
	9402	while (src > opnd) {
	9403	StructCopy(--src, --dst, regnode);
	9404	#ifdef RE_TRACK_PATTERN_OFFSETS
	9405	if (RExC_offsets) { /* MJD 20010112 */
	9406	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	9407	"reg_insert",
	9408	__LINE__,
	9409	PL_reg_name[op],
	9410	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	9411	? "Overwriting end of array!\n" : "OK",
	9412	(UV)(src - RExC_emit_start),
	9413	(UV)(dst - RExC_emit_start),
	9414	(UV)RExC_offsets[0]));
	9415	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	9416	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	9417	}
	9418	#endif
	9419	}
	9420
	9421
	9422	place = opnd; /* Op node, where operand used to be. */
	9423	#ifdef RE_TRACK_PATTERN_OFFSETS
	9424	if (RExC_offsets) { /* MJD */
	9425	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	9426	"reginsert",
	9427	__LINE__,
	9428	PL_reg_name[op],
	9429	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	9430	? "Overwriting end of array!\n" : "OK",
	9431	(UV)(place - RExC_emit_start),
	9432	(UV)(RExC_parse - RExC_start),
	9433	(UV)RExC_offsets[0]));
	9434	Set_Node_Offset(place, RExC_parse);
	9435	Set_Node_Length(place, 1);
	9436	}
	9437	#endif
	9438	src = NEXTOPER(place);
	9439	FILL_ADVANCE_NODE(place, op);
	9440	Zero(src, offset, regnode);
	9441	}
	9442
	9443	/*
	9444	- regtail - set the next-pointer at the end of a node chain of p to val.
	9445	- SEE ALSO: regtail_study
	9446	*/
	9447	/* TODO: All three parms should be const */
	9448	STATIC void
	9449	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	9450	{
	9451	dVAR;
	9452	register regnode *scan;
	9453	GET_RE_DEBUG_FLAGS_DECL;
	9454
	9455	PERL_ARGS_ASSERT_REGTAIL;
	9456	#ifndef DEBUGGING
	9457	PERL_UNUSED_ARG(depth);
	9458	#endif
	9459
	9460	if (SIZE_ONLY)
	9461	return;
	9462
	9463	/* Find last node. */
	9464	scan = p;
	9465	for (;;) {
	9466	regnode * const temp = regnext(scan);
	9467	DEBUG_PARSE_r({
	9468	SV * const mysv=sv_newmortal();
	9469	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	9470	regprop(RExC_rx, mysv, scan);
	9471	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	9472	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	9473	(temp == NULL ? "->" : ""),
	9474	(temp == NULL ? PL_reg_name[OP(val)] : "")
	9475	);
	9476	});
	9477	if (temp == NULL)
	9478	break;
	9479	scan = temp;
	9480	}
	9481
	9482	if (reg_off_by_arg[OP(scan)]) {
	9483	ARG_SET(scan, val - scan);
	9484	}
	9485	else {
	9486	NEXT_OFF(scan) = val - scan;
	9487	}
	9488	}
	9489
	9490	#ifdef DEBUGGING
	9491	/*
	9492	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	9493	- Look for optimizable sequences at the same time.
	9494	- currently only looks for EXACT chains.
	9495
	9496	This is experimental code. The idea is to use this routine to perform
	9497	in place optimizations on branches and groups as they are constructed,
	9498	with the long term intention of removing optimization from study_chunk so
	9499	that it is purely analytical.
	9500
	9501	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	9502	to control which is which.
	9503
	9504	*/
	9505	/* TODO: All four parms should be const */
	9506
	9507	STATIC U8
	9508	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	9509	{
	9510	dVAR;
	9511	register regnode *scan;
	9512	U8 exact = PSEUDO;
	9513	#ifdef EXPERIMENTAL_INPLACESCAN
	9514	I32 min = 0;
	9515	#endif
	9516	GET_RE_DEBUG_FLAGS_DECL;
	9517
	9518	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	9519
	9520
	9521	if (SIZE_ONLY)
	9522	return exact;
	9523
	9524	/* Find last node. */
	9525
	9526	scan = p;
	9527	for (;;) {
	9528	regnode * const temp = regnext(scan);
	9529	#ifdef EXPERIMENTAL_INPLACESCAN
	9530	if (PL_regkind[OP(scan)] == EXACT)
	9531	if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
	9532	return EXACT;
	9533	#endif
	9534	if ( exact ) {
	9535	switch (OP(scan)) {
	9536	case EXACT:
	9537	case EXACTF:
	9538	case EXACTFU:
	9539	case EXACTFL:
	9540	if( exact == PSEUDO )
	9541	exact= OP(scan);
	9542	else if ( exact != OP(scan) )
	9543	exact= 0;
	9544	case NOTHING:
	9545	break;
	9546	default:
	9547	exact= 0;
	9548	}
	9549	}
	9550	DEBUG_PARSE_r({
	9551	SV * const mysv=sv_newmortal();
	9552	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	9553	regprop(RExC_rx, mysv, scan);
	9554	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	9555	SvPV_nolen_const(mysv),
	9556	REG_NODE_NUM(scan),
	9557	PL_reg_name[exact]);
	9558	});
	9559	if (temp == NULL)
	9560	break;
	9561	scan = temp;
	9562	}
	9563	DEBUG_PARSE_r({
	9564	SV * const mysv_val=sv_newmortal();
	9565	DEBUG_PARSE_MSG("");
	9566	regprop(RExC_rx, mysv_val, val);
	9567	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	9568	SvPV_nolen_const(mysv_val),
	9569	(IV)REG_NODE_NUM(val),
	9570	(IV)(val - scan)
	9571	);
	9572	});
	9573	if (reg_off_by_arg[OP(scan)]) {
	9574	ARG_SET(scan, val - scan);
	9575	}
	9576	else {
	9577	NEXT_OFF(scan) = val - scan;
	9578	}
	9579
	9580	return exact;
	9581	}
	9582	#endif
	9583
	9584	/*
	9585	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	9586	*/
	9587	#ifdef DEBUGGING
	9588	static void
	9589	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	9590	{
	9591	int bit;
	9592	int set=0;
	9593	regex_charset cs;
	9594
	9595	for (bit=0; bit<32; bit++) {
	9596	if (flags & (1<<bit)) {
	9597	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	9598	continue;
	9599	}
	9600	if (!set++ && lead)
	9601	PerlIO_printf(Perl_debug_log, "%s",lead);
	9602	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	9603	}
	9604	}
	9605	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	9606	if (!set++ && lead) {
	9607	PerlIO_printf(Perl_debug_log, "%s",lead);
	9608	}
	9609	switch (cs) {
	9610	case REGEX_UNICODE_CHARSET:
	9611	PerlIO_printf(Perl_debug_log, "UNICODE");
	9612	break;
	9613	case REGEX_LOCALE_CHARSET:
	9614	PerlIO_printf(Perl_debug_log, "LOCALE");
	9615	break;
	9616	case REGEX_ASCII_RESTRICTED_CHARSET:
	9617	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	9618	break;
	9619	default:
	9620	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	9621	break;
	9622	}
	9623	}
	9624	if (lead) {
	9625	if (set)
	9626	PerlIO_printf(Perl_debug_log, "\n");
	9627	else
	9628	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	9629	}
	9630	}
	9631	#endif
	9632
	9633	void
	9634	Perl_regdump(pTHX_ const regexp *r)
	9635	{
	9636	#ifdef DEBUGGING
	9637	dVAR;
	9638	SV * const sv = sv_newmortal();
	9639	SV *dsv= sv_newmortal();
	9640	RXi_GET_DECL(r,ri);
	9641	GET_RE_DEBUG_FLAGS_DECL;
	9642
	9643	PERL_ARGS_ASSERT_REGDUMP;
	9644
	9645	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	9646
	9647	/* Header fields of interest. */
	9648	if (r->anchored_substr) {
	9649	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	9650	RE_SV_DUMPLEN(r->anchored_substr), 30);
	9651	PerlIO_printf(Perl_debug_log,
	9652	"anchored %s%s at %"IVdf" ",
	9653	s, RE_SV_TAIL(r->anchored_substr),
	9654	(IV)r->anchored_offset);
	9655	} else if (r->anchored_utf8) {
	9656	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	9657	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	9658	PerlIO_printf(Perl_debug_log,
	9659	"anchored utf8 %s%s at %"IVdf" ",
	9660	s, RE_SV_TAIL(r->anchored_utf8),
	9661	(IV)r->anchored_offset);
	9662	}
	9663	if (r->float_substr) {
	9664	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	9665	RE_SV_DUMPLEN(r->float_substr), 30);
	9666	PerlIO_printf(Perl_debug_log,
	9667	"floating %s%s at %"IVdf"..%"UVuf" ",
	9668	s, RE_SV_TAIL(r->float_substr),
	9669	(IV)r->float_min_offset, (UV)r->float_max_offset);
	9670	} else if (r->float_utf8) {
	9671	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	9672	RE_SV_DUMPLEN(r->float_utf8), 30);
	9673	PerlIO_printf(Perl_debug_log,
	9674	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	9675	s, RE_SV_TAIL(r->float_utf8),
	9676	(IV)r->float_min_offset, (UV)r->float_max_offset);
	9677	}
	9678	if (r->check_substr \|\| r->check_utf8)
	9679	PerlIO_printf(Perl_debug_log,
	9680	(const char *)
	9681	(r->check_substr == r->float_substr
	9682	&& r->check_utf8 == r->float_utf8
	9683	? "(checking floating" : "(checking anchored"));
	9684	if (r->extflags & RXf_NOSCAN)
	9685	PerlIO_printf(Perl_debug_log, " noscan");
	9686	if (r->extflags & RXf_CHECK_ALL)
	9687	PerlIO_printf(Perl_debug_log, " isall");
	9688	if (r->check_substr \|\| r->check_utf8)
	9689	PerlIO_printf(Perl_debug_log, ") ");
	9690
	9691	if (ri->regstclass) {
	9692	regprop(r, sv, ri->regstclass);
	9693	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	9694	}
	9695	if (r->extflags & RXf_ANCH) {
	9696	PerlIO_printf(Perl_debug_log, "anchored");
	9697	if (r->extflags & RXf_ANCH_BOL)
	9698	PerlIO_printf(Perl_debug_log, "(BOL)");
	9699	if (r->extflags & RXf_ANCH_MBOL)
	9700	PerlIO_printf(Perl_debug_log, "(MBOL)");
	9701	if (r->extflags & RXf_ANCH_SBOL)
	9702	PerlIO_printf(Perl_debug_log, "(SBOL)");
	9703	if (r->extflags & RXf_ANCH_GPOS)
	9704	PerlIO_printf(Perl_debug_log, "(GPOS)");
	9705	PerlIO_putc(Perl_debug_log, ' ');
	9706	}
	9707	if (r->extflags & RXf_GPOS_SEEN)
	9708	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	9709	if (r->intflags & PREGf_SKIP)
	9710	PerlIO_printf(Perl_debug_log, "plus ");
	9711	if (r->intflags & PREGf_IMPLICIT)
	9712	PerlIO_printf(Perl_debug_log, "implicit ");
	9713	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	9714	if (r->extflags & RXf_EVAL_SEEN)
	9715	PerlIO_printf(Perl_debug_log, "with eval ");
	9716	PerlIO_printf(Perl_debug_log, "\n");
	9717	DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
	9718	#else
	9719	PERL_ARGS_ASSERT_REGDUMP;
	9720	PERL_UNUSED_CONTEXT;
	9721	PERL_UNUSED_ARG(r);
	9722	#endif /* DEBUGGING */
	9723	}
	9724
	9725	/*
	9726	- regprop - printable representation of opcode
	9727	*/
	9728	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	9729	STMT_START { \
	9730	if (do_sep) { \
	9731	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	9732	if (flags & ANYOF_INVERT) \
	9733	/make sure the invert info is in each / \
	9734	sv_catpvs(sv, "^"); \
	9735	do_sep = 0; \
	9736	} \
	9737	} STMT_END
	9738
	9739	void
	9740	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	9741	{
	9742	#ifdef DEBUGGING
	9743	dVAR;
	9744	register int k;
	9745	RXi_GET_DECL(prog,progi);
	9746	GET_RE_DEBUG_FLAGS_DECL;
	9747
	9748	PERL_ARGS_ASSERT_REGPROP;
	9749
	9750	sv_setpvs(sv, "");
	9751
	9752	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	9753	/* It would be nice to FAIL() here, but this may be called from
	9754	regexec.c, and it would be hard to supply pRExC_state. */
	9755	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	9756	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	9757
	9758	k = PL_regkind[OP(o)];
	9759
	9760	if (k == EXACT) {
	9761	sv_catpvs(sv, " ");
	9762	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	9763	* is a crude hack but it may be the best for now since
	9764	* we have no flag "this EXACTish node was UTF-8"
	9765	* --jhi */
	9766	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	9767	PERL_PV_ESCAPE_UNI_DETECT \|
	9768	PERL_PV_ESCAPE_NONASCII \|
	9769	PERL_PV_PRETTY_ELLIPSES \|
	9770	PERL_PV_PRETTY_LTGT \|
	9771	PERL_PV_PRETTY_NOCLEAR
	9772	);
	9773	} else if (k == TRIE) {
	9774	/* print the details of the trie in dumpuntil instead, as
	9775	* progi->data isn't available here */
	9776	const char op = OP(o);
	9777	const U32 n = ARG(o);
	9778	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	9779	(reg_ac_data *)progi->data->data[n] :
	9780	NULL;
	9781	const reg_trie_data * const trie
	9782	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	9783
	9784	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	9785	DEBUG_TRIE_COMPILE_r(
	9786	Perl_sv_catpvf(aTHX_ sv,
	9787	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	9788	(UV)trie->startstate,
	9789	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	9790	(UV)trie->wordcount,
	9791	(UV)trie->minlen,
	9792	(UV)trie->maxlen,
	9793	(UV)TRIE_CHARCOUNT(trie),
	9794	(UV)trie->uniquecharcount
	9795	)
	9796	);
	9797	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	9798	int i;
	9799	int rangestart = -1;
	9800	U8* bitmap = IS_ANYOF_TRIE(op) ? (U8)ANYOF_BITMAP(o) : (U8)TRIE_BITMAP(trie);
	9801	sv_catpvs(sv, "[");
	9802	for (i = 0; i <= 256; i++) {
	9803	if (i < 256 && BITMAP_TEST(bitmap,i)) {
	9804	if (rangestart == -1)
	9805	rangestart = i;
	9806	} else if (rangestart != -1) {
	9807	if (i <= rangestart + 3)
	9808	for (; rangestart < i; rangestart++)
	9809	put_byte(sv, rangestart);
	9810	else {
	9811	put_byte(sv, rangestart);
	9812	sv_catpvs(sv, "-");
	9813	put_byte(sv, i - 1);
	9814	}
	9815	rangestart = -1;
	9816	}
	9817	}
	9818	sv_catpvs(sv, "]");
	9819	}
	9820
	9821	} else if (k == CURLY) {
	9822	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	9823	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	9824	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	9825	}
	9826	else if (k == WHILEM && o->flags) /* Ordinal/of */
	9827	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	9828	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	9829	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	9830	if ( RXp_PAREN_NAMES(prog) ) {
	9831	if ( k != REF \|\| (OP(o) < NREF)) {
	9832	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	9833	SV **name= av_fetch(list, ARG(o), 0 );
	9834	if (name)
	9835	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	9836	}
	9837	else {
	9838	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	9839	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	9840	I32 nums=(I32)SvPVX(sv_dat);
	9841	SV **name= av_fetch(list, nums[0], 0 );
	9842	I32 n;
	9843	if (name) {
	9844	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	9845	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	9846	(n ? "," : ""), (IV)nums[n]);
	9847	}
	9848	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	9849	}
	9850	}
	9851	}
	9852	} else if (k == GOSUB)
	9853	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	9854	else if (k == VERB) {
	9855	if (!o->flags)
	9856	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	9857	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	9858	} else if (k == LOGICAL)
	9859	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	9860	else if (k == FOLDCHAR)
	9861	Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
	9862	else if (k == ANYOF) {
	9863	int i, rangestart = -1;
	9864	const U8 flags = ANYOF_FLAGS(o);
	9865	int do_sep = 0;
	9866
	9867	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	9868	static const char * const anyofs[] = {
	9869	"\\w",
	9870	"\\W",
	9871	"\\s",
	9872	"\\S",
	9873	"\\d",
	9874	"\\D",
	9875	"[:alnum:]",
	9876	"[:^alnum:]",
	9877	"[:alpha:]",
	9878	"[:^alpha:]",
	9879	"[:ascii:]",
	9880	"[:^ascii:]",
	9881	"[:cntrl:]",
	9882	"[:^cntrl:]",
	9883	"[:graph:]",
	9884	"[:^graph:]",
	9885	"[:lower:]",
	9886	"[:^lower:]",
	9887	"[:print:]",
	9888	"[:^print:]",
	9889	"[:punct:]",
	9890	"[:^punct:]",
	9891	"[:upper:]",
	9892	"[:^upper:]",
	9893	"[:xdigit:]",
	9894	"[:^xdigit:]",
	9895	"[:space:]",
	9896	"[:^space:]",
	9897	"[:blank:]",
	9898	"[:^blank:]"
	9899	};
	9900
	9901	if (flags & ANYOF_LOCALE)
	9902	sv_catpvs(sv, "{loc}");
	9903	if (flags & ANYOF_LOC_NONBITMAP_FOLD)
	9904	sv_catpvs(sv, "{i}");
	9905	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	9906	if (flags & ANYOF_INVERT)
	9907	sv_catpvs(sv, "^");
	9908
	9909	/* output what the standard cp 0-255 bitmap matches */
	9910	for (i = 0; i <= 256; i++) {
	9911	if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
	9912	if (rangestart == -1)
	9913	rangestart = i;
	9914	} else if (rangestart != -1) {
	9915	if (i <= rangestart + 3)
	9916	for (; rangestart < i; rangestart++)
	9917	put_byte(sv, rangestart);
	9918	else {
	9919	put_byte(sv, rangestart);
	9920	sv_catpvs(sv, "-");
	9921	put_byte(sv, i - 1);
	9922	}
	9923	do_sep = 1;
	9924	rangestart = -1;
	9925	}
	9926	}
	9927
	9928	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	9929	/* output any special charclass tests (used entirely under use locale) */
	9930	if (ANYOF_CLASS_TEST_ANY_SET(o))
	9931	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
	9932	if (ANYOF_CLASS_TEST(o,i)) {
	9933	sv_catpv(sv, anyofs[i]);
	9934	do_sep = 1;
	9935	}
	9936
	9937	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	9938
	9939	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	9940	sv_catpvs(sv, "{non-utf8-latin1-all}");
	9941	}
	9942
	9943	/* output information about the unicode matching */
	9944	if (flags & ANYOF_UNICODE_ALL)
	9945	sv_catpvs(sv, "{unicode_all}");
	9946	else if (flags & ANYOF_UTF8)
	9947	sv_catpvs(sv, "{unicode}");
	9948	if (flags & ANYOF_NONBITMAP_NON_UTF8)
	9949	sv_catpvs(sv, "{outside bitmap}");
	9950
	9951	{
	9952	SV *lv;
	9953	SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
	9954
	9955	if (lv) {
	9956	if (sw) {
	9957	U8 s[UTF8_MAXBYTES_CASE+1];
	9958
	9959	for (i = 0; i <= 256; i++) { /* just the first 256 */
	9960	uvchr_to_utf8(s, i);
	9961
	9962	if (i < 256 && swash_fetch(sw, s, TRUE)) {
	9963	if (rangestart == -1)
	9964	rangestart = i;
	9965	} else if (rangestart != -1) {
	9966	if (i <= rangestart + 3)
	9967	for (; rangestart < i; rangestart++) {
	9968	const U8 * const e = uvchr_to_utf8(s,rangestart);
	9969	U8 *p;
	9970	for(p = s; p < e; p++)
	9971	put_byte(sv, *p);
	9972	}
	9973	else {
	9974	const U8 *e = uvchr_to_utf8(s,rangestart);
	9975	U8 *p;
	9976	for (p = s; p < e; p++)
	9977	put_byte(sv, *p);
	9978	sv_catpvs(sv, "-");
	9979	e = uvchr_to_utf8(s, i-1);
	9980	for (p = s; p < e; p++)
	9981	put_byte(sv, *p);
	9982	}
	9983	rangestart = -1;
	9984	}
	9985	}
	9986
	9987	sv_catpvs(sv, "..."); /* et cetera */
	9988	}
	9989
	9990	{
	9991	char *s = savesvpv(lv);
	9992	char * const origs = s;
	9993
	9994	while (s && s != '\n')
	9995	s++;
	9996
	9997	if (*s == '\n') {
	9998	const char * const t = ++s;
	9999
	10000	while (*s) {
	10001	if (*s == '\n')
	10002	*s = ' ';
	10003	s++;
	10004	}
	10005	if (s[-1] == ' ')
	10006	s[-1] = 0;
	10007
	10008	sv_catpv(sv, t);
	10009	}
	10010
	10011	Safefree(origs);
	10012	}
	10013	}
	10014	}
	10015
	10016	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	10017	}
	10018	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	10019	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	10020	#else
	10021	PERL_UNUSED_CONTEXT;
	10022	PERL_UNUSED_ARG(sv);
	10023	PERL_UNUSED_ARG(o);
	10024	PERL_UNUSED_ARG(prog);
	10025	#endif /* DEBUGGING */
	10026	}
	10027
	10028	SV *
	10029	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	10030	{ /* Assume that RE_INTUIT is set */
	10031	dVAR;
	10032	struct regexp const prog = (struct regexp )SvANY(r);
	10033	GET_RE_DEBUG_FLAGS_DECL;
	10034
	10035	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	10036	PERL_UNUSED_CONTEXT;
	10037
	10038	DEBUG_COMPILE_r(
	10039	{
	10040	const char * const s = SvPV_nolen_const(prog->check_substr
	10041	? prog->check_substr : prog->check_utf8);
	10042
	10043	if (!PL_colorset) reginitcolors();
	10044	PerlIO_printf(Perl_debug_log,
	10045	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	10046	PL_colors[4],
	10047	prog->check_substr ? "" : "utf8 ",
	10048	PL_colors[5],PL_colors[0],
	10049	s,
	10050	PL_colors[1],
	10051	(strlen(s) > 60 ? "..." : ""));
	10052	} );
	10053
	10054	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	10055	}
	10056
	10057	/*
	10058	pregfree()
	10059
	10060	handles refcounting and freeing the perl core regexp structure. When
	10061	it is necessary to actually free the structure the first thing it
	10062	does is call the 'free' method of the regexp_engine associated to
	10063	the regexp, allowing the handling of the void *pprivate; member
	10064	first. (This routine is not overridable by extensions, which is why
	10065	the extensions free is called first.)
	10066
	10067	See regdupe and regdupe_internal if you change anything here.
	10068	*/
	10069	#ifndef PERL_IN_XSUB_RE
	10070	void
	10071	Perl_pregfree(pTHX_ REGEXP *r)
	10072	{
	10073	SvREFCNT_dec(r);
	10074	}
	10075
	10076	void
	10077	Perl_pregfree2(pTHX_ REGEXP *rx)
	10078	{
	10079	dVAR;
	10080	struct regexp const r = (struct regexp )SvANY(rx);
	10081	GET_RE_DEBUG_FLAGS_DECL;
	10082
	10083	PERL_ARGS_ASSERT_PREGFREE2;
	10084
	10085	if (r->mother_re) {
	10086	ReREFCNT_dec(r->mother_re);
	10087	} else {
	10088	CALLREGFREE_PVT(rx); /* free the private data */
	10089	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	10090	}
	10091	if (r->substrs) {
	10092	SvREFCNT_dec(r->anchored_substr);
	10093	SvREFCNT_dec(r->anchored_utf8);
	10094	SvREFCNT_dec(r->float_substr);
	10095	SvREFCNT_dec(r->float_utf8);
	10096	Safefree(r->substrs);
	10097	}
	10098	RX_MATCH_COPY_FREE(rx);
	10099	#ifdef PERL_OLD_COPY_ON_WRITE
	10100	SvREFCNT_dec(r->saved_copy);
	10101	#endif
	10102	Safefree(r->offs);
	10103	}
	10104
	10105	/* reg_temp_copy()
	10106
	10107	This is a hacky workaround to the structural issue of match results
	10108	being stored in the regexp structure which is in turn stored in
	10109	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	10110	could be PL_curpm in multiple contexts, and could require multiple
	10111	result sets being associated with the pattern simultaneously, such
	10112	as when doing a recursive match with (??{$qr})
	10113
	10114	The solution is to make a lightweight copy of the regexp structure
	10115	when a qr// is returned from the code executed by (??{$qr}) this
	10116	lightweight copy doesn't actually own any of its data except for
	10117	the starp/end and the actual regexp structure itself.
	10118
	10119	*/
	10120
	10121
	10122	REGEXP *
	10123	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	10124	{
	10125	struct regexp *ret;
	10126	struct regexp const r = (struct regexp )SvANY(rx);
	10127	register const I32 npar = r->nparens+1;
	10128
	10129	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	10130
	10131	if (!ret_x)
	10132	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	10133	ret = (struct regexp *)SvANY(ret_x);
	10134
	10135	(void)ReREFCNT_inc(rx);
	10136	/* We can take advantage of the existing "copied buffer" mechanism in SVs
	10137	by pointing directly at the buffer, but flagging that the allocated
	10138	space in the copy is zero. As we've just done a struct copy, it's now
	10139	a case of zero-ing that, rather than copying the current length. */
	10140	SvPV_set(ret_x, RX_WRAPPED(rx));
	10141	SvFLAGS(ret_x) \|= SvFLAGS(rx) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	10142	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	10143	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	10144	SvLEN_set(ret_x, 0);
	10145	SvSTASH_set(ret_x, NULL);
	10146	SvMAGIC_set(ret_x, NULL);
	10147	Newx(ret->offs, npar, regexp_paren_pair);
	10148	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	10149	if (r->substrs) {
	10150	Newx(ret->substrs, 1, struct reg_substr_data);
	10151	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	10152
	10153	SvREFCNT_inc_void(ret->anchored_substr);
	10154	SvREFCNT_inc_void(ret->anchored_utf8);
	10155	SvREFCNT_inc_void(ret->float_substr);
	10156	SvREFCNT_inc_void(ret->float_utf8);
	10157
	10158	/* check_substr and check_utf8, if non-NULL, point to either their
	10159	anchored or float namesakes, and don't hold a second reference. */
	10160	}
	10161	RX_MATCH_COPIED_off(ret_x);
	10162	#ifdef PERL_OLD_COPY_ON_WRITE
	10163	ret->saved_copy = NULL;
	10164	#endif
	10165	ret->mother_re = rx;
	10166
	10167	return ret_x;
	10168	}
	10169	#endif
	10170
	10171	/* regfree_internal()
	10172
	10173	Free the private data in a regexp. This is overloadable by
	10174	extensions. Perl takes care of the regexp structure in pregfree(),
	10175	this covers the *pprivate pointer which technically perl doesn't
	10176	know about, however of course we have to handle the
	10177	regexp_internal structure when no extension is in use.
	10178
	10179	Note this is called before freeing anything in the regexp
	10180	structure.
	10181	*/
	10182
	10183	void
	10184	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	10185	{
	10186	dVAR;
	10187	struct regexp const r = (struct regexp )SvANY(rx);
	10188	RXi_GET_DECL(r,ri);
	10189	GET_RE_DEBUG_FLAGS_DECL;
	10190
	10191	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	10192
	10193	DEBUG_COMPILE_r({
	10194	if (!PL_colorset)
	10195	reginitcolors();
	10196	{
	10197	SV *dsv= sv_newmortal();
	10198	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	10199	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	10200	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	10201	PL_colors[4],PL_colors[5],s);
	10202	}
	10203	});
	10204	#ifdef RE_TRACK_PATTERN_OFFSETS
	10205	if (ri->u.offsets)
	10206	Safefree(ri->u.offsets); /* 20010421 MJD */
	10207	#endif
	10208	if (ri->data) {
	10209	int n = ri->data->count;
	10210	PAD* new_comppad = NULL;
	10211	PAD* old_comppad;
	10212	PADOFFSET refcnt;
	10213
	10214	while (--n >= 0) {
	10215	/* If you add a ->what type here, update the comment in regcomp.h */
	10216	switch (ri->data->what[n]) {
	10217	case 'a':
	10218	case 's':
	10219	case 'S':
	10220	case 'u':
	10221	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	10222	break;
	10223	case 'f':
	10224	Safefree(ri->data->data[n]);
	10225	break;
	10226	case 'p':
	10227	new_comppad = MUTABLE_AV(ri->data->data[n]);
	10228	break;
	10229	case 'o':
	10230	if (new_comppad == NULL)
	10231	Perl_croak(aTHX_ "panic: pregfree comppad");
	10232	PAD_SAVE_LOCAL(old_comppad,
	10233	/* Watch out for global destruction's random ordering. */
	10234	(SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
	10235	);
	10236	OP_REFCNT_LOCK;
	10237	refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
	10238	OP_REFCNT_UNLOCK;
	10239	if (!refcnt)
	10240	op_free((OP_4tree*)ri->data->data[n]);
	10241
	10242	PAD_RESTORE_LOCAL(old_comppad);
	10243	SvREFCNT_dec(MUTABLE_SV(new_comppad));
	10244	new_comppad = NULL;
	10245	break;
	10246	case 'n':
	10247	break;
	10248	case 'T':
	10249	{ /* Aho Corasick add-on structure for a trie node.
	10250	Used in stclass optimization only */
	10251	U32 refcount;
	10252	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	10253	OP_REFCNT_LOCK;
	10254	refcount = --aho->refcount;
	10255	OP_REFCNT_UNLOCK;
	10256	if ( !refcount ) {
	10257	PerlMemShared_free(aho->states);
	10258	PerlMemShared_free(aho->fail);
	10259	/* do this last!!!! */
	10260	PerlMemShared_free(ri->data->data[n]);
	10261	PerlMemShared_free(ri->regstclass);
	10262	}
	10263	}
	10264	break;
	10265	case 't':
	10266	{
	10267	/* trie structure. */
	10268	U32 refcount;
	10269	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	10270	OP_REFCNT_LOCK;
	10271	refcount = --trie->refcount;
	10272	OP_REFCNT_UNLOCK;
	10273	if ( !refcount ) {
	10274	PerlMemShared_free(trie->charmap);
	10275	PerlMemShared_free(trie->states);
	10276	PerlMemShared_free(trie->trans);
	10277	if (trie->bitmap)
	10278	PerlMemShared_free(trie->bitmap);
	10279	if (trie->jump)
	10280	PerlMemShared_free(trie->jump);
	10281	PerlMemShared_free(trie->wordinfo);
	10282	/* do this last!!!! */
	10283	PerlMemShared_free(ri->data->data[n]);
	10284	}
	10285	}
	10286	break;
	10287	default:
	10288	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	10289	}
	10290	}
	10291	Safefree(ri->data->what);
	10292	Safefree(ri->data);
	10293	}
	10294
	10295	Safefree(ri);
	10296	}
	10297
	10298	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	10299	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	10300	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	10301
	10302	/*
	10303	re_dup - duplicate a regexp.
	10304
	10305	This routine is expected to clone a given regexp structure. It is only
	10306	compiled under USE_ITHREADS.
	10307
	10308	After all of the core data stored in struct regexp is duplicated
	10309	the regexp_engine.dupe method is used to copy any private data
	10310	stored in the *pprivate pointer. This allows extensions to handle
	10311	any duplication it needs to do.
	10312
	10313	See pregfree() and regfree_internal() if you change anything here.
	10314	*/
	10315	#if defined(USE_ITHREADS)
	10316	#ifndef PERL_IN_XSUB_RE
	10317	void
	10318	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	10319	{
	10320	dVAR;
	10321	I32 npar;
	10322	const struct regexp r = (const struct regexp )SvANY(sstr);
	10323	struct regexp ret = (struct regexp )SvANY(dstr);
	10324
	10325	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	10326
	10327	npar = r->nparens+1;
	10328	Newx(ret->offs, npar, regexp_paren_pair);
	10329	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	10330	if(ret->swap) {
	10331	/* no need to copy these */
	10332	Newx(ret->swap, npar, regexp_paren_pair);
	10333	}
	10334
	10335	if (ret->substrs) {
	10336	/* Do it this way to avoid reading from *r after the StructCopy().
	10337	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	10338	cache, it doesn't matter. */
	10339	const bool anchored = r->check_substr
	10340	? r->check_substr == r->anchored_substr
	10341	: r->check_utf8 == r->anchored_utf8;
	10342	Newx(ret->substrs, 1, struct reg_substr_data);
	10343	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	10344
	10345	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	10346	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	10347	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	10348	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	10349
	10350	/* check_substr and check_utf8, if non-NULL, point to either their
	10351	anchored or float namesakes, and don't hold a second reference. */
	10352
	10353	if (ret->check_substr) {
	10354	if (anchored) {
	10355	assert(r->check_utf8 == r->anchored_utf8);
	10356	ret->check_substr = ret->anchored_substr;
	10357	ret->check_utf8 = ret->anchored_utf8;
	10358	} else {
	10359	assert(r->check_substr == r->float_substr);
	10360	assert(r->check_utf8 == r->float_utf8);
	10361	ret->check_substr = ret->float_substr;
	10362	ret->check_utf8 = ret->float_utf8;
	10363	}
	10364	} else if (ret->check_utf8) {
	10365	if (anchored) {
	10366	ret->check_utf8 = ret->anchored_utf8;
	10367	} else {
	10368	ret->check_utf8 = ret->float_utf8;
	10369	}
	10370	}
	10371	}
	10372
	10373	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	10374
	10375	if (ret->pprivate)
	10376	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	10377
	10378	if (RX_MATCH_COPIED(dstr))
	10379	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	10380	else
	10381	ret->subbeg = NULL;
	10382	#ifdef PERL_OLD_COPY_ON_WRITE
	10383	ret->saved_copy = NULL;
	10384	#endif
	10385
	10386	if (ret->mother_re) {
	10387	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
	10388	/* Our storage points directly to our mother regexp, but that's
	10389	1: a buffer in a different thread
	10390	2: something we no longer hold a reference on
	10391	so we need to copy it locally. */
	10392	/* Note we need to sue SvCUR() on our mother_re, because it, in
	10393	turn, may well be pointing to its own mother_re. */
	10394	SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
	10395	SvCUR(ret->mother_re)+1));
	10396	SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
	10397	}
	10398	ret->mother_re = NULL;
	10399	}
	10400	ret->gofs = 0;
	10401	}
	10402	#endif /* PERL_IN_XSUB_RE */
	10403
	10404	/*
	10405	regdupe_internal()
	10406
	10407	This is the internal complement to regdupe() which is used to copy
	10408	the structure pointed to by the *pprivate pointer in the regexp.
	10409	This is the core version of the extension overridable cloning hook.
	10410	The regexp structure being duplicated will be copied by perl prior
	10411	to this and will be provided as the regexp *r argument, however
	10412	with the /old/ structures pprivate pointer value. Thus this routine
	10413	may override any copying normally done by perl.
	10414
	10415	It returns a pointer to the new regexp_internal structure.
	10416	*/
	10417
	10418	void *
	10419	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	10420	{
	10421	dVAR;
	10422	struct regexp const r = (struct regexp )SvANY(rx);
	10423	regexp_internal *reti;
	10424	int len, npar;
	10425	RXi_GET_DECL(r,ri);
	10426
	10427	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	10428
	10429	npar = r->nparens+1;
	10430	len = ProgLen(ri);
	10431
	10432	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	10433	Copy(ri->program, reti->program, len+1, regnode);
	10434
	10435
	10436	reti->regstclass = NULL;
	10437
	10438	if (ri->data) {
	10439	struct reg_data *d;
	10440	const int count = ri->data->count;
	10441	int i;
	10442
	10443	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	10444	char, struct reg_data);
	10445	Newx(d->what, count, U8);
	10446
	10447	d->count = count;
	10448	for (i = 0; i < count; i++) {
	10449	d->what[i] = ri->data->what[i];
	10450	switch (d->what[i]) {
	10451	/* legal options are one of: sSfpontTua
	10452	see also regcomp.h and pregfree() */
	10453	case 'a': /* actually an AV, but the dup function is identical. */
	10454	case 's':
	10455	case 'S':
	10456	case 'p': /* actually an AV, but the dup function is identical. */
	10457	case 'u': /* actually an HV, but the dup function is identical. */
	10458	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	10459	break;
	10460	case 'f':
	10461	/* This is cheating. */
	10462	Newx(d->data[i], 1, struct regnode_charclass_class);
	10463	StructCopy(ri->data->data[i], d->data[i],
	10464	struct regnode_charclass_class);
	10465	reti->regstclass = (regnode*)d->data[i];
	10466	break;
	10467	case 'o':
	10468	/* Compiled op trees are readonly and in shared memory,
	10469	and can thus be shared without duplication. */
	10470	OP_REFCNT_LOCK;
	10471	d->data[i] = (void)OpREFCNT_inc((OP)ri->data->data[i]);
	10472	OP_REFCNT_UNLOCK;
	10473	break;
	10474	case 'T':
	10475	/* Trie stclasses are readonly and can thus be shared
	10476	* without duplication. We free the stclass in pregfree
	10477	* when the corresponding reg_ac_data struct is freed.
	10478	*/
	10479	reti->regstclass= ri->regstclass;
	10480	/* Fall through */
	10481	case 't':
	10482	OP_REFCNT_LOCK;
	10483	((reg_trie_data*)ri->data->data[i])->refcount++;
	10484	OP_REFCNT_UNLOCK;
	10485	/* Fall through */
	10486	case 'n':
	10487	d->data[i] = ri->data->data[i];
	10488	break;
	10489	default:
	10490	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	10491	}
	10492	}
	10493
	10494	reti->data = d;
	10495	}
	10496	else
	10497	reti->data = NULL;
	10498
	10499	reti->name_list_idx = ri->name_list_idx;
	10500
	10501	#ifdef RE_TRACK_PATTERN_OFFSETS
	10502	if (ri->u.offsets) {
	10503	Newx(reti->u.offsets, 2*len+1, U32);
	10504	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	10505	}
	10506	#else
	10507	SetProgLen(reti,len);
	10508	#endif
	10509
	10510	return (void*)reti;
	10511	}
	10512
	10513	#endif /* USE_ITHREADS */
	10514
	10515	#ifndef PERL_IN_XSUB_RE
	10516
	10517	/*
	10518	- regnext - dig the "next" pointer out of a node
	10519	*/
	10520	regnode *
	10521	Perl_regnext(pTHX_ register regnode *p)
	10522	{
	10523	dVAR;
	10524	register I32 offset;
	10525
	10526	if (!p)
	10527	return(NULL);
	10528
	10529	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	10530	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	10531	}
	10532
	10533	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	10534	if (offset == 0)
	10535	return(NULL);
	10536
	10537	return(p+offset);
	10538	}
	10539	#endif
	10540
	10541	STATIC void
	10542	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	10543	{
	10544	va_list args;
	10545	STRLEN l1 = strlen(pat1);
	10546	STRLEN l2 = strlen(pat2);
	10547	char buf[512];
	10548	SV *msv;
	10549	const char *message;
	10550
	10551	PERL_ARGS_ASSERT_RE_CROAK2;
	10552
	10553	if (l1 > 510)
	10554	l1 = 510;
	10555	if (l1 + l2 > 510)
	10556	l2 = 510 - l1;
	10557	Copy(pat1, buf, l1 , char);
	10558	Copy(pat2, buf + l1, l2 , char);
	10559	buf[l1 + l2] = '\n';
	10560	buf[l1 + l2 + 1] = '\0';
	10561	#ifdef I_STDARG
	10562	/* ANSI variant takes additional second argument */
	10563	va_start(args, pat2);
	10564	#else
	10565	va_start(args);
	10566	#endif
	10567	msv = vmess(buf, &args);
	10568	va_end(args);
	10569	message = SvPV_const(msv,l1);
	10570	if (l1 > 512)
	10571	l1 = 512;
	10572	Copy(message, buf, l1 , char);
	10573	buf[l1-1] = '\0'; /* Overwrite \n */
	10574	Perl_croak(aTHX_ "%s", buf);
	10575	}
	10576
	10577	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	10578
	10579	#ifndef PERL_IN_XSUB_RE
	10580	void
	10581	Perl_save_re_context(pTHX)
	10582	{
	10583	dVAR;
	10584
	10585	struct re_save_state *state;
	10586
	10587	SAVEVPTR(PL_curcop);
	10588	SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
	10589
	10590	state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
	10591	PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
	10592	SSPUSHUV(SAVEt_RE_STATE);
	10593
	10594	Copy(&PL_reg_state, state, 1, struct re_save_state);
	10595
	10596	PL_reg_start_tmp = 0;
	10597	PL_reg_start_tmpl = 0;
	10598	PL_reg_oldsaved = NULL;
	10599	PL_reg_oldsavedlen = 0;
	10600	PL_reg_maxiter = 0;
	10601	PL_reg_leftiter = 0;
	10602	PL_reg_poscache = NULL;
	10603	PL_reg_poscache_size = 0;
	10604	#ifdef PERL_OLD_COPY_ON_WRITE
	10605	PL_nrs = NULL;
	10606	#endif
	10607
	10608	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	10609	if (PL_curpm) {
	10610	const REGEXP * const rx = PM_GETRE(PL_curpm);
	10611	if (rx) {
	10612	U32 i;
	10613	for (i = 1; i <= RX_NPARENS(rx); i++) {
	10614	char digits[TYPE_CHARS(long)];
	10615	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	10616	GV const const gvp
	10617	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	10618
	10619	if (gvp) {
	10620	GV * const gv = *gvp;
	10621	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	10622	save_scalar(gv);
	10623	}
	10624	}
	10625	}
	10626	}
	10627	}
	10628	#endif
	10629
	10630	static void
	10631	clear_re(pTHX_ void *r)
	10632	{
	10633	dVAR;
	10634	ReREFCNT_dec((REGEXP *)r);
	10635	}
	10636
	10637	#ifdef DEBUGGING
	10638
	10639	STATIC void
	10640	S_put_byte(pTHX_ SV *sv, int c)
	10641	{
	10642	PERL_ARGS_ASSERT_PUT_BYTE;
	10643
	10644	/* Our definition of isPRINT() ignores locales, so only bytes that are
	10645	not part of UTF-8 are considered printable. I assume that the same
	10646	holds for UTF-EBCDIC.
	10647	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	10648	which Wikipedia says:
	10649
	10650	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	10651	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	10652	identical, to the ASCII delete (DEL) or rubout control character.
	10653	) So the old condition can be simplified to !isPRINT(c) */
	10654	if (!isPRINT(c)) {
	10655	if (c < 256) {
	10656	Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
	10657	}
	10658	else {
	10659	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	10660	}
	10661	}
	10662	else {
	10663	const char string = c;
	10664	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	10665	sv_catpvs(sv, "\\");
	10666	sv_catpvn(sv, &string, 1);
	10667	}
	10668	}
	10669
	10670
	10671	#define CLEAR_OPTSTART \
	10672	if (optstart) STMT_START { \
	10673	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	10674	optstart=NULL; \
	10675	} STMT_END
	10676
	10677	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	10678
	10679	STATIC const regnode *
	10680	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	10681	const regnode last, const regnode plast,
	10682	SV* sv, I32 indent, U32 depth)
	10683	{
	10684	dVAR;
	10685	register U8 op = PSEUDO; /* Arbitrary non-END op. */
	10686	register const regnode *next;
	10687	const regnode *optstart= NULL;
	10688
	10689	RXi_GET_DECL(r,ri);
	10690	GET_RE_DEBUG_FLAGS_DECL;
	10691
	10692	PERL_ARGS_ASSERT_DUMPUNTIL;
	10693
	10694	#ifdef DEBUG_DUMPUNTIL
	10695	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	10696	last ? last-start : 0,plast ? plast-start : 0);
	10697	#endif
	10698
	10699	if (plast && plast < last)
	10700	last= plast;
	10701
	10702	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	10703	/* While that wasn't END last time... */
	10704	NODE_ALIGN(node);
	10705	op = OP(node);
	10706	if (op == CLOSE \|\| op == WHILEM)
	10707	indent--;
	10708	next = regnext((regnode *)node);
	10709
	10710	/* Where, what. */
	10711	if (OP(node) == OPTIMIZED) {
	10712	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	10713	optstart = node;
	10714	else
	10715	goto after_print;
	10716	} else
	10717	CLEAR_OPTSTART;
	10718
	10719	regprop(r, sv, node);
	10720	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	10721	(int)(2*indent + 1), "", SvPVX_const(sv));
	10722
	10723	if (OP(node) != OPTIMIZED) {
	10724	if (next == NULL) /* Next ptr. */
	10725	PerlIO_printf(Perl_debug_log, " (0)");
	10726	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	10727	PerlIO_printf(Perl_debug_log, " (FAIL)");
	10728	else
	10729	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	10730	(void)PerlIO_putc(Perl_debug_log, '\n');
	10731	}
	10732
	10733	after_print:
	10734	if (PL_regkind[(U8)op] == BRANCHJ) {
	10735	assert(next);
	10736	{
	10737	register const regnode *nnode = (OP(next) == LONGJMP
	10738	? regnext((regnode *)next)
	10739	: next);
	10740	if (last && nnode > last)
	10741	nnode = last;
	10742	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	10743	}
	10744	}
	10745	else if (PL_regkind[(U8)op] == BRANCH) {
	10746	assert(next);
	10747	DUMPUNTIL(NEXTOPER(node), next);
	10748	}
	10749	else if ( PL_regkind[(U8)op] == TRIE ) {
	10750	const regnode *this_trie = node;
	10751	const char op = OP(node);
	10752	const U32 n = ARG(node);
	10753	const reg_ac_data * const ac = op>=AHOCORASICK ?
	10754	(reg_ac_data *)ri->data->data[n] :
	10755	NULL;
	10756	const reg_trie_data * const trie =
	10757	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	10758	#ifdef DEBUGGING
	10759	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	10760	#endif
	10761	const regnode *nextbranch= NULL;
	10762	I32 word_idx;
	10763	sv_setpvs(sv, "");
	10764	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	10765	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	10766
	10767	PerlIO_printf(Perl_debug_log, "%*s%s ",
	10768	(int)(2*(indent+3)), "",
	10769	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	10770	PL_colors[0], PL_colors[1],
	10771	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	10772	PERL_PV_PRETTY_ELLIPSES \|
	10773	PERL_PV_PRETTY_LTGT
	10774	)
	10775	: "???"
	10776	);
	10777	if (trie->jump) {
	10778	U16 dist= trie->jump[word_idx+1];
	10779	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	10780	(UV)((dist ? this_trie + dist : next) - start));
	10781	if (dist) {
	10782	if (!nextbranch)
	10783	nextbranch= this_trie + trie->jump[0];
	10784	DUMPUNTIL(this_trie + dist, nextbranch);
	10785	}
	10786	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	10787	nextbranch= regnext((regnode *)nextbranch);
	10788	} else {
	10789	PerlIO_printf(Perl_debug_log, "\n");
	10790	}
	10791	}
	10792	if (last && next > last)
	10793	node= last;
	10794	else
	10795	node= next;
	10796	}
	10797	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	10798	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	10799	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	10800	}
	10801	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	10802	assert(next);
	10803	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	10804	}
	10805	else if ( op == PLUS \|\| op == STAR) {
	10806	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	10807	}
	10808	else if (PL_regkind[(U8)op] == ANYOF) {
	10809	/* arglen 1 + class block */
	10810	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	10811	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	10812	node = NEXTOPER(node);
	10813	}
	10814	else if (PL_regkind[(U8)op] == EXACT) {
	10815	/* Literal string, where present. */
	10816	node += NODE_SZ_STR(node) - 1;
	10817	node = NEXTOPER(node);
	10818	}
	10819	else {
	10820	node = NEXTOPER(node);
	10821	node += regarglen[(U8)op];
	10822	}
	10823	if (op == CURLYX \|\| op == OPEN)
	10824	indent++;
	10825	}
	10826	CLEAR_OPTSTART;
	10827	#ifdef DEBUG_DUMPUNTIL
	10828	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	10829	#endif
	10830	return node;
	10831	}
	10832
	10833	#endif /* DEBUGGING */
	10834
	10835	/*
	10836	* Local variables:
	10837	* c-indentation-style: bsd
	10838	* c-basic-offset: 4
	10839	* indent-tabs-mode: t
	10840	* End:
	10841	*
	10842	* ex: set ts=8 sts=4 sw=4 noet:
	10843	*/